Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
aaf6d9a
Rust side implementation for GoFlow search index
afg1 Oct 13, 2025
53f2fef
Python bits of the goflowllm search index export
afg1 Oct 13, 2025
ffa4e42
Add necessary sql and nextflow bits
afg1 Oct 13, 2025
7c0b270
Merge branch 'dev' into gfllm-search-export
afg1 Oct 13, 2025
ca3ee09
Update rnacentral_pipeline/cli/r2dt.py
afg1 Oct 14, 2025
bbb78e2
Update rnacentral_pipeline/databases/ensembl/genomes/urls.py
afg1 Oct 14, 2025
c40cef0
Remove unused fetching of ensembl latest release from parsing release…
afg1 Oct 14, 2025
3e818d6
Remove some debugging print statements
afg1 Oct 14, 2025
a4901b7
Update rnacentral_pipeline/databases/ensembl/genomes/urls.py
afg1 Oct 18, 2025
0efdf8d
Remove some dead code relating to finding the ensembl release number
afg1 Oct 18, 2025
f30bead
Reinstate conditionals for running r2dt
afg1 Oct 18, 2025
0c2d944
Fix other instances of release matching not being defenzive about no …
afg1 Oct 18, 2025
96d37fd
Update rnacentral_pipeline/rnacentral/r2dt/parser.py
afg1 Oct 18, 2025
29dc710
Link up go flow search export processes properly
afg1 Oct 25, 2025
6332d64
Use less stringent regex in ensembl release detection
afg1 Oct 27, 2025
cd6e07d
Remove some trailing whitespace
afg1 Oct 27, 2025
fa07512
Raise value errors instead of relying on assertions
afg1 Oct 27, 2025
b045965
Merge branch 'dev' into gfllm-search-export
afg1 Oct 27, 2025
f00a676
Fix not passing goflow data to merging process correctly
afg1 Oct 27, 2025
acdba3c
Update expected key from search export rust code
afg1 Oct 27, 2025
fa64872
Rust side CLI requires kebab-case filename to match file type enum, s…
afg1 Oct 27, 2025
77583ac
Improve documentation comment in sequence raw handler
afg1 Oct 27, 2025
ea94cdc
Cargo clippy fixes
afg1 Nov 18, 2025
e8c00c3
Remove an unused polars dependency
afg1 Nov 18, 2025
e8524b7
Merge branch 'dev' into gfllm-search-export
afg1 Dec 3, 2025
03da0dd
Merge branch 'dev' into gfllm-search-export
afg1 Dec 17, 2025
93d5a39
Fix misnamed editing events export process
afg1 Dec 17, 2025
6ba1a0a
Repair so_tree fetching
afg1 Dec 17, 2025
aaaff28
Don't use a container for goflow text search export
afg1 Dec 17, 2025
007966d
Use infernal version arg for copying binaries into final container
afg1 Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
294 changes: 43 additions & 251 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ ENV RIBOTIMEDIR="/usr/bin"
ENV BIOEASELDIR="$RNA/Bio-Easel/blib/lib:$RNA/Bio-Easel/blib/arch:$RNA/Bio-Easel:$RNA/Bio-Easel/lib"
ENV PERL5LIB="$BIOEASELDIR:$RIBODIR:$EPNOPTDIR:$EPNOFILEDIR:$EPNTESTDIR:$PERL5LIB"

ENV PATH="$RNA/infernal-1.1.2/bin:$PATH"
ENV PATH="$RNA/infernal-${INFERNAL_VERSION}/bin:$PATH"
ENV PATH="$RNA/blat_suite:$PATH"
ENV PATH="$RNA/seqkit:$PATH"
ENV PATH="$RNACENTRAL_IMPORT_PIPELINE/bin:$PATH"
Expand Down
13 changes: 13 additions & 0 deletions files/search-export/parts/goflow.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
COPY (
SELECT
json_build_object(
'id', todo.id,
'urs_taxid', todo.urs_taxid,
'should_show_goflow', true
)
FROM search_export_urs todo
JOIN go_flow_llm_curation_results gfllm
ON
todo.urs_taxid = gfllm.urs_taxid
ORDER by todo.id
) TO STDOUT
5 changes: 3 additions & 2 deletions rnacentral_pipeline/databases/ensembl/genomes/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ def latest_release(ftp: FTP) -> str:
readme_lines = []
ftp.retrlines("RETR current_README", readme_lines.append)
cur_readme = "\n".join(readme_lines)
pattern = r"Ensembl Release (\d+) Databases."
match = re.search(pattern, cur_readme)
pattern = r"Ensembl Release (\d+) Databases\."
match = re.search(pattern, cur_readme, re.IGNORECASE)

if not match:
raise ValueError("Could not find release number in README")
release = match.group(1)
Expand Down
5 changes: 3 additions & 2 deletions rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ def latest_release(ftp: FTP) -> str:
readme_lines = []
ftp.retrlines("RETR current_README", readme_lines.append)
cur_readme = "\n".join(readme_lines)
pattern = r"Ensembl Release (\d+) Databases."
match = re.search(pattern, cur_readme)
pattern = r"Ensembl Release (\d+) Databases\."
match = re.search(pattern, cur_readme, re.IGNORECASE)

if not match:
raise ValueError("Could not determine latest Ensembl release from README")
release = match.group(1)
Expand Down
4 changes: 1 addition & 3 deletions rnacentral_pipeline/rnacentral/r2dt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
.rename({"column_1": "urs"})
)

raw_tracked = pl.scan_csv(
tracked_urs.name, low_memory=True
).unique()
raw_tracked = pl.scan_csv(tracked_urs.name, low_memory=True).unique()

to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti")

Expand Down
8 changes: 6 additions & 2 deletions rnacentral_pipeline/rnacentral/r2dt/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,11 +409,15 @@ def dot_bracket(self):
seq_dot = str(record.seq)
## Use indices instead, assert that the string is even length
## If not, then the two parts are not the same length
assert len(seq_dot) % 2 == 0, f"Odd length sequence {len(seq_dot)}"
if len(seq_dot) % 2 != 0:
raise ValueError(f"Odd length sequence {len(seq_dot)}")
seq_dot_len = len(seq_dot)
sequence = seq_dot[0 : seq_dot_len // 2]
dot_bracket = seq_dot[(seq_dot_len // 2) :]
assert len(sequence) == len(dot_bracket)
if len(sequence) != len(dot_bracket):
raise ValueError(
f"Sequence and dot bracket lengths do not match: {len(sequence)} != {len(dot_bracket)}"
)
return dot_bracket

def basepair_count(self):
Expand Down
9 changes: 9 additions & 0 deletions rnacentral_pipeline/rnacentral/search_export/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,10 @@ def has_litsumm(litsumm):
return str(bool(litsumm))


def has_go_flow_llm_annotation(go_flow):
return str(bool(go_flow))


def has_editing_event(editing_events):
return str(bool(editing_events))

Expand Down Expand Up @@ -880,6 +884,11 @@ def edit_ref_to_edit(editing_events):
edit_repeat_type,
keys="editing_events",
),
field(
"has_go_flow_llm_annotation",
has_go_flow_llm_annotation,
keys="go_flow_llm_annotations",
),
## Add new fields above this line! Otherwise editing the produced xml is hard.
tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
],
Expand Down
1 change: 0 additions & 1 deletion utils/precompute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ sorted-iter = "0.1.7"
structopt = "0.3"
strum = "0.21"
strum_macros = "0.21"
polars = { version = "0.33.2", features = ["lazy", "streaming"] }

[dev-dependencies]
rand = "0.8"
1 change: 0 additions & 1 deletion utils/precompute/src/releases.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ use anyhow::{
Result,
};

use polars::prelude::*;

#[derive(Serialize, Deserialize, Debug)]
pub struct UrsEntry {
Expand Down
11 changes: 11 additions & 0 deletions utils/search-export/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub enum Groupable {
SoInfo,
LitsummSummaries,
EditingEvents,
GoFlowAnnotation,
}

#[derive(Debug, StructOpt)]
Expand Down Expand Up @@ -145,6 +146,10 @@ enum SequenceCommand {
/// RNA editing events
editing_events: PathBuf,

#[structopt(parse(from_os_str))]
/// GoFlowLLM annotations
go_flow_llm_annotations: PathBuf,

// Add new arguments above this line!
#[structopt(parse(from_os_str))]
/// Filename to write the results to, '-' means stdout
Expand Down Expand Up @@ -260,6 +265,9 @@ fn main() -> Result<()> {
Groupable::EditingEvents => {
sequences::editing_events::group(&path, max_count, &output)?
},
Groupable::GoFlowAnnotation => {
sequences::go_flow_annotations::group(&path, max_count, &output)?
},
},
Subcommand::Sequences {
command,
Expand All @@ -280,6 +288,8 @@ fn main() -> Result<()> {
litsumm_summaries,
editing_events,
so_term_tree,
go_flow_llm_annotations,
// Add new arguments above this line!
output,
} => sequences::writers::write_merge(
vec![
Expand All @@ -298,6 +308,7 @@ fn main() -> Result<()> {
editing_events,
orfs,
so_term_tree,
go_flow_llm_annotations,
],
&output,
)?,
Expand Down
15 changes: 15 additions & 0 deletions utils/search-export/src/sequences/file_joiner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use super::{
editing_events::EditingEvent,
feedback::Feedback,
go_annotation::GoAnnotation,
go_flow_annotations::GoFlowLLMAnnotation,
interacting_protein::InteractingProtein,
interacting_rna::InteractingRna,
litsumm::LitsummSummaries,
Expand Down Expand Up @@ -98,6 +99,7 @@ pub enum FileTypes {
PublicationCount,
LitsummSummaries,
EditingEvents,
GoFlowLLMAnnotations,
SoTermTree,
}

Expand All @@ -116,6 +118,8 @@ pub struct FileJoiner<'de> {
rfam_hits: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<RfamHit>>,
publication_counts: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<PublicationCount>>,
lit_summ: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<LitsummSummaries>>,
go_flow_llm_annotations:
StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
editing_events: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<EditingEvent>>,
so_info: SoMapping,
}
Expand Down Expand Up @@ -203,6 +207,7 @@ impl FileJoinerBuilder {
let publication_counts = self.iterator_for(FileTypes::PublicationCount)?;
let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?;
let editing_events = self.iterator_for(FileTypes::EditingEvents)?;
let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?;
let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?;

Ok(FileJoiner {
Expand All @@ -220,6 +225,7 @@ impl FileJoinerBuilder {
publication_counts,
lit_summ,
editing_events,
go_flow_llm_annotations,
so_info,
})
}
Expand All @@ -244,6 +250,7 @@ impl<'de> Iterator for FileJoiner<'de> {
self.publication_counts.next(),
self.lit_summ.next(),
self.editing_events.next(),
self.go_flow_llm_annotations.next(),
);

match current {
Expand All @@ -262,6 +269,7 @@ impl<'de> Iterator for FileJoiner<'de> {
None,
None,
None,
None,
) => None,
(
Some(Ok(Required {
Expand Down Expand Up @@ -320,6 +328,10 @@ impl<'de> Iterator for FileJoiner<'de> {
id: id14,
data: editing_events,
})),
Some(Ok(Multiple {
id: id15,
data: goflow_llm_annotations,
})),
) => {
if id1 != id2
|| id1 != id3
Expand All @@ -334,9 +346,11 @@ impl<'de> Iterator for FileJoiner<'de> {
|| id1 != id12
|| id1 != id13
|| id1 != id14
|| id1 != id15
{
return Some(Err(Error::OutofSyncData(vec![
id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14,
id15,
])));
}

Expand All @@ -362,6 +376,7 @@ impl<'de> Iterator for FileJoiner<'de> {
.publication_counts(publication_counts)
.litsumm_summaries(lit_summ)
.editing_events(editing_events)
.go_flow_llm_annotations(goflow_llm_annotations)
.so_tree(so_tree)
.build();

Expand Down
35 changes: 35 additions & 0 deletions utils/search-export/src/sequences/go_flow_annotations.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use serde::{
Deserialize,
Serialize,
};
use std::path::Path;

use anyhow::Result;
use rnc_core::grouper;

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct GoFlowLLMAnnotation {
pub id: usize,
urs_taxid: String,
should_show_goflow: bool,
}

impl grouper::HasIndex for GoFlowLLMAnnotation {
fn index(&self) -> usize {
self.id
}
}

pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> {
grouper::group::<GoFlowLLMAnnotation>(grouper::Criteria::AnyNumber, &path, 1, max, &output)
}

impl GoFlowLLMAnnotation {
pub fn should_show_goflow(&self) -> bool {
self.should_show_goflow
}

pub fn urs_taxid(&self) -> &str {
&self.urs_taxid
}
}
1 change: 1 addition & 0 deletions utils/search-export/src/sequences/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pub mod editing_events;
pub mod feedback;
pub mod file_joiner;
pub mod go_annotation;
pub mod go_flow_annotations;
pub mod interacting_protein;
pub mod interacting_rna;
pub mod litsumm;
Expand Down
3 changes: 3 additions & 0 deletions utils/search-export/src/sequences/normalized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use crate::{
editing_events::EditingEvent,
feedback::FeedbackVec,
go_annotation::GoAnnotation,
go_flow_annotations::GoFlowLLMAnnotation,
interacting_protein::InteractingProtein,
interacting_rna::InteractingRna,
litsumm::LitsummSummaries,
Expand Down Expand Up @@ -80,6 +81,7 @@ pub struct Normalized {
publication_count: usize,
litsumm: Vec<LitsummSummaries>,
editing_events: Vec<EditingEvent>,
go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
so_rna_type_tree: so_tree::SoTree,

#[serde(flatten)]
Expand Down Expand Up @@ -140,6 +142,7 @@ impl Normalized {
rfam_hits: raw.rfam_hits().iter().cloned().collect(),
orfs: raw.orfs().iter().cloned().collect(),
litsumm: raw.litsumm_summaries().to_vec(),
go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(),
editing_events: raw.editing_events().to_vec(),
})
}
Expand Down
7 changes: 7 additions & 0 deletions utils/search-export/src/sequences/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use crate::sequences::{
editing_events::EditingEvent,
feedback::Feedback,
go_annotation::GoAnnotation,
go_flow_annotations::GoFlowLLMAnnotation,
interacting_protein::InteractingProtein,
interacting_rna::InteractingRna,
litsumm::LitsummSummaries,
Expand Down Expand Up @@ -46,6 +47,7 @@ pub struct Raw {
publication_counts: Option<PublicationCount>,
litsumm_summaries: Vec<LitsummSummaries>,
editing_events: Vec<EditingEvent>,
go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
so_tree: so_tree::SoTree,
}

Expand Down Expand Up @@ -148,6 +150,11 @@ impl Raw {
&self.editing_events
}

/// Get a reference to the raw's GoFlowlLM annotations
pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] {
&self.go_flow_llm_annotations
}

/// Get this raw's publication count.
pub fn publication_count(&self) -> usize {
self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0)
Expand Down
Loading
Loading