diff --git a/Cargo.lock b/Cargo.lock index b7731b06b..2b75281e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -188,35 +188,6 @@ dependencies = [ "zstd", ] -[[package]] -name = "arrow2" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963fef509b757bcbbf9e5ffa23bcb345614d99f4f6f531f97417b27b8604d389" -dependencies = [ - "ahash", - "arrow-format", - "bytemuck", - "chrono", - "dyn-clone", - "either", - "ethnum", - "foreign_vec", - "getrandom 0.2.16", - "hash_hasher", - "hashbrown 0.14.5", - "lexical-core", - "lz4", - "multiversion", - "num-traits", - "regex", - "regex-syntax 0.7.5", - "rustc_version 0.4.1", - "simdutf8", - "strength_reduce", - "zstd", -] - [[package]] name = "async-trait" version = "0.1.89" @@ -1712,28 +1683,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1362d4a136c0ebacb40d88a37ba361738b222fd8a2ee9340a3d8642f698c52b" dependencies = [ "getrandom 0.2.16", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-lazy 0.32.1", - "polars-ops 0.32.1", - "polars-sql 0.32.1", - "polars-time 0.32.1", - "version_check", -] - -[[package]] -name = "polars" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3030de163b9ff2c9dac9a12dcb9be25cc0f2bc7c8e7cd2e4b2592ebed458ce6a" -dependencies = [ - "getrandom 0.2.16", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-lazy 0.33.2", - "polars-ops 0.33.2", - "polars-sql 0.33.2", - "polars-time 0.33.2", + "polars-core", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-sql", + "polars-time", "version_check", ] @@ -1743,26 +1698,11 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f967c901fa5da4ca7f64e813d1268488ba97e9b3004cefc579ff851c197a1138" dependencies = [ - "arrow2 0.17.4", - "hashbrown 0.14.5", - "multiversion", - "num-traits", - "polars-error 0.32.1", - "thiserror", - "version_check", -] - -[[package]] -name = "polars-arrow" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35cd38a64fb389fd990e4efd433a36331c995c981d353bfef83b5de4d87f1828" -dependencies = [ - "arrow2 0.18.0", + "arrow2", "hashbrown 0.14.5", "multiversion", "num-traits", - "polars-error 0.33.2", + "polars-error", "thiserror", "version_check", ] @@ -1833,18 +1773,7 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40d09c3a7337e53b38c37b57999038440fa39c6801b9ba48afaecd8e16f7ac0a" dependencies = [ - "arrow2 0.17.4", - "regex", - "thiserror", -] - -[[package]] -name = "polars-error" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b20a09651a299979354945819dc2ce017964b80b916954e9d2ce39002a5f949" -dependencies = [ - "arrow2 0.18.0", + "arrow2", "regex", "thiserror", ] @@ -1856,7 +1785,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92cab0df9f2a35702fa5aec99edfaabf9ae8e9cdd0acf69e143ad2d132f34f9c" dependencies = [ "ahash", - "arrow2 0.17.4", + "arrow2", "async-trait", "bytes", "chrono", @@ -1869,45 +1798,17 @@ dependencies = [ "memmap2", "num-traits", "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-error 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", + "polars-arrow", + "polars-core", + "polars-error", + "polars-time", + "polars-utils", "rayon", "regex", "simdutf8", "tokio", ] -[[package]] -name = "polars-io" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf4a89c18a90ac20dfbcdfd19ab50ad4ac5a76fc7bb775d3c28bb738cf1f34" -dependencies = [ - "ahash", - "arrow2 0.18.0", - "bytes", - "chrono", - "fast-float", - "home", - "lexical", - "lexical-core", - "memchr", - "memmap2", - "num-traits", - "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-error 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", - "rayon", - "regex", - "simdutf8", -] - [[package]] name = "polars-lazy" version = "0.32.1" @@ -1961,7 +1862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e825575c96302d2daedfc205a0062180033c92c55bcd6aafc4e109d4d8849ed0" dependencies = [ "argminmax", - "arrow2 0.17.4", + "arrow2", "either", "indexmap 2.11.1", "memchr", @@ -2002,36 +1903,13 @@ dependencies = [ "enum_dispatch", "hashbrown 0.14.5", "num-traits", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-plan 0.32.1", - "polars-row 0.32.1", - "polars-utils 0.32.1", - "rayon", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-pipe" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f30c5e77c5594ddc958a46fe2e021da2feba9c94e767e1d798bd82ac5a33c3b" -dependencies = [ - "crossbeam-channel", - "crossbeam-queue", - "enum_dispatch", - "hashbrown 0.14.5", - "num-traits", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-plan 0.33.2", - "polars-row 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", "rayon", "smartstring", "version_check", @@ -2044,36 +1922,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb67b014f0295e8e9dbb84404a91d666d477b3bc248a2ed51bc442833b16da35" dependencies = [ "ahash", - "arrow2 0.17.4", - "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", - "rayon", - "regex", - "smartstring", - "strum_macros 0.25.3", - "version_check", -] - -[[package]] -name = "polars-plan" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678cbeb730e29e50f0f8d844102d15454fc6113a74c667eab046c0e4a4322a9e" -dependencies = [ - "ahash", - "arrow2 0.18.0", + "arrow2", "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-time", + "polars-utils", "rayon", "regex", "smartstring", @@ -2087,20 +1943,9 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27f54c1956027bf6301948fb4f2837cf6d6b638d8dd1edf3aaeaa19906a986be" dependencies = [ - "arrow2 0.17.4", - "polars-error 0.32.1", - "polars-utils 0.32.1", -] - -[[package]] -name = "polars-row" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c52ef8885b9d13f848839594fbab21ad79fc63f7e11c19cdc2cfe9bb03c313ac" -dependencies = [ - "arrow2 0.18.0", - "polars-error 0.33.2", - "polars-utils 0.33.2", + "arrow2", + "polars-error", + "polars-utils", ] [[package]] @@ -2109,25 +1954,10 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbfcb15cf8eebd25ea1724109d0153817cd484c6326290585f0736b4e7fcf2f4" dependencies = [ - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-lazy 0.32.1", - "polars-plan 0.32.1", - "serde", - "serde_json", - "sqlparser", -] - -[[package]] -name = "polars-sql" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d716855267e3516f722287f68cf10e650e33f7197df83a79e680602471456fc" -dependencies = [ - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-lazy 0.33.2", - "polars-plan 0.33.2", + "polars-arrow", + "polars-core", + "polars-lazy", + "polars-plan", "serde", "serde_json", "sqlparser", @@ -2139,34 +1969,15 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53f42d2632f5971c9575041d33cbcfb1f996900c40bbf58bc6eb0a0c5efbecea" dependencies = [ - "arrow2 0.17.4", - "atoi", - "chrono", - "now", - "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-ops 0.32.1", - "polars-utils 0.32.1", - "regex", - "smartstring", -] - -[[package]] -name = "polars-time" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb75a24f11b55a400b52dc19a2a3e949aaaa46a911f99496de4485b1127063" -dependencies = [ - "arrow2 0.18.0", + "arrow2", "atoi", "chrono", "now", "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-ops 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-ops", + "polars-utils", "regex", "smartstring", ] @@ -2181,25 +1992,7 @@ dependencies = [ "hashbrown 0.14.5", "num-traits", "once_cell", - "polars-error 0.32.1", - "rayon", - "smartstring", - "sysinfo", - "version_check", -] - -[[package]] -name = "polars-utils" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a4a5e743509096322cad39104d56e329fe2748483a3354a0f0c354724f3cef6" -dependencies = [ - "ahash", - "bytemuck", - "hashbrown 0.14.5", - "num-traits", - "once_cell", - "polars-error 0.33.2", + "polars-error", "rayon", "smartstring", "sysinfo", @@ -2223,7 +2016,6 @@ dependencies = [ "csv", "itertools 0.10.5", "log", - "polars 0.33.2", "rand 0.8.5", "rnc-core", "rnc-utils", diff --git a/Dockerfile b/Dockerfile index a0b21c868..e70280993 100644 --- a/Dockerfile +++ b/Dockerfile @@ -145,7 +145,7 @@ ENV RIBOTIMEDIR="/usr/bin" ENV BIOEASELDIR="$RNA/Bio-Easel/blib/lib:$RNA/Bio-Easel/blib/arch:$RNA/Bio-Easel:$RNA/Bio-Easel/lib" ENV PERL5LIB="$BIOEASELDIR:$RIBODIR:$EPNOPTDIR:$EPNOFILEDIR:$EPNTESTDIR:$PERL5LIB" -ENV PATH="$RNA/infernal-1.1.2/bin:$PATH" +ENV PATH="$RNA/infernal-${INFERNAL_VERSION}/bin:$PATH" ENV PATH="$RNA/blat_suite:$PATH" ENV PATH="$RNA/seqkit:$PATH" ENV PATH="$RNACENTRAL_IMPORT_PIPELINE/bin:$PATH" diff --git a/files/search-export/parts/goflow.sql b/files/search-export/parts/goflow.sql new file mode 100644 index 000000000..9999d8fd4 --- /dev/null +++ b/files/search-export/parts/goflow.sql @@ -0,0 +1,13 @@ +COPY ( + SELECT + json_build_object( + 'id', todo.id, + 'urs_taxid', todo.urs_taxid, + 'should_show_goflow', true + ) + FROM search_export_urs todo + JOIN go_flow_llm_curation_results gfllm + ON + todo.urs_taxid = gfllm.urs_taxid + ORDER by todo.id +) TO STDOUT diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py index 202ac5209..fe034337b 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py @@ -30,8 +30,9 @@ def latest_release(ftp: FTP) -> str: readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) - pattern = r"Ensembl Release (\d+) Databases." - match = re.search(pattern, cur_readme) + pattern = r"Ensembl Release (\d+) Databases\." + match = re.search(pattern, cur_readme, re.IGNORECASE) + if not match: raise ValueError("Could not find release number in README") release = match.group(1) diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py index e338038f2..a1b3eb7fc 100644 --- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py +++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py @@ -29,8 +29,9 @@ def latest_release(ftp: FTP) -> str: readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) - pattern = r"Ensembl Release (\d+) Databases." - match = re.search(pattern, cur_readme) + pattern = r"Ensembl Release (\d+) Databases\." + match = re.search(pattern, cur_readme, re.IGNORECASE) + if not match: raise ValueError("Could not determine latest Ensembl release from README") release = match.group(1) diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index 190f00c98..3d74880b3 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -159,9 +159,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences): .rename({"column_1": "urs"}) ) - raw_tracked = pl.scan_csv( - tracked_urs.name, low_memory=True - ).unique() + raw_tracked = pl.scan_csv(tracked_urs.name, low_memory=True).unique() to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti") diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index dcb47486d..ed204163c 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -409,11 +409,15 @@ def dot_bracket(self): seq_dot = str(record.seq) ## Use indices instead, assert that the string is even length ## If not, then the two parts are not the same length - assert len(seq_dot) % 2 == 0, f"Odd length sequence {len(seq_dot)}" + if len(seq_dot) % 2 != 0: + raise ValueError(f"Odd length sequence {len(seq_dot)}") seq_dot_len = len(seq_dot) sequence = seq_dot[0 : seq_dot_len // 2] dot_bracket = seq_dot[(seq_dot_len // 2) :] - assert len(sequence) == len(dot_bracket) + if len(sequence) != len(dot_bracket): + raise ValueError( + f"Sequence and dot bracket lengths do not match: {len(sequence)} != {len(dot_bracket)}" + ) return dot_bracket def basepair_count(self): diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py index 32c1ad958..dd1b4824c 100644 --- a/rnacentral_pipeline/rnacentral/search_export/data.py +++ b/rnacentral_pipeline/rnacentral/search_export/data.py @@ -709,6 +709,10 @@ def has_litsumm(litsumm): return str(bool(litsumm)) +def has_go_flow_llm_annotation(go_flow): + return str(bool(go_flow)) + + def has_editing_event(editing_events): return str(bool(editing_events)) @@ -880,6 +884,11 @@ def edit_ref_to_edit(editing_events): edit_repeat_type, keys="editing_events", ), + field( + "has_go_flow_llm_annotation", + has_go_flow_llm_annotation, + keys="go_flow_llm_annotations", + ), ## Add new fields above this line! Otherwise editing the produced xml is hard. tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"), ], diff --git a/utils/precompute/Cargo.toml b/utils/precompute/Cargo.toml index a7fbdd22e..36ae91b23 100644 --- a/utils/precompute/Cargo.toml +++ b/utils/precompute/Cargo.toml @@ -20,7 +20,6 @@ sorted-iter = "0.1.7" structopt = "0.3" strum = "0.21" strum_macros = "0.21" -polars = { version = "0.33.2", features = ["lazy", "streaming"] } [dev-dependencies] rand = "0.8" diff --git a/utils/precompute/src/releases.rs b/utils/precompute/src/releases.rs index becf9a9c8..5f55755db 100644 --- a/utils/precompute/src/releases.rs +++ b/utils/precompute/src/releases.rs @@ -26,7 +26,6 @@ use anyhow::{ Result, }; -use polars::prelude::*; #[derive(Serialize, Deserialize, Debug)] pub struct UrsEntry { diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs index 16733b393..30f6b19c9 100644 --- a/utils/search-export/src/main.rs +++ b/utils/search-export/src/main.rs @@ -34,6 +34,7 @@ pub enum Groupable { SoInfo, LitsummSummaries, EditingEvents, + GoFlowAnnotation, } #[derive(Debug, StructOpt)] @@ -145,6 +146,10 @@ enum SequenceCommand { /// RNA editing events editing_events: PathBuf, + #[structopt(parse(from_os_str))] + /// GoFlowLLM annotations + go_flow_llm_annotations: PathBuf, + // Add new arguments above this line! #[structopt(parse(from_os_str))] /// Filename to write the results to, '-' means stdout @@ -260,6 +265,9 @@ fn main() -> Result<()> { Groupable::EditingEvents => { sequences::editing_events::group(&path, max_count, &output)? }, + Groupable::GoFlowAnnotation => { + sequences::go_flow_annotations::group(&path, max_count, &output)? + }, }, Subcommand::Sequences { command, @@ -280,6 +288,8 @@ fn main() -> Result<()> { litsumm_summaries, editing_events, so_term_tree, + go_flow_llm_annotations, + // Add new arguments above this line! output, } => sequences::writers::write_merge( vec![ @@ -298,6 +308,7 @@ fn main() -> Result<()> { editing_events, orfs, so_term_tree, + go_flow_llm_annotations, ], &output, )?, diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs index 13723fea7..368877037 100644 --- a/utils/search-export/src/sequences/file_joiner.rs +++ b/utils/search-export/src/sequences/file_joiner.rs @@ -37,6 +37,7 @@ use super::{ editing_events::EditingEvent, feedback::Feedback, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -98,6 +99,7 @@ pub enum FileTypes { PublicationCount, LitsummSummaries, EditingEvents, + GoFlowLLMAnnotations, SoTermTree, } @@ -116,6 +118,8 @@ pub struct FileJoiner<'de> { rfam_hits: StreamDeserializer<'de, IoRead>, Grouped>, publication_counts: StreamDeserializer<'de, IoRead>, Grouped>, lit_summ: StreamDeserializer<'de, IoRead>, Grouped>, + go_flow_llm_annotations: + StreamDeserializer<'de, IoRead>, Grouped>, editing_events: StreamDeserializer<'de, IoRead>, Grouped>, so_info: SoMapping, } @@ -203,6 +207,7 @@ impl FileJoinerBuilder { let publication_counts = self.iterator_for(FileTypes::PublicationCount)?; let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?; let editing_events = self.iterator_for(FileTypes::EditingEvents)?; + let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?; let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?; Ok(FileJoiner { @@ -220,6 +225,7 @@ impl FileJoinerBuilder { publication_counts, lit_summ, editing_events, + go_flow_llm_annotations, so_info, }) } @@ -244,6 +250,7 @@ impl<'de> Iterator for FileJoiner<'de> { self.publication_counts.next(), self.lit_summ.next(), self.editing_events.next(), + self.go_flow_llm_annotations.next(), ); match current { @@ -262,6 +269,7 @@ impl<'de> Iterator for FileJoiner<'de> { None, None, None, + None, ) => None, ( Some(Ok(Required { @@ -320,6 +328,10 @@ impl<'de> Iterator for FileJoiner<'de> { id: id14, data: editing_events, })), + Some(Ok(Multiple { + id: id15, + data: goflow_llm_annotations, + })), ) => { if id1 != id2 || id1 != id3 @@ -334,9 +346,11 @@ impl<'de> Iterator for FileJoiner<'de> { || id1 != id12 || id1 != id13 || id1 != id14 + || id1 != id15 { return Some(Err(Error::OutofSyncData(vec![ id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, + id15, ]))); } @@ -362,6 +376,7 @@ impl<'de> Iterator for FileJoiner<'de> { .publication_counts(publication_counts) .litsumm_summaries(lit_summ) .editing_events(editing_events) + .go_flow_llm_annotations(goflow_llm_annotations) .so_tree(so_tree) .build(); diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs new file mode 100644 index 000000000..5213ed13d --- /dev/null +++ b/utils/search-export/src/sequences/go_flow_annotations.rs @@ -0,0 +1,35 @@ +use serde::{ + Deserialize, + Serialize, +}; +use std::path::Path; + +use anyhow::Result; +use rnc_core::grouper; + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct GoFlowLLMAnnotation { + pub id: usize, + urs_taxid: String, + should_show_goflow: bool, +} + +impl grouper::HasIndex for GoFlowLLMAnnotation { + fn index(&self) -> usize { + self.id + } +} + +pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> { + grouper::group::(grouper::Criteria::AnyNumber, &path, 1, max, &output) +} + +impl GoFlowLLMAnnotation { + pub fn should_show_goflow(&self) -> bool { + self.should_show_goflow + } + + pub fn urs_taxid(&self) -> &str { + &self.urs_taxid + } +} diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs index b3febfd50..a00fbe3d5 100644 --- a/utils/search-export/src/sequences/mod.rs +++ b/utils/search-export/src/sequences/mod.rs @@ -5,6 +5,7 @@ pub mod editing_events; pub mod feedback; pub mod file_joiner; pub mod go_annotation; +pub mod go_flow_annotations; pub mod interacting_protein; pub mod interacting_rna; pub mod litsumm; diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs index 792e090c4..65123b69c 100644 --- a/utils/search-export/src/sequences/normalized.rs +++ b/utils/search-export/src/sequences/normalized.rs @@ -38,6 +38,7 @@ use crate::{ editing_events::EditingEvent, feedback::FeedbackVec, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -80,6 +81,7 @@ pub struct Normalized { publication_count: usize, litsumm: Vec, editing_events: Vec, + go_flow_llm_annotations: Vec, so_rna_type_tree: so_tree::SoTree, #[serde(flatten)] @@ -140,6 +142,7 @@ impl Normalized { rfam_hits: raw.rfam_hits().iter().cloned().collect(), orfs: raw.orfs().iter().cloned().collect(), litsumm: raw.litsumm_summaries().to_vec(), + go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(), editing_events: raw.editing_events().to_vec(), }) } diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs index 4daf502d0..52bea2f9f 100644 --- a/utils/search-export/src/sequences/raw.rs +++ b/utils/search-export/src/sequences/raw.rs @@ -16,6 +16,7 @@ use crate::sequences::{ editing_events::EditingEvent, feedback::Feedback, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -46,6 +47,7 @@ pub struct Raw { publication_counts: Option, litsumm_summaries: Vec, editing_events: Vec, + go_flow_llm_annotations: Vec, so_tree: so_tree::SoTree, } @@ -148,6 +150,11 @@ impl Raw { &self.editing_events } + /// Get a reference to the raw's GoFlowlLM annotations + pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] { + &self.go_flow_llm_annotations + } + /// Get this raw's publication count. pub fn publication_count(&self) -> usize { self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0) diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf index 768507457..704d3a434 100755 --- a/workflows/export/text-search/sequences.nf +++ b/workflows/export/text-search/sequences.nf @@ -65,13 +65,14 @@ process build_metadata { path(text) path(litsumm) path(editing_events) + path(go_flow_annotations) path(so_tree) output: path("merged.json") """ - search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events merged.json + search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events $go_flow_annotations merged.json """ } @@ -141,10 +142,25 @@ process litsumm_summaries { """ } +process go_flow_annotations { + container '' + input: + val(max_count) + path (query) + + output: + path("go-flow-llm-annotations.json") + + """ + psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json + search-export group go-flow-annotation raw.json ${max_count} go-flow-llm-annotations.json + """ +} + process editing_events { input: val(max_count) - path(query) + path (query) output: path("editing-events.json") @@ -202,6 +218,7 @@ workflow sequences { Channel.fromPath('files/search-export/parts/text-mining.sql') | set { text_sql } Channel.fromPath('files/search-export/parts/litsumm.sql') | set { litsumm_sql } Channel.fromPath('files/search-export/parts/editing-events.sql') | set { editing_events_sql } + Channel.fromPath('files/search-export/parts/goflow.sql') | set { goflow_sql } Channel.fromPath('files/search-export/so-rna-types.sql') | set { so_sql } Channel.fromPath('files/search-export/parts/accessions.sql') | set { accessions_sql } @@ -233,6 +250,7 @@ workflow sequences { text_mining_query(search_count, text_sql), litsumm_summaries(search_count, litsumm_sql), editing_events(search_count, editing_events_sql), + go_flow_annotations(search_count, goflow_sql), so_tree, )\ | set { metadata }