diff --git a/rnacentral_pipeline/cli/__init__.py b/rnacentral_pipeline/cli/__init__.py index 5415be2cd..e16d25d99 100644 --- a/rnacentral_pipeline/cli/__init__.py +++ b/rnacentral_pipeline/cli/__init__.py @@ -41,6 +41,7 @@ mgnify, mirbase, mirgenedb, + mirtrondb, misc, ncbi, notify, @@ -54,7 +55,6 @@ qa, quickgo, r2dt, - rediportal, refseq, release, repeats, @@ -117,6 +117,7 @@ def cli(log_level): cli.add_command(mgnify.cli) cli.add_command(mirbase.cli) cli.add_command(mirgenedb.cli) +cli.add_command(mirtrondb.cli) cli.add_command(misc.crs_data) cli.add_command(misc.find_upi_ranges) cli.add_command(misc.validate_pgloader) diff --git a/rnacentral_pipeline/cli/mirtrondb.py b/rnacentral_pipeline/cli/mirtrondb.py new file mode 100644 index 000000000..2a77a0167 --- /dev/null +++ b/rnacentral_pipeline/cli/mirtrondb.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2024] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path + +import click + +from rnacentral_pipeline.databases.mirtrondb import parser +from rnacentral_pipeline.writers import entry_writer + + +@click.group("mirtrondb") +def cli(): + """ + Commands for parsing mirtrondb data. + """ + + +@cli.command("parse") +@click.argument("tsv", type=click.File("r")) +@click.argument( + "output", + default=".", + type=click.Path(writable=True, dir_okay=True, file_okay=False), +) +def process_json_schema(tsv, output): + """ + This parses the TSV file containing all sequences to produce the CSV we can + import. + """ + entries = parser.parse(tsv) + with entry_writer(Path(output)) as writer: + writer.write(entries) diff --git a/rnacentral_pipeline/databases/data/databases.py b/rnacentral_pipeline/databases/data/databases.py index b9e916780..97ea5cec0 100644 --- a/rnacentral_pipeline/databases/data/databases.py +++ b/rnacentral_pipeline/databases/data/databases.py @@ -58,6 +58,7 @@ class Database(enum.Enum): mgnify = DatabaseValue(55, "MGNIFY") mirbase = DatabaseValue(22, "miRBase") mirgenedb = DatabaseValue(23, "MirGeneDB") + mirtrondb = DatabaseValue(56, "mirtronDB") modomics = DatabaseValue(24, "Modomics") noncode = DatabaseValue(25, "NONCODE") pdbe = DatabaseValue(26, "PDBe") diff --git a/rnacentral_pipeline/databases/mirtrondb/__init__.py b/rnacentral_pipeline/databases/mirtrondb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rnacentral_pipeline/databases/mirtrondb/parser.py b/rnacentral_pipeline/databases/mirtrondb/parser.py new file mode 100644 index 000000000..5fa735971 --- /dev/null +++ b/rnacentral_pipeline/databases/mirtrondb/parser.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- + +# Copyright [2009-2024] EMBL-European Bioinformatics Institute +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import logging +import re +import typing as ty + +from rnacentral_pipeline.databases.data import Entry, RelatedCoordinate, RelatedSequence + +LOGGER = logging.getLogger(__name__) + +RNA_TYPES = { + "mature": "SO:0000276", + "precursor": "SO:0001244", +} + +SPECIES = { + "A. thaliana": 3702, + "B. taurus": 9913, + "C. elegans": 6239, + "C. familiaris": 9615, + "D. melanogaster": 7227, + "D. pseudoobscura": 7237, + "D. rerio": 7955, + "D. simulans": 7240, + "G. gallus": 9031, + "H. sapiens": 9606, + "M. esculenta": 3983, + "M. mulatta": 9544, + "M. musculus": 10090, + "M. truncatula": 3880, + "O. sativa": 4530, + "P. troglodytes": 9598, + "S. Italica": 4555, + "S. scrofa": 9823, +} + + +def text_value(row: ty.Dict[str, str], name: str) -> str | None: + value = row[name].strip() + if value == "-": + return None + return value + + +def find_coords(id: str, target: str, query: str) -> ty.List[RelatedCoordinate]: + if query not in target: + LOGGER.warn(f"Mature not found in precusor for %s", id) + return [] + start = target.index(query) + return [RelatedCoordinate(start=start, stop=start + len(query))] + + +def parse(handle: ty.IO): + blank = handle.readline().strip() + assert not blank, f"Invalid first line `{blank}`" + notification = handle.readline().strip() + assert notification == "##mirtronDB tabular format" + reader = csv.DictReader(handle, delimiter="\t") + + pre = {} + mature = {} + for raw in reader: + rna_type = raw["type"].strip() + name = raw["name"].strip() + species = raw["specie"].strip() + description = f"{species} {name} {rna_type} miRNA" + if raw["host gene"].strip(): + description += f" ({raw['host gene'].strip()})" + entry = Entry( + primary_id=f"MIRTRONDB:{raw['id'].strip()}", + accession=name, + ncbi_tax_id=SPECIES[species], + database="MIRTRONDB", + sequence=raw["sequence"].strip(), + regions=[], + rna_type=RNA_TYPES[rna_type], + url=f"http://mirtrondb.cp.utfpr.edu.br/fetch_details.php?mrt_details={name}", + seq_version="1", + gene=raw["host gene"].strip(), + description=description, + ) + assert entry.accession not in pre + assert entry.accession not in mature + + if rna_type == "mature": + mature[entry.accession] = entry + elif rna_type == "precursor": + pre[entry.accession] = entry + else: + raise ValueError(f"Cannot handle {raw}") + + for id, entry in mature.items(): + pre_id = re.sub(r"-[35]p", "", id) + if pre_id not in pre: + LOGGER.warn("Failed to find precursor for %s", id) + continue + pre_entry = pre[pre_id] + pre_entry.related_sequences.append( + RelatedSequence( + sequence_id=id, + relationship="mature_product", + coordinates=find_coords(id, pre_entry.sequence, entry.sequence), + ) + ) + + entry.related_sequences.append( + RelatedSequence( + sequence_id=pre_id, + relationship="precursor", + ) + ) + + yield from pre.values() + yield from mature.values() diff --git a/workflows/databases/mirtrondb.nf b/workflows/databases/mirtrondb.nf new file mode 100644 index 000000000..a26b4396b --- /dev/null +++ b/workflows/databases/mirtrondb.nf @@ -0,0 +1,11 @@ +process mirtrondb { + when: { params.databases.mirtrondb.run } + + output: + path('*.csv') + + """ + cp ${params.databases.mirtrondb.remote} all.tsv + rnac mirtrondb parse all.tsv . + """ +} diff --git a/workflows/parse-databases.nf b/workflows/parse-databases.nf index a76cfde49..250280959 100644 --- a/workflows/parse-databases.nf +++ b/workflows/parse-databases.nf @@ -15,6 +15,7 @@ include { lncipedia } from './databases/lncipedia' include { mgnify } from './databases/mgnify' include { mirbase } from './databases/mirbase' include { mirgenedb } from './databases/mirgenedb' +include { mirtrondb } from './databases/mirtrondb' include { pdbe } from './databases/pdbe' include { pirbase } from './databases/pirbase' include { plncdb } from './databases/plncdb' @@ -78,6 +79,7 @@ workflow parse_databases { mirbase(), mgnify(), mirgenedb(), + mirtrondb(), pdbe(), pirbase(), plncdb(),