RNAcentral · blakesweeney · Jun 18, 2024 · Jun 21, 2024
diff --git a/rnacentral_pipeline/cli/__init__.py b/rnacentral_pipeline/cli/__init__.py
@@ -41,6 +41,7 @@
     mgnify,
     mirbase,
     mirgenedb,
+    mirtrondb,
     misc,
     ncbi,
     notify,
@@ -54,7 +55,6 @@
     qa,
     quickgo,
     r2dt,
-    rediportal,
     refseq,
     release,
     repeats,
@@ -117,6 +117,7 @@ def cli(log_level):
 cli.add_command(mgnify.cli)
 cli.add_command(mirbase.cli)
 cli.add_command(mirgenedb.cli)
+cli.add_command(mirtrondb.cli)
 cli.add_command(misc.crs_data)
 cli.add_command(misc.find_upi_ranges)
 cli.add_command(misc.validate_pgloader)

diff --git a/rnacentral_pipeline/cli/mirtrondb.py b/rnacentral_pipeline/cli/mirtrondb.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2024] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from pathlib import Path
+
+import click
+
+from rnacentral_pipeline.databases.mirtrondb import parser
+from rnacentral_pipeline.writers import entry_writer
+
+
+@click.group("mirtrondb")
+def cli():
+    """
+    Commands for parsing mirtrondb data.
+    """
+
+
+@cli.command("parse")
+@click.argument("tsv", type=click.File("r"))
+@click.argument(
+    "output",
+    default=".",
+    type=click.Path(writable=True, dir_okay=True, file_okay=False),
+)
+def process_json_schema(tsv, output):
+    """
+    This parses the TSV file containing all sequences to produce the CSV we can
+    import.
+    """
+    entries = parser.parse(tsv)
+    with entry_writer(Path(output)) as writer:
+        writer.write(entries)
diff --git a/rnacentral_pipeline/databases/data/databases.py b/rnacentral_pipeline/databases/data/databases.py
@@ -58,6 +58,7 @@ class Database(enum.Enum):
     mgnify = DatabaseValue(55, "MGNIFY")
     mirbase = DatabaseValue(22, "miRBase")
     mirgenedb = DatabaseValue(23, "MirGeneDB")
+    mirtrondb = DatabaseValue(56, "mirtronDB")
     modomics = DatabaseValue(24, "Modomics")
     noncode = DatabaseValue(25, "NONCODE")
     pdbe = DatabaseValue(26, "PDBe")

diff --git a/rnacentral_pipeline/databases/mirtrondb/__init__.py b/rnacentral_pipeline/databases/mirtrondb/__init__.py
diff --git a/rnacentral_pipeline/databases/mirtrondb/parser.py b/rnacentral_pipeline/databases/mirtrondb/parser.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+
+# Copyright [2009-2024] EMBL-European Bioinformatics Institute
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import logging
+import re
+import typing as ty
+
+from rnacentral_pipeline.databases.data import Entry, RelatedCoordinate, RelatedSequence
+
+LOGGER = logging.getLogger(__name__)
+
+RNA_TYPES = {
+    "mature": "SO:0000276",
+    "precursor": "SO:0001244",
+}
+
+SPECIES = {
+    "A. thaliana": 3702,
+    "B. taurus": 9913,
+    "C. elegans": 6239,
+    "C. familiaris": 9615,
+    "D. melanogaster": 7227,
+    "D. pseudoobscura": 7237,
+    "D. rerio": 7955,
+    "D. simulans": 7240,
+    "G. gallus": 9031,
+    "H. sapiens": 9606,
+    "M. esculenta": 3983,
+    "M. mulatta": 9544,
+    "M. musculus": 10090,
+    "M. truncatula": 3880,
+    "O. sativa": 4530,
+    "P. troglodytes": 9598,
+    "S. Italica": 4555,
+    "S. scrofa": 9823,
+}
+
+
+def text_value(row: ty.Dict[str, str], name: str) -> str | None:
+    value = row[name].strip()
+    if value == "-":
+        return None
+    return value
+
+
+def find_coords(id: str, target: str, query: str) -> ty.List[RelatedCoordinate]:
+    if query not in target:
+        LOGGER.warn(f"Mature not found in precusor for %s", id)
+        return []
+    start = target.index(query)
+    return [RelatedCoordinate(start=start, stop=start + len(query))]
+
+
+def parse(handle: ty.IO):
+    blank = handle.readline().strip()
+    assert not blank, f"Invalid first line `{blank}`"
+    notification = handle.readline().strip()
+    assert notification == "##mirtronDB tabular format"
+    reader = csv.DictReader(handle, delimiter="\t")
+
+    pre = {}
+    mature = {}
+    for raw in reader:
+        rna_type = raw["type"].strip()
+        name = raw["name"].strip()
+        species = raw["specie"].strip()
+        description = f"{species} {name} {rna_type} miRNA"
+        if raw["host gene"].strip():
+            description += f" ({raw['host gene'].strip()})"
+        entry = Entry(
+            primary_id=f"MIRTRONDB:{raw['id'].strip()}",
+            accession=name,
+            ncbi_tax_id=SPECIES[species],
+            database="MIRTRONDB",
+            sequence=raw["sequence"].strip(),
+            regions=[],
+            rna_type=RNA_TYPES[rna_type],
+            url=f"http://mirtrondb.cp.utfpr.edu.br/fetch_details.php?mrt_details={name}",
+            seq_version="1",
+            gene=raw["host gene"].strip(),
+            description=description,
+        )
+        assert entry.accession not in pre
+        assert entry.accession not in mature
+
+        if rna_type == "mature":
+            mature[entry.accession] = entry
+        elif rna_type == "precursor":
+            pre[entry.accession] = entry
+        else:
+            raise ValueError(f"Cannot handle {raw}")
+
+    for id, entry in mature.items():
+        pre_id = re.sub(r"-[35]p", "", id)
+        if pre_id not in pre:
+            LOGGER.warn("Failed to find precursor for %s", id)
+            continue
+        pre_entry = pre[pre_id]
+        pre_entry.related_sequences.append(
+            RelatedSequence(
+                sequence_id=id,
+                relationship="mature_product",
+                coordinates=find_coords(id, pre_entry.sequence, entry.sequence),
+            )
+        )
+
+        entry.related_sequences.append(
+            RelatedSequence(
+                sequence_id=pre_id,
+                relationship="precursor",
+            )
+        )
+
+    yield from pre.values()
+    yield from mature.values()
diff --git a/workflows/databases/mirtrondb.nf b/workflows/databases/mirtrondb.nf
@@ -0,0 +1,11 @@
+process mirtrondb {
+  when: { params.databases.mirtrondb.run }
+
+  output:
+  path('*.csv')
+
+  """
+  cp ${params.databases.mirtrondb.remote} all.tsv
+  rnac mirtrondb parse all.tsv .
+  """
+}
diff --git a/workflows/parse-databases.nf b/workflows/parse-databases.nf
@@ -15,6 +15,7 @@ include { lncipedia } from './databases/lncipedia'
 include { mgnify } from './databases/mgnify'
 include { mirbase } from './databases/mirbase'
 include { mirgenedb } from './databases/mirgenedb'
+include { mirtrondb } from './databases/mirtrondb'
 include { pdbe } from './databases/pdbe'
 include { pirbase } from './databases/pirbase'
 include { plncdb } from './databases/plncdb'
@@ -78,6 +79,7 @@ workflow parse_databases {
       mirbase(),
       mgnify(),
       mirgenedb(),
+      mirtrondb(),
       pdbe(),
       pirbase(),
       plncdb(),