Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion rnacentral_pipeline/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
mgnify,
mirbase,
mirgenedb,
mirtrondb,
misc,
ncbi,
notify,
Expand All @@ -54,7 +55,6 @@
qa,
quickgo,
r2dt,
rediportal,
refseq,
release,
repeats,
Expand Down Expand Up @@ -117,6 +117,7 @@ def cli(log_level):
cli.add_command(mgnify.cli)
cli.add_command(mirbase.cli)
cli.add_command(mirgenedb.cli)
cli.add_command(mirtrondb.cli)
cli.add_command(misc.crs_data)
cli.add_command(misc.find_upi_ranges)
cli.add_command(misc.validate_pgloader)
Expand Down
45 changes: 45 additions & 0 deletions rnacentral_pipeline/cli/mirtrondb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-

"""
Copyright [2009-2024] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from pathlib import Path

import click

from rnacentral_pipeline.databases.mirtrondb import parser
from rnacentral_pipeline.writers import entry_writer


@click.group("mirtrondb")
def cli():
"""
Commands for parsing mirtrondb data.
"""


@cli.command("parse")
@click.argument("tsv", type=click.File("r"))
@click.argument(
"output",
default=".",
type=click.Path(writable=True, dir_okay=True, file_okay=False),
)
def process_json_schema(tsv, output):
"""
This parses the TSV file containing all sequences to produce the CSV we can
import.
"""
entries = parser.parse(tsv)
with entry_writer(Path(output)) as writer:
writer.write(entries)
1 change: 1 addition & 0 deletions rnacentral_pipeline/databases/data/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class Database(enum.Enum):
mgnify = DatabaseValue(55, "MGNIFY")
mirbase = DatabaseValue(22, "miRBase")
mirgenedb = DatabaseValue(23, "MirGeneDB")
mirtrondb = DatabaseValue(56, "mirtronDB")
modomics = DatabaseValue(24, "Modomics")
noncode = DatabaseValue(25, "NONCODE")
pdbe = DatabaseValue(26, "PDBe")
Expand Down
Empty file.
126 changes: 126 additions & 0 deletions rnacentral_pipeline/databases/mirtrondb/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-

# Copyright [2009-2024] EMBL-European Bioinformatics Institute
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
import logging
import re
import typing as ty

from rnacentral_pipeline.databases.data import Entry, RelatedCoordinate, RelatedSequence

LOGGER = logging.getLogger(__name__)

RNA_TYPES = {
"mature": "SO:0000276",
"precursor": "SO:0001244",
}

SPECIES = {
"A. thaliana": 3702,
"B. taurus": 9913,
"C. elegans": 6239,
"C. familiaris": 9615,
"D. melanogaster": 7227,
"D. pseudoobscura": 7237,
"D. rerio": 7955,
"D. simulans": 7240,
"G. gallus": 9031,
"H. sapiens": 9606,
"M. esculenta": 3983,
"M. mulatta": 9544,
"M. musculus": 10090,
"M. truncatula": 3880,
"O. sativa": 4530,
"P. troglodytes": 9598,
"S. Italica": 4555,
"S. scrofa": 9823,
}


def text_value(row: ty.Dict[str, str], name: str) -> str | None:
value = row[name].strip()
if value == "-":
return None
return value


def find_coords(id: str, target: str, query: str) -> ty.List[RelatedCoordinate]:
if query not in target:
LOGGER.warn(f"Mature not found in precusor for %s", id)
return []
start = target.index(query)
return [RelatedCoordinate(start=start, stop=start + len(query))]


def parse(handle: ty.IO):
blank = handle.readline().strip()
assert not blank, f"Invalid first line `{blank}`"
notification = handle.readline().strip()
assert notification == "##mirtronDB tabular format"
reader = csv.DictReader(handle, delimiter="\t")

pre = {}
mature = {}
for raw in reader:
rna_type = raw["type"].strip()
name = raw["name"].strip()
species = raw["specie"].strip()
description = f"{species} {name} {rna_type} miRNA"
if raw["host gene"].strip():
description += f" ({raw['host gene'].strip()})"
entry = Entry(
primary_id=f"MIRTRONDB:{raw['id'].strip()}",
accession=name,
ncbi_tax_id=SPECIES[species],
database="MIRTRONDB",
sequence=raw["sequence"].strip(),
regions=[],
rna_type=RNA_TYPES[rna_type],
url=f"http://mirtrondb.cp.utfpr.edu.br/fetch_details.php?mrt_details={name}",
seq_version="1",
gene=raw["host gene"].strip(),
description=description,
)
assert entry.accession not in pre
assert entry.accession not in mature

if rna_type == "mature":
mature[entry.accession] = entry
elif rna_type == "precursor":
pre[entry.accession] = entry
else:
raise ValueError(f"Cannot handle {raw}")

for id, entry in mature.items():
pre_id = re.sub(r"-[35]p", "", id)
if pre_id not in pre:
LOGGER.warn("Failed to find precursor for %s", id)
continue
pre_entry = pre[pre_id]
pre_entry.related_sequences.append(
RelatedSequence(
sequence_id=id,
relationship="mature_product",
coordinates=find_coords(id, pre_entry.sequence, entry.sequence),
)
)

entry.related_sequences.append(
RelatedSequence(
sequence_id=pre_id,
relationship="precursor",
)
)

yield from pre.values()
yield from mature.values()
11 changes: 11 additions & 0 deletions workflows/databases/mirtrondb.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
process mirtrondb {
when: { params.databases.mirtrondb.run }

output:
path('*.csv')

"""
cp ${params.databases.mirtrondb.remote} all.tsv
rnac mirtrondb parse all.tsv .
"""
}
2 changes: 2 additions & 0 deletions workflows/parse-databases.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ include { lncipedia } from './databases/lncipedia'
include { mgnify } from './databases/mgnify'
include { mirbase } from './databases/mirbase'
include { mirgenedb } from './databases/mirgenedb'
include { mirtrondb } from './databases/mirtrondb'
include { pdbe } from './databases/pdbe'
include { pirbase } from './databases/pirbase'
include { plncdb } from './databases/plncdb'
Expand Down Expand Up @@ -78,6 +79,7 @@ workflow parse_databases {
mirbase(),
mgnify(),
mirgenedb(),
mirtrondb(),
pdbe(),
pirbase(),
plncdb(),
Expand Down