From abfd83cc47c9714b2bbdc79b77eeb7cb1d3a8dd8 Mon Sep 17 00:00:00 2001 From: Blake Sweeney Date: Tue, 18 Jun 2024 13:28:54 +0100 Subject: [PATCH 1/2] Outline mirtronDB parsing This covers most of the data they provide. The key missing bit is genomic coordinates. That may be tricky as they do not provide which genome in an easily accessible way. A bit more digging will probably find it though. --- rnacentral_pipeline/cli/__init__.py | 2 + rnacentral_pipeline/cli/mirtrondb.py | 45 +++++++ .../databases/data/databases.py | 1 + .../databases/mirtrondb/__init__.py | 0 .../databases/mirtrondb/parser.py | 114 ++++++++++++++++++ workflows/databases/mirtrondb.nf | 11 ++ workflows/parse-databases.nf | 2 + 7 files changed, 175 insertions(+) create mode 100644 rnacentral_pipeline/cli/mirtrondb.py create mode 100644 rnacentral_pipeline/databases/mirtrondb/__init__.py create mode 100644 rnacentral_pipeline/databases/mirtrondb/parser.py create mode 100644 workflows/databases/mirtrondb.nf diff --git a/rnacentral_pipeline/cli/__init__.py b/rnacentral_pipeline/cli/__init__.py index 5415be2cd..3838504c3 100644 --- a/rnacentral_pipeline/cli/__init__.py +++ b/rnacentral_pipeline/cli/__init__.py @@ -41,6 +41,7 @@ mgnify, mirbase, mirgenedb, + mirtrondb, misc, ncbi, notify, @@ -117,6 +118,7 @@ def cli(log_level): cli.add_command(mgnify.cli) cli.add_command(mirbase.cli) cli.add_command(mirgenedb.cli) +cli.add_command(mirtrondb.cli) cli.add_command(misc.crs_data) cli.add_command(misc.find_upi_ranges) cli.add_command(misc.validate_pgloader) diff --git a/rnacentral_pipeline/cli/mirtrondb.py b/rnacentral_pipeline/cli/mirtrondb.py new file mode 100644 index 000000000..2a77a0167 --- /dev/null +++ b/rnacentral_pipeline/cli/mirtrondb.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2024] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path + +import click + +from rnacentral_pipeline.databases.mirtrondb import parser +from rnacentral_pipeline.writers import entry_writer + + +@click.group("mirtrondb") +def cli(): + """ + Commands for parsing mirtrondb data. + """ + + +@cli.command("parse") +@click.argument("tsv", type=click.File("r")) +@click.argument( + "output", + default=".", + type=click.Path(writable=True, dir_okay=True, file_okay=False), +) +def process_json_schema(tsv, output): + """ + This parses the TSV file containing all sequences to produce the CSV we can + import. + """ + entries = parser.parse(tsv) + with entry_writer(Path(output)) as writer: + writer.write(entries) diff --git a/rnacentral_pipeline/databases/data/databases.py b/rnacentral_pipeline/databases/data/databases.py index b9e916780..97ea5cec0 100644 --- a/rnacentral_pipeline/databases/data/databases.py +++ b/rnacentral_pipeline/databases/data/databases.py @@ -58,6 +58,7 @@ class Database(enum.Enum): mgnify = DatabaseValue(55, "MGNIFY") mirbase = DatabaseValue(22, "miRBase") mirgenedb = DatabaseValue(23, "MirGeneDB") + mirtrondb = DatabaseValue(56, "mirtronDB") modomics = DatabaseValue(24, "Modomics") noncode = DatabaseValue(25, "NONCODE") pdbe = DatabaseValue(26, "PDBe") diff --git a/rnacentral_pipeline/databases/mirtrondb/__init__.py b/rnacentral_pipeline/databases/mirtrondb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rnacentral_pipeline/databases/mirtrondb/parser.py b/rnacentral_pipeline/databases/mirtrondb/parser.py new file mode 100644 index 000000000..7ae642ef5 --- /dev/null +++ b/rnacentral_pipeline/databases/mirtrondb/parser.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +# Copyright [2009-2024] EMBL-European Bioinformatics Institute +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import re +import typing as ty + +from rnacentral_pipeline.databases.data import Entry, Region, RelatedSequence + +RNA_TYPES = { + "mature": "SO:0000276", + "precursor": "SO:0001244", +} + +SPECIES = { + "A. thaliana": 3702, + "B. taurus": 9913, + "C. elegans": 6239, + "C. familiaris": -1, + "D. melanogaster": 7227, + "D. pseudoobscura": -1, + "D. rerio": -1, + "D. simulans": -1, + "G. gallus": -1, + "H. sapiens": 9606, + "M. esculenta": -1, + "M. mulatta": -1, + "M. musculus": -1, + "M. truncatula": -1, + "O. sativa": -1, + "P. troglodytes": -1, + "S. Italica": 4555, + "S. scrofa": 9823, +} + + +def text_value(row: ty.Dict[str, str], name: str) -> str | None: + value = row[name].strip() + if value == "-": + return None + return value + + +def regions(entry: ty.Dict[str, str]) -> ty.List[Region]: + return None + + +def tax_id(entry: ty.Dict[str, str]) -> int: + return SPECIES[entry["specie"].strip()] + + +def parse(handle: ty.IO): + blank = handle.readline() + assert not blank + notification = handle.readline() + assert notification == "##mirtronDB tabular format" + reader = csv.DictReader(handle, delimiter="\t") + + pre = {} + mature = {} + for raw in reader: + entry = Entry( + primary_id=f"MIRTRONDB:{raw['id']}", + accesion=raw["name"], + ncbi_tax_id=tax_id(raw), + database="MIRTRONDB", + sequence=raw["sequence"], + regions=regions(raw), + rna_type=RNA_TYPES[raw["type"]], + url=f"http://mirtrondb.cp.utfpr.edu.br/fetch_details.php?mrt_details={raw['name']}", + seq_version="1", + gene=raw["host gene"], + ) + assert entry.accession not in pre + assert entry.accession not in mature + if raw["type"] == "mature": + mature[entry.accession] = entry + elif raw["type"] == "precursor": + pre[entry.accession] = entry + else: + raise ValueError(f"Cannot handle {raw}") + + for id, entry in mature.items(): + pre_id = re.sub(r"-[35]p", "", id) + if pre_id not in pre: + continue + pre_entry = pre[pre_id] + pre_entry.related_sequences.append( + RelatedSequence( + sequence_id=id, + relationship="mature_product", + coordinates=find_coord(pre_entry.sequence, entry.sequence), + ) + ) + + entry.related_sequences.append( + RelatedSequence( + sequence_id=pre_id, + relationship="precursor", + ) + ) + + yield from pre.values() + yield from mature.values() diff --git a/workflows/databases/mirtrondb.nf b/workflows/databases/mirtrondb.nf new file mode 100644 index 000000000..a26b4396b --- /dev/null +++ b/workflows/databases/mirtrondb.nf @@ -0,0 +1,11 @@ +process mirtrondb { + when: { params.databases.mirtrondb.run } + + output: + path('*.csv') + + """ + cp ${params.databases.mirtrondb.remote} all.tsv + rnac mirtrondb parse all.tsv . + """ +} diff --git a/workflows/parse-databases.nf b/workflows/parse-databases.nf index a76cfde49..250280959 100644 --- a/workflows/parse-databases.nf +++ b/workflows/parse-databases.nf @@ -15,6 +15,7 @@ include { lncipedia } from './databases/lncipedia' include { mgnify } from './databases/mgnify' include { mirbase } from './databases/mirbase' include { mirgenedb } from './databases/mirgenedb' +include { mirtrondb } from './databases/mirtrondb' include { pdbe } from './databases/pdbe' include { pirbase } from './databases/pirbase' include { plncdb } from './databases/plncdb' @@ -78,6 +79,7 @@ workflow parse_databases { mirbase(), mgnify(), mirgenedb(), + mirtrondb(), pdbe(), pirbase(), plncdb(), From e41a2b39f6ed391660a7e5f94fba8c18b399dbf4 Mon Sep 17 00:00:00 2001 From: Blake Sweeney Date: Fri, 21 Jun 2024 10:53:22 +0100 Subject: [PATCH 2/2] Initial mirtronDB parser This parses the dataset and seems to provide reasonable data. It does not extract the sequence region information because the data file does not include which genome it maps to, making that complicated to figure out. --- rnacentral_pipeline/cli/__init__.py | 1 - .../databases/mirtrondb/parser.py | 76 +++++++++++-------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/rnacentral_pipeline/cli/__init__.py b/rnacentral_pipeline/cli/__init__.py index 3838504c3..e16d25d99 100644 --- a/rnacentral_pipeline/cli/__init__.py +++ b/rnacentral_pipeline/cli/__init__.py @@ -55,7 +55,6 @@ qa, quickgo, r2dt, - rediportal, refseq, release, repeats, diff --git a/rnacentral_pipeline/databases/mirtrondb/parser.py b/rnacentral_pipeline/databases/mirtrondb/parser.py index 7ae642ef5..5fa735971 100644 --- a/rnacentral_pipeline/databases/mirtrondb/parser.py +++ b/rnacentral_pipeline/databases/mirtrondb/parser.py @@ -12,10 +12,13 @@ # limitations under the License. import csv +import logging import re import typing as ty -from rnacentral_pipeline.databases.data import Entry, Region, RelatedSequence +from rnacentral_pipeline.databases.data import Entry, RelatedCoordinate, RelatedSequence + +LOGGER = logging.getLogger(__name__) RNA_TYPES = { "mature": "SO:0000276", @@ -26,19 +29,19 @@ "A. thaliana": 3702, "B. taurus": 9913, "C. elegans": 6239, - "C. familiaris": -1, + "C. familiaris": 9615, "D. melanogaster": 7227, - "D. pseudoobscura": -1, - "D. rerio": -1, - "D. simulans": -1, - "G. gallus": -1, + "D. pseudoobscura": 7237, + "D. rerio": 7955, + "D. simulans": 7240, + "G. gallus": 9031, "H. sapiens": 9606, - "M. esculenta": -1, - "M. mulatta": -1, - "M. musculus": -1, - "M. truncatula": -1, - "O. sativa": -1, - "P. troglodytes": -1, + "M. esculenta": 3983, + "M. mulatta": 9544, + "M. musculus": 10090, + "M. truncatula": 3880, + "O. sativa": 4530, + "P. troglodytes": 9598, "S. Italica": 4555, "S. scrofa": 9823, } @@ -51,41 +54,49 @@ def text_value(row: ty.Dict[str, str], name: str) -> str | None: return value -def regions(entry: ty.Dict[str, str]) -> ty.List[Region]: - return None - - -def tax_id(entry: ty.Dict[str, str]) -> int: - return SPECIES[entry["specie"].strip()] +def find_coords(id: str, target: str, query: str) -> ty.List[RelatedCoordinate]: + if query not in target: + LOGGER.warn(f"Mature not found in precusor for %s", id) + return [] + start = target.index(query) + return [RelatedCoordinate(start=start, stop=start + len(query))] def parse(handle: ty.IO): - blank = handle.readline() - assert not blank - notification = handle.readline() + blank = handle.readline().strip() + assert not blank, f"Invalid first line `{blank}`" + notification = handle.readline().strip() assert notification == "##mirtronDB tabular format" reader = csv.DictReader(handle, delimiter="\t") pre = {} mature = {} for raw in reader: + rna_type = raw["type"].strip() + name = raw["name"].strip() + species = raw["specie"].strip() + description = f"{species} {name} {rna_type} miRNA" + if raw["host gene"].strip(): + description += f" ({raw['host gene'].strip()})" entry = Entry( - primary_id=f"MIRTRONDB:{raw['id']}", - accesion=raw["name"], - ncbi_tax_id=tax_id(raw), + primary_id=f"MIRTRONDB:{raw['id'].strip()}", + accession=name, + ncbi_tax_id=SPECIES[species], database="MIRTRONDB", - sequence=raw["sequence"], - regions=regions(raw), - rna_type=RNA_TYPES[raw["type"]], - url=f"http://mirtrondb.cp.utfpr.edu.br/fetch_details.php?mrt_details={raw['name']}", + sequence=raw["sequence"].strip(), + regions=[], + rna_type=RNA_TYPES[rna_type], + url=f"http://mirtrondb.cp.utfpr.edu.br/fetch_details.php?mrt_details={name}", seq_version="1", - gene=raw["host gene"], + gene=raw["host gene"].strip(), + description=description, ) assert entry.accession not in pre assert entry.accession not in mature - if raw["type"] == "mature": + + if rna_type == "mature": mature[entry.accession] = entry - elif raw["type"] == "precursor": + elif rna_type == "precursor": pre[entry.accession] = entry else: raise ValueError(f"Cannot handle {raw}") @@ -93,13 +104,14 @@ def parse(handle: ty.IO): for id, entry in mature.items(): pre_id = re.sub(r"-[35]p", "", id) if pre_id not in pre: + LOGGER.warn("Failed to find precursor for %s", id) continue pre_entry = pre[pre_id] pre_entry.related_sequences.append( RelatedSequence( sequence_id=id, relationship="mature_product", - coordinates=find_coord(pre_entry.sequence, entry.sequence), + coordinates=find_coords(id, pre_entry.sequence, entry.sequence), ) )