Skip to content

Commit 3f6f13f

Browse files
committed
Ignore SRA runs that do not contain transcriptomic RNA-Seq data
1 parent 14711d4 commit 3f6f13f

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

rnaseq_pipeline/sources/sra.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ class SraRunMetadata:
8080

8181
def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
8282
"""
83+
Extract transcriptomic RNA-Seq runs from the given SRA XML metadata file.
8384
:param path: Path to the XML file containing SRA run metadata.
8485
:param include_invalid_runs: If True, include runs that do not have any suitable metadata that can be used to
8586
determine the layout.
@@ -92,6 +93,20 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
9293
srr = run.attrib['accession']
9394

9495
srx = run.find('EXPERIMENT_REF').attrib['accession']
96+
97+
library_strategy = root.find(
98+
'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY')
99+
library_source = root.find(
100+
'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SOURCE')
101+
102+
if library_strategy is not None and library_strategy.text not in ['RNA-Seq']:
103+
logger.warning('%s Ignoring run with %s library strategy.', srr, library_strategy.text)
104+
continue
105+
106+
if library_source is not None and library_source.text not in ['TRANSCRIPTOMIC', 'TRANSCRIPTOMIC SINGLE CELL']:
107+
logger.warning('%s: Ignoring run with %s library source.', srr, library_source.text)
108+
continue
109+
95110
is_single_end = root.find(
96111
'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/SINGLE') is not None
97112
is_paired = root.find(
@@ -389,7 +404,7 @@ def run(self):
389404
meta = [r for r in meta if r.srr in self.srr]
390405

391406
if not meta:
392-
raise ValueError(f'No SRA runs found for {self.srx}.')
407+
raise ValueError(f'No valid SRA runs found for {self.srx}. Valid runs must be transcriptomic RNA-Seq.')
393408

394409
metadata = dict(self.metadata)
395410
# do not override the sample_id when invoked from DownloadGeoSample or DownloadGemmaExperiment

0 commit comments

Comments
 (0)