11"""
22This module contains all the logic to retrieve RNA-Seq data from SRA.
33"""
4-
4+ import enum
55import gzip
66import logging
77import os
@@ -52,6 +52,14 @@ def read_runinfo(path):
5252 df = pd .read_csv (path , names = SRA_RUNINFO_COLUMNS [:len (df .columns )])
5353 return df
5454
55+ class SraRunIssue (enum .IntFlag ):
56+ """Issues that can occur when processing SRA runs."""
57+ NO_SRA_FILES = enum .auto ()
58+ NO_SPOT_STATISTICS = enum .auto ()
59+ NO_FASTQ_LOAD_OPTIONS = enum .auto ()
60+ MISMATCHED_FASTQ_LOAD_OPTIONS = enum .auto ()
61+ AMBIGUOUS_READ_SIZES = enum .auto ()
62+
5563@dataclass
5664class SraRunMetadata :
5765 """A digested SRA run metadata"""
@@ -62,21 +70,36 @@ class SraRunMetadata:
6270 fastq_file_sizes : list [int ]
6371 # only available if statistics were present in the XML metadata
6472 average_read_lengths : Optional [list [float ]]
73+ fastq_load_options : Optional [dict ]
6574 layout : list [SequencingFileType ]
75+ issues : SraRunIssue
6676
67- def read_xml_metadata (path ) -> List [SraRunMetadata ]:
77+ def read_xml_metadata (path , include_invalid_runs = False ) -> List [SraRunMetadata ]:
78+ """
79+ :param path: Path to the XML file containing SRA run metadata.
80+ :param include_invalid_runs: If True, include runs that do not have any suitable metadata that can be used to
81+ determine the layout.
82+ :return:
83+ """
6884 root = ET .parse (path )
6985 runs = root .findall ('EXPERIMENT_PACKAGE/RUN_SET/RUN' )
7086 result = []
7187 for run in runs :
88+ srr = run .attrib ['accession' ]
89+
7290 srx = run .find ('EXPERIMENT_REF' ).attrib ['accession' ]
7391 is_single_end = root .find (
7492 'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\' ' + srx + '\' ]/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/SINGLE' ) is not None
7593 is_paired = root .find (
7694 'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\' ' + srx + '\' ]/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/PAIRED' ) is not None
77- srr = run . attrib [ 'accession' ]
95+
7896 sra_files = run .findall ('SRAFiles/SRAFile[@semantic_name=\' fastq\' ]' )
7997
98+ issues = SraRunIssue (0 )
99+
100+ if not sra_files :
101+ issues |= SraRunIssue .NO_SRA_FILES
102+
80103 # if the data was loaded with fastq-load.py, we can obtain the order of the files from the options
81104 loader , options = None , None
82105 run_attributes = run .findall ('RUN_ATTRIBUTES/RUN_ATTRIBUTE' )
@@ -101,6 +124,7 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
101124 number_of_spots = None
102125 reads = None
103126 spot_read_lengths = None
127+ issues |= SraRunIssue .NO_SPOT_STATISTICS
104128
105129 # sort the SRA files to match the spots using fastq-load.py options
106130 if loader == 'fastq-load.py' and options :
@@ -122,32 +146,60 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
122146 fastq_file_sizes = [int (sf .attrib ['size' ]) for sf in sra_files ]
123147 else :
124148 logging .warning (
125- "The SRA files of %s : %s do not match arguments passed to fastq-load.py: %s. The filenames passed to fastq-load.py will be used instead." ,
126- srx ,
149+ "%s: The SRA files: %s do not match arguments passed to fastq-load.py: %s. The filenames passed to fastq-load.py will be used instead." ,
150+ srr ,
127151 [sf .attrib ['filename' ] for sf in sra_files ],
128152 options )
129153 fastq_filenames = fastq_load_files
130154 fastq_file_sizes = None
155+ issues |= SraRunIssue .MISMATCHED_FASTQ_LOAD_OPTIONS
131156
132157 # use spot statistics to determine the order of the files by matching their sizes with the sizes of the files
133158 # this is less reliable than using the fastq-load.py options, but it is still better than nothing
159+ # we can only use this strategy if all the read sizes are different and can be related to the file sizes
134160 elif statistics :
135- # sort the files according to the layout
136- # sort the layout according to the average read size
137- reads_by_size = [e [0 ] for e in sorted (enumerate (reads ),
138- key = lambda e : int (e [1 ].attrib ['count' ]) * float (
139- e [1 ].attrib ['average' ]))]
140- files_by_size = [e [0 ] for e in sorted (enumerate (sra_files ), key = lambda e : int (e [1 ].attrib ['size' ]))]
141-
142- if reads_by_size != files_by_size :
143- logger .info ('Reordering SRA files to match the read sizes in the spot...' )
144- sra_files = [sra_files [reads_by_size .index (files_by_size [i ])] for i , sra_file in
145- enumerate (sra_files )]
146- fastq_filenames = [sf .attrib ['filename' ] for sf in sra_files ]
147- fastq_file_sizes = [int (sf .attrib ['size' ]) for sf in sra_files ]
161+ # would be nicer to have this in an else block
162+ issues |= SraRunIssue .NO_FASTQ_LOAD_OPTIONS
163+ # check if the sizes are unambiguous?
164+ read_sizes = [int (read .attrib ['count' ]) * float (read .attrib ['average' ]) for read in reads ]
165+ if len (set (read_sizes )) == len (read_sizes ):
166+ # sort the files according to the layout
167+ # sort the layout according to the average read size
168+ reads_by_size = [e [0 ] for e in sorted (enumerate (reads ),
169+ key = lambda e : int (e [1 ].attrib ['count' ]) * float (
170+ e [1 ].attrib ['average' ]))]
171+ files_by_size = [e [0 ] for e in sorted (enumerate (sra_files ), key = lambda e : int (e [1 ].attrib ['size' ]))]
172+
173+ if reads_by_size != files_by_size :
174+ logger .info ('Reordering SRA files to match the read sizes in the spot...' )
175+ sra_files = [sra_files [reads_by_size .index (files_by_size [i ])] for i , sra_file in
176+ enumerate (sra_files )]
177+ fastq_filenames = [sf .attrib ['filename' ] for sf in sra_files ]
178+ fastq_file_sizes = [int (sf .attrib ['size' ]) for sf in sra_files ]
179+ else :
180+ # this is extremely common, so it's not worth warning about it
181+ logger .info (
182+ '%s: Number of bps per read are ambiguous: %s, cannot use them to order SRA files by filesize. Only the spot metadata will be used to determine the layout.' ,
183+ srr , read_sizes )
184+ fastq_filenames = None
185+ fastq_file_sizes = None
186+ issues |= SraRunIssue .AMBIGUOUS_READ_SIZES
148187
149188 else :
150- logger .warning (f'No information found that can be used to order SRA files from { srx } , ignoring.' )
189+ logger .warning (
190+ '%s: No information found that can be used to order SRA files, ignoring that run.' ,
191+ srr )
192+ if include_invalid_runs :
193+ fastq_filenames = [sf .attrib ['filename' ] for sf in sra_files ]
194+ fastq_file_sizes = [int (sf .attrib ['size' ]) for sf in sra_files ]
195+ result .append (SraRunMetadata (srx , srr ,
196+ is_paired = is_paired ,
197+ fastq_filenames = fastq_filenames ,
198+ fastq_file_sizes = fastq_file_sizes ,
199+ average_read_lengths = None ,
200+ fastq_load_options = None ,
201+ layout = [],
202+ issues = issues ))
151203 continue
152204
153205 layout = detect_layout (srr , fastq_filenames , fastq_file_sizes , spot_read_lengths , is_single_end , is_paired )
@@ -157,7 +209,9 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
157209 fastq_filenames = fastq_filenames ,
158210 fastq_file_sizes = fastq_file_sizes ,
159211 average_read_lengths = spot_read_lengths ,
160- layout = layout ))
212+ fastq_load_options = options if loader == 'fastq-load.py' else None ,
213+ layout = layout ,
214+ issues = issues ))
161215 return result
162216
163217class PrefetchSraRun (TaskWithMetadataMixin , luigi .Task ):
@@ -263,13 +317,6 @@ class DownloadSraExperiment(DynamicTaskWithOutputMixin, DynamicWrapperTask):
263317 srx : str
264318 srr = luigi .OptionalListParameter (default = None , description = 'Specific SRA run accessions to use (defaults to all)' )
265319
266- force_single_end = luigi .BoolParameter (positional = False , significant = False , default = False ,
267- description = 'Force the library layout to be single-end' )
268- force_paired_reads = luigi .BoolParameter (positional = False , significant = False , default = False ,
269- description = 'Force the library layout to be paired' )
270- force_layout = luigi .ListParameter (positional = False , significant = False , default = False ,
271- description = 'Force the library layout to be either single-end or paired-end.' )
272-
273320 metadata : dict
274321
275322 @property
@@ -286,16 +333,8 @@ def run(self):
286333 if self .srr is not None :
287334 meta = [r for r in meta if r .srr in self .srr ]
288335
289- # make sure that all the run metadata are aligned
290-
291- if self .force_layout :
292- layout = self .force_layout
293- if self .force_paired_reads :
294- layout = [SequencingFileType .R1 .name ]
295- elif self .force_single_end :
296- layout = [SequencingFileType .R1 .name , SequencingFileType .R2 .name ]
297- else :
298- layout = self .force_layout
336+ if not meta :
337+ raise ValueError (f'No SRA runs found for { self .srx } .' )
299338
300339 metadata = dict (self .metadata )
301340 # do not override the sample_id when invoked from DownloadGeoSample or DownloadGemmaExperiment
0 commit comments