Skip to content

Commit 89ec252

Browse files
committed
Improve layout detection from SRA metadata
Detect bcl2fastq standard filenames and also commonly used names. Add a fallback that checks for the presence of I1/I2/R1/R2, but warns since this is very unreliable. Track issues encountered in runs using an enumerated flag.
1 parent 98854a7 commit 89ec252

File tree

6 files changed

+251
-83
lines changed

6 files changed

+251
-83
lines changed

luigi.cfg

Lines changed: 0 additions & 1 deletion
This file was deleted.

rnaseq_pipeline/rnaseq_utils.py

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@ class SequencingFileType(enum.Enum):
1414
I2 = 2
1515
R1 = 3
1616
R2 = 4
17+
R3 = 5
1718

1819
def detect_layout(run_id: str, filenames: Optional[list[str]],
1920
file_sizes: Optional[list[int]] = None,
2021
average_read_lengths: Optional[list[float]] = None,
2122
is_single_end: bool = False, is_paired: bool = False):
22-
"""Detects the layout of the sequencing run files based on their names
23+
"""Detects the layout of the sequencing run files based on their names and various additional information.
2324
2425
:param run_id: Identifier for the run
2526
:param filenames: List of filenames, if known
@@ -32,18 +33,19 @@ def detect_layout(run_id: str, filenames: Optional[list[str]],
3233
"""
3334

3435
if filenames:
35-
if layout := detect_bcl2fastq_name(filenames):
36-
logger.info('%s: Inferred file types from file names conforming to bcl2fastq output.', run_id)
36+
if layout := detect_bcl2fastq_name(run_id, filenames):
37+
logger.info('%s: Inferred file types: %s from file names conforming to bcl2fastq output: %s.', run_id,
38+
layout, filenames)
3739
return layout
3840

39-
if layout := detect_simple_fastq_name(filenames):
40-
logger.info('%s: Inferred file types from file names conforming to a simple output.', run_id)
41+
if layout := detect_simple_fastq_name(run_id, filenames):
42+
logger.info('%s: Inferred file types: %s from file names conforming to a simple output.', run_id,
43+
layout, filenames)
4144
return layout
4245

43-
if layout := detect_common_name(filenames):
44-
logger.warning('%s: Inferred file types from file names with common name patterns:\n\t%s',
45-
run_id,
46-
'\n\t'.join(filenames))
46+
if layout := detect_common_name(run_id, filenames):
47+
logger.warning('%s: Inferred file types: %s from file names with common name patterns: %s.',
48+
run_id, layout, filenames)
4749
return layout
4850

4951
number_of_files = len(filenames)
@@ -87,7 +89,7 @@ def detect_layout(run_id: str, filenames: Optional[list[str]],
8789
raise ValueError(
8890
f'Unable to detect sequencing layout for {run_id} from: {filenames=} {file_sizes=} {average_read_lengths=} {is_single_end=} {is_paired=}')
8991

90-
def detect_bcl2fastq_name(filenames):
92+
def detect_bcl2fastq_name(run_id, filenames):
9193
# try to detect the file types based on the filenames
9294
# from bcl2fastq manual, this is the expected format of the filenames:
9395
# <sample name>_<barcode sequence>_L<lane>_R<read number>_<set number>.fastq.gz
@@ -108,21 +110,24 @@ def detect_bcl2fastq_name(filenames):
108110
dt = SequencingFileType.R1
109111
elif read_type == 'R' and read_number == 2:
110112
dt = SequencingFileType.R2
113+
elif read_type == 'R' and read_number == 3:
114+
dt = SequencingFileType.R3
111115
else:
112-
logger.warning(f'Unrecognized read type: {read_type}{read_number} in {filename}.')
116+
logger.warning('%s: Unrecognized read type: %s%d in %s.', run_id, read_type, read_number, filename)
113117
break
114118
detected_types.append(dt)
115119

116120
if len(set(detected_types)) < len(detected_types):
117-
logger.warning(f"Non-unique sequencing file type detected: {detected_types} from {filenames}.")
121+
logger.warning("%s: Non-unique sequencing file type detected: %s from %s.", run_id, detected_types, filenames)
118122
return None
119123
elif len(detected_types) == len(filenames):
120124
return detected_types
121125
else:
122126
return None
123127

124-
def detect_simple_fastq_name(filenames):
125-
simple_name_pattern = re.compile(r'(.+)_([RI])(\d)(_\d+)?\.(fastq|fq)(\.gz)?')
128+
def detect_simple_fastq_name(run_id, filenames):
129+
"""Flexible detection of sequencing file types based on common, but valid naming patterns."""
130+
simple_name_pattern = re.compile(r'(.+)([RI])(\d)(_\d+)?\.(fastq|fq)(\.gz)?')
126131

127132
detected_types = []
128133
for filename in filenames:
@@ -138,20 +143,22 @@ def detect_simple_fastq_name(filenames):
138143
dt = SequencingFileType.R1
139144
elif read_type == 'R' and read_number == 2:
140145
dt = SequencingFileType.R2
146+
elif read_type == 'R' and read_number == 3:
147+
dt = SequencingFileType.R3
141148
else:
142-
logger.warning(f'Unrecognized read type: {read_type}{read_number} in {filename}.')
149+
logger.warning('%s: Unrecognized read type: %s%d in %s.', run_id, read_type, read_number, filename)
143150
break
144151
detected_types.append(dt)
145152

146153
if len(set(detected_types)) < len(detected_types):
147-
logger.warning(f"Non-unique sequencing file type detected: {detected_types} from {filenames}.")
154+
logger.warning("%s: Non-unique sequencing file type detected: %s from %s.", run_id, detected_types, filenames)
148155
return None
149156
elif len(detected_types) == len(filenames):
150157
return detected_types
151158
else:
152159
return None
153160

154-
def detect_common_name(filenames):
161+
def detect_common_name(run_id, filenames):
155162
detected_types = []
156163

157164
# this is the most robust way of detecting file types
@@ -168,11 +175,13 @@ def detect_common_name(filenames):
168175
detected_types.append(SequencingFileType.R1)
169176
elif 'R2' in filename:
170177
detected_types.append(SequencingFileType.R2)
178+
elif 'R3' in filename:
179+
detected_types.append(SequencingFileType.R3)
171180
else:
172181
break
173182

174183
if len(set(detected_types)) < len(detected_types):
175-
logger.warning(f"Non-unique sequencing file type detected: {detected_types}.")
184+
logger.warning("%s: Non-unique sequencing file type detected: %s from %s.", run_id, detected_types, filenames)
176185
return None
177186
elif len(detected_types) == len(filenames):
178187
return detected_types

rnaseq_pipeline/sources/sra.py

Lines changed: 76 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
This module contains all the logic to retrieve RNA-Seq data from SRA.
33
"""
4-
4+
import enum
55
import gzip
66
import logging
77
import os
@@ -52,6 +52,14 @@ def read_runinfo(path):
5252
df = pd.read_csv(path, names=SRA_RUNINFO_COLUMNS[:len(df.columns)])
5353
return df
5454

55+
class SraRunIssue(enum.IntFlag):
56+
"""Issues that can occur when processing SRA runs."""
57+
NO_SRA_FILES = enum.auto()
58+
NO_SPOT_STATISTICS = enum.auto()
59+
NO_FASTQ_LOAD_OPTIONS = enum.auto()
60+
MISMATCHED_FASTQ_LOAD_OPTIONS = enum.auto()
61+
AMBIGUOUS_READ_SIZES = enum.auto()
62+
5563
@dataclass
5664
class SraRunMetadata:
5765
"""A digested SRA run metadata"""
@@ -62,21 +70,36 @@ class SraRunMetadata:
6270
fastq_file_sizes: list[int]
6371
# only available if statistics were present in the XML metadata
6472
average_read_lengths: Optional[list[float]]
73+
fastq_load_options: Optional[dict]
6574
layout: list[SequencingFileType]
75+
issues: SraRunIssue
6676

67-
def read_xml_metadata(path) -> List[SraRunMetadata]:
77+
def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
78+
"""
79+
:param path: Path to the XML file containing SRA run metadata.
80+
:param include_invalid_runs: If True, include runs that do not have any suitable metadata that can be used to
81+
determine the layout.
82+
:return:
83+
"""
6884
root = ET.parse(path)
6985
runs = root.findall('EXPERIMENT_PACKAGE/RUN_SET/RUN')
7086
result = []
7187
for run in runs:
88+
srr = run.attrib['accession']
89+
7290
srx = run.find('EXPERIMENT_REF').attrib['accession']
7391
is_single_end = root.find(
7492
'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/SINGLE') is not None
7593
is_paired = root.find(
7694
'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/PAIRED') is not None
77-
srr = run.attrib['accession']
95+
7896
sra_files = run.findall('SRAFiles/SRAFile[@semantic_name=\'fastq\']')
7997

98+
issues = SraRunIssue(0)
99+
100+
if not sra_files:
101+
issues |= SraRunIssue.NO_SRA_FILES
102+
80103
# if the data was loaded with fastq-load.py, we can obtain the order of the files from the options
81104
loader, options = None, None
82105
run_attributes = run.findall('RUN_ATTRIBUTES/RUN_ATTRIBUTE')
@@ -101,6 +124,7 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
101124
number_of_spots = None
102125
reads = None
103126
spot_read_lengths = None
127+
issues |= SraRunIssue.NO_SPOT_STATISTICS
104128

105129
# sort the SRA files to match the spots using fastq-load.py options
106130
if loader == 'fastq-load.py' and options:
@@ -122,32 +146,60 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
122146
fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
123147
else:
124148
logging.warning(
125-
"The SRA files of %s: %s do not match arguments passed to fastq-load.py: %s. The filenames passed to fastq-load.py will be used instead.",
126-
srx,
149+
"%s: The SRA files: %s do not match arguments passed to fastq-load.py: %s. The filenames passed to fastq-load.py will be used instead.",
150+
srr,
127151
[sf.attrib['filename'] for sf in sra_files],
128152
options)
129153
fastq_filenames = fastq_load_files
130154
fastq_file_sizes = None
155+
issues |= SraRunIssue.MISMATCHED_FASTQ_LOAD_OPTIONS
131156

132157
# use spot statistics to determine the order of the files by matching their sizes with the sizes of the files
133158
# this is less reliable than using the fastq-load.py options, but it is still better than nothing
159+
# we can only use this strategy if all the read sizes are different and can be related to the file sizes
134160
elif statistics:
135-
# sort the files according to the layout
136-
# sort the layout according to the average read size
137-
reads_by_size = [e[0] for e in sorted(enumerate(reads),
138-
key=lambda e: int(e[1].attrib['count']) * float(
139-
e[1].attrib['average']))]
140-
files_by_size = [e[0] for e in sorted(enumerate(sra_files), key=lambda e: int(e[1].attrib['size']))]
141-
142-
if reads_by_size != files_by_size:
143-
logger.info('Reordering SRA files to match the read sizes in the spot...')
144-
sra_files = [sra_files[reads_by_size.index(files_by_size[i])] for i, sra_file in
145-
enumerate(sra_files)]
146-
fastq_filenames = [sf.attrib['filename'] for sf in sra_files]
147-
fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
161+
# would be nicer to have this in an else block
162+
issues |= SraRunIssue.NO_FASTQ_LOAD_OPTIONS
163+
# check if the sizes are unambiguous?
164+
read_sizes = [int(read.attrib['count']) * float(read.attrib['average']) for read in reads]
165+
if len(set(read_sizes)) == len(read_sizes):
166+
# sort the files according to the layout
167+
# sort the layout according to the average read size
168+
reads_by_size = [e[0] for e in sorted(enumerate(reads),
169+
key=lambda e: int(e[1].attrib['count']) * float(
170+
e[1].attrib['average']))]
171+
files_by_size = [e[0] for e in sorted(enumerate(sra_files), key=lambda e: int(e[1].attrib['size']))]
172+
173+
if reads_by_size != files_by_size:
174+
logger.info('Reordering SRA files to match the read sizes in the spot...')
175+
sra_files = [sra_files[reads_by_size.index(files_by_size[i])] for i, sra_file in
176+
enumerate(sra_files)]
177+
fastq_filenames = [sf.attrib['filename'] for sf in sra_files]
178+
fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
179+
else:
180+
# this is extremely common, so it's not worth warning about it
181+
logger.info(
182+
'%s: Number of bps per read are ambiguous: %s, cannot use them to order SRA files by filesize. Only the spot metadata will be used to determine the layout.',
183+
srr, read_sizes)
184+
fastq_filenames = None
185+
fastq_file_sizes = None
186+
issues |= SraRunIssue.AMBIGUOUS_READ_SIZES
148187

149188
else:
150-
logger.warning(f'No information found that can be used to order SRA files from {srx}, ignoring.')
189+
logger.warning(
190+
'%s: No information found that can be used to order SRA files, ignoring that run.',
191+
srr)
192+
if include_invalid_runs:
193+
fastq_filenames = [sf.attrib['filename'] for sf in sra_files]
194+
fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
195+
result.append(SraRunMetadata(srx, srr,
196+
is_paired=is_paired,
197+
fastq_filenames=fastq_filenames,
198+
fastq_file_sizes=fastq_file_sizes,
199+
average_read_lengths=None,
200+
fastq_load_options=None,
201+
layout=[],
202+
issues=issues))
151203
continue
152204

153205
layout = detect_layout(srr, fastq_filenames, fastq_file_sizes, spot_read_lengths, is_single_end, is_paired)
@@ -157,7 +209,9 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
157209
fastq_filenames=fastq_filenames,
158210
fastq_file_sizes=fastq_file_sizes,
159211
average_read_lengths=spot_read_lengths,
160-
layout=layout))
212+
fastq_load_options=options if loader == 'fastq-load.py' else None,
213+
layout=layout,
214+
issues=issues))
161215
return result
162216

163217
class PrefetchSraRun(TaskWithMetadataMixin, luigi.Task):
@@ -263,13 +317,6 @@ class DownloadSraExperiment(DynamicTaskWithOutputMixin, DynamicWrapperTask):
263317
srx: str
264318
srr = luigi.OptionalListParameter(default=None, description='Specific SRA run accessions to use (defaults to all)')
265319

266-
force_single_end = luigi.BoolParameter(positional=False, significant=False, default=False,
267-
description='Force the library layout to be single-end')
268-
force_paired_reads = luigi.BoolParameter(positional=False, significant=False, default=False,
269-
description='Force the library layout to be paired')
270-
force_layout = luigi.ListParameter(positional=False, significant=False, default=False,
271-
description='Force the library layout to be either single-end or paired-end.')
272-
273320
metadata: dict
274321

275322
@property
@@ -286,16 +333,8 @@ def run(self):
286333
if self.srr is not None:
287334
meta = [r for r in meta if r.srr in self.srr]
288335

289-
# make sure that all the run metadata are aligned
290-
291-
if self.force_layout:
292-
layout = self.force_layout
293-
if self.force_paired_reads:
294-
layout = [SequencingFileType.R1.name]
295-
elif self.force_single_end:
296-
layout = [SequencingFileType.R1.name, SequencingFileType.R2.name]
297-
else:
298-
layout = self.force_layout
336+
if not meta:
337+
raise ValueError(f'No SRA runs found for {self.srx}.')
299338

300339
metadata = dict(self.metadata)
301340
# do not override the sample_id when invoked from DownloadGeoSample or DownloadGemmaExperiment

tests/luigi.cfg

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#
2+
# This is a configuration example for Luigi and the RNA-Seq pipeline.
3+
#
4+
5+
#
6+
# This section contains scheduler resources dedicated to the pipeline
7+
# execution.
8+
#
9+
10+
[core]
11+
autoload_range=true
12+
13+
[resources]
14+
# in number of available CPUs
15+
cpus=16
16+
# in gigabytes
17+
memory=32
18+
geo_http_connections=4
19+
edirect_http_connections=4
20+
array_express_http_connections=4
21+
sra_connections=4
22+
# If you specify the 'slurm' scheduler in Bioluigi, you must set this resource
23+
slurm_jobs=384
24+
prefetch_jobs=2
25+
fastq_dump_jobs=40
26+
submit_data_jobs=1
27+
submit_batch_info_jobs=2
28+
29+
[bioluigi]
30+
scheduler=slurm
31+
scheduler_partition=
32+
scheduler_extra_args=[]
33+
34+
#
35+
# This section contains the necessary variables for the pipeline execution
36+
#
37+
38+
[rnaseq_pipeline]
39+
# pipeline output directories (relative to OUTPUT_DIR)
40+
OUTPUT_DIR=pipeline-output
41+
GENOMES=genomes
42+
REFERENCES=references
43+
SINGLE_CELL_REFERENCES=single-cell-references
44+
METADATA=metadata
45+
DATA=data
46+
DATAQCDIR=data-qc
47+
ALIGNDIR=aligned
48+
ALIGNQCDIR=aligned-qc
49+
QUANTDIR=quantified
50+
BATCHINFODIR=batch-info
51+
52+
# RSEM
53+
RSEM_DIR=contrib/RSEM
54+
55+
SLACK_WEBHOOK_URL=
56+
57+
[rnaseq_pipeline.gemma]
58+
cli_bin=gemma-cli
59+
# values for $JAVA_HOME and $JAVA_OPTS environment variables
60+
cli_JAVA_HOME=
61+
cli_JAVA_OPTS=
62+
baseurl=https://gemma.msl.ubc.ca
63+
appdata_dir=/space/gemmaData
64+
human_reference_id=hg38_ncbi
65+
mouse_reference_id=mm10_ncbi
66+
rat_reference_id=rn7_ncbi
67+
68+
[rnaseq_pipeline.sources.sra]
69+
ncbi_public_dir=/tmp/ncbi/public

tests/test_rnaseq_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from rnaseq_utils import detect_simple_fastq_name, SequencingFileType
2+
3+
R1, R2 = SequencingFileType.R1, SequencingFileType.R2
4+
5+
def test_detect_simple_fastq_name():
6+
assert detect_simple_fastq_name('123', ['3543_OF1B_5-2-D6-F3-R1.fq', '3543_OF1B_5-2-D6-F3-R2.fq']) == [R1, R2]

0 commit comments

Comments
 (0)