Improve layout detection from SRA metadata

arteymix · arteymix · commit 89ec25283d7f · 2025-09-15T15:31:25.000-07:00
Detect bcl2fastq standard filenames and also commonly used names. Add a
fallback that checks for the presence of I1/I2/R1/R2, but warns since
this is very unreliable.

Track issues encountered in runs using an enumerated flag.
diff --git a/luigi.cfg b/luigi.cfg
diff --git a/rnaseq_pipeline/rnaseq_utils.py b/rnaseq_pipeline/rnaseq_utils.py
@@ -14,12 +14,13 @@ class SequencingFileType(enum.Enum):
     I2 = 2
     R1 = 3
     R2 = 4
+    R3 = 5
 
 def detect_layout(run_id: str, filenames: Optional[list[str]],
                   file_sizes: Optional[list[int]] = None,
                   average_read_lengths: Optional[list[float]] = None,
                   is_single_end: bool = False, is_paired: bool = False):
-    """Detects the layout of the sequencing run files based on their names
+    """Detects the layout of the sequencing run files based on their names and various additional information.
 
     :param run_id: Identifier for the run
     :param filenames: List of filenames, if known
@@ -32,18 +33,19 @@ def detect_layout(run_id: str, filenames: Optional[list[str]],
     """
 
     if filenames:
-        if layout := detect_bcl2fastq_name(filenames):
-            logger.info('%s: Inferred file types from file names conforming to bcl2fastq output.', run_id)
+        if layout := detect_bcl2fastq_name(run_id, filenames):
+            logger.info('%s: Inferred file types: %s from file names conforming to bcl2fastq output: %s.', run_id,
+                        layout, filenames)
             return layout
 
-        if layout := detect_simple_fastq_name(filenames):
-            logger.info('%s: Inferred file types from file names conforming to a simple output.', run_id)
+        if layout := detect_simple_fastq_name(run_id, filenames):
+            logger.info('%s: Inferred file types: %s from file names conforming to a simple output.', run_id,
+                        layout, filenames)
             return layout
 
-        if layout := detect_common_name(filenames):
-            logger.warning('%s: Inferred file types from file names with common name patterns:\n\t%s',
-                           run_id,
-                           '\n\t'.join(filenames))
+        if layout := detect_common_name(run_id, filenames):
+            logger.warning('%s: Inferred file types: %s from file names with common name patterns: %s.',
+                           run_id, layout, filenames)
             return layout
 
         number_of_files = len(filenames)
@@ -87,7 +89,7 @@ def detect_layout(run_id: str, filenames: Optional[list[str]],
     raise ValueError(
         f'Unable to detect sequencing layout for {run_id} from: {filenames=} {file_sizes=} {average_read_lengths=} {is_single_end=} {is_paired=}')
 
-def detect_bcl2fastq_name(filenames):
+def detect_bcl2fastq_name(run_id, filenames):
     # try to detect the file types based on the filenames
     # from bcl2fastq manual, this is the expected format of the filenames:
     # <sample name>_<barcode sequence>_L<lane>_R<read number>_<set number>.fastq.gz
@@ -108,21 +110,24 @@ def detect_bcl2fastq_name(filenames):
                 dt = SequencingFileType.R1
             elif read_type == 'R' and read_number == 2:
                 dt = SequencingFileType.R2
+            elif read_type == 'R' and read_number == 3:
+                dt = SequencingFileType.R3
             else:
-                logger.warning(f'Unrecognized read type: {read_type}{read_number} in {filename}.')
+                logger.warning('%s: Unrecognized read type: %s%d in %s.', run_id, read_type, read_number, filename)
                 break
             detected_types.append(dt)
 
     if len(set(detected_types)) < len(detected_types):
-        logger.warning(f"Non-unique sequencing file type detected: {detected_types} from {filenames}.")
+        logger.warning("%s: Non-unique sequencing file type detected: %s from %s.", run_id, detected_types, filenames)
         return None
     elif len(detected_types) == len(filenames):
         return detected_types
     else:
         return None
 
-def detect_simple_fastq_name(filenames):
-    simple_name_pattern = re.compile(r'(.+)_([RI])(\d)(_\d+)?\.(fastq|fq)(\.gz)?')
+def detect_simple_fastq_name(run_id, filenames):
+    """Flexible detection of sequencing file types based on common, but valid naming patterns."""
+    simple_name_pattern = re.compile(r'(.+)([RI])(\d)(_\d+)?\.(fastq|fq)(\.gz)?')
 
     detected_types = []
     for filename in filenames:
@@ -138,20 +143,22 @@ def detect_simple_fastq_name(filenames):
                 dt = SequencingFileType.R1
             elif read_type == 'R' and read_number == 2:
                 dt = SequencingFileType.R2
+            elif read_type == 'R' and read_number == 3:
+                dt = SequencingFileType.R3
             else:
-                logger.warning(f'Unrecognized read type: {read_type}{read_number} in {filename}.')
+                logger.warning('%s: Unrecognized read type: %s%d in %s.', run_id, read_type, read_number, filename)
                 break
             detected_types.append(dt)
 
     if len(set(detected_types)) < len(detected_types):
-        logger.warning(f"Non-unique sequencing file type detected: {detected_types} from {filenames}.")
+        logger.warning("%s: Non-unique sequencing file type detected: %s from %s.", run_id, detected_types, filenames)
         return None
     elif len(detected_types) == len(filenames):
         return detected_types
     else:
         return None
 
-def detect_common_name(filenames):
+def detect_common_name(run_id, filenames):
     detected_types = []
 
     # this is the most robust way of detecting file types
@@ -168,11 +175,13 @@ def detect_common_name(filenames):
             detected_types.append(SequencingFileType.R1)
         elif 'R2' in filename:
             detected_types.append(SequencingFileType.R2)
+        elif 'R3' in filename:
+            detected_types.append(SequencingFileType.R3)
         else:
             break
 
     if len(set(detected_types)) < len(detected_types):
-        logger.warning(f"Non-unique sequencing file type detected: {detected_types}.")
+        logger.warning("%s: Non-unique sequencing file type detected: %s from %s.", run_id, detected_types, filenames)
         return None
     elif len(detected_types) == len(filenames):
         return detected_types
diff --git a/rnaseq_pipeline/sources/sra.py b/rnaseq_pipeline/sources/sra.py
@@ -1,7 +1,7 @@
 """
 This module contains all the logic to retrieve RNA-Seq data from SRA.
 """
-
+import enum
 import gzip
 import logging
 import os
@@ -52,6 +52,14 @@ def read_runinfo(path):
         df = pd.read_csv(path, names=SRA_RUNINFO_COLUMNS[:len(df.columns)])
     return df
 
+class SraRunIssue(enum.IntFlag):
+    """Issues that can occur when processing SRA runs."""
+    NO_SRA_FILES = enum.auto()
+    NO_SPOT_STATISTICS = enum.auto()
+    NO_FASTQ_LOAD_OPTIONS = enum.auto()
+    MISMATCHED_FASTQ_LOAD_OPTIONS = enum.auto()
+    AMBIGUOUS_READ_SIZES = enum.auto()
+
 @dataclass
 class SraRunMetadata:
     """A digested SRA run metadata"""
@@ -62,21 +70,36 @@ class SraRunMetadata:
     fastq_file_sizes: list[int]
     # only available if statistics were present in the XML metadata
     average_read_lengths: Optional[list[float]]
+    fastq_load_options: Optional[dict]
     layout: list[SequencingFileType]
+    issues: SraRunIssue
 
-def read_xml_metadata(path) -> List[SraRunMetadata]:
+def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
+    """
+    :param path: Path to the XML file containing SRA run metadata.
+    :param include_invalid_runs: If True, include runs that do not have any suitable metadata that can be used to
+    determine the layout.
+    :return:
+    """
     root = ET.parse(path)
     runs = root.findall('EXPERIMENT_PACKAGE/RUN_SET/RUN')
     result = []
     for run in runs:
+        srr = run.attrib['accession']
+
         srx = run.find('EXPERIMENT_REF').attrib['accession']
         is_single_end = root.find(
             'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/SINGLE') is not None
         is_paired = root.find(
             'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/PAIRED') is not None
-        srr = run.attrib['accession']
+
         sra_files = run.findall('SRAFiles/SRAFile[@semantic_name=\'fastq\']')
 
+        issues = SraRunIssue(0)
+
+        if not sra_files:
+            issues |= SraRunIssue.NO_SRA_FILES
+
         # if the data was loaded with fastq-load.py, we can obtain the order of the files from the options
         loader, options = None, None
         run_attributes = run.findall('RUN_ATTRIBUTES/RUN_ATTRIBUTE')
@@ -101,6 +124,7 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
             number_of_spots = None
             reads = None
             spot_read_lengths = None
+            issues |= SraRunIssue.NO_SPOT_STATISTICS
 
         # sort the SRA files to match the spots using fastq-load.py options
         if loader == 'fastq-load.py' and options:
@@ -122,32 +146,60 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
                 fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
             else:
                 logging.warning(
-                    "The SRA files of %s: %s do not match arguments passed to fastq-load.py: %s. The filenames passed to fastq-load.py will be used instead.",
-                    srx,
+                    "%s: The SRA files: %s do not match arguments passed to fastq-load.py: %s. The filenames passed to fastq-load.py will be used instead.",
+                    srr,
                     [sf.attrib['filename'] for sf in sra_files],
                     options)
                 fastq_filenames = fastq_load_files
                 fastq_file_sizes = None
+                issues |= SraRunIssue.MISMATCHED_FASTQ_LOAD_OPTIONS
 
         # use spot statistics to determine the order of the files by matching their sizes with the sizes of the files
         # this is less reliable than using the fastq-load.py options, but it is still better than nothing
+        # we can only use this strategy if all the read sizes are different and can be related to the file sizes
         elif statistics:
-            # sort the files according to the layout
-            # sort the layout according to the average read size
-            reads_by_size = [e[0] for e in sorted(enumerate(reads),
-                                                  key=lambda e: int(e[1].attrib['count']) * float(
-                                                      e[1].attrib['average']))]
-            files_by_size = [e[0] for e in sorted(enumerate(sra_files), key=lambda e: int(e[1].attrib['size']))]
-
-            if reads_by_size != files_by_size:
-                logger.info('Reordering SRA files to match the read sizes in the spot...')
-                sra_files = [sra_files[reads_by_size.index(files_by_size[i])] for i, sra_file in
-                             enumerate(sra_files)]
-            fastq_filenames = [sf.attrib['filename'] for sf in sra_files]
-            fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
+            # would be nicer to have this in an else block
+            issues |= SraRunIssue.NO_FASTQ_LOAD_OPTIONS
+            # check if the sizes are unambiguous?
+            read_sizes = [int(read.attrib['count']) * float(read.attrib['average']) for read in reads]
+            if len(set(read_sizes)) == len(read_sizes):
+                # sort the files according to the layout
+                # sort the layout according to the average read size
+                reads_by_size = [e[0] for e in sorted(enumerate(reads),
+                                                      key=lambda e: int(e[1].attrib['count']) * float(
+                                                          e[1].attrib['average']))]
+                files_by_size = [e[0] for e in sorted(enumerate(sra_files), key=lambda e: int(e[1].attrib['size']))]
+
+                if reads_by_size != files_by_size:
+                    logger.info('Reordering SRA files to match the read sizes in the spot...')
+                    sra_files = [sra_files[reads_by_size.index(files_by_size[i])] for i, sra_file in
+                                 enumerate(sra_files)]
+                fastq_filenames = [sf.attrib['filename'] for sf in sra_files]
+                fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
+            else:
+                # this is extremely common, so it's not worth warning about it
+                logger.info(
+                    '%s: Number of bps per read are ambiguous: %s, cannot use them to order SRA files by filesize. Only the spot metadata will be used to determine the layout.',
+                    srr, read_sizes)
+                fastq_filenames = None
+                fastq_file_sizes = None
+                issues |= SraRunIssue.AMBIGUOUS_READ_SIZES
 
         else:
-            logger.warning(f'No information found that can be used to order SRA files from {srx}, ignoring.')
+            logger.warning(
+                '%s: No information found that can be used to order SRA files, ignoring that run.',
+                srr)
+            if include_invalid_runs:
+                fastq_filenames = [sf.attrib['filename'] for sf in sra_files]
+                fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_files]
+                result.append(SraRunMetadata(srx, srr,
+                                             is_paired=is_paired,
+                                             fastq_filenames=fastq_filenames,
+                                             fastq_file_sizes=fastq_file_sizes,
+                                             average_read_lengths=None,
+                                             fastq_load_options=None,
+                                             layout=[],
+                                             issues=issues))
             continue
 
         layout = detect_layout(srr, fastq_filenames, fastq_file_sizes, spot_read_lengths, is_single_end, is_paired)
@@ -157,7 +209,9 @@ def read_xml_metadata(path) -> List[SraRunMetadata]:
                                      fastq_filenames=fastq_filenames,
                                      fastq_file_sizes=fastq_file_sizes,
                                      average_read_lengths=spot_read_lengths,
-                                     layout=layout))
+                                     fastq_load_options=options if loader == 'fastq-load.py' else None,
+                                     layout=layout,
+                                     issues=issues))
     return result
 
 class PrefetchSraRun(TaskWithMetadataMixin, luigi.Task):
@@ -263,13 +317,6 @@ class DownloadSraExperiment(DynamicTaskWithOutputMixin, DynamicWrapperTask):
     srx: str
     srr = luigi.OptionalListParameter(default=None, description='Specific SRA run accessions to use (defaults to all)')
 
-    force_single_end = luigi.BoolParameter(positional=False, significant=False, default=False,
-                                           description='Force the library layout to be single-end')
-    force_paired_reads = luigi.BoolParameter(positional=False, significant=False, default=False,
-                                             description='Force the library layout to be paired')
-    force_layout = luigi.ListParameter(positional=False, significant=False, default=False,
-                                       description='Force the library layout to be either single-end or paired-end.')
-
     metadata: dict
 
     @property
@@ -286,16 +333,8 @@ def run(self):
         if self.srr is not None:
             meta = [r for r in meta if r.srr in self.srr]
 
-        # make sure that all the run metadata are aligned
-
-        if self.force_layout:
-            layout = self.force_layout
-        if self.force_paired_reads:
-            layout = [SequencingFileType.R1.name]
-        elif self.force_single_end:
-            layout = [SequencingFileType.R1.name, SequencingFileType.R2.name]
-        else:
-            layout = self.force_layout
+        if not meta:
+            raise ValueError(f'No SRA runs found for {self.srx}.')
 
         metadata = dict(self.metadata)
         # do not override the sample_id when invoked from DownloadGeoSample or DownloadGemmaExperiment
diff --git a/tests/luigi.cfg b/tests/luigi.cfg
@@ -0,0 +1,69 @@
+#
+# This is a configuration example for Luigi and the RNA-Seq pipeline.
+#
+
+#
+# This section contains scheduler resources dedicated to the pipeline
+# execution.
+#
+
+[core]
+autoload_range=true
+
+[resources]
+# in number of available CPUs
+cpus=16
+# in gigabytes
+memory=32
+geo_http_connections=4
+edirect_http_connections=4
+array_express_http_connections=4
+sra_connections=4
+# If you specify the 'slurm' scheduler in Bioluigi, you must set this resource
+slurm_jobs=384
+prefetch_jobs=2
+fastq_dump_jobs=40
+submit_data_jobs=1
+submit_batch_info_jobs=2
+
+[bioluigi]
+scheduler=slurm
+scheduler_partition=
+scheduler_extra_args=[]
+
+#
+# This section contains the necessary variables for the pipeline execution
+#
+
+[rnaseq_pipeline]
+# pipeline output directories (relative to OUTPUT_DIR)
+OUTPUT_DIR=pipeline-output
+GENOMES=genomes
+REFERENCES=references
+SINGLE_CELL_REFERENCES=single-cell-references
+METADATA=metadata
+DATA=data
+DATAQCDIR=data-qc
+ALIGNDIR=aligned
+ALIGNQCDIR=aligned-qc
+QUANTDIR=quantified
+BATCHINFODIR=batch-info
+
+# RSEM
+RSEM_DIR=contrib/RSEM
+
+SLACK_WEBHOOK_URL=
+
+[rnaseq_pipeline.gemma]
+cli_bin=gemma-cli
+# values for $JAVA_HOME and $JAVA_OPTS environment variables
+cli_JAVA_HOME=
+cli_JAVA_OPTS=
+baseurl=https://gemma.msl.ubc.ca
+appdata_dir=/space/gemmaData
+human_reference_id=hg38_ncbi
+mouse_reference_id=mm10_ncbi
+rat_reference_id=rn7_ncbi
+
+[rnaseq_pipeline.sources.sra]
+ncbi_public_dir=/tmp/ncbi/public
diff --git a/tests/test_rnaseq_utils.py b/tests/test_rnaseq_utils.py
@@ -0,0 +1,6 @@
+from rnaseq_utils import detect_simple_fastq_name, SequencingFileType
+
+R1, R2 = SequencingFileType.R1, SequencingFileType.R2
+
+def test_detect_simple_fastq_name():
+    assert detect_simple_fastq_name('123', ['3543_OF1B_5-2-D6-F3-R1.fq', '3543_OF1B_5-2-D6-F3-R2.fq']) == [R1, R2]
diff --git a/tests/test_sra.py b/tests/test_sra.py