PavlidisLab · arteymix · Apr 28, 2025 · Apr 28, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+/tests/data/* linguist-generated=true
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,20 +7,16 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       max-parallel: 5
+    defaults:
+      run:
+        shell: bash -el {0}
 
     steps:
     - uses: actions/checkout@v4
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v5
+    - uses: conda-incubator/setup-miniconda@v3
       with:
-        python-version: '3.9'
-    - name: Add conda to system path
-      run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
-    - name: Setup Conda environment
-      run: |
-        conda env update --file environment.yml --name base
+        activate-environment: rnaseq-pipeline
+        environment-file: environment.yml
     - name: Install package
       run: |
         pip install .[gsheet,webviewer]
@@ -29,7 +25,7 @@ jobs:
         make -C scripts
     - name: Configure the pipeline
       run: |
-        cp example.luigi.cfg luigi.cfg
+        cp tests/luigi.cfg ./
     - name: Test with pytest
       run: |
         conda install pytest

diff --git a/Makefile b/Makefile
@@ -16,7 +16,7 @@ install: install-python install-systemd-units install-RSEM install-scripts insta
 
 install-fish-completion:
 	mkdir -p "${DESTDIR}/etc/fish/completions"
-	install data/luigi.fish "${DESTDIR}/etc/fish/completions/"
+	install -m644 data/luigi.fish "${DESTDIR}/etc/fish/completions/"
 
 install-scripts:
 	$(MAKE) -C scripts install
@@ -26,7 +26,7 @@ install-python:
 
 install-systemd-units:
 	mkdir -p "${DESTDIR}/etc/systemd/system/"
-	install data/systemd/*.{service,timer,target} "${DESTDIR}/etc/systemd/system/"
+	install -m644 data/systemd/*.{service,timer} "${DESTDIR}/etc/systemd/system/"
 	@echo "Remember to run 'systemctl override rnaseq-pipeline-viewer' and 'systemctl override rnaseq-pipeline-worker@' and set CONDA_BIN, CONDA_ENV, GEMMA_USERNAME and GEMMA_PASSWORD environment variables."
 
 install-RSEM:

diff --git a/README.md b/README.md
@@ -71,8 +71,7 @@ your tasks at http://localhost:8082/.
 luigid
 ```
 
-For convenience, we provide a `luigi-wrapper` script that sets the `--module`
-flag to `rnaseq_pipeline.tasks` for you.
+For convenience, we provide a `rnaseq-pipeline-cli` tool to run high-level tasks:
 
 ```bash
 luigi-wrapper <task> <task_args>
@@ -102,13 +101,16 @@ The output is organized as follow:
 
 ```
 pipeline-output/
-    genomes/<reference_id>/                 # Genomic references
-    references/<reference_id>/              # RSEM/STAR indexes
-    data/<source>                           # FASTQs (note that GEO source uses SRA)
-    data-qc/<experiment_id>/<sample_id>/    # FastQC reports
-    aligned/<reference_id>/<experiment_id>/ # alignments and quantification results
-    quantified/<reference_id>               # quantification matrices for isoforms and genes
-    report/<reference_id>/<experiment_id>/  # MultiQC reports for reads and alignments
+    genomes/<reference_id>/                       # Genomic references
+    references/<reference_id>/                    # RSEM/STAR indexes
+    references-single-cell/<reference_id>/        # Cell Ranger references
+    data/<source>/                                # FASTQs (organization is source-specific; note that GEO source uses SRA)
+    data-qc/<experiment_id>/<sample_id>/          # FastQC reports
+    data-single-cell/<experiment_id>/<sample_id>/ # Single-cell data (hard links to files from data/)
+    aligned/<reference_id>/<experiment_id>/       # alignments and quantification results
+    quantified/<reference_id>                     # quantification matrices for isoforms and genes
+    quantified-single-cell/<reference_id>         # quantified single-cell data (Cell Ranger outputs)
+    report/<reference_id>/<experiment_id>/        # MultiQC reports for reads and alignments
 ```
 
 You can adjust the pipeline output directory by setting `OUTPUT_DIR` under
@@ -146,6 +148,26 @@ pip install .[webviewer]
 gunicorn rnaseq_pipeline.viewer:app
 ```
 
+## Tools Wrappers
+
+A few wrappers are provided to make some tools run more efficiently. For this to work, you have to configure Bioluigi to
+use the wrappers instead of the actual tools.
+
+Examples of behaviors:
+
+ - copy the reference directory to a local scratch directory (Cell Ranger & RSEM)
+ - preload genome reference in shared memory and release unused ones (RSEM only)
+
+```ini
+[bioluigi]
+cellranger_bin=rnaseq-pipeline-cellranger
+rsem_calculate_expression_bin=rnaseq-pipeline-rsem-calculate-expression
+
+[rnaseq_pipeline.wrapped_tools]
+cellranger_bin=/absolute/path/to/cell/ranger/bin
+rsem_calculate_expression_bin=/absolute/path/to/rsem
+```
+
 ## Gemma integration
 
 The RNA-Seq pipeline is capable of communicating with Gemma using its [RESTful API](https://gemma.msl.ubc.ca/resources/restapidocs/).

diff --git a/environment.yml b/environment.yml
@@ -4,13 +4,15 @@ channels:
 - bioconda
 - nodefaults
 dependencies:
-- python=3.10
+- python=3.12
 - pip
-- cutadapt==4.8
-- multiqc==1.29
+- cutadapt==4.9
+- multiqc==1.32
 - polars-lts-cpu # for our older servers that lack support for AVX2
 - sra-tools
 - fastqc==0.12.1
 - star==2.7.3a
 - entrez-direct
 - perl # rsem expects this
+- samtools
+- curl
diff --git a/example.luigi.cfg b/example.luigi.cfg
@@ -27,9 +27,12 @@ submit_data_jobs=1
 submit_batch_info_jobs=2
 
 [bioluigi]
-scheduler=slurm
+scheduler=local
 scheduler_partition=
 scheduler_extra_args=[]
+# Default tools, override as needed
+#cutadapt_bin=cutadapt
+#cell_ranger_bin=cellranger
 
 #
 # This section contains the necessary variables for the pipeline execution
@@ -40,19 +43,33 @@ scheduler_extra_args=[]
 OUTPUT_DIR=pipeline-output
 GENOMES=genomes
 REFERENCES=references
+SINGLE_CELL_REFERENCES=references-single-cell
 METADATA=metadata
 DATA=data
 DATAQCDIR=data-qc
 ALIGNDIR=aligned
-ALIGNQCDIR=aligned-qc
 QUANTDIR=quantified
 BATCHINFODIR=batch-info
 
 # RSEM
 RSEM_DIR=contrib/RSEM
+rsem_calculate_expression_bin=contrib/RSEM/rsem-calculate-expression
 
 SLACK_WEBHOOK_URL=
 
+[rnaseq_pipeline.wrapped_tools]
+rsem_calculate_expression_bin=rsem-calculate-expression
+cellranger_bin=cellranger
+
+[rnaseq_pipeline.sources.sra]
+# location where tools like prefetch and fastq-dump will store downloaded SRA files
+# you can get this value with vdb-config -p
+ncbi_public_dir=/cosmos/scratch/ncbi/public
+samtools_bin=samtools
+bamtofastq_bin=bamtofastq
+# location where BAM headers downloaded from SRA will be cached
+bam_headers_cache_dir=bam_headers
+
 [rnaseq_pipeline.gemma]
 cli_bin=gemma-cli
 # values for $JAVA_HOME and $JAVA_OPTS environment variables
@@ -63,3 +80,6 @@ appdata_dir=/space/gemmaData
 human_reference_id=hg38_ncbi
 mouse_reference_id=mm10_ncbi
 rat_reference_id=rn7_ncbi
+human_single_cell_reference_id=refdata-gex-GRCh38-2024-A
+mouse_single_cell_reference_id=refdata-gex-GRCm39-2024-A
+rat_single_cell_reference_id=refdata-gex-mRatBN7-2-2024-A
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,36 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "rnaseq-pipeline"
+version = "2.1.12"
+description = "RNA-Seq pipeline for the Pavlidis Lab"
+authors = [
+    {name = "Guillaume Poirier-Morency", email = "poirigui@msl.ubc.ca"}
+]
+readme = "README.md"
+license = "Unlicense"
+license-files = ["LICENSE"]
+requires-python = "==3.12.*"
+dependencies = ['luigi', 'python-daemon<3.0.0',
+             'bioluigi@git+https://github.com/PavlidisLab/bioluigi@master',
+             'requests', 'pandas']
+
+[project.optional-dependencies]
+gsheet = ['google-api-python-client', 'google-auth-httplib2', 'google-auth-oauthlib', 'pyxdg']
+webviewer =  ['Flask', 'gunicorn']
+
+[dependency-groups]
+dev = ["pytest", "mypy"]
+
+[project.scripts]
+rnaseq-pipeline-cli = "rnaseq_pipeline.cli:main"
+rnaseq-pipeline-cellranger = "rnaseq_pipeline.wrapped_tools:cellranger_wrapper"
+rnaseq-pipeline-rsem-calculate-expression = "rnaseq_pipeline.wrapped_tools:rsem_calculate_expression_wrapper"
+
+[tool.setuptools]
+packages = ["rnaseq_pipeline", "rnaseq_pipeline.sources", "rnaseq_pipeline.webviewer"]
+
+[tool.mypy]
+plugins = ["luigi.mypy"]
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+testpaths=tests
+log_cli=1
+log_cli_level=info
diff --git a/rnaseq_pipeline/__init__.py b/rnaseq_pipeline/__init__.py
@@ -1,3 +1,7 @@
 import luigi
 
 luigi.auto_namespace(scope=__name__)
+
+from rnaseq_pipeline.tasks import *
+from rnaseq_pipeline.sources.sra import *
+
diff --git a/rnaseq_pipeline/cli.py b/rnaseq_pipeline/cli.py
@@ -0,0 +1,90 @@
+import argparse
+import sys
+import os
+from contextlib import contextmanager
+
+import luigi
+import luigi.cmdline
+
+from rnaseq_pipeline.tasks import SubmitExperimentToGemma, SubmitExperimentsFromGoogleSpreadsheetToGemma, \
+    SubmitExperimentBatchInfoToGemma
+
+@contextmanager
+def umask(umask):
+    print(f'Setting umask to 0x{umask:03o}')
+    prev_umask = os.umask(umask)
+    try:
+        yield None
+    finally:
+        print(f'Restoring umask to 0x{prev_umask:03o}')
+        os.umask(prev_umask)
+
+def parse_octal(s):
+    return int(s, 8)
+
+def run_luigi_task(task, args):
+    with umask(args.umask):
+        luigi.build([task], workers=args.workers, detailed_summary=True, local_scheduler=args.local_scheduler)
+
+def run(args):
+    with umask(0o002):
+        luigi.run(args)
+
+def submit_experiment(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--experiment-id', required=True, help='Experiment ID to submit to Gemma')
+    parser.add_argument('--rerun', action='store_true', default=False, help='Rerun the experiment')
+    parser.add_argument('--priority', type=int, default=100)
+    parser.add_argument('--umask', type=parse_octal, default='002',
+                        help='Set a umask (defaults to 002 to make created files group-writable)')
+    parser.add_argument('--workers', type=int, default=30, help='Number of workers to use (defaults to 30)')
+    parser.add_argument('--local-scheduler', action='store_true', default=False)
+    args = parser.parse_args(argv)
+    run_luigi_task(SubmitExperimentToGemma(experiment_id=args.experiment_id, rerun=args.rerun, priority=args.priority),
+                   args)
+
+def submit_experiment_batch_info(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--experiment-id', required=True, help='Experiment ID to submit to Gemma')
+    parser.add_argument('--ignored-samples', nargs='+', default=[])
+    parser.add_argument('--rerun', action='store_true', default=False, help='Rerun the experiment')
+    parser.add_argument('--umask', type=parse_octal, default='002',
+                        help='Set a umask (defaults to 002 to make created files group-writable)')
+    parser.add_argument('--workers', type=int, default=30, help='Number of workers to use (defaults to 30)')
+    parser.add_argument('--local-scheduler', action='store_true', default=False)
+    args = parser.parse_args(argv)
+    print(args.ignored_samples)
+    run_luigi_task(
+        SubmitExperimentBatchInfoToGemma(experiment_id=args.experiment_id, ignored_samples=args.ignored_samples,
+                                         rerun=args.rerun), args)
+
+def submit_experiments_from_gsheet(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--spreadsheet-id', required=True, help='Spreadsheet ID')
+    parser.add_argument('--sheet-name', required=True, help='Sheet name')
+    parser.add_argument('--umask', type=parse_octal, default='002',
+                        help='Set a umask (defaults to 002 to make created files group-writable)')
+    parser.add_argument('--workers', type=int, default=200, help='Number of workers to use (defaults to 200)')
+    parser.add_argument('--ignore-priority', action='store_true', help='Ignore the priority column in the spreadsheet')
+    parser.add_argument('--local-scheduler', action='store_true', default=False)
+    args = parser.parse_args(argv)
+    run_luigi_task(SubmitExperimentsFromGoogleSpreadsheetToGemma(args.spreadsheet_id, args.sheet_name,
+                                                                 ignore_priority=args.ignore_priority), args)
+
+def main():
+    if len(sys.argv) < 2:
+        print('Usage: rnaseq-pipeline-cli <command>')
+        return 1
+    command = sys.argv[1]
+    if command == 'run':
+        return run(sys.argv[2:])
+    elif command == 'submit-experiment':
+        return submit_experiment(sys.argv[2:])
+    elif command == 'submit-experiment-batch-info':
+        return submit_experiment_batch_info(sys.argv[2:])
+    elif command == 'submit-experiments-from-gsheet':
+        return submit_experiments_from_gsheet(sys.argv[2:])
+    else:
+        print(
+            f'Unknown command {command}. Possible values are: submit-experiment, submit-experiment-batch-info, submit-experiments-from-gsheet.')
+        return 1
diff --git a/rnaseq_pipeline/config.py b/rnaseq_pipeline/config.py
@@ -1,21 +1,28 @@
+from typing import Optional
+
 import luigi
 
 # see luigi.cfg for details
-class rnaseq_pipeline(luigi.Config):
-    task_namespace = ''
+class Config(luigi.Config):
+    @classmethod
+    def get_task_family(cls):
+        return 'rnaseq_pipeline'
+
+    OUTPUT_DIR: str = luigi.Parameter(default='pipeline-output')
 
-    GENOMES = luigi.Parameter()
+    GENOMES: str = luigi.Parameter(default='genomes')
+    REFERENCES: str = luigi.Parameter(default='references')
+    SINGLE_CELL_REFERENCES: str = luigi.Parameter(default='references-single-cell')
+    METADATA: str = luigi.Parameter(default='metadata')
+    DATA: str = luigi.Parameter(default='data')
+    DATAQCDIR: str = luigi.Parameter(default='data-qc')
+    ALIGNDIR: str = luigi.Parameter(default='aligned')
+    QUANTDIR: str = luigi.Parameter(default='quantified')
+    QUANT_SINGLE_CELL_DIR: str = luigi.Parameter(default='quantified-single-cell')
+    BATCHINFODIR: str = luigi.Parameter(default='batch-info')
 
-    OUTPUT_DIR = luigi.Parameter()
-    REFERENCES = luigi.Parameter()
-    METADATA = luigi.Parameter()
-    DATA = luigi.Parameter()
-    DATAQCDIR = luigi.Parameter()
-    ALIGNDIR = luigi.Parameter()
-    ALIGNQCDIR = luigi.Parameter()
-    QUANTDIR = luigi.Parameter()
-    BATCHINFODIR = luigi.Parameter()
+    RSEM_DIR: str = luigi.Parameter(default='contrib/RSEM')
 
-    RSEM_DIR = luigi.Parameter()
+    rsem_calculate_expression_bin: str = luigi.Parameter(default='contrib/RSEM/rsem-calculate-expression')
 
-    SLACK_WEBHOOK_URL = luigi.OptionalParameter(default=None)
+    SLACK_WEBHOOK_URL: Optional[str] = luigi.OptionalParameter(default=None)