Skip to content

Commit 9e01957

Browse files
committed
fixup! fixup! WIP
1 parent 91df897 commit 9e01957

File tree

9 files changed

+129
-25
lines changed

9 files changed

+129
-25
lines changed

README.md

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,15 @@ The output is organized as follow:
102102

103103
```
104104
pipeline-output/
105-
genomes/<reference_id>/ # Genomic references
106-
references/<reference_id>/ # RSEM/STAR indexes
107-
data/<source> # FASTQs (note that GEO source uses SRA)
108-
data-qc/<experiment_id>/<sample_id>/ # FastQC reports
109-
aligned/<reference_id>/<experiment_id>/ # alignments and quantification results
110-
quantified/<reference_id> # quantification matrices for isoforms and genes
111-
report/<reference_id>/<experiment_id>/ # MultiQC reports for reads and alignments
105+
genomes/<reference_id>/ # Genomic references
106+
references/<reference_id>/ # RSEM/STAR indexes
107+
data/<source>/ # FASTQs (organization is source-specific; note that GEO source uses SRA)
108+
data-qc/<experiment_id>/<sample_id>/ # FastQC reports
109+
data-single-cell/<experiment_id>/<sample_id>/ # Single-cell data (hard links to files from data/)
110+
aligned/<reference_id>/<experiment_id>/ # alignments and quantification results
111+
quantified/<reference_id> # quantification matrices for isoforms and genes
112+
quantified-single-cell/<reference_id> # quantified single-cell data (Cell Ranger outputs)
113+
report/<reference_id>/<experiment_id>/ # MultiQC reports for reads and alignments
112114
```
113115

114116
You can adjust the pipeline output directory by setting `OUTPUT_DIR` under

example.luigi.cfg

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ submit_batch_info_jobs=2
3030
scheduler=local
3131
scheduler_partition=
3232
scheduler_extra_args=[]
33+
# Default tools, override as needed
34+
#cutadapt_bin=cutadapt
35+
#cell_ranger_bin=cellranger
3336

3437
#
3538
# This section contains the necessary variables for the pipeline execution
@@ -40,7 +43,7 @@ scheduler_extra_args=[]
4043
OUTPUT_DIR=pipeline-output
4144
GENOMES=genomes
4245
REFERENCES=references
43-
REFERENCES_CELL_RANGER=references-cell-ranger
46+
SINGLE_CELL_REFERENCES=references-single-cell
4447
METADATA=metadata
4548
DATA=data
4649
DATAQCDIR=data-qc
@@ -52,9 +55,6 @@ BATCHINFODIR=batch-info
5255
# RSEM
5356
RSEM_DIR=contrib/RSEM
5457

55-
# Cell Ranger
56-
cell_ranger_bin=cellranger
57-
5858
SLACK_WEBHOOK_URL=
5959

6060
[rnaseq_pipeline.sources.sra]
@@ -72,3 +72,6 @@ appdata_dir=/space/gemmaData
7272
human_reference_id=hg38_ncbi
7373
mouse_reference_id=mm10_ncbi
7474
rat_reference_id=rn7_ncbi
75+
human_single_cell_reference_id=refdata-gex-GRCh38-2024-A
76+
mouse_single_cell_reference_id=refdata-gex-GRCm39-2024-A
77+
rat_single_cell_reference_id=refdata-gex-mRatBN7-2-2024-A

luigi.cfg

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#
2+
# This is a configuration example for Luigi and the RNA-Seq pipeline.
3+
#
4+
5+
#
6+
# This section contains scheduler resources dedicated to the pipeline
7+
# execution.
8+
#
9+
10+
[core]
11+
autoload_range=true
12+
13+
[resources]
14+
# in number of available CPUs
15+
cpus=16
16+
# in gigabytes
17+
memory=32
18+
geo_http_connections=4
19+
edirect_http_connections=4
20+
array_express_http_connections=4
21+
sra_connections=4
22+
# If you specify the 'slurm' scheduler in Bioluigi, you must set this resource
23+
slurm_jobs=384
24+
prefetch_jobs=2
25+
fastq_dump_jobs=40
26+
submit_data_jobs=1
27+
submit_batch_info_jobs=2
28+
29+
[bioluigi]
30+
scheduler=local
31+
scheduler_partition=
32+
scheduler_extra_args=[]
33+
34+
#
35+
# This section contains the necessary variables for the pipeline execution
36+
#
37+
38+
[rnaseq_pipeline]
39+
# pipeline output directories (relative to OUTPUT_DIR)
40+
OUTPUT_DIR=pipeline-output
41+
GENOMES=genomes
42+
REFERENCES=references
43+
SINGLE_CELL_REFERENCES=references-single-cell
44+
METADATA=metadata
45+
DATA=data
46+
DATAQCDIR=data-qc
47+
ALIGNDIR=aligned
48+
ALIGNQCDIR=aligned-qc
49+
QUANTDIR=quantified
50+
BATCHINFODIR=batch-info
51+
52+
# RSEM
53+
RSEM_DIR=contrib/RSEM
54+
55+
# Cell Ranger
56+
cell_ranger_bin=cellranger
57+
58+
SLACK_WEBHOOK_URL=
59+
60+
[rnaseq_pipeline.sources.sra]
61+
# location where tools like prefetch and fastq-dump will store downloaded SRA files
62+
# you can get this value with vdb-config -p
63+
ncbi_public_dir=/home/guillaume/Projets/Gemma/gemma-data/cache/ncbi/public
64+
65+
[rnaseq_pipeline.gemma]
66+
cli_bin=gemma-cli
67+
# values for $JAVA_HOME and $JAVA_OPTS environment variables
68+
cli_JAVA_HOME=
69+
cli_JAVA_OPTS=
70+
baseurl=https://gemma.msl.ubc.ca
71+
appdata_dir=/space/gemmaData
72+
human_reference_id=hg38_ncbi
73+
mouse_reference_id=mm10_ncbi
74+
rat_reference_id=rn7_ncbi
75+
human_single_cell_reference_id=hg38_ncbi
76+
mouse_single_cell_reference_id=mm10_ncbi
77+
rat_single_cell_reference_id=rn7_ncbi

rnaseq_pipeline/gemma.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ class gemma(luigi.Config):
1818
human_reference_id: str = luigi.Parameter()
1919
mouse_reference_id: str = luigi.Parameter()
2020
rat_reference_id: str = luigi.Parameter()
21+
human_single_cell_reference_id: str = luigi.Parameter()
22+
mouse_single_cell_reference_id: str = luigi.Parameter()
23+
rat_single_cell_reference_id: str = luigi.Parameter()
2124

2225
cfg = gemma()
2326

@@ -98,6 +101,13 @@ def reference_id(self):
98101
except KeyError:
99102
raise ValueError('Unsupported Gemma taxon {}.'.format(self.taxon))
100103

104+
def single_cell_reference_id(self):
105+
try:
106+
return {'human': cfg.human_single_cell_reference_id, 'mouse': cfg.mouse_single_cell_reference_id, 'rat': cfg.rat_single_cell_reference_id}[
107+
self.taxon]
108+
except KeyError:
109+
raise ValueError('Unsupported Gemma taxon {}.'.format(self.taxon))
110+
101111
@property
102112
def platform_short_name(self):
103113
return f'Generic_{self.taxon}_ncbiIds'

rnaseq_pipeline/sources/arrayexpress.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class DownloadArrayExpressRun(luigi.Task):
5050

5151
@property
5252
def platform(self):
53+
# TODO: detect platforms from ArrayExpress metadata
5354
return IlluminaPlatform('HiSeq 2500')
5455

5556
def run(self):

rnaseq_pipeline/targets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def is_stale(self):
8585
def exists(self):
8686
return super().exists() and not self.is_stale()
8787

88-
class DownloadRunTarget(luigi.LocalTarget):
88+
class DownloadRunTarget(luigi.Target):
8989
run_id: str
9090
files: list[str]
9191
layout: list[str]

rnaseq_pipeline/tasks.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import uuid
55
from glob import glob
66
from os.path import join, dirname
7-
from typing import Optional
87

98
import luigi
109
import luigi.task
@@ -456,8 +455,8 @@ class OrganizeSingleCellSample(luigi.Task):
456455
def run(self):
457456
runs = self.input()
458457
os.makedirs(self.output().path)
459-
for run in runs:
460-
for lane, (f, read_type) in enumerate(zip(run.files, run.layout)):
458+
for lane, run in enumerate(runs):
459+
for f, read_type in zip(run.files, run.layout):
461460
dest = join(self.output().path, f'{self.sample_id}_S1_L{lane + 1:03}_{read_type}_001.fastq.gz')
462461
if os.path.exists(dest):
463462
os.unlink(dest)
@@ -470,10 +469,6 @@ def output(self):
470469
class AlignSingleCellSample(DynamicWrapperTask):
471470
experiment_id: str
472471
sample_id: str
473-
expect_cells: Optional[int] = luigi.OptionalIntParameter(default=None, positional=False)
474-
force_cells: Optional[int] = luigi.OptionalIntParameter(default=None, positional=False)
475-
chemistry: Optional[str] = luigi.OptionalParameter(default=None, positional=False,
476-
description='Chemistry to use for Cell Ranger (default is to auto-detect)')
477472

478473
def run(self):
479474
fastqs_dir, transcriptome_dir = self.input()
@@ -482,15 +477,13 @@ def run(self):
482477
transcriptome_dir=transcriptome_dir,
483478
fastqs_dir=fastqs_dir,
484479
output_dir=self.output().path,
485-
expect_cells=self.expect_cells,
486-
force_cells=self.force_cells,
487-
chemistry=self.chemistry,
488-
# FIXME: request a node with AVX512
480+
# TODO: add an avx feature on slurm
489481
scheduler_extra_args=['--constraint', 'thrd64']
490482
)
491483

492484
def output(self):
493-
return luigi.LocalTarget(join(cfg.OUTPUT_DIR, 'quantified-single-cell', self.experiment_id, self.sample_id))
485+
return luigi.LocalTarget(
486+
join(cfg.OUTPUT_DIR, 'quantified-single-cell', self.reference_id, self.experiment_id, self.sample_id))
494487

495488
class AlignSingleCellExperiment(DynamicTaskWithOutputMixin, DynamicWrapperTask):
496489
experiment_id: str = luigi.Parameter()
@@ -591,6 +584,24 @@ def output(self):
591584
return luigi.LocalTarget(
592585
join(gemma_cfg.appdata_dir, 'metadata', self.experiment_id, 'MultiQCReports/multiqc_report.html'))
593586

587+
class SubmitSingleCellExperimentDataToGemma(RerunnableTaskMixin, GemmaCliTask):
588+
experiment_id: str = luigi.Parameter()
589+
subcommand = 'loadSingleCellData'
590+
591+
def requires(self):
592+
return AlignSingleCellExperiment(experiment_id=self.experiment_id,
593+
reference_id=self.single_cell_reference_id(),
594+
source='gemma')
595+
596+
def subcommand_args(self):
597+
return ['-e', self.experiment_id, '-a', self.platform_short_name,
598+
'--data-path', self.input().path,
599+
'--quantitation-type-recomputed-from-raw-data',
600+
'--preferred-quantitation-type',
601+
# TODO: add sequencing metadata
602+
# FIXME: add --replace
603+
]
604+
594605
@requires(SubmitExperimentDataToGemma, SubmitExperimentBatchInfoToGemma, SubmitExperimentReportToGemma)
595606
class SubmitExperimentToGemma(TaskWithOutputMixin, WrapperTask):
596607
"""

rnaseq_pipeline/webviewer/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from os.path import basename, getctime, join, dirname
21
import datetime
32
from glob import glob
43
from os.path import basename, getctime, join, dirname

tests/test_webviewer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def test_experiment_summary(client):
1111
res = client.get('/experiment/GSE87750')
1212
assert res.status == '200 OK'
1313

14+
@pytest.mark.skip()
1415
def test_experiment_batch_info(client):
1516
res = client.get('/experiment/GSE87750/batch-info')
1617
assert res.status == '200 OK'

0 commit comments

Comments
 (0)