Skip to content

Commit 12230be

Browse files
author
Pablo Riesgo Ferreiro
committed
Merge branch 'improve-mark-dupliocates-performance' into 'develop'
Improve mark dupliocates performance See merge request tron/tron-bam-preprocessing!15
2 parents e6ba496 + 92c925e commit 12230be

File tree

5 files changed

+44
-27
lines changed

5 files changed

+44
-27
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@ test:
1515
nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
1616
nextflow main.nf -profile test,conda --output output/test6 --intervals false
1717
nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
18-
nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10
18+
nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ Optional input:
7474
* --skip_bqsr: optionally skip BQSR (default: false)
7575
* --skip_realignment: optionally skip realignment (default: false)
7676
* --skip_deduplication: optionally skip deduplication (default: false)
77+
* --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
7778
* --skip_metrics: optionally skip metrics (default: false)
7879
* --output: the folder where to publish output (default: ./output)
7980
* --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# You can use this file to create a conda environment for this pipeline:
22
# conda env create -f environment.yml
3-
name: tronflow-bam-preprocessing-1.3.1
3+
name: tronflow-bam-preprocessing-1.4.0
44
channels:
55
- conda-forge
66
- bioconda

main.nf

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ params.hs_metrics_per_base_coverage = false
1313
params.skip_bqsr = false
1414
params.skip_realignment = false
1515
params.skip_deduplication = false
16+
params.remove_duplicates = true
1617
params.skip_metrics = false
1718
params.output = false
1819
params.platform = "ILLUMINA"
@@ -30,6 +31,8 @@ params.bqsr_cpus = 3
3031
params.bqsr_memory = "4g"
3132
params.metrics_cpus = 1
3233
params.metrics_memory = "8g"
34+
params.index_cpus = 1
35+
params.index_memory = "8g"
3336

3437

3538

@@ -83,9 +86,10 @@ process prepareBam {
8386
output:
8487
set val(name),
8588
val("${bam.baseName}"),
86-
val(type), file("${bam.baseName}.prepared.bam"),
87-
file("${bam.baseName}.prepared.bai") into prepared_bams
89+
val(type), file("${bam.baseName}.prepared.bam") into prepared_bams
8890

91+
script:
92+
order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
8993
"""
9094
mkdir tmp
9195
@@ -109,8 +113,7 @@ process prepareBam {
109113
--RGSM ${type} \
110114
--RGLB 1 \
111115
--RGPL ${params.platform} \
112-
--SORT_ORDER coordinate \
113-
--CREATE_INDEX true
116+
${order}
114117
"""
115118
}
116119

@@ -126,7 +129,7 @@ if (!params.skip_deduplication) {
126129
publishDir "${publish_dir}/${name}/metrics", mode: "copy", pattern: "*.dedup_metrics"
127130

128131
input:
129-
set name, bam_name, type, file(bam), file(bai) from prepared_bams
132+
set name, bam_name, type, file(bam) from prepared_bams
130133

131134
output:
132135
set val(name), val(bam_name), val(type),
@@ -136,6 +139,7 @@ if (!params.skip_deduplication) {
136139

137140
script:
138141
dedup_metrics = params.skip_metrics ? "": "--metrics-file ${bam.baseName}.dedup_metrics"
142+
remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false"
139143
"""
140144
mkdir tmp
141145
@@ -144,12 +148,34 @@ if (!params.skip_deduplication) {
144148
--input ${bam} \
145149
--output ${bam.baseName}.dedup.bam \
146150
--conf 'spark.executor.cores=${task.cpus}' \
151+
${remove_duplicates} \
147152
${dedup_metrics}
148153
"""
149154
}
150155
}
151156
else {
152-
prepared_bams.into{ deduplicated_bams; deduplicated_bams_for_metrics; deduplicated_bams_for_hs_metrics}
157+
process indexBam {
158+
cpus "${params.index_cpus}"
159+
memory "${params.index_memory}"
160+
tag "${name}"
161+
162+
input:
163+
set name, bam_name, type, file(bam) from prepared_bams
164+
165+
output:
166+
set val(name), val(bam_name), val(type),
167+
file("${bam}"), file("${bam.baseName}.bai") into deduplicated_bams,
168+
deduplicated_bams_for_metrics, deduplicated_bams_for_hs_metrics
169+
170+
script:
171+
"""
172+
mkdir tmp
173+
174+
gatk BuildBamIndex \
175+
--java-options '-Xmx8g -Djava.io.tmpdir=tmp' \
176+
--INPUT ${bam}
177+
"""
178+
}
153179
}
154180

155181
if (! params.skip_metrics) {

nextflow.config

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,16 @@ profiles {
2323
params.bqsr_memory = "3g"
2424
params.metrics_cpus = 1
2525
params.metrics_memory = "3g"
26+
params.index_cpus = 1
27+
params.index_memory = "3g"
2628
params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
2729
params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
2830
params.intervals = "$baseDir/test_data/minimal_intervals.intervals"
2931
params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
32+
timeline.enabled = false
33+
report.enabled = false
34+
trace.enabled = false
35+
dag.enabled = false
3036
}
3137
}
3238

@@ -40,29 +46,12 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
4046

4147
cleanup = true
4248

43-
timeline {
44-
enabled = true
45-
//file = "${params.output}/execution_timeline.html"
46-
}
47-
report {
48-
enabled = true
49-
//file = "${params.output}/execution_report.html"
50-
}
51-
trace {
52-
enabled = true
53-
//file = "${params.output}/execution_trace.txt"
54-
}
55-
dag {
56-
enabled = true
57-
//file = "${params.output}/pipeline_dag.svg"
58-
}
59-
60-
VERSION = '1.3.1'
49+
VERSION = '1.4.0'
6150
DOI = 'https://zenodo.org/badge/latestdoi/358400957'
6251

6352
manifest {
6453
name = 'TRON-Bioinformatics/tronflow-bam-preprocessing'
65-
author = 'Pablo Riesgo-Ferreiro, Özlem Muslu'
54+
author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola'
6655
homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing'
6756
description = 'Picard and GATK BAM preprocessing pipeline'
6857
mainScript = 'main.nf'
@@ -99,6 +88,7 @@ Optional input:
9988
* --skip_bqsr: optionally skip BQSR (default: false)
10089
* --skip_realignment: optionally skip realignment (default: false)
10190
* --skip_deduplication: optionally skip deduplication (default: false)
91+
* --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
10292
* --skip_metrics: optionally skip metrics (default: false)
10393
* --output: the folder where to publish output (default: ./output)
10494
* --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)

0 commit comments

Comments
 (0)