Skip to content

Commit 8ce785f

Browse files
author
Pablo Riesgo Ferreiro
committed
Merge branch 'develop' into 'master'
Release v1.4.0 See merge request tron/tron-bam-preprocessing!16
2 parents 6c2b617 + 12230be commit 8ce785f

File tree

5 files changed

+45
-28
lines changed

5 files changed

+45
-28
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@ test:
1515
nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
1616
nextflow main.nf -profile test,conda --output output/test6 --intervals false
1717
nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
18-
nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10
18+
nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ GATK has been providing a well known best practices document on BAM preprocessin
1616

1717
We aim at providing a single implementation of the BAM preprocessing pipeline that can be used across different situations. For this purpose there are some required steps and some optional steps. This is implemented as a Nextflow pipeline to simplify parallelization of execution in the cluster. The default configuration uses reference genome hg19, if another reference is needed the adequate resources must be provided. The reference genome resources for hg19 were downloaded from https://software.broadinstitute.org/gatk/download/bundle
1818

19-
The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs.
19+
The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs.
2020

2121
## Implementation
2222

@@ -74,6 +74,7 @@ Optional input:
7474
* --skip_bqsr: optionally skip BQSR (default: false)
7575
* --skip_realignment: optionally skip realignment (default: false)
7676
* --skip_deduplication: optionally skip deduplication (default: false)
77+
* --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
7778
* --skip_metrics: optionally skip metrics (default: false)
7879
* --output: the folder where to publish output (default: ./output)
7980
* --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# You can use this file to create a conda environment for this pipeline:
22
# conda env create -f environment.yml
3-
name: tronflow-bam-preprocessing-1.3.1
3+
name: tronflow-bam-preprocessing-1.4.0
44
channels:
55
- conda-forge
66
- bioconda

main.nf

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ params.hs_metrics_per_base_coverage = false
1313
params.skip_bqsr = false
1414
params.skip_realignment = false
1515
params.skip_deduplication = false
16+
params.remove_duplicates = true
1617
params.skip_metrics = false
1718
params.output = false
1819
params.platform = "ILLUMINA"
@@ -30,6 +31,8 @@ params.bqsr_cpus = 3
3031
params.bqsr_memory = "4g"
3132
params.metrics_cpus = 1
3233
params.metrics_memory = "8g"
34+
params.index_cpus = 1
35+
params.index_memory = "8g"
3336

3437

3538

@@ -83,9 +86,10 @@ process prepareBam {
8386
output:
8487
set val(name),
8588
val("${bam.baseName}"),
86-
val(type), file("${bam.baseName}.prepared.bam"),
87-
file("${bam.baseName}.prepared.bai") into prepared_bams
89+
val(type), file("${bam.baseName}.prepared.bam") into prepared_bams
8890

91+
script:
92+
order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
8993
"""
9094
mkdir tmp
9195
@@ -109,8 +113,7 @@ process prepareBam {
109113
--RGSM ${type} \
110114
--RGLB 1 \
111115
--RGPL ${params.platform} \
112-
--SORT_ORDER coordinate \
113-
--CREATE_INDEX true
116+
${order}
114117
"""
115118
}
116119

@@ -126,7 +129,7 @@ if (!params.skip_deduplication) {
126129
publishDir "${publish_dir}/${name}/metrics", mode: "copy", pattern: "*.dedup_metrics"
127130

128131
input:
129-
set name, bam_name, type, file(bam), file(bai) from prepared_bams
132+
set name, bam_name, type, file(bam) from prepared_bams
130133

131134
output:
132135
set val(name), val(bam_name), val(type),
@@ -136,6 +139,7 @@ if (!params.skip_deduplication) {
136139

137140
script:
138141
dedup_metrics = params.skip_metrics ? "": "--metrics-file ${bam.baseName}.dedup_metrics"
142+
remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false"
139143
"""
140144
mkdir tmp
141145
@@ -144,12 +148,34 @@ if (!params.skip_deduplication) {
144148
--input ${bam} \
145149
--output ${bam.baseName}.dedup.bam \
146150
--conf 'spark.executor.cores=${task.cpus}' \
151+
${remove_duplicates} \
147152
${dedup_metrics}
148153
"""
149154
}
150155
}
151156
else {
152-
prepared_bams.into{ deduplicated_bams; deduplicated_bams_for_metrics; deduplicated_bams_for_hs_metrics}
157+
process indexBam {
158+
cpus "${params.index_cpus}"
159+
memory "${params.index_memory}"
160+
tag "${name}"
161+
162+
input:
163+
set name, bam_name, type, file(bam) from prepared_bams
164+
165+
output:
166+
set val(name), val(bam_name), val(type),
167+
file("${bam}"), file("${bam.baseName}.bai") into deduplicated_bams,
168+
deduplicated_bams_for_metrics, deduplicated_bams_for_hs_metrics
169+
170+
script:
171+
"""
172+
mkdir tmp
173+
174+
gatk BuildBamIndex \
175+
--java-options '-Xmx8g -Djava.io.tmpdir=tmp' \
176+
--INPUT ${bam}
177+
"""
178+
}
153179
}
154180

155181
if (! params.skip_metrics) {

nextflow.config

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,16 @@ profiles {
2323
params.bqsr_memory = "3g"
2424
params.metrics_cpus = 1
2525
params.metrics_memory = "3g"
26+
params.index_cpus = 1
27+
params.index_memory = "3g"
2628
params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
2729
params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
2830
params.intervals = "$baseDir/test_data/minimal_intervals.intervals"
2931
params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
32+
timeline.enabled = false
33+
report.enabled = false
34+
trace.enabled = false
35+
dag.enabled = false
3036
}
3137
}
3238

@@ -40,29 +46,12 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
4046

4147
cleanup = true
4248

43-
timeline {
44-
enabled = true
45-
//file = "${params.output}/execution_timeline.html"
46-
}
47-
report {
48-
enabled = true
49-
//file = "${params.output}/execution_report.html"
50-
}
51-
trace {
52-
enabled = true
53-
//file = "${params.output}/execution_trace.txt"
54-
}
55-
dag {
56-
enabled = true
57-
//file = "${params.output}/pipeline_dag.svg"
58-
}
59-
60-
VERSION = '1.3.1'
49+
VERSION = '1.4.0'
6150
DOI = 'https://zenodo.org/badge/latestdoi/358400957'
6251

6352
manifest {
6453
name = 'TRON-Bioinformatics/tronflow-bam-preprocessing'
65-
author = 'Pablo Riesgo-Ferreiro, Özlem Muslu'
54+
author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola'
6655
homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing'
6756
description = 'Picard and GATK BAM preprocessing pipeline'
6857
mainScript = 'main.nf'
@@ -99,6 +88,7 @@ Optional input:
9988
* --skip_bqsr: optionally skip BQSR (default: false)
10089
* --skip_realignment: optionally skip realignment (default: false)
10190
* --skip_deduplication: optionally skip deduplication (default: false)
91+
* --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
10292
* --skip_metrics: optionally skip metrics (default: false)
10393
* --output: the folder where to publish output (default: ./output)
10494
* --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)

0 commit comments

Comments
 (0)