Merge branch 'develop' into 'master'

Pablo Riesgo Ferreiro · Pablo Riesgo Ferreiro · commit 8ce785f358be · 2021-05-26T11:10:25.000Z
Release v1.4.0

See merge request tron/tron-bam-preprocessing!16
diff --git a/Makefile b/Makefile
@@ -15,4 +15,4 @@ test:
 	nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
 	nextflow main.nf -profile test,conda --output output/test6 --intervals false
 	nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
-	nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10
+	nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ GATK has been providing a well known best practices document on BAM preprocessin
 
 We aim at providing a single implementation of the BAM preprocessing pipeline that can be used across different situations. For this purpose there are some required steps and some optional steps. This is implemented as a Nextflow pipeline to simplify parallelization of execution in the cluster. The default configuration uses reference genome hg19, if another reference is needed the adequate resources must be provided. The reference genome resources  for hg19 were downloaded from https://software.broadinstitute.org/gatk/download/bundle
 
-The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs.
+The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs. 
 
 ## Implementation
 
@@ -74,6 +74,7 @@ Optional input:
     * --skip_bqsr: optionally skip BQSR (default: false)
     * --skip_realignment: optionally skip realignment (default: false)
     * --skip_deduplication: optionally skip deduplication (default: false)
+    * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
     * --skip_metrics: optionally skip metrics (default: false)
     * --output: the folder where to publish output (default: ./output)
     * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)
diff --git a/environment.yml b/environment.yml
@@ -1,6 +1,6 @@
 # You can use this file to create a conda environment for this pipeline:
 #   conda env create -f environment.yml
-name: tronflow-bam-preprocessing-1.3.1
+name: tronflow-bam-preprocessing-1.4.0
 channels:
   - conda-forge
   - bioconda
diff --git a/main.nf b/main.nf
@@ -13,6 +13,7 @@ params.hs_metrics_per_base_coverage = false
 params.skip_bqsr = false
 params.skip_realignment = false
 params.skip_deduplication = false
+params.remove_duplicates = true
 params.skip_metrics = false
 params.output = false
 params.platform = "ILLUMINA"
@@ -30,6 +31,8 @@ params.bqsr_cpus = 3
 params.bqsr_memory = "4g"
 params.metrics_cpus = 1
 params.metrics_memory = "8g"
+params.index_cpus = 1
+params.index_memory = "8g"
 
 
 
@@ -83,9 +86,10 @@ process prepareBam {
     output:
       set val(name),
         val("${bam.baseName}"),
-        val(type), file("${bam.baseName}.prepared.bam"),
-        file("${bam.baseName}.prepared.bai")  into prepared_bams
+        val(type), file("${bam.baseName}.prepared.bam") into prepared_bams
 
+    script:
+    order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
     """
     mkdir tmp
 
@@ -109,8 +113,7 @@ process prepareBam {
     --RGSM ${type} \
     --RGLB 1 \
     --RGPL ${params.platform} \
-    --SORT_ORDER coordinate \
-    --CREATE_INDEX true
+    ${order}
     """
 }
 
@@ -126,7 +129,7 @@ if (!params.skip_deduplication) {
 	    publishDir "${publish_dir}/${name}/metrics", mode: "copy", pattern: "*.dedup_metrics"
 
 	    input:
-	    	set name, bam_name, type, file(bam), file(bai) from prepared_bams
+	    	set name, bam_name, type, file(bam) from prepared_bams
 
 	    output:
 	    	set val(name), val(bam_name), val(type),
@@ -136,6 +139,7 @@ if (!params.skip_deduplication) {
 
         script:
         dedup_metrics = params.skip_metrics ? "": "--metrics-file ${bam.baseName}.dedup_metrics"
+        remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false"
 	    """
 	    mkdir tmp
 
@@ -144,12 +148,34 @@ if (!params.skip_deduplication) {
         --input  ${bam} \
         --output ${bam.baseName}.dedup.bam \
         --conf 'spark.executor.cores=${task.cpus}' \
+        ${remove_duplicates} \
         ${dedup_metrics}
 	    """
 	}
 }
 else {
-    prepared_bams.into{ deduplicated_bams; deduplicated_bams_for_metrics; deduplicated_bams_for_hs_metrics}
+    process indexBam {
+	    cpus "${params.index_cpus}"
+        memory "${params.index_memory}"
+	    tag "${name}"
+
+	    input:
+	    	set name, bam_name, type, file(bam) from prepared_bams
+
+	    output:
+	    	set val(name), val(bam_name), val(type),
+	    	    file("${bam}"), file("${bam.baseName}.bai") into deduplicated_bams,
+	    	    deduplicated_bams_for_metrics, deduplicated_bams_for_hs_metrics
+
+        script:
+	    """
+	    mkdir tmp
+
+        gatk BuildBamIndex \
+        --java-options '-Xmx8g  -Djava.io.tmpdir=tmp' \
+        --INPUT  ${bam}
+	    """
+	}
 }
 
 if (! params.skip_metrics) {
diff --git a/nextflow.config b/nextflow.config
@@ -23,10 +23,16 @@ profiles {
     params.bqsr_memory = "3g"
     params.metrics_cpus = 1
     params.metrics_memory = "3g"
+    params.index_cpus = 1
+    params.index_memory = "3g"
     params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
     params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
     params.intervals = "$baseDir/test_data/minimal_intervals.intervals"
     params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
+    timeline.enabled = false
+    report.enabled = false
+    trace.enabled = false
+    dag.enabled = false
   }
 }
 
@@ -40,29 +46,12 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
 
 cleanup = true
 
-timeline {
-  enabled = true
-  //file = "${params.output}/execution_timeline.html"
-}
-report {
-  enabled = true
-  //file = "${params.output}/execution_report.html"
-}
-trace {
-  enabled = true
-  //file = "${params.output}/execution_trace.txt"
-}
-dag {
-  enabled = true
-  //file = "${params.output}/pipeline_dag.svg"
-}
-
-VERSION = '1.3.1'
+VERSION = '1.4.0'
 DOI = 'https://zenodo.org/badge/latestdoi/358400957'
 
 manifest {
   name = 'TRON-Bioinformatics/tronflow-bam-preprocessing'
-  author = 'Pablo Riesgo-Ferreiro, Özlem Muslu'
+  author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola'
   homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing'
   description = 'Picard and GATK BAM preprocessing pipeline'
   mainScript = 'main.nf'
@@ -99,6 +88,7 @@ Optional input:
     * --skip_bqsr: optionally skip BQSR (default: false)
     * --skip_realignment: optionally skip realignment (default: false)
     * --skip_deduplication: optionally skip deduplication (default: false)
+    * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
     * --skip_metrics: optionally skip metrics (default: false)
     * --output: the folder where to publish output (default: ./output)
     * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)