Merge branch 'improve-mark-dupliocates-performance' into 'develop'

Pablo Riesgo Ferreiro · Pablo Riesgo Ferreiro · commit 12230be29b40 · 2021-05-26T10:35:16.000Z
Improve mark dupliocates performance

See merge request tron/tron-bam-preprocessing!15
diff --git a/Makefile b/Makefile
@@ -15,4 +15,4 @@ test:
 	nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
 	nextflow main.nf -profile test,conda --output output/test6 --intervals false
 	nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
-	nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10
+	nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false
diff --git a/README.md b/README.md
@@ -74,6 +74,7 @@ Optional input:
     * --skip_bqsr: optionally skip BQSR (default: false)
     * --skip_realignment: optionally skip realignment (default: false)
     * --skip_deduplication: optionally skip deduplication (default: false)
+    * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
     * --skip_metrics: optionally skip metrics (default: false)
     * --output: the folder where to publish output (default: ./output)
     * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)
diff --git a/environment.yml b/environment.yml
@@ -1,6 +1,6 @@
 # You can use this file to create a conda environment for this pipeline:
 #   conda env create -f environment.yml
-name: tronflow-bam-preprocessing-1.3.1
+name: tronflow-bam-preprocessing-1.4.0
 channels:
   - conda-forge
   - bioconda
diff --git a/main.nf b/main.nf
@@ -13,6 +13,7 @@ params.hs_metrics_per_base_coverage = false
 params.skip_bqsr = false
 params.skip_realignment = false
 params.skip_deduplication = false
+params.remove_duplicates = true
 params.skip_metrics = false
 params.output = false
 params.platform = "ILLUMINA"
@@ -30,6 +31,8 @@ params.bqsr_cpus = 3
 params.bqsr_memory = "4g"
 params.metrics_cpus = 1
 params.metrics_memory = "8g"
+params.index_cpus = 1
+params.index_memory = "8g"
 
 
 
@@ -83,9 +86,10 @@ process prepareBam {
     output:
       set val(name),
         val("${bam.baseName}"),
-        val(type), file("${bam.baseName}.prepared.bam"),
-        file("${bam.baseName}.prepared.bai")  into prepared_bams
+        val(type), file("${bam.baseName}.prepared.bam") into prepared_bams
 
+    script:
+    order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
     """
     mkdir tmp
 
@@ -109,8 +113,7 @@ process prepareBam {
     --RGSM ${type} \
     --RGLB 1 \
     --RGPL ${params.platform} \
-    --SORT_ORDER coordinate \
-    --CREATE_INDEX true
+    ${order}
     """
 }
 
@@ -126,7 +129,7 @@ if (!params.skip_deduplication) {
 	    publishDir "${publish_dir}/${name}/metrics", mode: "copy", pattern: "*.dedup_metrics"
 
 	    input:
-	    	set name, bam_name, type, file(bam), file(bai) from prepared_bams
+	    	set name, bam_name, type, file(bam) from prepared_bams
 
 	    output:
 	    	set val(name), val(bam_name), val(type),
@@ -136,6 +139,7 @@ if (!params.skip_deduplication) {
 
         script:
         dedup_metrics = params.skip_metrics ? "": "--metrics-file ${bam.baseName}.dedup_metrics"
+        remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false"
 	    """
 	    mkdir tmp
 
@@ -144,12 +148,34 @@ if (!params.skip_deduplication) {
         --input  ${bam} \
         --output ${bam.baseName}.dedup.bam \
         --conf 'spark.executor.cores=${task.cpus}' \
+        ${remove_duplicates} \
         ${dedup_metrics}
 	    """
 	}
 }
 else {
-    prepared_bams.into{ deduplicated_bams; deduplicated_bams_for_metrics; deduplicated_bams_for_hs_metrics}
+    process indexBam {
+	    cpus "${params.index_cpus}"
+        memory "${params.index_memory}"
+	    tag "${name}"
+
+	    input:
+	    	set name, bam_name, type, file(bam) from prepared_bams
+
+	    output:
+	    	set val(name), val(bam_name), val(type),
+	    	    file("${bam}"), file("${bam.baseName}.bai") into deduplicated_bams,
+	    	    deduplicated_bams_for_metrics, deduplicated_bams_for_hs_metrics
+
+        script:
+	    """
+	    mkdir tmp
+
+        gatk BuildBamIndex \
+        --java-options '-Xmx8g  -Djava.io.tmpdir=tmp' \
+        --INPUT  ${bam}
+	    """
+	}
 }
 
 if (! params.skip_metrics) {
diff --git a/nextflow.config b/nextflow.config
@@ -23,10 +23,16 @@ profiles {
     params.bqsr_memory = "3g"
     params.metrics_cpus = 1
     params.metrics_memory = "3g"
+    params.index_cpus = 1
+    params.index_memory = "3g"
     params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
     params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
     params.intervals = "$baseDir/test_data/minimal_intervals.intervals"
     params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
+    timeline.enabled = false
+    report.enabled = false
+    trace.enabled = false
+    dag.enabled = false
   }
 }
 
@@ -40,29 +46,12 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
 
 cleanup = true
 
-timeline {
-  enabled = true
-  //file = "${params.output}/execution_timeline.html"
-}
-report {
-  enabled = true
-  //file = "${params.output}/execution_report.html"
-}
-trace {
-  enabled = true
-  //file = "${params.output}/execution_trace.txt"
-}
-dag {
-  enabled = true
-  //file = "${params.output}/pipeline_dag.svg"
-}
-
-VERSION = '1.3.1'
+VERSION = '1.4.0'
 DOI = 'https://zenodo.org/badge/latestdoi/358400957'
 
 manifest {
   name = 'TRON-Bioinformatics/tronflow-bam-preprocessing'
-  author = 'Pablo Riesgo-Ferreiro, Özlem Muslu'
+  author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola'
   homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing'
   description = 'Picard and GATK BAM preprocessing pipeline'
   mainScript = 'main.nf'
@@ -99,6 +88,7 @@ Optional input:
     * --skip_bqsr: optionally skip BQSR (default: false)
     * --skip_realignment: optionally skip realignment (default: false)
     * --skip_deduplication: optionally skip deduplication (default: false)
+    * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
     * --skip_metrics: optionally skip metrics (default: false)
     * --output: the folder where to publish output (default: ./output)
     * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)