Merge pull request #11 from TRON-Bioinformatics/create-indices

priesgo · web-flow · commit 266c18bda083 · 2022-10-19T14:09:42.000+02:00
Create indices
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,6 @@ report.html*
 timeline.html*
 trace.txt*
 dag.dot*
-*.swp
+*.swp
+/test_data/ucsc.hg19.minimal.without_indices.dict
+/test_data/ucsc.hg19.minimal.without_indices.fasta.fai
diff --git a/Makefile b/Makefile
@@ -17,3 +17,4 @@ test:
 	bash tests/test_08.sh
 	bash tests/test_09.sh
 	bash tests/test_10.sh
+	bash tests/test_11.sh
diff --git a/main.nf b/main.nf
@@ -7,6 +7,7 @@ include { MARK_DUPLICATES; SPLIT_CIGAR_N_READS } from './modules/02_mark_duplica
 include { METRICS; HS_METRICS; COVERAGE_ANALYSIS; FLAGSTAT } from './modules/03_metrics'
 include { REALIGNMENT_AROUND_INDELS } from './modules/04_realignment_around_indels'
 include { BQSR; CREATE_OUTPUT } from './modules/05_bqsr'
+include { CREATE_FAIDX; CREATE_DICT } from './modules/00_reference_indices'
 
 params.help= false
 params.input_files = false
@@ -82,10 +83,36 @@ else if (params.input_files) {
     .set { input_files }
 }
 
+workflow CHECK_REFERENCE {
+    take:
+        reference
+
+    emit:
+        checked_reference = reference
+
+    main:
+        // checks the reference and its indexes, if the indexes are not there creates them
+        reference_file = file(reference)
+        if (reference_file.isEmpty()) {
+            log.error "--reference points to a non existing file"
+            exit 1
+        }
+        faidx = file("${reference}.fai")
+        if (faidx.isEmpty()) {
+            CREATE_FAIDX(reference)
+        }
+        dict =  file("${reference_file.getParent() }/${reference_file.baseName }*.dict")
+        if (dict.isEmpty()) {
+            CREATE_DICT(reference)
+        }
+}
+
 
 workflow {
 
-    PREPARE_BAM(input_files, params.reference)
+    CHECK_REFERENCE(params.reference)
+
+    PREPARE_BAM(input_files, CHECK_REFERENCE.out.checked_reference)
 
     if (!params.skip_deduplication) {
         MARK_DUPLICATES(PREPARE_BAM.out.prepared_bams)
@@ -97,29 +124,29 @@ workflow {
     }
 
     if (params.split_cigarn) {
-        SPLIT_CIGAR_N_READS(deduplicated_bams, params.reference)
+        SPLIT_CIGAR_N_READS(deduplicated_bams, CHECK_REFERENCE.out.checked_reference)
         deduplicated_bams = SPLIT_CIGAR_N_READS.out.split_cigarn_bams
     }
 
     if (! params.skip_metrics) {
         if (params.intervals) {
             HS_METRICS(deduplicated_bams)
         }
-        METRICS(deduplicated_bams, params.reference)
+        METRICS(deduplicated_bams, CHECK_REFERENCE.out.checked_reference)
         COVERAGE_ANALYSIS(deduplicated_bams)
         FLAGSTAT(deduplicated_bams)
     }
 
     if (!params.skip_realignment) {
-        REALIGNMENT_AROUND_INDELS(deduplicated_bams, params.reference)
+        REALIGNMENT_AROUND_INDELS(deduplicated_bams, CHECK_REFERENCE.out.checked_reference)
         realigned_bams = REALIGNMENT_AROUND_INDELS.out.realigned_bams
     }
     else {
         realigned_bams = deduplicated_bams
     }
 
     if (!params.skip_bqsr) {
-        BQSR(realigned_bams, params.reference)
+        BQSR(realigned_bams, CHECK_REFERENCE.out.checked_reference)
         preprocessed_bams = BQSR.out.recalibrated_bams
     }
     else {
diff --git a/modules/00_reference_indices.nf b/modules/00_reference_indices.nf
@@ -0,0 +1,30 @@
+
+process CREATE_FAIDX {
+    cpus "1"
+    memory "4g"
+    tag "${name}"
+
+    conda (params.enable_conda ? "bioconda::samtools=1.12" : null)
+
+    input:
+    val(reference)
+
+    """
+    samtools faidx ${reference}
+    """
+}
+
+process CREATE_DICT {
+    cpus "1"
+    memory "4g"
+    tag "${name}"
+
+    conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
+
+    input:
+    val(reference)
+
+    """
+    gatk CreateSequenceDictionary --REFERENCE ${reference}
+    """
+}
diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf
@@ -1,5 +1,7 @@
 params.mark_duplicates_cpus = 2
 params.mark_duplicates_memory = "16g"
+params.split_reads_cpus = 2
+params.split_reads_memory = "4g"
 params.remove_duplicates = true
 params.output = 'output'
 
@@ -50,8 +52,8 @@ process MARK_DUPLICATES {
 }
 
 process SPLIT_CIGAR_N_READS {
-    cpus "${params.prepare_bam_cpus}"
-    memory "${params.prepare_bam_memory}"
+    cpus "${params.split_reads_cpus}"
+    memory "${params.split_reads_memory}"
     tag "${name}"
     publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
 
@@ -70,7 +72,7 @@ process SPLIT_CIGAR_N_READS {
     mkdir tmp
 
     gatk SplitNCigarReads \
-    --java-options '-Xmx${params.prepare_bam_memory}  -Djava.io.tmpdir=./tmp' \
+    --java-options '-Xmx${params.split_reads_memory}  -Djava.io.tmpdir=./tmp' \
     --input ${bam} \
     --output ${name}.split_cigarn.bam \
     --create-output-bam-index true \
diff --git a/nextflow.config b/nextflow.config
@@ -44,7 +44,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
 
 cleanup = true
 
-VERSION = '2.0.1'
+VERSION = '2.1.0'
 DOI = 'https://zenodo.org/badge/latestdoi/358400957'
 
 manifest {
diff --git a/test_data/ucsc.hg19.minimal.without_indices.fasta b/test_data/ucsc.hg19.minimal.without_indices.fasta
diff --git a/tests/test_11.sh b/tests/test_11.sh