kundajelab
diff --git a/‎README.md‎
Lines changed: 116 additions & 1 deletion b/‎README.md‎
Lines changed: 116 additions & 1 deletion
diff --git a/‎chipseq.bds‎
Lines changed: 28 additions & 2 deletions b/‎chipseq.bds‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎etc/broadPeak.as‎
Lines changed: 13 additions & 0 deletions b/‎etc/broadPeak.as‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎etc/gappedPeak.as‎
Lines changed: 19 additions & 0 deletions b/‎etc/gappedPeak.as‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎etc/narrowPeak.as‎
Lines changed: 14 additions & 0 deletions b/‎etc/narrowPeak.as‎
Lines changed: 14 additions & 0 deletions
@@ -361,7 +361,122 @@ There are two kinds of HTML reports provided by the pipeline.
 
 If you stop a BDS pipeline with `Ctrl+C` while calling peaks with `spp`. Temporary files generated by `Rscript` are not removed and they are still on `$TMP` (or `/tmp` if not explicitly exported). You need to manually remove them.
 
-## Programming with BDS
+
+# Output directory structure and file naming
+
+For more details, refer to the file table section in an HTML report generated by the pipeline.
+
+```
+out                               # root dir. of outputs
+│
+├ *report.html                    #  HTML report
+├ *tracks.json                    #  Tracks datahub (JSON) for WashU browser
+├ ENCODE_summary.json             #  Metadata of all datafiles and QC results
+│
+├ align                           #  mapped alignments
+│ ├ rep1                          #   for true replicate 1 
+│ │ ├ *.bam                       #    raw bam
+│ │ ├ *.nodup.bam                 #    filtered and deduped bam
+│ │ ├ *.tagAlign.gz               #    tagAlign (bed6) generated from filtered bam
+│ │ ├ *.*M.tagAlign.gz            #    subsampled tagAlign for cross-corr. analysis
+│ ├ rep2                          #   for true repilicate 2
+│ ...
+│ ├ ctl1                          #   for control 1
+│ ...
+│ ├ pooled_rep                    #   for pooled replicate
+│ ├ pseudo_reps                   #   for self pseudo replicates
+│ │ ├ rep1                        #    for replicate 1
+│ │ │ ├ pr1                       #     for self pseudo replicate 1 of replicate 1
+│ │ │ └ pr2                       #     for self pseudo replicate 2 of replicate 1
+│ │ ├ rep2                        #    for repilicate 2
+│ │ ...                           
+│ └ pooled_pseudo_reps            #   for pooled pseudo replicates
+│   ├ ppr1                        #    for pooled pseudo replicate 1 (rep1-pr1 + rep2-pr1 + ...)
+│   └ ppr2                        #    for pooled pseudo replicate 2 (rep1-pr2 + rep2-pr2 + ...)
+│
+├ peak                            #  peaks called
+│ ├ macs2                         #   peaks generated by MACS2
+│ │ ├ rep1                        #    for replicate 1
+│ │ │ ├ *.narrowPeak.gz           #     narrowPeak
+│ │ │ ├ *.gappedPeak.gz           #     gappedPeak
+│ │ │ ├ *.filt.narrowPeak.gz      #     blacklist filtered narrowPeak 
+│ │ │ ├ *.filt.gappedPeak.gz      #     blacklist filtered gappedPeak
+│ │ ├ rep2                        #    for replicate 2
+│ │ ...
+│ │ ├ pseudo_reps                 #   for self pseudo replicates
+│ │ └ pooled_pseudo_reps          #   for pooled pseudo replicates
+│ │
+│ ├ spp                           #   peaks generated by SPP
+│ │ ├ rep1                        #    for replicate 1
+│ │ │ ├ *.regionPeak.gz           #     regionPeak (narrowPeak format) generated from SPP
+│ │ │ ├ *.filt.regionPeak.gz      #     blacklist filtered narrowPeak 
+│ │ ├ rep2                        #    for replicate 2
+│ │ ...
+│ │ ├ pseudo_reps                 #   for self pseudo replicates
+│ │ └ pooled_pseudo_reps          #   for pooled pseudo replicates
+│ │
+│ └ idr                           #   IDR thresholded peaks (using peaks from SPP)
+│   ├ true_reps                   #    for replicate 1
+│   │ ├ *.narrowPeak.gz           #     IDR thresholded narrowPeak
+│   │ ├ *.filt.narrowPeak.gz      #     IDR thresholded narrowPeak (blacklist filtered)
+│   │ └ *.12-col.bed.gz           #     IDR thresholded narrowPeak track for WashU browser
+│   ├ pseudo_reps                 #    for self pseudo replicates
+│   │ ├ rep1                      #    for replicate 1
+│   │ ...
+│   ├ optimal_set                 #    optimal IDR thresholded peaks
+│   │ └ *.filt.narrowPeak.gz      #     IDR thresholded narrowPeak (blacklist filtered)
+│   ├ conservative_set            #    optimal IDR thresholded peaks
+│   │ └ *.filt.narrowPeak.gz      #     IDR thresholded narrowPeak (blacklist filtered)
+│   ├ pseudo_reps                 #    for self pseudo replicates
+│   └ pooled_pseudo_reps          #    for pooled pseudo replicate
+│
+├ qc                              #  QC logs
+│ ├ *IDR_final.qc                 #   Final IDR QC
+│ ├ rep1                          #   for true replicate 1
+│ │ ├ *.flagstat.qc               #    Flagstat QC for raw bam
+│ │ ├ *.dup.qc                    #    Picard (or sambamba) MarkDuplicate QC log
+│ │ ├ *.pbc.qc                    #    PBC QC
+│ │ ├ *.nodup.flagstat.qc         #    Flagstat QC for filtered bam
+│ │ ├ *M.cc.qc                    #    Cross-correlation analysis score for tagAlign
+│ │ └ *M.cc.plot.pdf/png          #    Cross-correlation analysis plot for tagAlign
+│ ...
+│
+├ signal                          #  signal tracks
+│ ├ macs2                         #   signal tracks generated by MACS2
+│ │ ├ rep1                        #    for true replicate 1 
+│ │ │ ├ *.pval.signal.bigwig (E)  #     signal track for p-val
+│ │ │ └ *.fc.signal.bigwig   (E)  #     signal track for fold change
+│ ...
+│ └ pooled_rep                    #   for pooled replicate
+│ 
+└ report                          # files for HTML report
+```
+
+## QC metrics spreadsheet (TSV) generation
+
+For each pipeline rune, `ENCODE_summary.json` file is generated under the output directory (`-out_dir`). This JSON file includes all metadata and QC metrics.
+
+`./utils/parse_summary_qc_recursively.py` recursively finds `ENCODE_summary.json` files and parse them to generate one big TSV spreadsheet for QC metrics.
+
+```
+$ python parse_summary_qc_recursively.py -h
+usage: ENCODE_summary.json parser for QC [-h] [--out-file OUT_FILE]
+                                         [--search-dir SEARCH_DIR]
+                                         [--json-file JSON_FILE]
+
+Recursively find ENCODE_summary.json, parse it and make a TSV spreadsheet of
+QC metrics.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --out-file OUT_FILE   Output TSV filename)
+  --search-dir SEARCH_DIR
+                        Root directory to search for ENCODE_summary.json
+  --json-file JSON_FILE
+                        Specify json file name to be parsed
+```
+
+# Programming with BDS
 
 * [Using genomic pipeline modules in Kundaje lab](https://kundajelab.github.io/bds_pipeline_modules/programming.html)
 
 
@@ -35,6 +35,7 @@ include "modules/callpeak_spp.bds"
 include "modules/callpeak_macs2.bds"
 include "modules/callpeak_naive_overlap.bds"
 include "modules/callpeak_idr.bds"
+include "modules/callpeak_blacklist_filter.bds"
 
 include "modules/signal.bds"
 
@@ -89,6 +90,9 @@ void main() { // chipseq pipeline starts here
 
 	create_sig_trk() // (BETA)
 
+	// blacklist-filter peaks
+	filter_peak() 
+
 	report()
 }
 
@@ -1237,6 +1241,27 @@ void create_sig_trk( int rep ) {
 	}
 }
 
+// black list filter and then convert to bigbed (for true replicates only)
+void filter_peak() {
+
+	// peaks for true replicates
+	pc := get_peak_caller()
+	if ( get_num_rep() > 1 && peak_pooled.hasKey(pc) ) {
+		filt_peak_pooled := \
+			blacklist_filter_peak( "narrowPeak", peak_pooled{pc}, (peak_pooled{pc}).dirName(), "peak_pooled" )
+	}
+
+	for (int rep=1; rep<=get_num_rep(); rep++) {
+		if ( !peak.hasKey(pc+",$rep") ) continue
+		filt_peak := \
+			blacklist_filter_peak( "narrowPeak", peak{pc+",$rep"}, (peak{pc+",$rep"}).dirName(), "peak $rep" )
+	}
+
+	wait
+
+	print( "\n== Done filter_peak_and_convert_to_bigbed()\n" )	
+}
+
 void report() {
 
 	wait
@@ -1248,6 +1273,7 @@ void report() {
 	html += html_chipseq_QC()	// show QC tables and images
 
 	report( html )
+	write_summary_json()
 
 	print( "\n== Done report()\n" )		
 }
@@ -1292,9 +1318,9 @@ string html_chipseq_QC() {
 
 	html := "<div id='chipseq_qc'>"
 
-	html += parse_flagstat_to_html( "all", 	flagstat_headers, flagstat_qcs, flagstat_headers )
+	html += parse_flagstat_to_html( "all", 	flagstat_headers, flagstat_qcs, flagstat_headers, false )
 	html += parse_dup_to_html( "all", 		dup_headers, dup_qcs, dup_headers )
-	html += parse_flagstat_to_html( "all, filtered",flagstat_nodup_headers, flagstat_nodup_qcs, flagstat_nodup_headers )
+	html += parse_flagstat_to_html( "all, filtered",flagstat_nodup_headers, flagstat_nodup_qcs, flagstat_nodup_headers, true )
 	html += parse_pbc_to_html( "all", 		pbc_headers, pbc_qcs, pbc_headers )
 	html += parse_xcor_to_html( "all", 		xcor_headers, xcor_qcs, xcor_plots, xcor_headers )
 
 
@@ -0,0 +1,13 @@
+table broadPeak
+"BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
+(
+    string chrom;        "Reference sequence chromosome or scaffold"
+    uint   chromStart;   "Start position in chromosome"
+    uint   chromEnd;     "End position in chromosome"
+    string name;	 "Name given to a region (preferably unique). Use . if no name is assigned."
+    uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000)"
+    char[1]   strand;     "+ or - or . for unknown"
+    float  signalValue;  "Measurement of average enrichment for the region"
+    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
+    float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
+)
@@ -0,0 +1,19 @@
+table gappedPeak
+"This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
+    (
+    string chrom;	"Reference sequence chromosome or scaffold"
+    uint chromStart;	"Pseudogene alignment start position"
+    uint chromEnd;      "Pseudogene alignment end position"
+    string name;        "Name of pseudogene"
+    uint score;          "Score of pseudogene with gene (0-1000)"
+    char[1] strand;     "+ or - or . for unknown"
+    uint thickStart;    "Start of where display should be thick (start codon)"
+    uint thickEnd;      "End of where display should be thick (stop codon)"
+    uint reserved;      "Always zero for now"
+    int blockCount;     "Number of blocks"
+    int[blockCount] blockSizes; "Comma separated list of block sizes"
+    int[blockCount] chromStarts; "Start positions relative to chromStart"
+    float  signalValue;  "Measurement of average enrichment for the region"
+    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
+    float  qValue;       "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
+)
@@ -0,0 +1,14 @@
+table narrowPeak
+"BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
+(
+    string chrom;        "Reference sequence chromosome or scaffold"
+    uint   chromStart;   "Start position in chromosome"
+    uint   chromEnd;     "End position in chromosome"
+    string name;	 "Name given to a region (preferably unique). Use . if no name is assigned"
+    uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000) "
+    char[1]  strand;     "+ or - or . for unknown"
+    float  signalValue;  "Measurement of average enrichment for the region"
+    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
+    float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
+    int   peak;         "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
+)