diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 51a3a665..b1f35bf0 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -15,9 +15,9 @@ experiment: resources: ref: species: homo_sapiens - release: "114" + release: "115" build: GRCh38 - pfam: "37.0" + pfam: "37.1" representative_transcripts: canonical ontology: gene_ontology: "https://release.geneontology.org/2025-07-22/ontology/go-basic.obo" @@ -44,7 +44,9 @@ diffexp: qq-plot: 0.05 genes_of_interest: activate: true - genelist: "resources/gene_list.tsv" + gene_lists: + gene_list_1: "resources/gene_list.tsv" + gene_list_2: "resources/gene_list_2.tsv" diffsplice: activate: false @@ -81,7 +83,7 @@ report: offer_excel: true bootstrap_plots: - FDR: 0.01 + FDR: 0.5 # Intentionally high for testing. top_n: 3 color_by: condition diff --git a/.test/resources/gene_list.tsv b/.test/resources/gene_list.tsv index ad43e18d..4de8c11f 100644 --- a/.test/resources/gene_list.tsv +++ b/.test/resources/gene_list.tsv @@ -8,11 +8,4 @@ DCT MLANA MITF CDK2 -SOX10 -ERBB3 -LEF1 -CTNNB1 -CDH1 -FN1 -NGFR -AXL +SOX10 \ No newline at end of file diff --git a/.test/resources/gene_list_2.tsv b/.test/resources/gene_list_2.tsv new file mode 100644 index 00000000..f14c2735 --- /dev/null +++ b/.test/resources/gene_list_2.tsv @@ -0,0 +1,11 @@ +MLANA +MITF +CDK2 +SOX10 +ERBB3 +LEF1 +CTNNB1 +CDH1 +FN1 +NGFR +AXL \ No newline at end of file diff --git a/.test/three_prime/config/config.yaml b/.test/three_prime/config/config.yaml index 08d60770..2b70f148 100644 --- a/.test/three_prime/config/config.yaml +++ b/.test/three_prime/config/config.yaml @@ -32,6 +32,7 @@ scatter: diffexp: exclude: + - SRR8309099_2 models: model_X: full: ~condition @@ -44,7 +45,8 @@ diffexp: qq-plot: 0.05 genes_of_interest: activate: false - genelist: "resources/gene_list.tsv" + gene_lists: + gene_list_1: "resources/gene_list.tsv" diffsplice: activate: false diff --git a/.test/three_prime/config/samples.tsv b/.test/three_prime/config/samples.tsv index db857bfc..82fabe49 100644 --- a/.test/three_prime/config/samples.tsv +++ b/.test/three_prime/config/samples.tsv @@ -4,4 +4,5 @@ SRR8309094 Control SRR8309095 Treated SRR8309097 Treated SRR8309098 Control -SRR8309099 Treated \ No newline at end of file +SRR8309099 Treated +SRR8309099_2 Treated \ No newline at end of file diff --git a/.test/three_prime/config/units.tsv b/.test/three_prime/config/units.tsv index a9c35146..1f069afc 100644 --- a/.test/three_prime/config/units.tsv +++ b/.test/three_prime/config/units.tsv @@ -4,4 +4,5 @@ SRR8309094 u1 430 43 quant_seq_test_data/SRR8309094.fastq.gz SRR8309095 u1 430 43 quant_seq_test_data/SRR8309095.fastq.gz SRR8309097 u1 430 43 quant_seq_test_data/SRR8309097.fastq.gz SRR8309098 u1 430 43 quant_seq_test_data/SRR8309098.fastq.gz -SRR8309099 u1 430 43 quant_seq_test_data/SRR8309099.fastq.gz \ No newline at end of file +SRR8309099 u1 430 43 quant_seq_test_data/SRR8309099.fastq.gz +SRR8309099_2 u1 430 43 quant_seq_test_data/SRR8309099.fastq.gz \ No newline at end of file diff --git a/config/config.yaml b/config/config.yaml index 2aba6891..b63cad94 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,27 +24,30 @@ resources: # For a quick check, see the Ensembl species list: # https://www.ensembl.org/info/about/species.html # For full valid species names, consult the respective table for the release - # you specify, for example for ‘114’ this is at: - # https://ftp.ensembl.org/pub/release-114/species_EnsemblVertebrates.txt + # you specify, for example for ‘115’ this is at: + # https://ftp.ensembl.org/pub/release-115/species_EnsemblVertebrates.txt # And to browse available downloads in more detail, see the FTP server: # https://ftp.ensembl.org/pub/ species: homo_sapiens # ensembl release version: # Update this to the latest working version, when you first set up a new - # analysis on a dataset. Later, it only makes sense to update (or downgrade) - # the release versions if either (a) the version you are using consistently - # fails to download (some Ensembl release versions are just broken) or - # (b) you know that a newer version will include changes that will fix some - # error or adds transcripts that will be relevant to your analysis. - release: "114" + # analysis on a dataset. You can usually find the latest release in the + # Ensembl blog, by looking at the latest posts of the release category: + # https://www.ensembl.info/category/01-release/ + # Later, it only makes sense to update (or downgrade) the release versions + # if either (a) the version you are using consistently fails to download + # (some Ensembl release versions are just broken) or (b) you know that a + # newer version will include changes that will fix some error or adds + # transcripts that will be relevant to your analysis. + release: "115" # genome build: # Usually, this should just be the main build listed in: - # https://ftp.ensembl.org/pub/release-114/species_EnsemblVertebrates.txt + # https://ftp.ensembl.org/pub/release-115/species_EnsemblVertebrates.txt # For example, for homo_sapiens, you strip the assembly column entry # "GRCh38.p12" down to "GRCh38". If in doubt, navigate to the respective # cdna folder on the FTP server, and look for the correct build in the # file names there. For example "GRCh38" in: - # https://ftp.ensembl.org/pub/release-114/fasta/homo_sapiens/cdna/ + # https://ftp.ensembl.org/pub/release-115/fasta/homo_sapiens/cdna/ build: GRCh38 # pfam release: # This is used for annotation of domains in differential splicing analysis. @@ -54,7 +57,7 @@ resources: # https://xfam.wordpress.com/ # For debugging file downloads, you can browse the FTP download server: # https://ftp.ebi.ac.uk/pub/databases/Pfam/releases/ - pfam: "37.0" + pfam: "37.1" # representative transcripts: # Strategy for selecting a representative transcript for each gene. # kallisto quantifies expression on the transcript level. For datasets @@ -91,8 +94,11 @@ scatter: diffexp: # Samples to exclude from differential expression modeling (for example, - # outliers due to technical problems). + # outliers due to technical problems). List their `sample_name` column + # entry in the form of a YAML list. exclude: + # - sample_X + # - sample_Y # model for sleuth differential expression analysis # For an introduction to sleuth, see its online manual: # https://pachterlab.github.io/sleuth/about @@ -135,14 +141,19 @@ diffexp: volcano-plot: 0.05 ma-plot: 0.05 qq-plot: 0.05 - # heatmap and bootstrap plots for given set of genes: - # If you want a heatmap and bootstrap plots for a particular set of genes, - # you can set `activate: true` and provide a genelist file. In this file, - # list all your HGNC gene symbols of interest in one line, separated by - # tabulators (tabs). + # heatmap and bootstrap plots for given sets of genes: + # If you want a heatmap and bootstrap plots for particular sets of genes, + # you can set `activate: true` and provide one or more gene list files. In + # those files, list all your HGNC gene symbols of interest, one gene per line. + # The workflow will generate one heatmap plot per gene set file, and a + # bootstrap plot for each of the genes contained in any of the provided files. genes_of_interest: activate: false - genelist: "config/gene_list.tsv" + gene_lists: + # Use a descriptive gene list name, as this will be part of + # the filename of the resulting heatmap plot. + gene_list_1: "config/gene_list.tsv" + diffsplice: # isoformSwitchAnalyzer diff --git a/workflow/envs/heatmap.yaml b/workflow/envs/heatmap.yaml index fcfea229..147f6638 100644 --- a/workflow/envs/heatmap.yaml +++ b/workflow/envs/heatmap.yaml @@ -3,6 +3,6 @@ channels: - bioconda - nodefaults dependencies: - - r-pheatmap =1.0.12 - - r-dplyr =1.0.9 - - r-tidyr =1.2.0 \ No newline at end of file + - r-base >=4.1 + - r-tidyverse =2.0 + - r-ggalign =1.1 \ No newline at end of file diff --git a/workflow/envs/sleuth.yaml b/workflow/envs/sleuth.yaml index b8fba429..ef81c1b8 100644 --- a/workflow/envs/sleuth.yaml +++ b/workflow/envs/sleuth.yaml @@ -8,5 +8,5 @@ dependencies: - r-pheatmap =1.0.12 - r-tidyverse =2.0 - r-ggpubr =0.6 - - r-base =4 + - r-base >=4.1 - bioconductor-limma =3.56 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index df7bfcb9..51083d98 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -242,15 +242,6 @@ def kallisto_params(wildcards, input): return extra -def input_genelist(predef_genelist): - if config["diffexp"]["genes_of_interest"]["activate"] == True: - predef_genelist = config["diffexp"]["genes_of_interest"]["genelist"] - else: - predef_genelist = [] - - return predef_genelist - - def all_input(wildcards): """ Function defining all requested inputs for the rule all (below). @@ -329,20 +320,22 @@ def all_input(wildcards): "results/tables/tpm-matrix/{model}.tpm-matrix.tsv", "results/sleuth/{model}.samples.tsv", "results/datavzrd-reports/diffexp-{model}", - "results/plots/diffexp-heatmap/{model}.diffexp-heatmap.{mode}.pdf", + "results/plots/diffexp-heatmap/{model}.diffexp-heatmap.{gene_list}.pdf", ], model=config["diffexp"]["models"], - mode=["topn"], + gene_list=["topn"], ) ) if config["diffexp"]["genes_of_interest"]["activate"]: wanted_input.extend( expand( [ - "results/plots/diffexp-heatmap/{model}.diffexp-heatmap.{mode}.pdf", + "results/plots/diffexp-heatmap/{model}.diffexp-heatmap.{gene_list}.pdf", ], model=config["diffexp"]["models"], - mode=["predefined"], + gene_list=lookup( + within=config, dpath="diffexp/genes_of_interest/gene_lists" + ), ) ) diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk index cc2ee61e..4656e2f8 100644 --- a/workflow/rules/diffexp.smk +++ b/workflow/rules/diffexp.smk @@ -146,7 +146,7 @@ rule plot_bootstrap: color_by=config["bootstrap_plots"]["color_by"], fdr=config["bootstrap_plots"]["FDR"], top_n=config["bootstrap_plots"]["top_n"], - genes=config["diffexp"]["genes_of_interest"], + genes_of_interest=lookup(within=config, dpath="diffexp/genes_of_interest"), log: "logs/plots/bootstrap/{model}/{model}.plot_bootstrap.log", script: @@ -258,18 +258,27 @@ rule tpm_matrix: rule plot_diffexp_heatmap: input: logcountmatrix_file="results/tables/logcount-matrix/{model}.logcount-matrix.tsv", - predef_genelist=input_genelist, + # default provides a dummy file path for the "topn" case (a file path + # that is always valid, but never loaded) + predef_gene_list=lookup( + within=config, + dpath="diffexp/genes_of_interest/gene_lists/{gene_list}", + default=lookup(within=config, dpath="samples"), + ), output: diffexp_heatmap=report( - "results/plots/diffexp-heatmap/{model}.diffexp-heatmap.{mode}.pdf", + "results/plots/diffexp-heatmap/{model}.diffexp-heatmap.{gene_list}.pdf", caption="../report/plot-heatmap.rst", category="Heatmaps", - labels={"model": "{model}-{mode}"}, + labels={ + "model": "{model}", + "gene list": "{gene_list}", + }, ), params: model=get_model, log: - "logs/plots/diffexp-heatmap/{model}.diffexp-heatmap.{mode}.log", + "logs/plots/diffexp-heatmap/{model}.diffexp-heatmap.{gene_list}.log", conda: "../envs/heatmap.yaml" script: diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 247a66e3..d4b387dd 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -100,10 +100,13 @@ properties: properties: activate: type: boolean - genelist: - type: string + gene_lists: + type: object + patternProperties: + "^.+$": + type: string required: - - genelist + - gene_lists required: - models - genes_of_interest diff --git a/workflow/scripts/plot-bootstrap.R b/workflow/scripts/plot-bootstrap.R index 6e092756..5671b41f 100644 --- a/workflow/scripts/plot-bootstrap.R +++ b/workflow/scripts/plot-bootstrap.R @@ -9,42 +9,68 @@ so <- sleuth_load(snakemake@input[["so"]]) top_n <- -strtoi(snakemake@params["top_n"]) -results <- read_tsv(snakemake@input[["transcripts"]]) - -top_genes <- results %>% - filter(qval <= snakemake@params[["fdr"]]) %>% - top_n(top_n, qval) %>% - dplyr::select(ext_gene) %>% - drop_na() %>% - distinct(ext_gene) - - -if (snakemake@params[["genes"]]$activate == TRUE) { - gene_table <- read.csv(snakemake@params[["genes"]]$genelist, - sep = "\t") - names(gene_table) <- c("ext_gene") - genes_of_interest <- tibble(gene_table) %>% - distinct(ext_gene) +transcripts <- read_tsv(snakemake@input[["transcripts"]]) + +top_genes <- transcripts |> + filter(qval <= snakemake@params[["fdr"]]) |> + top_n(top_n, qval) |> + dplyr::select(ext_gene) |> + drop_na() |> + distinct(ext_gene) + +if (snakemake@params[["genes_of_interest"]][["activate"]] == TRUE) { + genes_of_interest <- read_tsv( + unlist(unname(snakemake@params[["genes_of_interest"]][["gene_lists"]])), + col_names = c("ext_gene") + ) |> + unique() } else { - # "genes" is null, if the list provided in config.yaml is empty + # Create empty tibble when genes_of_interest is not activated genes_of_interest <- tibble(ext_gene = character()) } -genes <- full_join(top_genes, genes_of_interest, by = "ext_gene") %>% - add_row(ext_gene = "Custom") %>% - pull(ext_gene) + +genes <- full_join(top_genes, genes_of_interest, by = "ext_gene") |> + add_row(ext_gene = "Custom") dir.create(snakemake@output[[1]]) -for (gene in genes) { - transcripts <- results %>% - filter(ext_gene == gene) %>% - drop_na() %>% - pull(target_id) - - if (length(transcripts > 0)) { - for (transcript in transcripts) { - plot_bootstrap(so, transcript, color_by = snakemake@params[["color_by"]], units = "est_counts") - ggsave(file = str_c(snakemake@output[[1]], "/", gene, ".", transcript, ".", snakemake@wildcards[["model"]], ".bootstrap.pdf")) - } +# sleuth seems to do some internal filtering, and bootstrap values are only +# available for filtered transcripts +available_transcripts <- so$obs_norm_filt |> + select(target_id) + +transcripts_of_interest <- inner_join( + transcripts, + genes, + by = "ext_gene" +) |> + inner_join( + available_transcripts, + by = "target_id" + ) |> + pull(target_id) + + +if (length(transcripts_of_interest) > 0) { + for (transcript in transcripts_of_interest) { + gene = transcripts |> filter(target_id == transcript) |> pull(ext_gene) + plot_bootstrap( + so, + transcript, + color_by = snakemake@params[["color_by"]], + units = "est_counts" + ) + ggsave( + filename = str_c( + snakemake@output[[1]], + "/", + gene, + ".", + transcript, + ".", + snakemake@wildcards[["model"]], + ".bootstrap.pdf" + ) + ) } } diff --git a/workflow/scripts/plot_diffexp_heatmap.R b/workflow/scripts/plot_diffexp_heatmap.R index 4180992b..6ab72656 100644 --- a/workflow/scripts/plot_diffexp_heatmap.R +++ b/workflow/scripts/plot_diffexp_heatmap.R @@ -3,71 +3,76 @@ sink(log) sink(log, type = "message") -library(pheatmap) -library(dplyr) -library(tidyr) +library(tidyverse) +library(ggalign) + # Reading the sleuth log count matrix file -sleuth_file <- read.csv(snakemake@input[["logcountmatrix_file"]], - sep = "\t", header = TRUE +log_counts <- read_tsv( + snakemake@input[["logcountmatrix_file"]] ) -# Check the config file if it contains pre-defined gene list -if (snakemake@wildcards[["mode"]] == "topn") { - # Replacing the rownames with transcript Id and gene name - rownames(sleuth_file) <- paste( - sleuth_file$transcript, - ":", sleuth_file$gene - ) - sleuth_file$gene <- NULL - sleuth_file$transcript <- NULL - # Getting top variable genes - vargenes <- - apply(sleuth_file, 1, var) - # Selecting top 50 variable genes - selectedgenes <- sleuth_file[row.names(sleuth_file) - %in% names(vargenes[order(vargenes, decreasing = TRUE)][1:50]), ] - # If the config file contains pre-defined gene list -} else if (snakemake@wildcards[["mode"]] == "predefined") { +# Check the mode of the rule's execution, topn vs. predefined gene list +if (snakemake@wildcards[["gene_list"]] == "topn") { + selected_genes <- log_counts |> + pivot_longer( + !c(transcript, gene), + names_to = "sample", + values_to = "log_count" + ) |> + summarize( + .by = c(gene, transcript), + variance = var(log_count) + ) |> + left_join( + log_counts, + by = join_by(gene, transcript) + ) |> + slice_max(variance, n = 50) |> + select(-variance) +} else { # Adding gene list to the variable - predefine_genelist <- - read.table(snakemake@input[["predef_genelist"]], - sep = "\t" + predef_gene_list <- + read_tsv( + snakemake@input[["predef_gene_list"]], + col_names = c("gene") + ) + selected_genes <- + log_counts |> + inner_join( + predef_gene_list, + by = join_by(gene) ) - selectedgenes <- - sleuth_file %>% filter(sleuth_file$gene - %in% predefine_genelist$V1) - rownames(selectedgenes) <- paste( - selectedgenes$transcript, - ":", selectedgenes$gene - ) - selectedgenes$gene <- NULL - selectedgenes$transcript <- NULL } -# Checks if selectedgenes variable contain genes that have expression values -if (all(selectedgenes == 0)) { - txt <- "cannot plot, all values are zero" - pdf(snakemake@output[["diffexp_heatmap"]], height = 10, width = 10) - plot.new() - text(.5, .5, txt, font = 2, cex = 1.5) - dev.off() - # If number of transcripts is more than 100, - # Since for RNA-seq multiple transcripts may present for a single gene. -} else if (nrow(selectedgenes) > 100) { - pdf_height <- round(nrow(selectedgenes) / 5) - pdf(snakemake@output[["diffexp_heatmap"]], - height = pdf_height, width = ncol(selectedgenes) * 2 - ) - pheatmap(selectedgenes, selectedgenes, - cluster_rows = FALSE, display_numbers = TRUE, - cellheight = 10, scale = "row" - ) - dev.off() -} else { - pdf_height <- round(nrow(selectedgenes) / 5) - pdf( - file = snakemake@output[["diffexp_heatmap"]], - height = pdf_height, width = ncol(selectedgenes) * 2 + +matrix <- selected_genes |> + mutate(transcript = str_pad(transcript, 18, "right")) |> + replace_na(list(gene = " ")) |> + mutate(t = str_c(gene, " - ", transcript)) |> + select(-gene, -transcript) |> + column_to_rownames(var = "t") + +heatmap <- ggheatmap(matrix) + + geom_tile(aes(fill = value), color = "grey") + + theme(axis.text.x = element_text(angle = -45, hjust = 0)) + + anno_top(size = unit(3, "cm")) + + align_dendro() + + theme( + axis.title = element_blank(), + axis.ticks = element_blank(), + axis.text = element_blank() + ) + + anno_right(size = unit(3, "cm")) + + align_dendro() + + theme( + axis.title = element_blank(), + axis.ticks = element_blank(), + axis.text = element_blank() ) - pheatmap(selectedgenes, scale = "row") - dev.off() -} \ No newline at end of file + +ggsave( + snakemake@output[["diffexp_heatmap"]], + heatmap, + width = length(colnames(matrix)) * 0.5 + 6, + height = length(rownames(matrix)) * 0.2 + 3, + limitsize = FALSE +)