diff --git a/modules/local/scanpy/filter/main.nf b/modules/local/scanpy/filter/main.nf index 6273490d..a845d9c5 100644 --- a/modules/local/scanpy/filter/main.nf +++ b/modules/local/scanpy/filter/main.nf @@ -14,6 +14,7 @@ process SCANPY_FILTER { val(min_counts_gene) val(min_counts_cell) val(max_mito_percentage) + val dynamic_filtering output: tuple val(meta), path("${prefix}.h5ad"), emit: h5ad diff --git a/modules/local/scanpy/filter/templates/filter.py b/modules/local/scanpy/filter/templates/filter.py index 333f4f1c..deba8a75 100644 --- a/modules/local/scanpy/filter/templates/filter.py +++ b/modules/local/scanpy/filter/templates/filter.py @@ -6,37 +6,79 @@ os.environ["MPLCONFIGDIR"] = "./tmp/mpl" os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba" -import yaml +import numpy as np import scanpy as sc +import yaml +from scipy.stats import median_abs_deviation from threadpoolctl import threadpool_limits + threadpool_limits(int("${task.cpus}")) sc.settings.n_jobs = int("${task.cpus}") +def is_outlier(adata, metric: str, nmads: int): + """Identify outliers using MAD (Median Absolute Deviation) method.""" + M = adata.obs[metric] + outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | ( + np.median(M) + nmads * median_abs_deviation(M) < M + ) + return outlier + + adata = sc.read_h5ad("${h5ad}") prefix = "${prefix}" -adata.var["mt"] = adata.var_names.str.lower().str.startswith("mt-") +# mitochondrial genes +adata.var["mt"] = adata.var_names.str.startswith("MT-") +# ribosomal genes +adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL")) +# hemoglobin genes. +adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]") + sc.pp.calculate_qc_metrics( - adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True + adata, qc_vars=["mt", "ribo", "hb"], percent_top=[20], log1p=True, inplace=True ) -adata = adata[adata.obs.pct_counts_mt < int("${max_mito_percentage}"), :].copy() -sc.pp.filter_cells(adata, min_counts=int("${min_counts_cell}")) -sc.pp.filter_genes(adata, min_counts=int("${min_counts_gene}")) +if "${dynamic_filtering}" == "true": + # Dynamic filtering using MAD strategy + print("Applying dynamic filtering using MAD strategy...") + + # Filter cells based on general QC metrics (5 MADs) + adata.obs["outlier"] = ( + is_outlier(adata, "log1p_total_counts", 5) + | is_outlier(adata, "log1p_n_genes_by_counts", 5) + | is_outlier(adata, "pct_counts_in_top_20_genes", 5) + ) -sc.pp.filter_cells(adata, min_genes=int("${min_genes}")) -sc.pp.filter_genes(adata, min_cells=int("${min_cells}")) + # Filter mitochondrial outliers (3 MADs) with additional max threshold + adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | ( + adata.obs["pct_counts_mt"] > int("${max_mito_percentage}") + ) + + # Apply filtering + print(f"Total number of cells before filtering: {adata.n_obs}") + adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy() + print(f"Number of cells after dynamic filtering: {adata.n_obs}") + +else: + # Static filtering using user-defined thresholds + print("Applying static filtering using user-defined thresholds...") + + # Apply mitochondrial filtering + adata = adata[adata.obs.pct_counts_mt < int("${max_mito_percentage}"), :].copy() + + # Apply count and gene filtering + sc.pp.filter_cells(adata, min_counts=int("${min_counts_cell}")) + sc.pp.filter_genes(adata, min_counts=int("${min_counts_gene}")) + + sc.pp.filter_cells(adata, min_genes=int("${min_genes}")) + sc.pp.filter_genes(adata, min_cells=int("${min_cells}")) adata.write_h5ad(f"{prefix}.h5ad") # Versions - versions = { - "${task.process}": { - "python": platform.python_version(), - "scanpy": sc.__version__ - } + "${task.process}": {"python": platform.python_version(), "scanpy": sc.__version__} } with open("versions.yml", "w") as f: diff --git a/modules/local/scanpy/filter/tests/main.nf.test b/modules/local/scanpy/filter/tests/main.nf.test index 58927e20..9b4d7661 100644 --- a/modules/local/scanpy/filter/tests/main.nf.test +++ b/modules/local/scanpy/filter/tests/main.nf.test @@ -25,6 +25,7 @@ nextflow_process { input[3] = 50 input[4] = 50 input[5] = 50 + input[6] = false """ } } @@ -58,6 +59,7 @@ nextflow_process { input[3] = 50 input[4] = 50 input[5] = 50 + input[6] = false """ } } diff --git a/nextflow.config b/nextflow.config index 3237146f..91195f4c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,6 +25,7 @@ params { doublet_detection = 'scrublet' doublet_detection_threshold = 1 cellbender_epochs = 150 + dynamic_filtering = false // Integration options integration_methods = 'scvi' diff --git a/nextflow_schema.json b/nextflow_schema.json index 6735225b..f552e847 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -105,6 +105,10 @@ "type": "integer", "default": 150, "description": "Number of epochs to train the CellBender model" + }, + "dynamic_filtering": { + "type": "boolean", + "description": "Employ deviation-based filtering" } } }, diff --git a/subworkflows/local/integrate.nf b/subworkflows/local/integrate.nf index ef65db40..7c08ae26 100644 --- a/subworkflows/local/integrate.nf +++ b/subworkflows/local/integrate.nf @@ -38,7 +38,7 @@ workflow INTEGRATE { ch_var = ch_var.mix(SCANPY_HVGS.out.var) // Filter out empty cells from the AnnData object - SCANPY_FILTER(ch_h5ad_hvg, 1, 0, 0, 0, 100) + SCANPY_FILTER(ch_h5ad_hvg, 1, 0, 0, 0, 100, params.dynamic_filtering) ch_h5ad_hvg = SCANPY_FILTER.out.h5ad ch_versions = ch_versions.mix(SCANPY_FILTER.out.versions) } diff --git a/subworkflows/local/quality_control/main.nf b/subworkflows/local/quality_control/main.nf index 211af8cc..76aadfa9 100644 --- a/subworkflows/local/quality_control/main.nf +++ b/subworkflows/local/quality_control/main.nf @@ -74,7 +74,7 @@ workflow QUALITY_CONTROL { min_counts_cell: meta.min_counts_cell ?: 0 max_mito_percentage: meta.max_mito_percentage ?: 100 } - SCANPY_FILTER(ch_filtering.h5ad, ch_filtering.min_genes, ch_filtering.min_cells, ch_filtering.min_counts_gene, ch_filtering.min_counts_cell, ch_filtering.max_mito_percentage) + SCANPY_FILTER(ch_filtering.h5ad, ch_filtering.min_genes, ch_filtering.min_cells, ch_filtering.min_counts_gene, ch_filtering.min_counts_cell, ch_filtering.max_mito_percentage, params.dynamic_filtering) ch_h5ad = SCANPY_FILTER.out.h5ad ch_versions = ch_versions.mix(SCANPY_FILTER.out.versions)