Skip to content

Commit 0b70ad7

Browse files
authored
Merge pull request #1490 from nf-core/optional_fasta
Make genomic FASTA input optional
2 parents bdd52ec + 21eb5ad commit 0b70ad7

File tree

15 files changed

+925
-332
lines changed

15 files changed

+925
-332
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ jobs:
5151
matrix:
5252
NXF_VER:
5353
- "24.04.2"
54-
- "latest-everything"
5554
nf_test_files: ["${{ fromJson(needs.nf-test-changes.outputs.nf_test_files) }}"]
5655
profile:
5756
- "docker"

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1111

1212
- [PR #1480](https://github.com/nf-core/rnaseq/pull/1480) - Bump version after release 3.18.0
1313
- [PR #1482](https://github.com/nf-core/rnaseq/pull/1482) - Update trimgalore module for save_unpaired fix
14-
- [pR #1486](https://github.com/nf-core/rnaseq/pull/1486) - Bump STAR build for multiprocessing fix
14+
- [PR #1486](https://github.com/nf-core/rnaseq/pull/1486) - Bump STAR build for multiprocessing fix
15+
- [PR #1490](https://github.com/nf-core/rnaseq/pull/1490) - Make genomic FASTA input optional
1516

1617
# 3.18.0 - 2024-12-19
1718

bin/filter_gtf.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import argparse
77
import re
88
import statistics
9-
from typing import Set
9+
from typing import Optional, Set
1010

1111
# Create a logger
1212
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
@@ -27,14 +27,15 @@ def tab_delimited(file: str) -> float:
2727
return statistics.median(line.count("\t") for line in data.split("\n"))
2828

2929

30-
def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
30+
def filter_gtf(fasta: Optional[str], gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
3131
"""Filter GTF file based on FASTA sequence names."""
3232
if tab_delimited(gtf_in) != 8:
3333
raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.")
3434

35-
seq_names_in_genome = extract_fasta_seq_names(fasta)
36-
logger.info(f"Extracted chromosome sequence names from {fasta}")
37-
logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
35+
if (fasta is not None):
36+
seq_names_in_genome = extract_fasta_seq_names(fasta)
37+
logger.info(f"Extracted chromosome sequence names from {fasta}")
38+
logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
3839

3940
seq_names_in_gtf = set()
4041
try:
@@ -44,7 +45,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i
4445
seq_name = line.split("\t")[0]
4546
seq_names_in_gtf.add(seq_name) # Add sequence name to the set
4647

47-
if seq_name in seq_names_in_genome:
48+
if fasta is None or seq_name in seq_names_in_genome:
4849
if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
4950
out.write(line)
5051
line_count += 1
@@ -63,7 +64,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i
6364
if __name__ == "__main__":
6465
parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
6566
parser.add_argument("--gtf", type=str, required=True, help="GTF file")
66-
parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
67+
parser.add_argument("--fasta", type=str, required=False, help="Genome fasta file")
6768
parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
6869
parser.add_argument(
6970
"--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"

docs/usage.md

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ You also have the option to pseudoalign and quantify your data directly with [Sa
132132

133133
The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). Similarly, strandedness is taken from the sample sheet or calculated automatically, and passed to Kallisto on a per-library basis, but you can apply a global override by setting the Kallisto strandedness parameters in `--extra_kallisto_quant_args` like `--extra_kallisto_quant_args '--fr-stranded'` see the [Kallisto documentation](https://pachterlab.github.io/kallisto/manual).
134134

135-
When running Salmon in mapping-based mode via `--pseudo_aligner salmon` the entire genome of the organism is used by default for the decoy-aware transcriptome when creating the indices (see second bulleted option in [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)).
135+
When running Salmon in mapping-based mode via `--pseudo_aligner salmon`, supplying a genome fasta via `--fasta` and not supplying a Salmon index, the entire genome of the organism is used by default for the decoy-aware transcriptome when creating the indices, as is recommended (see second bulleted option in [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)). If you do not supply a FASTA file or an index, Salmon will index without those decoys, using only transcript sequences in the index. This second option is not usually recommended, but may be useful in limited circumstances. Note that Kallisto does not index with genomic sequences.
136136

137137
Two additional parameters `--extra_star_align_args` and `--extra_salmon_quant_args` were added in v3.10 of the pipeline that allow you to append any custom parameters to the STAR align and Salmon quant commands, respectively. Note, the `--seqBias` and `--gcBias` are not provided to Salmon quant by default so you can provide these via `--extra_salmon_quant_args '--seqBias --gcBias'` if required. You can now also supply additional arguments to Kallisto via `--extra_kallisto_quant_args`.
138138

@@ -209,7 +209,7 @@ When supplying reference files as discussed below, it is important to be consist
209209

210210
### Explicit reference file specification (recommended)
211211

212-
The minimum reference genome requirements for this pipeline are a FASTA and GTF file, all other files required to run the pipeline can be generated from these files. For example, the latest reference files for human can be derived from Ensembl like:
212+
The minimum reference genome requirements for this pipeline are a FASTA file (genome and/ or transcriptome) and GTF file, all other files required to run the pipeline can be generated from these files. For example, the latest reference files for human can be derived from Ensembl like:
213213

214214
```
215215
latest_release=$(curl -s 'http://rest.ensembl.org/info/software?content-type=application/json' | grep -o '"release":[0-9]*' | cut -d: -f2)
@@ -227,6 +227,7 @@ Notes:
227227
- If `--gene_bed` is not provided then it will be generated from the GTF file.
228228
- If `--additional_fasta` is provided then the features in this file (e.g. ERCC spike-ins) will be automatically concatenated onto both the reference FASTA file as well as the GTF annotation before building the appropriate indices.
229229
- When using `--aligner star_rsem`, both the STAR and RSEM indices should be present in the path specified by `--rsem_index` (see [#568](https://github.com/nf-core/rnaseq/issues/568)).
230+
- If the `--skip_alignment` option is used along with `--transcript_fasta`, the pipeline can technically run without providing the genomic FASTA (`--fasta`). However, this approach is **not recommended** with `--pseudo_aligner salmon`, as any dynamically generated Salmon index will lack decoys. To ensure optimal indexing with decoys, it is **highly recommended** to include the genomic FASTA (`--fasta`) with Salmon, unless a pre-existing decoy-aware Salmon index is supplied. For more details on the benefits of decoy-aware indexing, refer to the [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode).
230231

231232
#### Reference genome
232233

@@ -304,7 +305,7 @@ Notes:
304305

305306
### GTF filtering
306307

307-
By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter.
308+
By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file (where supplied), and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter.
308309

309310
## Contamination screening options
310311

@@ -332,6 +333,21 @@ nextflow run \
332333
-profile docker
333334
```
334335

336+
You can also run without a genomic FASTA file, provided you skip the alignment step and provide a transcriptome FASTA directly:
337+
338+
```bash
339+
nextflow run \
340+
nf-core/rnaseq \
341+
--input <SAMPLESHEET> \
342+
--outdir <OUTDIR> \
343+
--gtf <GTF> \
344+
--transcript_fasta <TRANSCRIPTOME FASTA> \
345+
--skip_alignment \
346+
-profile docker
347+
```
348+
349+
This is not usually recommended with Salmon unless you also supply a previously generated decoy-aware Salmon transcriptome index.
350+
335351
> **NB:** Loading iGenomes configuration remains the default for reasons of consistency with other workflows, but should be disabled when not using iGenomes, applying the recommended usage above.
336352
337353
This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.

modules.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@
181181
},
182182
"salmon/index": {
183183
"branch": "master",
184-
"git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358",
184+
"git_sha": "25ddc0bb25292280923eed07e6351789a671e86a",
185185
"installed_by": ["fastq_subsample_fq_salmon"]
186186
},
187187
"salmon/quant": {

modules/local/gtf_filter/main.nf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,15 @@ process GTF_FILTER {
1818
task.ext.when == null || task.ext.when
1919

2020
script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/
21+
fasta_text=''
22+
if (fasta){
23+
fasta_text="--fasta $fasta"
24+
}
2125
"""
2226
filter_gtf.py \\
2327
--gtf $gtf \\
24-
--fasta $fasta \\
25-
--prefix ${fasta.baseName}
28+
$fasta_text \\
29+
--prefix ${gtf.baseName}
2630
2731
cat <<-END_VERSIONS > versions.yml
2832
"${task.process}":

modules/nf-core/salmon/index/main.nf

Lines changed: 17 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

modules/nf-core/salmon/index/tests/main.nf.test

Lines changed: 40 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

modules/nf-core/salmon/index/tests/main.nf.test.snap

Lines changed: 23 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)