Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions pvactools/lib/aggregate_all_epitopes.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,17 +598,31 @@ def sort_transcripts(self, annotations, included_df):
transcript_table = pd.DataFrame.from_records(records)

transcript_table['Biotype Sort'] = transcript_table.Biotype.map(lambda x: 1 if x == 'protein_coding' else 2)
transcript_table['TSL Sort'] = transcript_table.apply(lambda x: 1 if is_preferred_transcript(x, self.transcript_prioritization_strategy, self.maximum_transcript_support_level) else 2, axis=1)

transcript_table['mane_select_sort'] = transcript_table["MANE Select"].apply(lambda x: 1 if x else 2)
transcript_table['canonical_sort'] = transcript_table["Canonical"].apply(lambda x: 1 if x else 2)
transcript_table['tsl_sort'] = transcript_table["Transcript Support Level"].apply(lambda x: 6 if x in ['NA', 'Not Supported'] or pd.isna(x) else int(x))
sort_columns = [
"Biotype Sort",
"mane_select_sort",
"canonical_sort",
"tsl_sort",
"Length",
"Expr"
]
sort_orders = [
True,
True,
True,
True,
False,
False
]
if self.allow_incomplete_transcripts:
transcript_table['Transcript CDS Flags Sort'] = transcript_table['Transcript CDS Flags'].apply(lambda x: 1 if x == "None" else (2 if any(flag in str(x) for flag in ["cds_start_nf", "cds_end_nf"]) else 1))
sort_columns = ["Biotype Sort", "Transcript CDS Flags Sort", "TSL Sort", "Length"]
sort_order = [True, True, True, False]
else:
sort_columns = ["Biotype Sort", "TSL Sort", "Length"]
sort_order = [True, True, False]
sort_columns.insert(0, 'Transcript CDS Flags Sort')
sort_orders.insert(0, True)

transcript_table.sort_values(by=sort_columns, ascending=sort_order, inplace=True)
transcript_table.sort_values(by=sort_columns, ascending=sort_orders, inplace=True)
return transcript_table

def calculate_unique_peptide_count(self, included_df):
Expand Down
118 changes: 71 additions & 47 deletions pvactools/lib/get_best_candidate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

from pvactools.lib.anchor_residue_pass import AnchorResiduePass
from pvactools.lib.run_utils import is_preferred_transcript

Expand All @@ -19,11 +21,19 @@ def __init__(
self.allow_incomplete_transcripts = allow_incomplete_transcripts

def get(self, df):
#get all entries with Biotype 'protein_coding'
biotype_df = df[df['Biotype'] == 'protein_coding']
#get all entries that don't have CDS Flags
if self.allow_incomplete_transcripts:
cds_df = df[df['Transcript CDS Flags'] == 'None']
if cds_df.shape[0] == 0:
cds_df = df
else:
cds_df = df

#subset cds dataframe to only get entries with Biotype 'protein_coding'
biotype_df = cds_df[cds_df['Biotype'] == 'protein_coding']
#if there are none, reset to previous dataframe
if biotype_df.shape[0] == 0:
biotype_df = df
biotype_df = cds_df

#subset protein_coding dataframe to only preferred transcripts
biotype_df['transcript_pass'] = biotype_df.apply(lambda x: is_preferred_transcript(x, self.transcript_prioritization_strategy, self.maximum_transcript_support_level), axis=1)
Expand All @@ -32,7 +42,7 @@ def get(self, df):
if transcript_df.shape[0] == 0:
transcript_df = biotype_df

#subset tsl dataframe to only include entries with no problematic positions
#subset transcript dataframe to only include entries with no problematic positions
if 'Problematic Positions' in transcript_df:
prob_pos_df = transcript_df[transcript_df['Problematic Positions'] == "None"]
#if this results in an empty dataframe, reset to previous dataframe
Expand All @@ -47,28 +57,31 @@ def get(self, df):
if anchor_residue_pass_df.shape[0] == 0:
anchor_residue_pass_df = prob_pos_df

# if allow_incomplete_transcripts is True, deprioritize certain flags
if self.allow_incomplete_transcripts:
anchor_residue_pass_df['Transcript CDS Flags Sort'] = anchor_residue_pass_df['Transcript CDS Flags'].apply(
lambda x: 1 if x == "None" else (2 if any(flag in str(x) for flag in ["cds_start_nf", "cds_end_nf"]) else 1)
)
sort_columns = [
'Transcript CDS Flags Sort',
f"{self.mt_top_score_metric} MT {self.top_score_mode}",
'Transcript Length',
]
sort_order = [True, True, False]
else:
sort_columns = [
f"{self.mt_top_score_metric} MT {self.top_score_mode}",
'Transcript Length',
]
sort_order = [True, False]

#determine the entry with the lowest IC50 Score, transcript prioritization status, and longest Transcript
#set up sorting criteria
anchor_residue_pass_df['mane_select_sort'] = anchor_residue_pass_df["MANE Select"].apply(lambda x: 1 if x else 2)
anchor_residue_pass_df['canonical_sort'] = anchor_residue_pass_df["Canonical"].apply(lambda x: 1 if x else 2)
anchor_residue_pass_df['tsl_sort'] = anchor_residue_pass_df["Transcript Support Level"].apply(lambda x: 6 if x in ['NA', 'Not Supported'] or pd.isna(x) else int(x))
sort_columns = [
f"{self.mt_top_score_metric} MT {self.top_score_mode}",
"mane_select_sort",
"canonical_sort",
"tsl_sort",
"Transcript Length",
"Transcript Expression"
]
sort_orders = [
True,
True,
True,
True,
False,
False
]

#Sort the dataframe according to the criteria and pick the first (best) one
anchor_residue_pass_df.sort_values(
by=sort_columns,
ascending=sort_order,
ascending=sort_orders,
inplace=True
)
return anchor_residue_pass_df.iloc[0]
Expand Down Expand Up @@ -127,11 +140,19 @@ def __init__(
self.allow_incomplete_transcripts=allow_incomplete_transcripts

def get(self, df):
#get all entries with Biotype 'protein_coding'
biotype_df = df[df['Biotype'] == 'protein_coding']
#get all entries that don't have CDS Flags
if self.allow_incomplete_transcripts:
cds_df = df[df['Transcript CDS Flags'] == 'None']
if cds_df.shape[0] == 0:
cds_df = df
else:
cds_df = df

#subset cds dataframe to only get entries with Biotype 'protein_coding'
biotype_df = cds_df[cds_df['Biotype'] == 'protein_coding']
#if there are none, reset to previous dataframe
if biotype_df.shape[0] == 0:
biotype_df = df
biotype_df = cds_df

#subset protein_coding dataframe to only preferred transcripts
biotype_df['transcript_pass'] = biotype_df.apply(lambda x: is_preferred_transcript(x, self.transcript_prioritization_strategy, self.maximum_transcript_support_level), axis=1)
Expand All @@ -149,28 +170,31 @@ def get(self, df):
else:
prob_pos_df = transcript_df

# if allow_incomplete_transcripts is True, deprioritize certain flags
if self.allow_incomplete_transcripts:
prob_pos_df['Transcript CDS Flags Sort'] = prob_pos_df['Transcript CDS Flags'].apply(
lambda x: 1 if x == "None" else (2 if any(flag in str(x) for flag in ["cds_start_nf", "cds_end_nf"]) else 1)
)
sort_columns = [
'Transcript CDS Flags Sort',
f"{self.mt_top_score_metric} {self.top_score_mode}",
"WT Protein Length",
]
sort_order = [True, True, False]
else:
sort_columns = [
f"{self.mt_top_score_metric} {self.top_score_mode}",
"WT Protein Length",
]
sort_order = [True, False]

#determine the entry with the lowest IC50 Score, transcript prioritization status, and longest Transcript
#set up sorting criteria
prob_pos_df['mane_select_sort'] = prob_pos_df["MANE Select"].apply(lambda x: 1 if x else 2)
prob_pos_df['canonical_sort'] = prob_pos_df["Canonical"].apply(lambda x: 1 if x else 2)
prob_pos_df['tsl_sort'] = prob_pos_df["Transcript Support Level"].apply(lambda x: 6 if x in ['NA', 'Not Supported'] or pd.isna(x) else int(x))
sort_columns = [
f"{self.mt_top_score_metric} {self.top_score_mode}",
"mane_select_sort",
"canonical_sort",
"tsl_sort",
"WT Protein Length",
"Transcript Expression"
]
sort_orders = [
True,
True,
True,
True,
False,
False
]

#Sort the dataframe according to the criteria and pick the first (best) one
prob_pos_df.sort_values(
by=sort_columns,
ascending=sort_order,
ascending=sort_orders,
inplace=True
)
return prob_pos_df.iloc[0]
2 changes: 1 addition & 1 deletion pvactools/lib/input_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ def execute(self):
else:
output_row['mane_select'] = 'True'
else:
output_row['mane_select'] = 'Not Supported'
output_row['mane_select'] = 'Not Run'

for (tag, key, comparison_fields) in zip(['TX', 'GX'], ['transcript_expression', 'gene_expression'], [[transcript_name], [ensembl_gene_id, gene_name]]):
if tag in self.vcf_reader.header.format_ids():
Expand Down
8 changes: 4 additions & 4 deletions pvactools/lib/post_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pvactools.lib.aggregate_all_epitopes import PvacseqAggregateAllEpitopes, PvacfuseAggregateAllEpitopes, PvacbindAggregateAllEpitopes, PvacspliceAggregateAllEpitopes
from pvactools.lib.binding_filter import BindingFilter
from pvactools.lib.filter import Filter, FilterCriterion
from pvactools.lib.transcript_filter import TranscriptFilter
from pvactools.lib.top_score_filter import PvacseqTopScoreFilter, PvacfuseTopScoreFilter, PvacbindTopScoreFilter, PvacspliceTopScoreFilter
from pvactools.lib.calculate_manufacturability import CalculateManufacturability
from pvactools.lib.calculate_reference_proteome_similarity import CalculateReferenceProteomeSimilarity
Expand Down Expand Up @@ -225,12 +226,11 @@ def execute_coverage_filter(self):
def execute_transcript_support_level_filter(self):
if self.run_transcript_support_level_filter:
print("Running Transcript Support Level Filter")
filter_criteria = [FilterCriterion('Transcript Support Level', '<=', self.maximum_transcript_support_level, exclude_nas=True, skip_value='Not Supported')]
Filter(
TranscriptFilter(
self.coverage_filter_fh.name,
self.transcript_support_level_filter_fh.name,
filter_criteria,
['Transcript Support Level'],
self.transcript_prioritization_strategy,
self.maximum_transcript_support_level
).execute()
print("Complete")
else:
Expand Down
42 changes: 41 additions & 1 deletion pvactools/lib/update_tiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import tempfile
import shutil
import argparse
import json

from pvactools.lib.prediction_class import PredictionClass
from pvactools.lib.run_utils import is_preferred_transcript, float_range, transcript_prioritization_strategy
Expand Down Expand Up @@ -33,6 +34,7 @@ def execute(self):
output_df.to_csv(self.output_file, sep='\t', na_rep='NA', index=False, float_format='%.3f')
shutil.copy(self.output_file.name, self.input_file)
self.output_file.close()
self.update_metrics_file()

@abstractmethod
def get_tier(self, mutation):
Expand All @@ -42,6 +44,9 @@ def get_tier(self, mutation):
def sort_table(self, output_lines):
raise Exception("Must implement method in child class")

def update_metrics_file(self):
pass

@classmethod
def parser(cls, tool):
parser = argparse.ArgumentParser(
Expand All @@ -53,6 +58,11 @@ def parser(cls, tool):
'input_file',
help="Input aggregated file with tiers to update. This file will be overwritten with the output."
)
if tool in ['pvacseq']:
parser.add_argument(
'metrics_file',
help="metrics.json file corresponding to the input aggregated file. This file will be overwritten to update tiering parameters used by this command."
)
if tool in ['pvacseq', 'pvacsplice']:
parser.add_argument(
'vaf_clonal', type=float_range(0.0, 1.0),
Expand Down Expand Up @@ -99,7 +109,13 @@ def parser(cls, tool):
help="Tumor RNA Coverage Cutoff to consider when evaluating the expression criteria. Only sites above this read depth cutoff will be considered.",
default=10
)
if tool in ['pvacseq', 'pvacfuse', 'pvacsplice']:
if tool in ['pvacseq', 'pvacsplice']:
parser.add_argument(
'--expn-val', type=float,
help="Gene and Transcript Expression cutoff. Sites above this cutoff will be considered.",
default=1.0
)
if tool in ['pvacfuse']:
parser.add_argument(
'--expn-val', type=float,
help="Expression Cutoff to consider when evaluating the expression criteria. Expression is meassured as FFPM (fusion fragments per million total reads). Sites above this cutoff will be considered.",
Expand Down Expand Up @@ -167,6 +183,7 @@ def __init__(
allele_specific_anchors=False,
anchor_contribution_threshold=0.8,
top_score_metric2='ic50',
metrics_file=None,
):
self.input_file = input_file
self.output_file = tempfile.NamedTemporaryFile()
Expand All @@ -180,6 +197,7 @@ def __init__(
self.trna_vaf = trna_vaf
self.transcript_prioritization_strategy = transcript_prioritization_strategy
self.maximum_transcript_support_level = maximum_transcript_support_level
self.metrics_file=metrics_file
if top_score_metric2 == "percentile":
self.top_score_mode = "%ile MT"
else:
Expand Down Expand Up @@ -341,6 +359,28 @@ def sort_table(self, output_lines):

return df

def update_metrics_file(self):
if self.metrics_file is not None:
output_metrics_file = tempfile.NamedTemporaryFile()
with open(self.metrics_file, 'r') as input_fh, open(output_metrics_file.name, 'w') as output_fh:
metrics = json.loads(input_fh.read())
metrics['vaf_clonal'] = round(self.vaf_clonal, 3)
metrics['vaf_subclonal'] = round(self.vaf_clonal/2, 3)
metrics['binding_threshold'] = self.binding_threshold
metrics['trna_vaf'] = self.trna_vaf
metrics['trna_cov'] = self.trna_cov
metrics['allele_expr_threshold'] = self.allele_expr_threshold
metrics['transcript_prioritization_strategy'] = sorted(self.transcript_prioritization_strategy)
metrics['maximum_transcript_support_level'] = self.maximum_transcript_support_level
metrics['percentile_threshold'] = self.percentile_threshold
metrics['percentile_threshold_strategy'] = self.percentile_threshold_strategy
metrics['use_allele_specific_binding_thresholds'] = self.use_allele_specific_binding_thresholds
metrics['top_score_metric2'] = 'ic50' if self.top_score_mode == "IC50 MT" else 'percentile'
metrics['allele_specific_anchors'] = self.anchor_calculator.use_allele_specific_anchors
metrics['anchor_contribution_threshold'] = self.anchor_calculator.anchor_contribution_threshold
json.dump(metrics, output_fh, indent=2, separators=(',', ': '))
shutil.copy(output_metrics_file.name, self.metrics_file)

class PvacfuseUpdateTiers(UpdateTiers, metaclass=ABCMeta):
def __init__(
self,
Expand Down
2 changes: 2 additions & 0 deletions pvactools/tools/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ def define_parser():
default_reference_match_columns = ["Peptide", "Match Window"]

parser = argparse.ArgumentParser(
"pvactools compare",
description="Run a comparison between two output results folders",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
Expand Down
1 change: 1 addition & 0 deletions pvactools/tools/pvacseq/update_tiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def main(args_input = sys.argv[1:]):
allele_specific_anchors=args.allele_specific_anchors,
anchor_contribution_threshold=args.anchor_contribution_threshold,
top_score_metric2=args.top_score_metric2,
metrics_file=args.metrics_file,
).execute()

if __name__ == "__main__":
Expand Down
10 changes: 10 additions & 0 deletions pvactools/tools/pvacsplice/generate_aggregated_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ def define_parser():
default=1.0,
help="Gene and Expression cutoff. Used to calculate the allele expression cutoff for tiering.",
)
parser.add_argument(
"--transcript-prioritization-strategy", type=transcript_prioritization_strategy(),
help="Specify the criteria to consider when prioritizing or filtering transcripts of the neoantigen candidates during aggregate report creation or TSL filtering. "
+ "'canonical' will prioritize/select candidates resulting from variants on a Ensembl canonical transcript. "
+ "'mane_select' will prioritize/select candidates resulting from variants on a MANE select transcript. "
+ "'tsl' will prioritize/select candidates where the transcript support level (TSL) matches the --maximum-transcript-support-level. "
+ "When selecting more than one criteria, a transcript meeting EITHER of the selected criteria will be prioritized/selected.",
default=['canonical', 'mane_select', 'tsl']
)
parser.add_argument(
"--maximum-transcript-support-level", type=int,
help="The threshold to use for filtering epitopes on the Ensembl transcript support level (TSL). "
Expand Down Expand Up @@ -114,6 +123,7 @@ def main(args_input = sys.argv[1:]):
trna_vaf=args.trna_vaf,
trna_cov=args.trna_cov,
expn_val=args.expn_val,
transcript_prioritization_strategy=args.transcript_prioritization_strategy,
maximum_transcript_support_level=args.maximum_transcript_support_level,
top_score_metric=args.top_score_metric,
top_score_metric2=args.top_score_metric2,
Expand Down
Loading