From 221fc4b8391e1c97a2bbb2a030be8f4ec4662c59 Mon Sep 17 00:00:00 2001 From: rodvrees Date: Wed, 12 Mar 2025 12:53:59 +0100 Subject: [PATCH 1/2] Refactor deeplc and im2deep and batching --- ms2pip/core.py | 40 +++++++++++++++++++++++++++++----------- ms2pip/search_space.py | 8 +++++++- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/ms2pip/core.py b/ms2pip/core.py index 3ffd82b..7462056 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -112,15 +112,15 @@ def predict_batch( psms = PSMList(psm_list=psms) psm_list = read_psms(psms, filetype=psm_filetype) - if add_retention_time: - logger.info("Adding retention time predictions") - rt_predictor = RetentionTime(processes=processes) - rt_predictor.add_rt_predictions(psm_list) + # if add_retention_time: + # logger.info("Adding retention time predictions") + # rt_predictor = RetentionTime(processes=processes) + # rt_predictor.add_rt_predictions(psm_list) - if add_ion_mobility: - logger.info("Adding ion mobility predictions") - im_predictor = IonMobility(processes=processes) - im_predictor.add_im_predictions(psm_list) + # if add_ion_mobility: + # logger.info("Adding ion mobility predictions") + # im_predictor = IonMobility(processes=processes) + # im_predictor.add_im_predictions(psm_list) with Encoder.from_psm_list(psm_list) as encoder: ms2pip_parallelized = _Parallelized( @@ -189,16 +189,34 @@ def predict_library( raise ValueError("Either `fasta_file` or `config` must be provided.") search_space = ProteomeSearchSpace.from_any(config) - search_space.build() + search_space.build(processes=processes) + + # Convert to PSMList + psm_list = search_space.to_psm_list() + + # Filter PSMs by mz + # TODO: Parallelize this step? + psm_list_filtered = search_space.filter_psms_by_mz(psm_list) + + # Add retention time and ion mobility predictions + if add_retention_time: + logger.info("Adding retention time predictions...") + rt_predictor = RetentionTime(processes=processes) + rt_predictor.add_rt_predictions(psm_list_filtered) + if add_ion_mobility: + logger.info("Adding ion mobility predictions...") + im_predictor = IonMobility(processes=processes) + im_predictor.add_im_predictions(psm_list_filtered) for batch in track( - _into_batches(search_space, batch_size=batch_size), + _into_batches(psm_list_filtered, batch_size=batch_size), description="Predicting spectra...", total=ceil(len(search_space) / batch_size), ): + logging.disable(logging.CRITICAL) yield predict_batch( - search_space.filter_psms_by_mz(PSMList(psm_list=list(batch))), + batch, add_retention_time=add_retention_time, add_ion_mobility=add_ion_mobility, model=model, diff --git a/ms2pip/search_space.py b/ms2pip/search_space.py index 2ac3a87..e969c53 100644 --- a/ms2pip/search_space.py +++ b/ms2pip/search_space.py @@ -265,7 +265,9 @@ def build(self, processes: int = 1): Number of processes to use for parallelization. """ - processes = processes if processes else multiprocessing.cpu_count() + processes = ( + processes if processes else multiprocessing.cpu_count() + ) # Always ignored because of the default value self._digest_fasta(processes) self._remove_redundancy() self._add_modifications(processes) @@ -308,6 +310,10 @@ def filter_psms_by_mz(self, psms: PSMList) -> PSMList: ] ) + def to_psm_list(self) -> PSMList: + """Convert search space to PSMList.""" + return PSMList(psm_list=list(self)) + def _digest_fasta(self, processes: int = 1): """Digest FASTA file to peptides and populate search space.""" # Convert to string to avoid issues with Path objects From 59da00ce10812e685d0bec5a3ef9a9a268e83725 Mon Sep 17 00:00:00 2001 From: rodvrees Date: Wed, 12 Mar 2025 13:10:51 +0100 Subject: [PATCH 2/2] Remove IM and RT parameters from predict_batch call inside predict_library --- ms2pip/core.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ms2pip/core.py b/ms2pip/core.py index 7462056..fe50112 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -112,15 +112,15 @@ def predict_batch( psms = PSMList(psm_list=psms) psm_list = read_psms(psms, filetype=psm_filetype) - # if add_retention_time: - # logger.info("Adding retention time predictions") - # rt_predictor = RetentionTime(processes=processes) - # rt_predictor.add_rt_predictions(psm_list) + if add_retention_time: + logger.info("Adding retention time predictions") + rt_predictor = RetentionTime(processes=processes) + rt_predictor.add_rt_predictions(psm_list) - # if add_ion_mobility: - # logger.info("Adding ion mobility predictions") - # im_predictor = IonMobility(processes=processes) - # im_predictor.add_im_predictions(psm_list) + if add_ion_mobility: + logger.info("Adding ion mobility predictions") + im_predictor = IonMobility(processes=processes) + im_predictor.add_im_predictions(psm_list) with Encoder.from_psm_list(psm_list) as encoder: ms2pip_parallelized = _Parallelized( @@ -217,8 +217,6 @@ def predict_library( logging.disable(logging.CRITICAL) yield predict_batch( batch, - add_retention_time=add_retention_time, - add_ion_mobility=add_ion_mobility, model=model, model_dir=model_dir, processes=processes,