compomics · ArthurDeclercq · Feb 24, 2024 · Feb 25, 2024 · Feb 25, 2024 · Feb 26, 2024
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
@@ -13,7 +13,11 @@
 from ms2rescore.parse_spectra import add_precursor_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
+from ms2rescore.rescoring_engines.mokapot import (
+    add_peptide_confidence,
+    add_psm_confidence,
+)
+from ms2rescore.utils import filter_mumble_psms
 
 logger = logging.getLogger(__name__)
 
@@ -106,6 +110,12 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         )
     psm_list = psm_list[psms_with_features]
 
+    if "mumble" in config["psm_generator"]:
+        # Remove PSMs where matched_ions_pct drops 25% below the original hit
+        psm_list = filter_mumble_psms(psm_list, threshold=0.75)
+        # Currently replace the score with the hyperscore for Mumble
+        # psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix
+
     # Write feature names to file
     _write_feature_names(feature_names, output_file_root)
 
@@ -211,7 +221,10 @@ def _write_feature_names(feature_names, output_file_root):
 def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
     """Log #PSMs identified before rescoring."""
     id_psms_before = (
-        (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
+        (psm_list["qvalue"] <= 0.01)
+        & (psm_list["rank"] <= max_rank)
+        & (~psm_list["is_decoy"])
+        & ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]])
     ).sum()
     logger.info(
         f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
@@ -277,7 +290,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList:
     )
 
     # Recalculate confidence
-    new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"])
+    new_confidence = lin_psm_data.assign_confidence(
+        scores=list(psm_list["score"])
+    )  # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot
 
     # Add new confidence estimations to PSMList
     add_psm_confidence(psm_list, new_confidence)

diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
@@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError):
     """Error while rescoring PSMs."""
 
     pass
+
+
+class ParseSpectrumError(MS2RescoreError):
+    """Error while rescoring PSMs."""
+
+    pass
diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
@@ -4,10 +4,11 @@
 
 from ms2rescore.feature_generators.basic import BasicFeatureGenerator
 from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
+from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
 from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
+from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
 from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
-from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 
 FEATURE_GENERATORS = {
     "basic": BasicFeatureGenerator,
@@ -16,4 +17,5 @@
     "maxquant": MaxQuantFeatureGenerator,
     "ionmob": IonMobFeatureGenerator,
     "im2deep": IM2DeepFeatureGenerator,
+    "ms2": MS2FeatureGenerator,
 }
diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py
@@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None:
         charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list])
         precursor_mzs = psm_list["precursor_mz"]
         scores = psm_list["score"]
+        peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list])
 
         has_charge = None not in charge_states
         has_mz = None not in precursor_mzs and has_charge
@@ -74,13 +75,25 @@ def add_features(self, psm_list: PSMList) -> None:
         if has_score:
             self._feature_names.append("search_engine_score")
 
+        if has_mz and has_charge:
+            experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812)
+            theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812)
+            mass_error = experimental_mass - theoretical_mass
+            self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"])
+
+        self._feature_names.append("pep_len")
+
         for i, psm in enumerate(psm_list):
             psm.rescoring_features.update(
                 dict(
                     **{"charge_n": charge_n[i]} if has_charge else {},
                     **charge_one_hot[i] if has_charge else {},
                     **{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {},
                     **{"search_engine_score": scores[i]} if has_score else {},
+                    **{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {},
+                    **{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {},
+                    **{"mass_error": mass_error[i]} if has_mz and has_charge else {},
+                    **{"pep_len": peptide_lengths[i]},
                 )
             )
 

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
@@ -43,6 +43,7 @@ def __init__(
         *args,
         lower_score_is_better: bool = False,
         calibration_set_size: Union[int, float, None] = None,
+        calibration_set: Union[str, None] = None,
         processes: int = 1,
         **kwargs,
     ) -> None:
@@ -74,6 +75,7 @@ def __init__(
 
         self.lower_psm_score_better = lower_score_is_better
         self.calibration_set_size = calibration_set_size
+        self.calibration_set = calibration_set
         self.processes = processes
         self.deeplc_kwargs = kwargs or {}
 
@@ -123,7 +125,6 @@ def add_features(self, psm_list: PSMList) -> None:
         # Run DeepLC for each spectrum file
         current_run = 1
         total_runs = sum(len(runs) for runs in psm_dict.values())
-
         for runs in psm_dict.values():
             # Reset DeepLC predictor for each collection of runs
             self.deeplc_predictor = None
@@ -141,13 +142,13 @@ def add_features(self, psm_list: PSMList) -> None:
                 )
 
                 # Disable wild logging to stdout by Tensorflow, unless in debug mode
-
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w", encoding="utf-8")
-                ) if not self._verbose else contextlib.nullcontext():
+                with (
+                    contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8"))
+                    if not self._verbose
+                    else contextlib.nullcontext()
+                ):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-
                     psm_list_calibration = self._get_calibration_psms(psm_list_run)
                     logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
                     self.deeplc_predictor = self.DeepLC(
@@ -197,7 +198,10 @@ def add_features(self, psm_list: PSMList) -> None:
 
     def _get_calibration_psms(self, psm_list: PSMList):
         """Get N best scoring target PSMs for calibration."""
-        psm_list_targets = psm_list[~psm_list["is_decoy"]]
+        psm_list_targets = psm_list[
+            ~psm_list["is_decoy"]
+            & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]]
+        ]
         if self.calibration_set_size:
             n_psms = self._get_number_of_calibration_psms(psm_list_targets)
             indices = np.argsort(psm_list_targets["score"])

diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
@@ -161,7 +161,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p
         identified_psms = psm_list_df[
             (psm_list_df["qvalue"] < 0.01)
             & (~psm_list_df["is_decoy"])
-            & (psm_list_df["charge"] < 5)  # predictions do not go higher for IM2Deep
+            & (psm_list_df["charge"] < 7)  # predictions do not go higher for IM2Deep
+            & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]])
         ]
         calibration_psms = identified_psms[
             identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)

diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py
@@ -29,7 +29,11 @@
 try:
     from ionmob import __file__ as ionmob_file
     from ionmob.preprocess.data import to_tf_dataset_inference
-    from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs
+    from ionmob.utilities.chemistry import (
+        VARIANT_DICT,
+        calculate_mz,
+        reduced_mobility_to_ccs,
+    )
     from ionmob.utilities.tokenization import tokenizer_from_json
     from ionmob.utilities.utility import get_ccs_shift
 except ImportError: