Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spectrum feature generator #178

Open
wants to merge 42 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
fdceeba
initial commit
ArthurDeclercq Feb 24, 2024
5374ed8
finalize ms2 feature generation
ArthurDeclercq Feb 25, 2024
60207a3
add rustyms
ArthurDeclercq Feb 25, 2024
ae39844
remove exit statement fixed IM required value
ArthurDeclercq Feb 26, 2024
9b98c4d
change logger.info to debug
ArthurDeclercq Feb 26, 2024
5e45756
added profile decorator to get timings for functions
ArthurDeclercq Feb 26, 2024
304777c
removed profile as standard rescore debug statement
ArthurDeclercq Feb 26, 2024
95ee475
added new basic features
ArthurDeclercq Feb 26, 2024
73f4573
fixes for ms2 feature generator, removed multiprocessing
ArthurDeclercq Feb 26, 2024
947233e
return empty list on parsing error with rustyms, removed multiprocessing
ArthurDeclercq Feb 28, 2024
24ce565
add deeplc_calibration psm set
ArthurDeclercq Mar 15, 2024
114b006
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 17, 2024
33c38b0
remove unused import
ArthurDeclercq Apr 17, 2024
40425c7
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 19, 2024
b810b8c
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 19, 2024
69b5d1a
Merge tag 'main' of https://github.com/compomics/ms2rescore into spec…
ArthurDeclercq Aug 16, 2024
6e2d102
Merge pull request #177 from compomics/main
ArthurDeclercq Aug 16, 2024
11fdc51
integrate mumble into ms2branch
ArthurDeclercq Aug 21, 2024
3140c44
Merge remote-tracking branch 'origin/main' into spectrum-feature-gene…
ArthurDeclercq Sep 23, 2024
883169a
temp removal of sage features before rescoring
ArthurDeclercq Sep 27, 2024
97865e7
Merge branch 'main' of https://github.com/compomics/ms2rescore into s…
ArthurDeclercq Sep 27, 2024
da39ae8
remove psm_file features when rescoring with mumble
ArthurDeclercq Nov 8, 2024
37fff28
linting
SamvPy Nov 19, 2024
e8b59f3
add hyperscore calculation
SamvPy Nov 19, 2024
c51cd34
calibration fixes
ArthurDeclercq Nov 21, 2024
295e37f
changes for mumble implementation
ArthurDeclercq Nov 21, 2024
909860d
change openms peptide formatting
SamvPy Nov 22, 2024
c5902c2
add mumble psm filtering functionality
ArthurDeclercq Nov 22, 2024
6eaceb2
Merge branch 'spectrum-feature-generator' of https://github.com/compo…
ArthurDeclercq Nov 22, 2024
5ce55f5
remove pyopenms dependency for hyperscore calculation
SamvPy Nov 22, 2024
986c5f6
fix spectrum_id accession
ArthurDeclercq Nov 22, 2024
bbecf6a
Merge branch 'spectrum-feature-generator' of https://github.com/compo…
ArthurDeclercq Nov 22, 2024
6fd6053
Merge remote-tracking branch 'origin/main' into spectrum-feature-gene…
paretje Jan 14, 2025
5333e46
remove unused imports
paretje Jan 17, 2025
dd2259f
remove unused import in deeplc feature generator
paretje Jan 17, 2025
d24ef30
add rustyms dependency
paretje Jan 17, 2025
21cafc7
drop rustyms requirement to 0.8.3
paretje Jan 17, 2025
ca9da7d
mumble related changes
ArthurDeclercq Jan 17, 2025
c5b6eb0
add mumble
paretje Jan 17, 2025
aee8ec7
update mumble to use user cache dir
paretje Jan 21, 2025
7ce56c2
bump im2deep dependency
paretje Jan 24, 2025
106ad8f
make mumble and rustyms optional dependancy
ArthurDeclercq Feb 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
from ms2rescore.parse_spectra import add_precursor_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
from ms2rescore.rescoring_engines.mokapot import (
add_peptide_confidence,
add_psm_confidence,
)
from ms2rescore.utils import filter_mumble_psms

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -106,6 +110,12 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
)
psm_list = psm_list[psms_with_features]

if "mumble" in config["psm_generator"]:
# Remove PSMs where matched_ions_pct drops 25% below the original hit
psm_list = filter_mumble_psms(psm_list, threshold=0.75)
# Currently replace the score with the hyperscore for Mumble
# psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix

# Write feature names to file
_write_feature_names(feature_names, output_file_root)

Expand Down Expand Up @@ -211,7 +221,10 @@ def _write_feature_names(feature_names, output_file_root):
def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
"""Log #PSMs identified before rescoring."""
id_psms_before = (
(psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
(psm_list["qvalue"] <= 0.01)
& (psm_list["rank"] <= max_rank)
& (~psm_list["is_decoy"])
& ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like it might be quite inefficient, however I'm not sure if it can be improved significantly, given that original_psm is in the metadata dict. Maybe keeping it a series instead of a list might be better. Or adding it to the dataframe.

).sum()
logger.info(
f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
Expand Down Expand Up @@ -277,7 +290,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList:
)

# Recalculate confidence
new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"])
new_confidence = lin_psm_data.assign_confidence(
scores=list(psm_list["score"])
) # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot

# Add new confidence estimations to PSMList
add_psm_confidence(psm_list, new_confidence)
Expand Down
6 changes: 6 additions & 0 deletions ms2rescore/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError):
"""Error while rescoring PSMs."""

pass


class ParseSpectrumError(MS2RescoreError):
"""Error while rescoring PSMs."""

pass
4 changes: 3 additions & 1 deletion ms2rescore/feature_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

from ms2rescore.feature_generators.basic import BasicFeatureGenerator
from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator

FEATURE_GENERATORS = {
"basic": BasicFeatureGenerator,
Expand All @@ -16,4 +17,5 @@
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
"im2deep": IM2DeepFeatureGenerator,
"ms2": MS2FeatureGenerator,
}
13 changes: 13 additions & 0 deletions ms2rescore/feature_generators/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None:
charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list])
precursor_mzs = psm_list["precursor_mz"]
scores = psm_list["score"]
peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list])

has_charge = None not in charge_states
has_mz = None not in precursor_mzs and has_charge
Expand All @@ -74,13 +75,25 @@ def add_features(self, psm_list: PSMList) -> None:
if has_score:
self._feature_names.append("search_engine_score")

if has_mz and has_charge:
experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812)
theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812)
mass_error = experimental_mass - theoretical_mass
self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"])

self._feature_names.append("pep_len")

for i, psm in enumerate(psm_list):
psm.rescoring_features.update(
dict(
**{"charge_n": charge_n[i]} if has_charge else {},
**charge_one_hot[i] if has_charge else {},
**{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {},
**{"search_engine_score": scores[i]} if has_score else {},
**{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {},
**{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {},
**{"mass_error": mass_error[i]} if has_mz and has_charge else {},
**{"pep_len": peptide_lengths[i]},
)
)

Expand Down
18 changes: 11 additions & 7 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
*args,
lower_score_is_better: bool = False,
calibration_set_size: Union[int, float, None] = None,
calibration_set: Union[str, None] = None,
processes: int = 1,
**kwargs,
) -> None:
Expand Down Expand Up @@ -74,6 +75,7 @@ def __init__(

self.lower_psm_score_better = lower_score_is_better
self.calibration_set_size = calibration_set_size
self.calibration_set = calibration_set
self.processes = processes
self.deeplc_kwargs = kwargs or {}

Expand Down Expand Up @@ -123,7 +125,6 @@ def add_features(self, psm_list: PSMList) -> None:
# Run DeepLC for each spectrum file
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())

for runs in psm_dict.values():
# Reset DeepLC predictor for each collection of runs
self.deeplc_predictor = None
Expand All @@ -141,13 +142,13 @@ def add_features(self, psm_list: PSMList) -> None:
)

# Disable wild logging to stdout by Tensorflow, unless in debug mode

with contextlib.redirect_stdout(
open(os.devnull, "w", encoding="utf-8")
) if not self._verbose else contextlib.nullcontext():
with (
contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8"))
if not self._verbose
else contextlib.nullcontext()
):
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

psm_list_calibration = self._get_calibration_psms(psm_list_run)
logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
self.deeplc_predictor = self.DeepLC(
Expand Down Expand Up @@ -197,7 +198,10 @@ def add_features(self, psm_list: PSMList) -> None:

def _get_calibration_psms(self, psm_list: PSMList):
"""Get N best scoring target PSMs for calibration."""
psm_list_targets = psm_list[~psm_list["is_decoy"]]
psm_list_targets = psm_list[
~psm_list["is_decoy"]
& [metadata.get("original_psm", True) for metadata in psm_list["metadata"]]
]
if self.calibration_set_size:
n_psms = self._get_number_of_calibration_psms(psm_list_targets)
indices = np.argsort(psm_list_targets["score"])
Expand Down
3 changes: 2 additions & 1 deletion ms2rescore/feature_generators/im2deep.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p
identified_psms = psm_list_df[
(psm_list_df["qvalue"] < 0.01)
& (~psm_list_df["is_decoy"])
& (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep
& (psm_list_df["charge"] < 7) # predictions do not go higher for IM2Deep
& ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]])
]
calibration_psms = identified_psms[
identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)
Expand Down
6 changes: 5 additions & 1 deletion ms2rescore/feature_generators/ionmob.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@
try:
from ionmob import __file__ as ionmob_file
from ionmob.preprocess.data import to_tf_dataset_inference
from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs
from ionmob.utilities.chemistry import (
VARIANT_DICT,
calculate_mz,
reduced_mobility_to_ccs,
)
from ionmob.utilities.tokenization import tokenizer_from_json
from ionmob.utilities.utility import get_ccs_shift
except ImportError:
Expand Down
Loading
Loading