From 44a0eb18a29d5f927d60e55bd723ae7f4237e2c6 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Fri, 20 Aug 2021 14:44:34 +0200 Subject: [PATCH 01/27] Added Peaks_pipeline --- ms2rescore/__init__.py | 2 + ms2rescore/id_file_parser.py | 250 ++++++++++++++++++++++++++++++++--- 2 files changed, 230 insertions(+), 22 deletions(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 04e148af..197f1f91 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -119,6 +119,8 @@ def _select_pipeline(self): pipeline = id_file_parser.TandemPipeline elif self.config["general"]["pipeline"] == "peptideshaker": pipeline = id_file_parser.PeptideShakerPipeline + elif self.config["general"]["pipeline"] == "Peaks": + pipeline = id_file_parser.PeaksPipeline else: raise NotImplementedError(self.config["general"]["pipeline"]) return pipeline diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 5cc7d6ca..aac5898c 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -4,11 +4,13 @@ import os import re from abc import ABC, abstractmethod -from typing import Dict, Tuple, Union +from typing import Dict, Tuple, Union, List import numpy as np import pandas as pd from pyteomics import tandem +from pyteomics import mzid +from tqdm import tqdm from ms2rescore._exceptions import MS2ReScoreError from ms2rescore.maxquant import MSMSAccessor @@ -81,7 +83,7 @@ def _validate_mgf_path( passed_path: Union[None, str, os.PathLike], default_dir: Union[str, os.PathLike], expected_rootname: str, - expected_ext: str = ".mgf" + expected_ext: str = ".mgf", ) -> Union[str, os.PathLike]: """ Infer MGF path from passed path and expected filename (e.g. from ID file). @@ -113,8 +115,7 @@ def _validate_mgf_path( # If passed path is directory, join with expected rootname + ext elif os.path.isdir(passed_path): path_to_mgf_file = os.path.join( - passed_path, - expected_rootname + expected_ext + passed_path, expected_rootname + expected_ext ) # If passed path is file, use that, but warn if basename doesn't match expected @@ -125,7 +126,7 @@ def _validate_mgf_path( "Passed MGF name root `%s` does not match MGF name root `%s` from " "identifications file. Continuing with passed MGF name.", passed_rootname, - expected_rootname + expected_rootname, ) path_to_mgf_file = passed_path @@ -144,7 +145,7 @@ def _run_percolator_converter(self): self.path_to_id_file, self._path_to_original_pin, id_decoy_pattern=self.id_decoy_pattern, - log_level=self.log_level + log_level=self.log_level, ) @property @@ -153,7 +154,7 @@ def path_to_mgf_file(self) -> Union[str, os.PathLike]: path_to_mgf_file = self._validate_mgf_path( self.passed_mgf_path, os.path.dirname(self.path_to_id_file), - os.path.basename(os.path.splitext(self.path_to_id_file)[0]) + os.path.basename(os.path.splitext(self.path_to_id_file)[0]), ) return path_to_mgf_file @@ -167,8 +168,7 @@ def original_pin(self) -> PercolatorIn: self._run_percolator_converter() self._original_pin = PercolatorIn(self._path_to_original_pin) self._original_pin.modification_mapping_from_list( - self.modification_list, - label_style=self._pin_modification_style + self.modification_list, label_style=self._pin_modification_style ) return self._original_pin @@ -264,7 +264,7 @@ def path_to_mgf_file(self) -> Union[str, os.PathLike]: path_to_mgf_file = self._validate_mgf_path( self.passed_mgf_path, os.path.dirname(self.path_to_id_file), - self.original_pin.get_spectrum_filename() + self.original_pin.get_spectrum_filename(), ) return path_to_mgf_file @@ -285,14 +285,14 @@ def get_peprec(self) -> PeptideRecord: "rt": "observed_retention_time", "expect": "psm_score", "hyperscore": "hyperscore_tandem", - "id": "tandem_id" + "id": "tandem_id", } peprec_df = tandem_df[tandem_peprec_mapping.keys()].rename( columns=tandem_peprec_mapping ) # Set PSM score as -log(e-value) - peprec_df["psm_score"] = - np.log(peprec_df["psm_score"]) + peprec_df["psm_score"] = -np.log(peprec_df["psm_score"]) logger.debug("Adding modifications from original PIN to PEPREC...") pin = self.original_pin @@ -300,18 +300,24 @@ def get_peprec(self) -> PeptideRecord: pin.add_spectrum_index_column(label="tandem_id") pin.df["ModPeptide"] = pin.df["Peptide"] peprec_df = peprec_df.merge( - pin.df[["modifications", "tandem_id", "hyperscore", "Label", "ModPeptide", "Proteins"]], - on="tandem_id" + pin.df[ + [ + "modifications", + "tandem_id", + "hyperscore", + "Label", + "ModPeptide", + "Proteins", + ] + ], + on="tandem_id", ) # Validate merge by comparing the hyperscore columns if not (peprec_df["hyperscore_tandem"] == peprec_df["hyperscore"]).all(): raise IDFileParserError( "Could not merge tandem xml and generated pin files." ) - peprec_df.drop( - columns=["tandem_id", "hyperscore_tandem"], - inplace=True - ) + peprec_df.drop(columns=["tandem_id", "hyperscore_tandem"], inplace=True) peprec = PeptideRecord.from_dataframe(peprec_df) peprec.reorder_columns() @@ -374,9 +380,9 @@ def parse_mgf_files(self, peprec): peprec.df, self.passed_mgf_path, outname=path_to_new_mgf, - filename_col='Raw file', - spec_title_col='spec_id', - title_parsing_method='run.scan.scan', + filename_col="Raw file", + spec_title_col="spec_id", + title_parsing_method="run.scan.scan", ) self._path_to_new_mgf = path_to_new_mgf @@ -385,7 +391,7 @@ def get_peprec(self, parse_mgf: bool = True) -> PeptideRecord: logger.debug("Converting MaxQuant msms.txt to PEPREC...") peprec = self.msms_df.msms.to_peprec( modification_mapping=self._modification_mapping, - fixed_modifications=self._fixed_modifications + fixed_modifications=self._fixed_modifications, ) if parse_mgf: self.parse_mgf_files(peprec) @@ -437,3 +443,203 @@ def get_peprec(self) -> PeptideRecord: def get_search_engine_features(self) -> pd.DataFrame: """Get pandas.DataFrame with search engine features.""" return self.extended_psm_report.ext_psm_report.get_search_engine_features() + + +class PeaksPipeline(_Pipeline): + """Peaks mzid report to PeptideRecord and search engine features.""" + + def __init__(self, config: Dict, output_basename: Union[str, os.PathLike]) -> None: + super().__init__(config, output_basename) + + # Private attributes, specific to this pipeline + self.df = self.read_df_from_mzid() + + @property + def original_pin(self): + """Get PercolatorIn object from identification file.""" + raise NotImplementedError( + "Property `original_pin` is not implemented in class " "`PeaksPipeline`." + ) + + def peprec_from_pin(self): + """Get PeptideRecord from PIN file and MGF file.""" + raise NotImplementedError( + "Method `peprec_from_pin` is not implemented in class " "`PeaksPipeline`." + ) + + @staticmethod + def _get_peprec_modifications(modifications: List): + """get peprec modifications out of the peaks id file""" + suffix_list = ["Phospho"] + if isinstance(modifications, List): + mods = [] + for m in modifications: + if m["name"] in suffix_list: + mods.append(f"{m['location']}|{m['name'] + m['residues'][0]}") + elif "residues" not in m.keys() and m["location"] != 0: + mods.append(f"-1|{m['name']}") + else: + mods.append(f"{m['location']}|{m['name']}") + return "|".join(mods) + else: + raise TypeError + + def _convert_to_flat_dict(self, nested_dict, parent_key="", sep="_"): + """Convert nested dict to flat dict with concatenated keys""" + + items = [] + for k, v in nested_dict.items(): + new_key = parent_key + sep + k if parent_key else k + + if isinstance(v, Dict): + items.extend(self._convert_to_flat_dict(v, new_key, sep=sep)) + elif isinstance(v, List): + if isinstance(v[0], Dict) and k != "Modification": + items.extend(self._convert_to_flat_dict(v[0], new_key, sep=sep)) + elif isinstance(v[0], Dict) and k == "Modification": + items.append((new_key, v)) + else: + items.append((new_key, v[0])) + else: + items.append((new_key, v)) + return items + + def read_df_from_mzid(self) -> pd.DataFrame: + """Read mzid to Dataframe.""" + df = pd.DataFrame( + columns=[ + "spec_id", + "peptide", + "modifications", + "charge", + "PEAKS:peptideScore", + "Label", + "Raw file", + "rank", + "protein_description", + ] + ) + with mzid.read(self.path_to_id_file) as reader: + for spectrum_identification_result in tqdm(reader): + retrieved_data = pd.Series( + index=[ + "spec_id", + "peptide", + "modifications", + "charge", + "protein_list", + "PEAKS:peptideScore", + "Label", + "Raw file", + "rank", + "protein_description", + ] + ) + flat_dict = dict( + self._convert_to_flat_dict(spectrum_identification_result) + ) + spec_id = ( + flat_dict["location"] + .rsplit("/", 1)[1] + .split(".", 1)[0] + .replace(",", "_", 1) + + "." + + re.findall(r"\d+", flat_dict["spectrumID"]).group(0) + + "." + + re.findall(r"\d+", flat_dict["spectrumID"]).group(0) + ) + retrieved_data["spec_id"] = spec_id + retrieved_data["peptide"] = flat_dict[ + "SpectrumIdentificationItem_PeptideSequence" + ] + try: + retrieved_data["modifications"] = self._get_peprec_modifications( + flat_dict["SpectrumIdentificationItem_Modification"] + ) + except KeyError: + retrieved_data["modifications"] = "-" + retrieved_data["charge"] = flat_dict[ + "SpectrumIdentificationItem_chargeState" + ] + retrieved_data["protein_list"] = flat_dict[ + "SpectrumIdentificationItem_PeptideEvidenceRef_accession" + ] + retrieved_data["psm_score"] = flat_dict[ + "SpectrumIdentificationItem_PEAKS:peptideScore" + ] + retrieved_data["Label"] = flat_dict[ + "SpectrumIdentificationItem_PeptideEvidenceRef_isDecoy" + ] + retrieved_data["Raw file"] = ( + flat_dict["location"] + .rsplit("/", 1)[1] + .split(".", 1)[0] + .replace(",", "_", 1) + ) + retrieved_data["Rank"] = flat_dict["SpectrumIdentificationItem_rank"] + df = df.append(retrieved_data, ignore_index=True) + retrieved_data["protein_description"] = flat_dict[ + "SpectrumIdentificationItem_PeptideEvidenceRef_protein description" + ] + + df["Label"] = df["Label"].apply(lambda x: 1 if x else -1) + + return df + + def parse_mgf_files(self, peprec): + """Parse multiple MGF files into one for MS²PIP.""" + logger.debug("Parsing MGF files into one for MS²PIP") + path_to_new_mgf = self.output_basename + "_unified.mgf" + parse_mgf( + peprec, + self.passed_mgf_path, + outname=path_to_new_mgf, + filename_col="Raw file", + spec_title_col="spec_id", + title_parsing_method="run.scan.scan", + ) + self._path_to_new_mgf = path_to_new_mgf + + def get_peprec(self) -> PeptideRecord: + """Get PeptideRecord.""" + + peprec = self.df[ + [ + "spec_id", + "peptide", + "modifications", + "charge", + "protein_list", + "PEAKS:peptideScore", + "Label", + "Raw file", + ] + ].rename({"PEAKS:peptideScore": "psm_score"}) + peprec.drop("Raw file", axis=1, inplace=True) + self.parse_mgf_files(peprec) + + titles, rt = parse_mgf_title_rt(self._path_to_new_mgf) + id_rt_dict = { + "spec_id": list(titles.values()), + "observed_retention_time": list(rt.values()), + } + peprec.merge(pd.DataFrame.from_dict(id_rt_dict), on="spec_id", how="inner") + + return peprec + + def get_search_engine_features(self) -> pd.DataFrame: + """Get pandas.DataFrame with search engine features.""" + + peprec_cols = [ + "spec_id", + "peptide", + "modifications", + "charge", + "observed_retention_time", + "Raw file", + "protein_list", + "Label", + ] + non_feature_cols = peprec_cols + feature_cols = [col for col in self.df.columns if col not in non_feature_cols] + return self.df[feature_cols] From fd7fb3ef22051e36cd9b871749979f31be9ea224 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Fri, 27 Aug 2021 09:52:33 +0200 Subject: [PATCH 02/27] Peaks pipeline update, plotting bug fix --- ms2rescore/id_file_parser.py | 36 ++++++++++------------ ms2rescore/package_data/config_schema.json | 2 +- ms2rescore/parse_mgf.py | 5 +-- ms2rescore/plotting.py | 12 +++++--- 4 files changed, 27 insertions(+), 28 deletions(-) diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index aac5898c..2ce239e3 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -515,7 +515,7 @@ def read_df_from_mzid(self) -> pd.DataFrame: "PEAKS:peptideScore", "Label", "Raw file", - "rank", + "Rank", "protein_description", ] ) @@ -531,7 +531,7 @@ def read_df_from_mzid(self) -> pd.DataFrame: "PEAKS:peptideScore", "Label", "Raw file", - "rank", + "Rank", "protein_description", ] ) @@ -544,9 +544,9 @@ def read_df_from_mzid(self) -> pd.DataFrame: .split(".", 1)[0] .replace(",", "_", 1) + "." - + re.findall(r"\d+", flat_dict["spectrumID"]).group(0) + + re.search(r"\d+", flat_dict["spectrumID"]).group(0) + "." - + re.findall(r"\d+", flat_dict["spectrumID"]).group(0) + + re.search(r"\d+", flat_dict["spectrumID"]).group(0) ) retrieved_data["spec_id"] = spec_id retrieved_data["peptide"] = flat_dict[ @@ -564,7 +564,7 @@ def read_df_from_mzid(self) -> pd.DataFrame: retrieved_data["protein_list"] = flat_dict[ "SpectrumIdentificationItem_PeptideEvidenceRef_accession" ] - retrieved_data["psm_score"] = flat_dict[ + retrieved_data["PEAKS:peptideScore"] = flat_dict[ "SpectrumIdentificationItem_PEAKS:peptideScore" ] retrieved_data["Label"] = flat_dict[ @@ -577,13 +577,11 @@ def read_df_from_mzid(self) -> pd.DataFrame: .replace(",", "_", 1) ) retrieved_data["Rank"] = flat_dict["SpectrumIdentificationItem_rank"] - df = df.append(retrieved_data, ignore_index=True) retrieved_data["protein_description"] = flat_dict[ "SpectrumIdentificationItem_PeptideEvidenceRef_protein description" ] - + df = df.append(retrieved_data, ignore_index=True) df["Label"] = df["Label"].apply(lambda x: 1 if x else -1) - return df def parse_mgf_files(self, peprec): @@ -598,12 +596,12 @@ def parse_mgf_files(self, peprec): spec_title_col="spec_id", title_parsing_method="run.scan.scan", ) - self._path_to_new_mgf = path_to_new_mgf + self.passed_mgf_path = path_to_new_mgf def get_peprec(self) -> PeptideRecord: """Get PeptideRecord.""" - peprec = self.df[ + peprec_df = self.df[ [ "spec_id", "peptide", @@ -614,16 +612,16 @@ def get_peprec(self) -> PeptideRecord: "Label", "Raw file", ] - ].rename({"PEAKS:peptideScore": "psm_score"}) - peprec.drop("Raw file", axis=1, inplace=True) - self.parse_mgf_files(peprec) - - titles, rt = parse_mgf_title_rt(self._path_to_new_mgf) + ].rename({"PEAKS:peptideScore": "psm_score"}, axis=1) + self.parse_mgf_files(peprec_df) + peprec_df.drop("Raw file", axis=1, inplace=True) + titles, rt = parse_mgf_title_rt(self.passed_mgf_path) id_rt_dict = { "spec_id": list(titles.values()), "observed_retention_time": list(rt.values()), } - peprec.merge(pd.DataFrame.from_dict(id_rt_dict), on="spec_id", how="inner") + peprec_df = pd.merge(peprec_df, pd.DataFrame.from_dict(id_rt_dict), on="spec_id", how='inner') + peprec = PeptideRecord.from_dataframe(peprec_df) return peprec @@ -631,15 +629,13 @@ def get_search_engine_features(self) -> pd.DataFrame: """Get pandas.DataFrame with search engine features.""" peprec_cols = [ - "spec_id", "peptide", "modifications", - "charge", "observed_retention_time", "Raw file", "protein_list", "Label", ] - non_feature_cols = peprec_cols - feature_cols = [col for col in self.df.columns if col not in non_feature_cols] + + feature_cols = [col for col in self.df.columns if col not in peprec_cols] return self.df[feature_cols] diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index c2468538..03fde771 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -13,7 +13,7 @@ "pipeline": { "description": "Pipeline to use, depending on input format", "type": "string", - "enum": ["infer", "pin", "tandem", "maxquant", "msgfplus", "peptideshaker"], + "enum": ["infer", "pin", "tandem", "maxquant", "msgfplus", "peptideshaker", "Peaks"], "default": "infer" }, "feature_sets": { diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py index 57110831..8e012138 100644 --- a/ms2rescore/parse_mgf.py +++ b/ms2rescore/parse_mgf.py @@ -129,5 +129,6 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', logger.debug( "%i/%i spectra found and written to new MGF file.", count, num_expected ) - if not count == num_expected: - raise ParseMGFError("Not all PSMs could be found in the provided MGF files.") + + # if not count == num_expected: + # raise ParseMGFError("Not all PSMs could be found in the provided MGF files.") diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index 4e4168f3..0a0d8ae8 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -371,7 +371,8 @@ def calculate_loss_gain_df(cls, reference="Before rescoring", FDR_threshold=[0.0 if "After rescoring: searchengine" in ft_dict.keys(): reference = "After rescoring: searchengine" total = len(ft_dict[reference]) - + if len(total) == 0: + continue for feature in cls.unique_df["rescoring"][ cls.unique_df["sample"] == sample ].unique(): @@ -403,14 +404,15 @@ def loss_gain_plot(cls, FDR): single FDR threshold used for the plot """ - if cls.loss_gain_df.empty: - cls.calculate_loss_gain_df(FDR_threshold=[FDR]) - if FDR not in cls.loss_gain_df["FDR"].unique(): + if FDR not in cls.unique_df["FDR"].unique(): cls._separate_unique_peptides(FDR_threshold=[FDR]) cls.calculate_loss_gain_df(FDR_threshold=[FDR]) fig = plt.figure() - tmp = cls.loss_gain_df[cls.loss_gain_df["FDR"] == FDR] + try: + tmp = cls.loss_gain_df[cls.loss_gain_df["FDR"] == FDR] + except KeyError: + return for sample in zip( tmp["sample"].unique(), list(range(1, len(tmp["sample"].unique()) + 1)) ): From c3d8778138bd7fd2eb6fcb4bd57bb3d75c9c39f9 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Wed, 1 Sep 2021 11:14:44 +0200 Subject: [PATCH 03/27] bug fix percolator modification mapping --- ms2rescore/percolator.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ms2rescore/percolator.py b/ms2rescore/percolator.py index f9341cdb..a71f6b20 100644 --- a/ms2rescore/percolator.py +++ b/ms2rescore/percolator.py @@ -98,7 +98,7 @@ def modification_mapping( self._modification_label_type = "str" self._modification_mapping = value elif all([isinstance(label, float) for label in mod_labels]): - self.modification_pattern = r"\[([0-9\-\.]*)\]" + self.modification_pattern = r"\[(\+[0-9\-\.]*)\]" self._modification_label_type = "float" self._modification_mapping = { (aa, self._round(shift)): name @@ -250,13 +250,13 @@ def _get_sequence_column(self) -> pd.Series: def _get_charge_column(self) -> pd.Series: """Get charge column from one-hot encoded `ChargeX` columns.""" - charge_cols = [col for col in self.df.columns if col.startswith("Charge")] + charge_cols = [col for col in self.df.columns if col.startswith("charge")] if not (self.df[charge_cols] == 1).any(axis=1).all(): raise PercolatorInError("Not all PSMs have an assigned charge state.") return ( self.df[charge_cols] .rename( - columns={col: int(col.replace("Charge", "")) for col in charge_cols} + columns={col: int(col.replace("charge", "")) for col in charge_cols} ) .idxmax(1) ) @@ -426,7 +426,7 @@ def get_feature_table(self) -> pd.DataFrame: def to_peptide_record( self, - extract_spectrum_index: Optional[bool] = True, + extract_spectrum_index: Optional[bool] = False, spectrum_index_pattern: Optional[str] = None, score_column_label: Optional[str] = None, ) -> PeptideRecord: @@ -453,7 +453,7 @@ def to_peptide_record( """ # Assign one of the default score column labels, if available if not score_column_label: - for col in ["lnEValue", "hyperscore", "Score"]: + for col in ["lnEValue", "hyperscore", "Score", "COMET:lnExpect"]: if col in self.df.columns: score_column_label = col logger.debug( @@ -478,7 +478,8 @@ def to_peptide_record( peprec_df["psm_score"] = self.df[score_column_label] peprec_df["label"] = self.df["Label"] peprec_df["Proteins"] = self.df["Proteins"] - + if "RT" in self.df.columns: + peprec_df["observed_retention_time"] = self.df["RT"] peprec = PeptideRecord.from_dataframe(peprec_df) return peprec From b27dad33abf1cab0dab3ee4dd53a34a924de1ed6 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Wed, 1 Sep 2021 11:15:58 +0200 Subject: [PATCH 04/27] updating Peaks pipeline, fixes plotting module --- ms2rescore/__init__.py | 18 ++++++++++++------ ms2rescore/id_file_parser.py | 35 ++++++++++++++++------------------- ms2rescore/parse_mgf.py | 15 +++++++++------ ms2rescore/plotting.py | 12 ++++++------ 4 files changed, 43 insertions(+), 37 deletions(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 197f1f91..117b64c8 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -8,6 +8,8 @@ from multiprocessing import cpu_count from typing import Dict, Optional, Union +import pandas as pd + from ms2rescore import id_file_parser, rescore_core, setup_logging from ms2rescore._exceptions import MS2ReScoreError from ms2rescore._version import __version__ @@ -271,12 +273,16 @@ def run(self): + "_".join(fset) + "_features.pout_dec" ) - plotting.POUT( - pout_file, - pout_decoy_file, - self.config["general"]["output_filename"], - " ".join(fset) - ) + try: + plotting.POUT( + pout_file, + pout_decoy_file, + self.config["general"]["output_filename"], + " ".join(fset) + ) + except pd.errors.EmptyDataError: + continue + plotting.RescoreRecord.save_plots_to_pdf( self.config["general"]["output_filename"] + "_plots.pdf", FDR_thresholds=[0.01, 0.001], diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 2ce239e3..a2360143 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -178,11 +178,11 @@ def peprec_from_pin(self) -> PeptideRecord: peprec = self.original_pin.to_peptide_record( spectrum_index_pattern=self._pin_spec_id_patterns[self._pin_spec_id_style] ) - - # Map MGF titles and observed retention times - titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file) - peprec.df["observed_retention_time"] = peprec.df["spec_id"].map(retention_times) - peprec.df["spec_id"] = peprec.df["spec_id"].map(titles) + if "observed_retention_time" not in peprec.df.columns: + # Map MGF titles and observed retention times + titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file) + peprec.df["observed_retention_time"] = peprec.df["spec_id"].map(retention_times) + peprec.df["spec_id"] = peprec.df["spec_id"].map(titles) if not ~peprec.df["observed_retention_time"].isna().any(): raise IDFileParserError( "Could not map all MGF retention times to spectrum indices." @@ -516,7 +516,7 @@ def read_df_from_mzid(self) -> pd.DataFrame: "Label", "Raw file", "Rank", - "protein_description", + #"protein_description", ] ) with mzid.read(self.path_to_id_file) as reader: @@ -532,7 +532,7 @@ def read_df_from_mzid(self) -> pd.DataFrame: "Label", "Raw file", "Rank", - "protein_description", + # "protein_description", ] ) flat_dict = dict( @@ -543,10 +543,8 @@ def read_df_from_mzid(self) -> pd.DataFrame: .rsplit("/", 1)[1] .split(".", 1)[0] .replace(",", "_", 1) - + "." - + re.search(r"\d+", flat_dict["spectrumID"]).group(0) - + "." - + re.search(r"\d+", flat_dict["spectrumID"]).group(0) + + ":" + + flat_dict["spectrumID"] ) retrieved_data["spec_id"] = spec_id retrieved_data["peptide"] = flat_dict[ @@ -577,11 +575,9 @@ def read_df_from_mzid(self) -> pd.DataFrame: .replace(",", "_", 1) ) retrieved_data["Rank"] = flat_dict["SpectrumIdentificationItem_rank"] - retrieved_data["protein_description"] = flat_dict[ - "SpectrumIdentificationItem_PeptideEvidenceRef_protein description" - ] + # retrieved_data["protein_description"] = flat_dict["SpectrumIdentificationItem_PeptideEvidenceRef_protein description"] df = df.append(retrieved_data, ignore_index=True) - df["Label"] = df["Label"].apply(lambda x: 1 if x else -1) + df["Label"] = df["Label"].apply(lambda x: -1 if x else 1) return df def parse_mgf_files(self, peprec): @@ -594,7 +590,7 @@ def parse_mgf_files(self, peprec): outname=path_to_new_mgf, filename_col="Raw file", spec_title_col="spec_id", - title_parsing_method="run.scan.scan", + title_parsing_method="full_run", ) self.passed_mgf_path = path_to_new_mgf @@ -620,10 +616,11 @@ def get_peprec(self) -> PeptideRecord: "spec_id": list(titles.values()), "observed_retention_time": list(rt.values()), } - peprec_df = pd.merge(peprec_df, pd.DataFrame.from_dict(id_rt_dict), on="spec_id", how='inner') - peprec = PeptideRecord.from_dataframe(peprec_df) + # TODO try: except ValueError if lists are unequal? + id_rt_df = pd.DataFrame.from_dict(id_rt_dict) + peprec_df = pd.merge(peprec_df, id_rt_df, on="spec_id", how='inner') - return peprec + return PeptideRecord.from_dataframe(peprec_df) def get_search_engine_features(self) -> pd.DataFrame: """Get pandas.DataFrame with search engine features.""" diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py index 8e012138..9e8ad9f3 100644 --- a/ms2rescore/parse_mgf.py +++ b/ms2rescore/parse_mgf.py @@ -4,7 +4,7 @@ import mmap import os.path import re - +import random from tqdm import tqdm from ms2rescore._exceptions import MS2ReScoreError @@ -38,6 +38,7 @@ def title_parser(line, method='full', run=None): line: string, line from MGF file containing 'TITLE=' method: string, one of the following: - 'full': take everything after 'TITLE=' + - 'full_run': take everything after 'TITLE=' and add run to title - 'first_space': take everything between 'TITLE=' and first space. - 'first_space_no_charge': take everything between 'TITLE=' and first space, but leave out everything after last dot. (required for MaxQuant pipeline). @@ -46,6 +47,10 @@ def title_parser(line, method='full', run=None): if method == 'full': title = line[6:].strip() + elif method == 'full_run': + if not run: + raise TypeError("If `method` is `full_run`, `run` cannot be None.") + title = run + ':' + line[6:].strip() elif method == 'first_space': title = line[6:].split(' ')[0].strip() elif method == 'first_space_no_charge': @@ -92,7 +97,6 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', for run in runs: current_mgf_file = os.path.join(mgf_folder, run + file_suffix) spec_set = set(df_in[(df_in[filename_col] == run)][spec_title_col].values) - # Temporary fix: replace charges in MGF with ID'ed charges # Until MS2PIP uses ID'ed charge instead of MGF charge id_charges = df_in[(df_in[filename_col] == run)].set_index('spec_id')['charge'].to_dict() @@ -122,13 +126,12 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', out.write("CHARGE=" + str(charge) + "+\n") continue # Only print lines when spectrum is found and intensity != 0 - if found and line[-4:] != '0.0\n': + if found and line[-4:] != ' 0.0\n': out.write(line) num_expected = len(df_in[spec_title_col].unique()) logger.debug( "%i/%i spectra found and written to new MGF file.", count, num_expected ) - - # if not count == num_expected: - # raise ParseMGFError("Not all PSMs could be found in the provided MGF files.") + if not count == num_expected: + raise ParseMGFError("Not all PSMs could be found in the provided MGF files.") diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index 0a0d8ae8..e5a91f13 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -88,7 +88,7 @@ def target_decoy_distribution( score_cutoff = None # Score distribution plot - plot_list =[ + plot_list = [ list(self.df[self.df[decoy_label]][score_label]), list(self.df[~self.df[decoy_label]][score_label]), ] @@ -305,18 +305,18 @@ def count_plot(cls, unique=False): unique : Boolean If true only plot unique identifications If false plot total amount of identifications - """ if unique: if cls.unique_df.empty: cls._separate_unique_peptides() y_label = "number of unique identified peptides" - count_df = cls.unique_df + counts_ = cls.unique_df else: + print("niet unique") if cls.count_df.empty: cls._count_identifications() y_label = "number of identified peptides" - count_df = cls.unique_df + count_df = cls.count_df g = sns.catplot( x="sample", @@ -371,7 +371,7 @@ def calculate_loss_gain_df(cls, reference="Before rescoring", FDR_threshold=[0.0 if "After rescoring: searchengine" in ft_dict.keys(): reference = "After rescoring: searchengine" total = len(ft_dict[reference]) - if len(total) == 0: + if total == 0: continue for feature in cls.unique_df["rescoring"][ cls.unique_df["sample"] == sample @@ -402,8 +402,8 @@ def loss_gain_plot(cls, FDR): ---------- FDR : float single FDR threshold used for the plot - """ + if FDR not in cls.unique_df["FDR"].unique(): cls._separate_unique_peptides(FDR_threshold=[FDR]) cls.calculate_loss_gain_df(FDR_threshold=[FDR]) From e5d288e86d25db0a7e8b464f5e3317b796796e9a Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Fri, 3 Sep 2021 16:53:37 +0200 Subject: [PATCH 05/27] bux fixes percolator pipeline, peaks pipeline and plotting module --- ms2rescore/id_file_parser.py | 21 ++++++++++++++------- ms2rescore/percolator.py | 2 +- ms2rescore/plotting.py | 20 +++++++++----------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index a2360143..38f895dc 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -25,7 +25,6 @@ class IDFileParserError(MS2ReScoreError): """Error parsing ID file.""" - pass @@ -181,7 +180,9 @@ def peprec_from_pin(self) -> PeptideRecord: if "observed_retention_time" not in peprec.df.columns: # Map MGF titles and observed retention times titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file) - peprec.df["observed_retention_time"] = peprec.df["spec_id"].map(retention_times) + peprec.df["observed_retention_time"] = peprec.df["spec_id"].map( + retention_times + ) peprec.df["spec_id"] = peprec.df["spec_id"].map(titles) if not ~peprec.df["observed_retention_time"].isna().any(): raise IDFileParserError( @@ -446,6 +447,7 @@ def get_search_engine_features(self) -> pd.DataFrame: class PeaksPipeline(_Pipeline): + # TODO: move code to separate file and create mzid class """Peaks mzid report to PeptideRecord and search engine features.""" def __init__(self, config: Dict, output_basename: Union[str, os.PathLike]) -> None: @@ -516,7 +518,7 @@ def read_df_from_mzid(self) -> pd.DataFrame: "Label", "Raw file", "Rank", - #"protein_description", + # "protein_description", ] ) with mzid.read(self.path_to_id_file) as reader: @@ -559,8 +561,14 @@ def read_df_from_mzid(self) -> pd.DataFrame: retrieved_data["charge"] = flat_dict[ "SpectrumIdentificationItem_chargeState" ] - retrieved_data["protein_list"] = flat_dict[ - "SpectrumIdentificationItem_PeptideEvidenceRef_accession" + # TODO: create class for SpectrumIdentificationItem + # print(spectrum_identification_result) + retrieved_data["protein_list"] = [ + d["accession"] + for d in spectrum_identification_result[ + "SpectrumIdentificationItem" + ][0]["PeptideEvidenceRef"] + if "accession" in d.keys() ] retrieved_data["PEAKS:peptideScore"] = flat_dict[ "SpectrumIdentificationItem_PEAKS:peptideScore" @@ -575,7 +583,6 @@ def read_df_from_mzid(self) -> pd.DataFrame: .replace(",", "_", 1) ) retrieved_data["Rank"] = flat_dict["SpectrumIdentificationItem_rank"] - # retrieved_data["protein_description"] = flat_dict["SpectrumIdentificationItem_PeptideEvidenceRef_protein description"] df = df.append(retrieved_data, ignore_index=True) df["Label"] = df["Label"].apply(lambda x: -1 if x else 1) return df @@ -618,7 +625,7 @@ def get_peprec(self) -> PeptideRecord: } # TODO try: except ValueError if lists are unequal? id_rt_df = pd.DataFrame.from_dict(id_rt_dict) - peprec_df = pd.merge(peprec_df, id_rt_df, on="spec_id", how='inner') + peprec_df = pd.merge(peprec_df, id_rt_df, on="spec_id", how="inner") return PeptideRecord.from_dataframe(peprec_df) diff --git a/ms2rescore/percolator.py b/ms2rescore/percolator.py index a71f6b20..c7aa069d 100644 --- a/ms2rescore/percolator.py +++ b/ms2rescore/percolator.py @@ -476,7 +476,7 @@ def to_peptide_record( peprec_df["charge"] = self._get_charge_column() if score_column_label: peprec_df["psm_score"] = self.df[score_column_label] - peprec_df["label"] = self.df["Label"] + peprec_df["Label"] = self.df["Label"] peprec_df["Proteins"] = self.df["Proteins"] if "RT" in self.df.columns: peprec_df["observed_retention_time"] = self.df["RT"] diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index e5a91f13..087aa8a0 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -312,7 +312,6 @@ def count_plot(cls, unique=False): y_label = "number of unique identified peptides" counts_ = cls.unique_df else: - print("niet unique") if cls.count_df.empty: cls._count_identifications() y_label = "number of identified peptides" @@ -403,16 +402,16 @@ def loss_gain_plot(cls, FDR): FDR : float single FDR threshold used for the plot """ + fig = plt.figure() - if FDR not in cls.unique_df["FDR"].unique(): + if cls.loss_gain_df.empty: + cls.calculate_loss_gain_df(FDR_threshold=[FDR]) + if FDR not in cls.loss_gain_df["FDR"].unique(): cls._separate_unique_peptides(FDR_threshold=[FDR]) cls.calculate_loss_gain_df(FDR_threshold=[FDR]) - fig = plt.figure() - try: - tmp = cls.loss_gain_df[cls.loss_gain_df["FDR"] == FDR] - except KeyError: - return + tmp = cls.loss_gain_df[cls.loss_gain_df["FDR"] == FDR] + for sample in zip( tmp["sample"].unique(), list(range(1, len(tmp["sample"].unique()) + 1)) ): @@ -511,10 +510,9 @@ def save_plots_to_pdf(cls, filename, FDR_thresholds=[0.01]): plt.tight_layout() pdf.savefig() - for fdr in FDR_thresholds: - cls.loss_gain_plot(FDR=fdr) - plt.tight_layout() - pdf.savefig() + cls.loss_gain_plot(FDR=0.01) + plt.tight_layout() + pdf.savefig() pdf.close() From bc1b0ce17c5cd62d3f6b4818355b2a53a1789045 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Wed, 15 Sep 2021 11:19:43 +0200 Subject: [PATCH 06/27] bug fix parse mgf --- ms2rescore/parse_mgf.py | 7 +++++-- ms2rescore/plotting.py | 2 +- ms2rescore/rescore_core.py | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py index 9e8ad9f3..76e33ec1 100644 --- a/ms2rescore/parse_mgf.py +++ b/ms2rescore/parse_mgf.py @@ -119,12 +119,15 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', out.write(line + '\n') found = False continue - # Temporary fix (see above) - if 'CHARGE=' in line: + # Temporary fix (see above, write charge if pepmass is written) + if 'PEPMASS=' in line: if found: charge = id_charges[title] + out.write(line) out.write("CHARGE=" + str(charge) + "+\n") continue + if 'CHARGE=' in line: + continue # Only print lines when spectrum is found and intensity != 0 if found and line[-4:] != ' 0.0\n': out.write(line) diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index 087aa8a0..cd16a935 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -310,7 +310,7 @@ def count_plot(cls, unique=False): if cls.unique_df.empty: cls._separate_unique_peptides() y_label = "number of unique identified peptides" - counts_ = cls.unique_df + count_df = cls.unique_df else: if cls.count_df.empty: cls._count_identifications() diff --git a/ms2rescore/rescore_core.py b/ms2rescore/rescore_core.py index b29b808b..bba1bb5f 100644 --- a/ms2rescore/rescore_core.py +++ b/ms2rescore/rescore_core.py @@ -593,6 +593,7 @@ def write_pin_files( col_names = col_names + search_engine_feature_names outname = "_".join(fset) + complete_df.sort_values("SpecId", axis=0, inplace=True) complete_df[ ["SpecId", "Label", "ScanNr"] + col_names + ["Peptide", "Proteins"] ].to_csv( From 195164e2dd471bc88a74beaa4158597aee86a8a7 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Tue, 28 Sep 2021 12:05:46 +0200 Subject: [PATCH 07/27] retention time calibration for each raw file --- ms2rescore/id_file_parser.py | 4 +- ms2rescore/retention_time.py | 96 +++++++++++++++++++++++------------- 2 files changed, 65 insertions(+), 35 deletions(-) diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 38f895dc..f6a7d893 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -396,7 +396,7 @@ def get_peprec(self, parse_mgf: bool = True) -> PeptideRecord: ) if parse_mgf: self.parse_mgf_files(peprec) - peprec.df.drop("Raw file", axis=1, inplace=True) + # peprec.df.drop("Raw file", axis=1, inplace=True) return peprec def get_search_engine_features(self) -> pd.DataFrame: @@ -617,7 +617,7 @@ def get_peprec(self) -> PeptideRecord: ] ].rename({"PEAKS:peptideScore": "psm_score"}, axis=1) self.parse_mgf_files(peprec_df) - peprec_df.drop("Raw file", axis=1, inplace=True) + # peprec_df.drop("Raw file", axis=1, inplace=True) titles, rt = parse_mgf_title_rt(self.passed_mgf_path) id_rt_dict = { "spec_id": list(titles.values()), diff --git a/ms2rescore/retention_time.py b/ms2rescore/retention_time.py index d4ed6684..e5c0193a 100644 --- a/ms2rescore/retention_time.py +++ b/ms2rescore/retention_time.py @@ -7,9 +7,11 @@ import click import pandas as pd +from deeplc import DeepLC + from ms2rescore.peptide_record import PeptideRecord -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) @@ -65,14 +67,8 @@ def __init__( # Until fixed upstream: https://github.com/compomics/DeepLC/issues/19 if "NUMEXPR_MAX_THREADS" not in os.environ: - os.environ['NUMEXPR_MAX_THREADS'] = str(self.num_cpu) - - from deeplc import DeepLC + os.environ["NUMEXPR_MAX_THREADS"] = str(self.num_cpu) - self.deeplc_predictor = DeepLC( - split_cal=10, - n_jobs=self.num_cpu, cnn_model=True, verbose=False - ) self.peprec = None self.feature_df = None @@ -99,7 +95,7 @@ def num_calibration_psms(self): "Requested number of calibration PSMs (%s) is larger than total number " "of PSMs in PEPREC (%s). Using all PSMs for calibration.", self.calibration_set_size, - self.peprec.df + self.peprec.df, ) num_calibration_psms = len(self.peprec.df) else: @@ -112,41 +108,35 @@ def num_calibration_psms(self): logger.debug("Using %i PSMs for calibration", num_calibration_psms) return num_calibration_psms - @property - def calibration_data(self): + def get_calibration_data(self, peprec): """Get calibration peptides (N best PSMs in PEPREC).""" ascending = not self.higher_psm_score_better - if "label" in self.peprec.df.columns: + if "label" in peprec.columns: label_col = "label" - elif "Label" in self.peprec.df.columns: + elif "Label" in peprec.columns: label_col = "Label" else: raise ValueError("No label column found in peptide record.") calibration_data = ( - self.peprec.df[self.peprec.df[label_col] == 1] + peprec[peprec[label_col] == 1] .sort_values(["psm_score"], ascending=ascending) .head(self.num_calibration_psms) - .rename(columns={"observed_retention_time": "tr", "peptide": "seq",}) - [["tr", "seq", "modifications"]] + .rename( + columns={ + "observed_retention_time": "tr", + "peptide": "seq", + } + )[["tr", "seq", "modifications"]] .copy() ) return calibration_data - @property - def prediction_data(self): + def get_prediction_data(self, peprec): """Get prediction peptides.""" - return self.peprec.df[["peptide", "modifications"]].rename( - columns={"peptide": "seq",} - ) - - def _calibrate_predictor(self): - """Calibrate retention time predictor.""" - self.deeplc_predictor.calibrate_preds(seq_df=self.calibration_data) - - def _get_predictions(self): - """Get retention time predictions.""" - self.predicted_rts = pd.Series( - self.deeplc_predictor.make_preds(seq_df=self.prediction_data) + return peprec[["peptide", "modifications"]].rename( + columns={ + "peptide": "seq", + } ) def _calculate_features(self): @@ -200,9 +190,49 @@ def _calculate_features(self): def run(self): """Get retention time predictions for PEPREC and calculate features.""" - self._calibrate_predictor() - self._get_predictions() - self.peprec.df["predicted_retention_time"] = self.predicted_rts + if "Raw file" in self.peprec.df.columns: + raw_specific_predicted_dfs = [] + for raw_file, df in self.peprec.df.groupby("Raw file"): + peprec_raw_df = df.copy().reset_index() + self.deeplc_predictor = DeepLC( + split_cal=10, n_jobs=self.num_cpu, cnn_model=True, verbose=False + ) + retention_time_df = pd.DataFrame( + columns=["spec_id", "predicted_retention_time"] + ) + logger.info(f"Calibrating for {raw_file}") + self.deeplc_predictor.calibrate_preds( + seq_df=self.get_calibration_data(peprec_raw_df) + ) + predicted_rts = pd.Series( + self.deeplc_predictor.make_preds( + seq_df=self.get_prediction_data(peprec_raw_df) + ) + ) + retention_time_df["spec_id"] = peprec_raw_df["spec_id"].copy() + retention_time_df["predicted_retention_time"] = predicted_rts + + raw_specific_predicted_dfs.append(retention_time_df) + self.peprec.df = pd.merge( + self.peprec.df, + pd.concat(raw_specific_predicted_dfs, ignore_index=True), + on="spec_id", + how="inner", + ) + else: + self.deeplc_predictor = DeepLC( + split_cal=10, n_jobs=self.num_cpu, cnn_model=True, verbose=False + ) + self.deeplc_predictor.calibrate_preds( + seq_df=self.get_calibration_data(self.peprec.df) + ) + predicted_rts = pd.Series( + self.deeplc_predictor.make_preds( + seq_df=self.get_prediction_data(self.peprec.df) + ) + ) + self.peprec.df["predicted_retention_time"] = predicted_rts + self._calculate_features() self.feature_df.to_csv(self.feature_path, index=False) From bfc65ae91b5f8ae18ef827d879e27c2ca5e0dec2 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Tue, 28 Sep 2021 22:17:43 +0200 Subject: [PATCH 08/27] bug fix config parser --- ms2rescore/__init__.py | 1 + ms2rescore/config_parser.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 117b64c8..c33b2a72 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -46,6 +46,7 @@ def __init__( self.config = parse_config( parse_cli_args=parse_cli_args, config_class=configuration ) + exit() if set_logger: setup_logging.setup_logging(self.config["general"]["log_level"]) diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index 797099dc..5530a078 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -142,7 +142,14 @@ def parse_config(parse_cli_args: bool = True, config_class: Optional[Dict] = Non raise MS2ReScoreConfigurationError( "If `parse_cli_args` is False, `config_class` arguments are required." ) - + # print(json.load(config_default)["general"]["output_filename"]) + # with open(config_user, "rt") as f: + # print(json.load(f)["general"]["output_filename"]) + # print(args.output_filename) + # if config_class: + # print(config_class["output_filename"]) + # else: + # print(None) cascade_conf = CascadeConfig(validation_schema=json.load(config_schema)) cascade_conf.add_dict(json.load(config_default)) if config_user: @@ -152,7 +159,8 @@ def parse_config(parse_cli_args: bool = True, config_class: Optional[Dict] = Non elif config_class: cascade_conf.add_dict(config_class, subkey="general") config = cascade_conf.parse() - + print(config) + exit() config = _validate_filenames(config) config = _validate_num_cpu(config) From 4260ddcd05b6c28e6b37fc97f89c892b18b1b8dc Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Tue, 28 Sep 2021 22:45:51 +0200 Subject: [PATCH 09/27] removing print sstatements --- ms2rescore/__init__.py | 2 +- ms2rescore/config_parser.py | 12 ++---------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index c33b2a72..48f3f5f2 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -46,7 +46,7 @@ def __init__( self.config = parse_config( parse_cli_args=parse_cli_args, config_class=configuration ) - exit() + if set_logger: setup_logging.setup_logging(self.config["general"]["log_level"]) diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index 5530a078..797099dc 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -142,14 +142,7 @@ def parse_config(parse_cli_args: bool = True, config_class: Optional[Dict] = Non raise MS2ReScoreConfigurationError( "If `parse_cli_args` is False, `config_class` arguments are required." ) - # print(json.load(config_default)["general"]["output_filename"]) - # with open(config_user, "rt") as f: - # print(json.load(f)["general"]["output_filename"]) - # print(args.output_filename) - # if config_class: - # print(config_class["output_filename"]) - # else: - # print(None) + cascade_conf = CascadeConfig(validation_schema=json.load(config_schema)) cascade_conf.add_dict(json.load(config_default)) if config_user: @@ -159,8 +152,7 @@ def parse_config(parse_cli_args: bool = True, config_class: Optional[Dict] = Non elif config_class: cascade_conf.add_dict(config_class, subkey="general") config = cascade_conf.parse() - print(config) - exit() + config = _validate_filenames(config) config = _validate_num_cpu(config) From a6156a92355c35e95e97375154b348014df9c831 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Thu, 30 Sep 2021 11:21:49 +0200 Subject: [PATCH 10/27] num calibration fix --- ms2rescore/id_file_parser.py | 2 +- ms2rescore/retention_time.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index f6a7d893..71ae2e5d 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -396,7 +396,7 @@ def get_peprec(self, parse_mgf: bool = True) -> PeptideRecord: ) if parse_mgf: self.parse_mgf_files(peprec) - # peprec.df.drop("Raw file", axis=1, inplace=True) + peprec.df.drop("Raw file", axis=1, inplace=True) return peprec def get_search_engine_features(self) -> pd.DataFrame: diff --git a/ms2rescore/retention_time.py b/ms2rescore/retention_time.py index e5c0193a..58f79df2 100644 --- a/ms2rescore/retention_time.py +++ b/ms2rescore/retention_time.py @@ -75,8 +75,7 @@ def __init__( if self.peprec_path: self.peprec = PeptideRecord(path=self.peprec_path) - @property - def num_calibration_psms(self): + def num_calibration_psms(self, peprec): """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs.""" if isinstance(self.calibration_set_size, float): if self.calibration_set_size > 1: @@ -87,17 +86,17 @@ def num_calibration_psms(self): ) else: num_calibration_psms = round( - len(self.peprec.df) * self.calibration_set_size + len(peprec) * self.calibration_set_size ) elif isinstance(self.calibration_set_size, int): - if self.calibration_set_size > len(self.peprec.df): + if self.calibration_set_size > len(peprec): logger.warning( "Requested number of calibration PSMs (%s) is larger than total number " "of PSMs in PEPREC (%s). Using all PSMs for calibration.", self.calibration_set_size, - self.peprec.df, + peprec, ) - num_calibration_psms = len(self.peprec.df) + num_calibration_psms = len(peprec) else: num_calibration_psms = self.calibration_set_size else: @@ -120,7 +119,7 @@ def get_calibration_data(self, peprec): calibration_data = ( peprec[peprec[label_col] == 1] .sort_values(["psm_score"], ascending=ascending) - .head(self.num_calibration_psms) + .head(self.num_calibration_psms(peprec=peprec)) .rename( columns={ "observed_retention_time": "tr", From 2ce3667bff864b9c127006fad126b1229862d999 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Mon, 11 Oct 2021 15:06:15 +0200 Subject: [PATCH 11/27] tiny fixes in plottting module and Peaks pipeline --- ms2rescore/id_file_parser.py | 3 +-- ms2rescore/plotting.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 71ae2e5d..9a804208 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -25,6 +25,7 @@ class IDFileParserError(MS2ReScoreError): """Error parsing ID file.""" + pass @@ -617,13 +618,11 @@ def get_peprec(self) -> PeptideRecord: ] ].rename({"PEAKS:peptideScore": "psm_score"}, axis=1) self.parse_mgf_files(peprec_df) - # peprec_df.drop("Raw file", axis=1, inplace=True) titles, rt = parse_mgf_title_rt(self.passed_mgf_path) id_rt_dict = { "spec_id": list(titles.values()), "observed_retention_time": list(rt.values()), } - # TODO try: except ValueError if lists are unequal? id_rt_df = pd.DataFrame.from_dict(id_rt_dict) peprec_df = pd.merge(peprec_df, id_rt_df, on="spec_id", how="inner") diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index cd16a935..03938fed 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -15,6 +15,7 @@ from ms2rescore.percolator import PercolatorIn + from ms2rescore._exceptions import MS2ReScoreError @@ -335,7 +336,7 @@ def count_plot(cls, unique=False): return g @classmethod - def calculate_loss_gain_df(cls, reference="Before rescoring", FDR_threshold=[0.01]): + def calculate_loss_gain_df(cls, reference=None, FDR_threshold=[0.01]): """ Calculate relative amount of gained, lossed and shared unique peptides for each rescoring method relative to the reference. @@ -367,8 +368,13 @@ def calculate_loss_gain_df(cls, reference="Before rescoring", FDR_threshold=[0.0 ].item() ) ) - if "After rescoring: searchengine" in ft_dict.keys(): - reference = "After rescoring: searchengine" + if reference: + pass + elif "After rescoring: Searchengine" in ft_dict.keys(): + reference = "After rescoring: Searchengine" + else: + reference = "Before rescoring" + total = len(ft_dict[reference]) if total == 0: continue @@ -574,7 +580,7 @@ def _read_pin_from_peprec(self, path_to_peprec): pin_qvalues = pd.DataFrame( qvalues( peprec, - key=peprec['psm_score'], + key=peprec["psm_score"], is_decoy=peprec["Label"] == -1, reverse=True, remove_decoy=False, From e69001d2dcbcc93fc356eb901d00e2a75423fd54 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Mon, 11 Oct 2021 15:54:57 +0200 Subject: [PATCH 12/27] DeepLC import fix --- ms2rescore/retention_time.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ms2rescore/retention_time.py b/ms2rescore/retention_time.py index 58f79df2..1ed893ca 100644 --- a/ms2rescore/retention_time.py +++ b/ms2rescore/retention_time.py @@ -7,8 +7,6 @@ import click import pandas as pd -from deeplc import DeepLC - from ms2rescore.peptide_record import PeptideRecord os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" @@ -189,6 +187,9 @@ def _calculate_features(self): def run(self): """Get retention time predictions for PEPREC and calculate features.""" + + from deeplc import DeepLC + if "Raw file" in self.peprec.df.columns: raw_specific_predicted_dfs = [] for raw_file, df in self.peprec.df.groupby("Raw file"): From ada66dcb06a3788e13b9f711615c8e7c386d7503 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Mon, 11 Oct 2021 16:25:15 +0200 Subject: [PATCH 13/27] Pattern change in to allow for +, - or nothing --- ms2rescore/percolator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ms2rescore/percolator.py b/ms2rescore/percolator.py index c7aa069d..3faeed62 100644 --- a/ms2rescore/percolator.py +++ b/ms2rescore/percolator.py @@ -98,7 +98,7 @@ def modification_mapping( self._modification_label_type = "str" self._modification_mapping = value elif all([isinstance(label, float) for label in mod_labels]): - self.modification_pattern = r"\[(\+[0-9\-\.]*)\]" + self.modification_pattern = r"\[((\+|\-|)[0-9\-\.]*)\]" self._modification_label_type = "float" self._modification_mapping = { (aa, self._round(shift)): name @@ -426,7 +426,7 @@ def get_feature_table(self) -> pd.DataFrame: def to_peptide_record( self, - extract_spectrum_index: Optional[bool] = False, + extract_spectrum_index: Optional[bool] = True, spectrum_index_pattern: Optional[str] = None, score_column_label: Optional[str] = None, ) -> PeptideRecord: From 6bc2b14042edb89c933fcfab1dd31cf458b7afbc Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Fri, 5 Nov 2021 12:50:16 +0100 Subject: [PATCH 14/27] requested changes --- ms2rescore/__init__.py | 80 +++++++++++----------- ms2rescore/id_file_parser.py | 6 +- ms2rescore/package_data/config_schema.json | 2 +- ms2rescore/percolator.py | 5 +- 4 files changed, 47 insertions(+), 46 deletions(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 48f3f5f2..9070fd50 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -8,7 +8,7 @@ from multiprocessing import cpu_count from typing import Dict, Optional, Union -import pandas as pd +from pandas.errors import EmptyDataError from ms2rescore import id_file_parser, rescore_core, setup_logging from ms2rescore._exceptions import MS2ReScoreError @@ -108,21 +108,21 @@ def _infer_pipeline(identification_file: str): def _select_pipeline(self): """Select specific rescoring pipeline.""" - if self.config["general"]["pipeline"] == "infer": + if self.config["general"]["pipeline"].lower() == "infer": pipeline = self._infer_pipeline( self.config["general"]["identification_file"] ) - elif self.config["general"]["pipeline"] == "pin": + elif self.config["general"]["pipeline"].lower() == "pin": pipeline = id_file_parser.PinPipeline - elif self.config["general"]["pipeline"] == "maxquant": + elif self.config["general"]["pipeline"].lower() == "maxquant": pipeline = id_file_parser.MaxQuantPipeline - elif self.config["general"]["pipeline"] == "msgfplus": + elif self.config["general"]["pipeline"].lower() == "msgfplus": pipeline = id_file_parser.MSGFPipeline - elif self.config["general"]["pipeline"] == "tandem": + elif self.config["general"]["pipeline"].lower() == "tandem": pipeline = id_file_parser.TandemPipeline - elif self.config["general"]["pipeline"] == "peptideshaker": + elif self.config["general"]["pipeline"].lower() == "peptideshaker": pipeline = id_file_parser.PeptideShakerPipeline - elif self.config["general"]["pipeline"] == "Peaks": + elif self.config["general"]["pipeline"].lower() == "peaks": pipeline = id_file_parser.PeaksPipeline else: raise NotImplementedError(self.config["general"]["pipeline"]) @@ -254,39 +254,41 @@ def run(self): if self.config["general"]["run_percolator"]: self._run_percolator() - logger.info("Generating Rescore plots") - if self.config["general"]["plotting"]: + # Only use plotting module when run_percolator is true + if self.config["general"]["plotting"]: + logger.info("Generating Rescore plots") - plotting.PIN( - peprec_filename, self.config["general"]["output_filename"] - ) - - for fset in self.config["general"]["feature_sets"]: - pout_file = ( - self.config["general"]["output_filename"] - + "_" - + "_".join(fset) - + "_features.pout" + plotting.PIN( + peprec_filename, self.config["general"]["output_filename"] ) - pout_decoy_file = ( - self.config["general"]["output_filename"] - + "_" - + "_".join(fset) - + "_features.pout_dec" - ) - try: - plotting.POUT( - pout_file, - pout_decoy_file, - self.config["general"]["output_filename"], - " ".join(fset) - ) - except pd.errors.EmptyDataError: - continue - plotting.RescoreRecord.save_plots_to_pdf( - self.config["general"]["output_filename"] + "_plots.pdf", - FDR_thresholds=[0.01, 0.001], - ) + for fset in self.config["general"]["feature_sets"]: + pout_file = ( + self.config["general"]["output_filename"] + + "_" + + "_".join(fset) + + "_features.pout" + ) + pout_decoy_file = ( + self.config["general"]["output_filename"] + + "_" + + "_".join(fset) + + "_features.pout_dec" + ) + try: + plotting.POUT( + pout_file, + pout_decoy_file, + self.config["general"]["output_filename"], + " ".join(fset) + ) + except EmptyDataError: + logger.warn(f"Feature set: {'_'.join(fset)} returned empty pout file") + continue + + plotting.RescoreRecord.save_plots_to_pdf( + self.config["general"]["output_filename"] + "_plots.pdf", + FDR_thresholds=[0.01, 0.001], + ) logger.info("MS²ReScore finished!") diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 9a804208..a392bb09 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -8,8 +8,7 @@ import numpy as np import pandas as pd -from pyteomics import tandem -from pyteomics import mzid +from pyteomics import tandem, mzid from tqdm import tqdm from ms2rescore._exceptions import MS2ReScoreError @@ -67,6 +66,7 @@ def __init__(self, config: Dict, output_basename: Union[str, os.PathLike]) -> No "generic": r".+_([0-9]+)_[0-9]+_[0-9]+", "tandem": r".+_([0-9]+)_[0-9]+_[0-9]+", "msgfplus": r".+_SII_([0-9]+)_[0-9]+_[0-9]+_[0-9]+", + "USI": r"mzspec:PXD[0-9]{6}:[^\s\:]*:scan:([0-9]+)" } # Private attributes specific to pipeline, override these in each subclass @@ -562,8 +562,6 @@ def read_df_from_mzid(self) -> pd.DataFrame: retrieved_data["charge"] = flat_dict[ "SpectrumIdentificationItem_chargeState" ] - # TODO: create class for SpectrumIdentificationItem - # print(spectrum_identification_result) retrieved_data["protein_list"] = [ d["accession"] for d in spectrum_identification_result[ diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index 03fde771..9a877b34 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -13,7 +13,7 @@ "pipeline": { "description": "Pipeline to use, depending on input format", "type": "string", - "enum": ["infer", "pin", "tandem", "maxquant", "msgfplus", "peptideshaker", "Peaks"], + "enum": ["infer", "pin", "tandem", "maxquant", "msgfplus", "peptideshaker", "peaks"], "default": "infer" }, "feature_sets": { diff --git a/ms2rescore/percolator.py b/ms2rescore/percolator.py index 3faeed62..57588195 100644 --- a/ms2rescore/percolator.py +++ b/ms2rescore/percolator.py @@ -250,13 +250,13 @@ def _get_sequence_column(self) -> pd.Series: def _get_charge_column(self) -> pd.Series: """Get charge column from one-hot encoded `ChargeX` columns.""" - charge_cols = [col for col in self.df.columns if col.startswith("charge")] + charge_cols = [col for col in self.df.columns if col.lower().startswith("charge")] if not (self.df[charge_cols] == 1).any(axis=1).all(): raise PercolatorInError("Not all PSMs have an assigned charge state.") return ( self.df[charge_cols] .rename( - columns={col: int(col.replace("charge", "")) for col in charge_cols} + columns={col: int(col.lower().replace("charge", "")) for col in charge_cols} ) .idxmax(1) ) @@ -424,6 +424,7 @@ def get_feature_table(self) -> pd.DataFrame: feature_cols = [col for col in self.df.columns if col not in non_feature_cols] return self.df[feature_cols] + # TODO if USI is used extract_spectrum_index should be False def to_peptide_record( self, extract_spectrum_index: Optional[bool] = True, From 8b00ebc321aaba45330c4b570add278ada61b284 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Wed, 10 Nov 2021 17:59:16 +0100 Subject: [PATCH 15/27] requested changes, added CL parameters --- ms2rescore/__init__.py | 14 ++++---- ms2rescore/config_parser.py | 38 +++++++++++++++++++++ ms2rescore/id_file_parser.py | 3 +- ms2rescore/maxquant.py | 1 - ms2rescore/package_data/config_default.json | 1 - ms2rescore/parse_mgf.py | 6 ++-- ms2rescore/percolator.py | 2 +- 7 files changed, 50 insertions(+), 15 deletions(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 9070fd50..1c857976 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -108,21 +108,21 @@ def _infer_pipeline(identification_file: str): def _select_pipeline(self): """Select specific rescoring pipeline.""" - if self.config["general"]["pipeline"].lower() == "infer": + if self.config["general"]["pipeline"] == "infer": pipeline = self._infer_pipeline( self.config["general"]["identification_file"] ) - elif self.config["general"]["pipeline"].lower() == "pin": + elif self.config["general"]["pipeline"] == "pin": pipeline = id_file_parser.PinPipeline - elif self.config["general"]["pipeline"].lower() == "maxquant": + elif self.config["general"]["pipeline"] == "maxquant": pipeline = id_file_parser.MaxQuantPipeline - elif self.config["general"]["pipeline"].lower() == "msgfplus": + elif self.config["general"]["pipeline"] == "msgfplus": pipeline = id_file_parser.MSGFPipeline - elif self.config["general"]["pipeline"].lower() == "tandem": + elif self.config["general"]["pipeline"] == "tandem": pipeline = id_file_parser.TandemPipeline - elif self.config["general"]["pipeline"].lower() == "peptideshaker": + elif self.config["general"]["pipeline"] == "peptideshaker": pipeline = id_file_parser.PeptideShakerPipeline - elif self.config["general"]["pipeline"].lower() == "peaks": + elif self.config["general"]["pipeline"] == "peaks": pipeline = id_file_parser.PeaksPipeline else: raise NotImplementedError(self.config["general"]["pipeline"]) diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index 797099dc..ca7df87d 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -72,6 +72,42 @@ def _parse_arguments() -> argparse.Namespace: default="info", help="logging level (default: `info`)", ) + parser.add_argument( + "-n", + metavar="VALUE", + action="store", + type=int, + dest="num_cpu", + default=None, + help="number of cpus available to MS²Rescore", + ) + parser.add_argument( + "--pipeline", + metavar="PIPELINE", + action="store", + type=str, + dest="pipeline", + default=None, + help="determines which pipeline to use (default: 'infer')", + ) + parser.add_argument( + "--percolator", + metavar="BOOL", + action="store", + type=bool, + dest="run_percolator", + default=None, + help="run percolator (default: 'True')", + ) + parser.add_argument( + "--plotting", + metavar="BOOL", + action="store", + type=bool, + dest="plotting", + default=None, + help="write PDF with result plots", + ) return parser.parse_args() @@ -156,4 +192,6 @@ def parse_config(parse_cli_args: bool = True, config_class: Optional[Dict] = Non config = _validate_filenames(config) config = _validate_num_cpu(config) + config["general"]["pipeline"] = config["general"]["pipeline"].lower() + return config diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index a392bb09..836b42e2 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -66,7 +66,7 @@ def __init__(self, config: Dict, output_basename: Union[str, os.PathLike]) -> No "generic": r".+_([0-9]+)_[0-9]+_[0-9]+", "tandem": r".+_([0-9]+)_[0-9]+_[0-9]+", "msgfplus": r".+_SII_([0-9]+)_[0-9]+_[0-9]+_[0-9]+", - "USI": r"mzspec:PXD[0-9]{6}:[^\s\:]*:scan:([0-9]+)" + # "USI": r"mzspec:PXD[0-9]{6}:[^\s\:]*:scan:([0-9]+)" } # Private attributes specific to pipeline, override these in each subclass @@ -397,7 +397,6 @@ def get_peprec(self, parse_mgf: bool = True) -> PeptideRecord: ) if parse_mgf: self.parse_mgf_files(peprec) - peprec.df.drop("Raw file", axis=1, inplace=True) return peprec def get_search_engine_features(self) -> pd.DataFrame: diff --git a/ms2rescore/maxquant.py b/ms2rescore/maxquant.py index ee102aad..d8269f74 100644 --- a/ms2rescore/maxquant.py +++ b/ms2rescore/maxquant.py @@ -180,7 +180,6 @@ def _get_peprec_modifications( for k, v in fixed_modifications.items() } - # Apply fixed modifications if fixed_modifications: for aa, mod in fixed_modifications.items(): diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index f8717303..f5a7b643 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -44,7 +44,6 @@ "gl":"Gln->pyro-Glu" }, "fixed_modifications":{ - "C":"Carbamidomethyl" } }, "percolator": {} diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py index 76e33ec1..7336232d 100644 --- a/ms2rescore/parse_mgf.py +++ b/ms2rescore/parse_mgf.py @@ -94,7 +94,8 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', with open(outname, 'w') as out: count = 0 - for run in runs: + iterator = tqdm(runs) if show_progress_bar else runs + for run in iterator: current_mgf_file = os.path.join(mgf_folder, run + file_suffix) spec_set = set(df_in[(df_in[filename_col] == run)][spec_title_col].values) # Temporary fix: replace charges in MGF with ID'ed charges @@ -103,8 +104,7 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', found = False with open(current_mgf_file, 'r') as f: - iterator = tqdm(f, total=get_num_lines(current_mgf_file)) if show_progress_bar else f - for line in iterator: + for line in f: if 'TITLE=' in line: title = title_parser(line, method=title_parsing_method, run=run) if title in spec_set: diff --git a/ms2rescore/percolator.py b/ms2rescore/percolator.py index 57588195..92b4b1ae 100644 --- a/ms2rescore/percolator.py +++ b/ms2rescore/percolator.py @@ -98,7 +98,7 @@ def modification_mapping( self._modification_label_type = "str" self._modification_mapping = value elif all([isinstance(label, float) for label in mod_labels]): - self.modification_pattern = r"\[((\+|\-|)[0-9\-\.]*)\]" + self.modification_pattern = r"\[([+-]?[0-9\-.]*)\]" self._modification_label_type = "float" self._modification_mapping = { (aa, self._round(shift)): name From 1c7bedeab33734f97808bb4262b406dc1fb75115 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Wed, 17 Nov 2021 13:02:45 +0100 Subject: [PATCH 16/27] removed mgf argument from maxquant_to_rescore dict in config file --- ms2rescore/package_data/config_default.json | 1 - ms2rescore/package_data/config_schema.json | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index f5a7b643..ec34ca74 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -35,7 +35,6 @@ ] }, "maxquant_to_rescore": { - "mgf_dir": "", "modification_mapping":{ "ox":"Oxidation", "ac":"Acetyl", diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index 9a877b34..06778c31 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -111,13 +111,9 @@ "maxquant_to_rescore":{ "description": "Settings specific to the MaxQuant pipeline", "type": "object", - "required": ["mgf_dir", "modification_mapping", "fixed_modifications"], + "required": ["modification_mapping", "fixed_modifications"], "additionalProperties": false, "properties": { - "mgf_dir": { - "description": "Path to directory with MGF files", - "type": "string" - }, "modification_mapping": { "description": "Mapping of MaxQuant modification labels to modifications names for MS²PIP", "type": "object", From 9e696fbc07054db6a5249839bb619d7ca09946cc Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Wed, 17 Nov 2021 17:42:08 +0100 Subject: [PATCH 17/27] requested changes --- ms2rescore/id_file_parser.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 836b42e2..fabff849 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -178,13 +178,14 @@ def peprec_from_pin(self) -> PeptideRecord: peprec = self.original_pin.to_peptide_record( spectrum_index_pattern=self._pin_spec_id_patterns[self._pin_spec_id_style] ) + titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file) + peprec.df["spec_id"] = peprec.df["spec_id"].map(titles) if "observed_retention_time" not in peprec.df.columns: # Map MGF titles and observed retention times - titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file) peprec.df["observed_retention_time"] = peprec.df["spec_id"].map( retention_times ) - peprec.df["spec_id"] = peprec.df["spec_id"].map(titles) + if not ~peprec.df["observed_retention_time"].isna().any(): raise IDFileParserError( "Could not map all MGF retention times to spectrum indices." @@ -460,7 +461,7 @@ def __init__(self, config: Dict, output_basename: Union[str, os.PathLike]) -> No def original_pin(self): """Get PercolatorIn object from identification file.""" raise NotImplementedError( - "Property `original_pin` is not implemented in class " "`PeaksPipeline`." + "Property `original_pin` is not implemented in class `PeaksPipeline`." ) def peprec_from_pin(self): @@ -471,7 +472,8 @@ def peprec_from_pin(self): @staticmethod def _get_peprec_modifications(modifications: List): - """get peprec modifications out of the peaks id file""" + # TODO: Add unit tests for this function + """get peprec modifications out of the peaks id file.""" suffix_list = ["Phospho"] if isinstance(modifications, List): mods = [] @@ -484,10 +486,10 @@ def _get_peprec_modifications(modifications: List): mods.append(f"{m['location']}|{m['name']}") return "|".join(mods) else: - raise TypeError + raise TypeError("`modifications` should be of type list.") def _convert_to_flat_dict(self, nested_dict, parent_key="", sep="_"): - """Convert nested dict to flat dict with concatenated keys""" + """Convert nested dict to flat dict with concatenated keys.""" items = [] for k, v in nested_dict.items(): @@ -518,7 +520,6 @@ def read_df_from_mzid(self) -> pd.DataFrame: "Label", "Raw file", "Rank", - # "protein_description", ] ) with mzid.read(self.path_to_id_file) as reader: @@ -534,7 +535,6 @@ def read_df_from_mzid(self) -> pd.DataFrame: "Label", "Raw file", "Rank", - # "protein_description", ] ) flat_dict = dict( From d24bfea03cff6b2c73b5cea297812ace0aed8a43 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Thu, 18 Nov 2021 14:21:22 +0100 Subject: [PATCH 18/27] optimized model selection deeplc --- ms2rescore/retention_time.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/ms2rescore/retention_time.py b/ms2rescore/retention_time.py index 1ed893ca..57a86c1b 100644 --- a/ms2rescore/retention_time.py +++ b/ms2rescore/retention_time.py @@ -192,18 +192,37 @@ def run(self): if "Raw file" in self.peprec.df.columns: raw_specific_predicted_dfs = [] - for raw_file, df in self.peprec.df.groupby("Raw file"): + for i, (raw_file, df) in enumerate(self.peprec.df.groupby("Raw file")): + logger.info(f"Calibrating {raw_file}") + peprec_raw_df = df.copy().reset_index() - self.deeplc_predictor = DeepLC( - split_cal=10, n_jobs=self.num_cpu, cnn_model=True, verbose=False - ) retention_time_df = pd.DataFrame( columns=["spec_id", "predicted_retention_time"] ) - logger.info(f"Calibrating for {raw_file}") - self.deeplc_predictor.calibrate_preds( - seq_df=self.get_calibration_data(peprec_raw_df) - ) + + if i == 0: + self.deeplc_predictor = DeepLC( + split_cal=10, + n_jobs=self.num_cpu, + cnn_model=True, + verbose=False + ) + logger.info(f"{self.deeplc_predictor.model}") + self.deeplc_predictor.calibrate_preds( + seq_df=self.get_calibration_data(peprec_raw_df) + ) + self.deeplc_model = list(self.deeplc_predictor.model.keys()) + else: + self.deeplc_predictor = DeepLC( + split_cal=10, + n_jobs=self.num_cpu, + cnn_model=True, + verbose=False, + path_model=self.deeplc_model + ) + self.deeplc_predictor.calibrate_preds( + seq_df=self.get_calibration_data(peprec_raw_df) + ) predicted_rts = pd.Series( self.deeplc_predictor.make_preds( seq_df=self.get_prediction_data(peprec_raw_df) From 6c35e195adcc6804afe696784dced2777dd81d20 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Tue, 23 Nov 2021 13:42:43 +0100 Subject: [PATCH 19/27] add plotting warning if percolator is disabled --- ms2rescore/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 1c857976..e0028906 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -290,5 +290,6 @@ def run(self): self.config["general"]["output_filename"] + "_plots.pdf", FDR_thresholds=[0.01, 0.001], ) - + if not self.config["general"]["run_percolator"] and self.config["general"]["plotting"]: + logger.warn("To plot rescore results run_percolator should be enabled") logger.info("MS²ReScore finished!") From 04c1c46c4bd82510497402b32c521ae6b8afa305 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Tue, 23 Nov 2021 14:30:21 +0100 Subject: [PATCH 20/27] MS2ReScoreError fix --- ms2rescore/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index f44c5250..391c108a 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -536,7 +536,7 @@ def __init__(self, path_to_file, sample_name, score_metric=None) -> None: self.score_metric = "psm_score" self.df = self._read_pin_from_peprec(path_to_file) else: - raise MS2RescoreError("Not a peprec or pin file") + raise MS2ReScoreError("Not a peprec or pin file") self.rerecs.append(self) # add PIN record to RescoreRecord From ba94f3eea6dd99b82f50517518c18a147ad5ded7 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Tue, 23 Nov 2021 14:45:03 +0100 Subject: [PATCH 21/27] MS2ReScoreError fix (2) --- ms2rescore/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index 391c108a..ec2f41bf 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -502,7 +502,7 @@ def save_plots_to_pdf(cls, filename, FDR_thresholds=[0.01]): """ if not cls.rerecs: - raise MS2RescoreError("no pin/pout files listed") + raise MS2ReScoreError("no pin/pout files listed") pdf = matplotlib.backends.backend_pdf.PdfPages(filename) cls._separate_unique_peptides(FDR_thresholds) From eb4e15deb5d71ccf473e9add94fe310a2ab4c235 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Tue, 23 Nov 2021 16:38:37 +0100 Subject: [PATCH 22/27] another MS2rescoreError fix --- ms2rescore/plotting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ms2rescore/plotting.py b/ms2rescore/plotting.py index ec2f41bf..124db2ce 100644 --- a/ms2rescore/plotting.py +++ b/ms2rescore/plotting.py @@ -15,7 +15,7 @@ from ms2rescore.percolator import PercolatorIn -from ms2rescore._exceptions import MS2ReScoreError +from ms2rescore._exceptions import MS2RescoreError class RescoreRecord(ABC): @@ -502,7 +502,7 @@ def save_plots_to_pdf(cls, filename, FDR_thresholds=[0.01]): """ if not cls.rerecs: - raise MS2ReScoreError("no pin/pout files listed") + raise MS2RescoreError("no pin/pout files listed") pdf = matplotlib.backends.backend_pdf.PdfPages(filename) cls._separate_unique_peptides(FDR_thresholds) @@ -536,7 +536,7 @@ def __init__(self, path_to_file, sample_name, score_metric=None) -> None: self.score_metric = "psm_score" self.df = self._read_pin_from_peprec(path_to_file) else: - raise MS2ReScoreError("Not a peprec or pin file") + raise MS2RescoreError("Not a peprec or pin file") self.rerecs.append(self) # add PIN record to RescoreRecord From 20bd6a10209cd5fa8c25a06830c40c04d145e5f3 Mon Sep 17 00:00:00 2001 From: Arthur Declercq Date: Thu, 2 Dec 2021 15:48:50 +0100 Subject: [PATCH 23/27] added decoy error, searchengine feature --- ms2rescore/__init__.py | 1 + ms2rescore/id_file_parser.py | 15 +++++++++------ ms2rescore/peptide_record.py | 8 ++++++++ ms2rescore/retention_time.py | 1 - 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index c1ff2a2f..d788607f 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -221,6 +221,7 @@ def _run_percolator(self): def run(self): """Run MS²ReScore.""" peprec = self.pipeline.get_peprec() + peprec.validate_decoy_presence() peprec_filename = self.tmpfile_basepath + ".peprec" peprec.to_csv(peprec_filename) diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 53485628..85c6732f 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -456,7 +456,7 @@ def __init__(self, config: Dict, output_basename: Union[str, os.PathLike]) -> No super().__init__(config, output_basename) # Private attributes, specific to this pipeline - self.df = self.read_df_from_mzid() + self.df = None @property def original_pin(self): @@ -475,7 +475,7 @@ def peprec_from_pin(self): def _get_peprec_modifications(modifications: List): # TODO: Add unit tests for this function """get peprec modifications out of the peaks id file.""" - suffix_list = ["Phospho"] + suffix_list = ["Phospho", "Dioxidation"] if isinstance(modifications, List): mods = [] for m in modifications: @@ -511,16 +511,17 @@ def _convert_to_flat_dict(self, nested_dict, parent_key="", sep="_"): def read_df_from_mzid(self) -> pd.DataFrame: """Read mzid to Dataframe.""" + logger.info("Processing mzid file") df = pd.DataFrame( columns=[ "spec_id", "peptide", + "peptide_length", "modifications", "charge", "PEAKS:peptideScore", "Label", "Raw file", - "Rank", ] ) with mzid.read(self.path_to_id_file) as reader: @@ -529,13 +530,13 @@ def read_df_from_mzid(self) -> pd.DataFrame: index=[ "spec_id", "peptide", + "peptide_length", "modifications", "charge", "protein_list", "PEAKS:peptideScore", "Label", "Raw file", - "Rank", ] ) flat_dict = dict( @@ -553,6 +554,7 @@ def read_df_from_mzid(self) -> pd.DataFrame: retrieved_data["peptide"] = flat_dict[ "SpectrumIdentificationItem_PeptideSequence" ] + retrieved_data["peptide_length"] = len(retrieved_data["peptide"]) try: retrieved_data["modifications"] = self._get_peprec_modifications( flat_dict["SpectrumIdentificationItem_Modification"] @@ -581,7 +583,6 @@ def read_df_from_mzid(self) -> pd.DataFrame: .split(".", 1)[0] .replace(",", "_", 1) ) - retrieved_data["Rank"] = flat_dict["SpectrumIdentificationItem_rank"] df = df.append(retrieved_data, ignore_index=True) df["Label"] = df["Label"].apply(lambda x: -1 if x else 1) return df @@ -603,6 +604,8 @@ def parse_mgf_files(self, peprec): def get_peprec(self) -> PeptideRecord: """Get PeptideRecord.""" + if not self.df: + self.df = self.read_df_from_mzid() peprec_df = self.df[ [ "spec_id", @@ -623,7 +626,6 @@ def get_peprec(self) -> PeptideRecord: } id_rt_df = pd.DataFrame.from_dict(id_rt_dict) peprec_df = pd.merge(peprec_df, id_rt_df, on="spec_id", how="inner") - return PeptideRecord.from_dataframe(peprec_df) def get_search_engine_features(self) -> pd.DataFrame: @@ -639,4 +641,5 @@ def get_search_engine_features(self) -> pd.DataFrame: ] feature_cols = [col for col in self.df.columns if col not in peprec_cols] + print(feature_cols) return self.df[feature_cols] diff --git a/ms2rescore/peptide_record.py b/ms2rescore/peptide_record.py index 49f985b5..f690ad62 100644 --- a/ms2rescore/peptide_record.py +++ b/ms2rescore/peptide_record.py @@ -154,3 +154,11 @@ def _infer_separator(path: str) -> str: line = f.readline() separator = line[7] return separator + + def validate_decoy_presence(self): + "check whether decoys are present in peprec file" + + if sum(self.df["Label"] == -1) == 0: + raise InvalidPeprecError("No decoys present, make sure to provide decoys along targets") + else: + pass diff --git a/ms2rescore/retention_time.py b/ms2rescore/retention_time.py index 57a86c1b..fe74a0f3 100644 --- a/ms2rescore/retention_time.py +++ b/ms2rescore/retention_time.py @@ -207,7 +207,6 @@ def run(self): cnn_model=True, verbose=False ) - logger.info(f"{self.deeplc_predictor.model}") self.deeplc_predictor.calibrate_preds( seq_df=self.get_calibration_data(peprec_raw_df) ) From 5e91a683d3b488cc50f616b36ade5a76abb2154b Mon Sep 17 00:00:00 2001 From: Ralf Gabriels Date: Tue, 14 Dec 2021 15:47:01 +0100 Subject: [PATCH 24/27] parse_mgf: Fix removal of lines with zero peak intensity --- ms2rescore/parse_mgf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py index ce1103cc..f1d859cf 100644 --- a/ms2rescore/parse_mgf.py +++ b/ms2rescore/parse_mgf.py @@ -129,7 +129,7 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', if 'CHARGE=' in line: continue # Only print lines when spectrum is found and intensity != 0 - if found and line[-4:] != ' 0.0\n': + if found and line[-5:] != ' 0.0\n': out.write(line) num_expected = len(df_in[spec_title_col].unique()) From d13af3487426d3b11bf95141b0d652ef4ad9ba22 Mon Sep 17 00:00:00 2001 From: Ralf Gabriels Date: Tue, 14 Dec 2021 15:59:05 +0100 Subject: [PATCH 25/27] Add example to readme for maxquant config section --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 3420f1ff..4b54b9c9 100644 --- a/README.md +++ b/README.md @@ -187,6 +187,17 @@ one of the modifications listed in the MS²PIP configuration (see - `fixed_modifications`: Must list all modifications set as fixed during the MaxQuant search (as this is not denoted in the msms.txt file). Keys refer to the amino acid, values to the modification name used in the MS²PIP configuration. + - The maxquant specific configuration could for example be: + ```json + "maxquant_to_rescore": { + "modification_mapping":{ + "ox":"Oxidation", + "cm":"Carbamidomethyl" + }, + "fixed_modifications":{ + "C":"Carbamidomethyl" + } + ``` As a general rule, MS²Rescore always needs access to all target and decoy PSMs, not only the FDR-filtered targets. From efce2a77a8c3c0d271b821a1ba448f5d5212778d Mon Sep 17 00:00:00 2001 From: Ralf Gabriels Date: Tue, 14 Dec 2021 16:06:04 +0100 Subject: [PATCH 26/27] Update README.md: Add PEAKS to supported input --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4b54b9c9..92aeace3 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,13 @@ MS²Rescore uses identifications from a or from the output of one of these search engines: - [MaxQuant](https://www.maxquant.org/): Start from `msms.txt` identification - file and directory with `.mgf` files. (Be sure to export without FDR - filtering!) + file and directory with `.mgf` files. - [MSGFPlus](https://omics.pnl.gov/software/ms-gf): Start with an `.mzid` identification file and corresponding `.mgf`. - [X!Tandem](https://www.thegpm.org/tandem/): Start with an X!Tandem `.xml` identification file and corresponding `.mgf`. +- [PEAKS](https://www.bioinfor.com/peaksdb/): Start with an `.mzid` identification + file and directory with `.mgf` files. - [PeptideShaker](http://compomics.github.io/projects/peptide-shaker): Start with a PeptideShaker Extended PSM Report and corresponding `.mgf` file. From c39ef8a42977034ee587c5e4fa34223c5876e27f Mon Sep 17 00:00:00 2001 From: Ralf Gabriels Date: Tue, 14 Dec 2021 16:06:40 +0100 Subject: [PATCH 27/27] Remove obsolute maxquant mgf dir option --- configuration.md | 1 - 1 file changed, 1 deletion(-) diff --git a/configuration.md b/configuration.md index 77e5c994..07791290 100644 --- a/configuration.md +++ b/configuration.md @@ -25,7 +25,6 @@ - **Items**: Refer to *#/definitions/modifications*. - **`percolator`** *(object)*: Command line options directly passed to Percolator (see the Percolator wiki). - **`maxquant_to_rescore`** *(object)*: Settings specific to the MaxQuant pipeline. Cannot contain additional properties. - - **`mgf_dir`** *(string)*: Path to directory with MGF files. - **`modification_mapping`** *(object)*: Mapping of MaxQuant modification labels to modifications names for MS²PIP. Default: `{}`. - **`fixed_modifications`** *(object)*: Mapping of amino acids with fixed modifications to the modification name. Default: `{}`. ## Definitions