Skip to content

Commit

Permalink
Remove invalid AAs in all pipelines (fixes #31)
Browse files Browse the repository at this point in the history
  • Loading branch information
RalfG committed Mar 2, 2021
1 parent 16d5792 commit d740dd9
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 4 deletions.
7 changes: 4 additions & 3 deletions ms2rescore/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self, pandas_obj) -> None:
"""Pandas extension for MaxQuant msms.txt files."""
self._obj = pandas_obj
self._set_mass_error_unit()
self.invalid_amino_acids = r"[BJOUXZ]"

@classmethod
def from_file(
Expand All @@ -66,8 +67,8 @@ def from_file(
filter_rank1_psms : bool, optional
filter for rank 1 PSMs
validate_amino_acids : bool, optional
remove PSMs where the sequence includes an invalid amino acid
(B, J, O, U, X, Z); required for MS2PIP compatibility
remove PSMs where the sequence includes an invalid amino acid; required for
MS2PIP compatibility
Returns
-------
Expand Down Expand Up @@ -114,7 +115,7 @@ def filter_rank1_psms(self) -> pd.DataFrame:
def remove_invalid_amino_acids(self) -> pd.DataFrame:
"""Remove invalid amino acids from MSMS."""
invalid_indices = self._obj[self._obj["Sequence"].str.contains(
r"[BJOUXZ]", regex=True
self.invalid_amino_acids, regex=True
)].index
self._obj = self._obj.drop(index=invalid_indices).reset_index(drop=True)

Expand Down
17 changes: 16 additions & 1 deletion ms2rescore/peptideshaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,33 @@ def __init__(self, pandas_obj: pd.DataFrame) -> None:
def _validate(self):
"""Validate Pandas DataFrame as Extended PSM Report."""
# TODO: Implement validation of PSM report DataFrame
pass
self.drop_invalid_amino_acids()

def drop_invalid_amino_acids(self, invalid_amino_acids=r"[BJOUXZ]"):
"""Drop all PSMs (rows) with peptides containing invalid amino acids."""
to_drop = self._obj[
self._obj['Sequence'].str.contains(invalid_amino_acids, regex=True)
].index
if len(to_drop) > 0:
logger.warning(
"Dropping %i PSMs from report due to invalid amino acids (%s)",
len(to_drop),
invalid_amino_acids
)
self._obj = self._obj.drop(index=to_drop)

@staticmethod
def from_tsv(path: Union[str, os.PathLike]) -> pd.DataFrame:
"""Read Extended PSM Report from TSV file."""
ext_psm_report = pd.read_csv(path, sep="\t", index_col=0)
pd.ext_psm_report._validate(ext_psm_report)
return ext_psm_report

@staticmethod
def from_xls(path: Union[str, os.PathLike]) -> pd.DataFrame:
"""Read Extended PSM Report from XLS file."""
ext_psm_report = pd.read_excel(path, sheet_name=0, index_col=0)
pd.ext_psm_report._validate(ext_psm_report)
return ext_psm_report

@staticmethod
Expand Down
24 changes: 24 additions & 0 deletions ms2rescore/percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
modification_mapping: Optional[
Dict[Tuple[Union[str, None], Union[float, str]], str]
] = None,
invalid_amino_acids: Optional[str] = r"[BJOUXZ]"
):
"""
Percolator In (PIN).
Expand All @@ -54,10 +55,14 @@ def __init__(
(e.g. `{("Q", -17.02655): "Gln->pyro-Glu"}). If the keys are floats, they
are rounded up to three decimals to avoid rounding issues while matching.
If None, the original modification labels from the PIN file will be used.
invalid_amino_acids: str, optional
regex pattern of invalid amino acids. PSMs containing these amino acids will
be dropped. (default: `r"[BJOUXZ]"`)
"""
# Attributes
self.modification_pattern = r"\[([^\[^\]]*)\]"
self.invalid_amino_acids = invalid_amino_acids

# Parameters
self.path = path
Expand Down Expand Up @@ -301,6 +306,23 @@ def get_spectrum_filename(
else:
raise ValueError("Multiple spectrum filenames found in single PIN file.")

def drop_invalid_amino_acids(self):
"""Drop all PSMs (rows) with peptides containing invalid amino acids."""
if "sequence" in self.df.columns:
sequences = self.df['sequence']
else:
sequences = self._get_sequence_column()
to_drop = sequences[
sequences.str.contains(self.invalid_amino_acids, regex=True)
].index
if len(to_drop) > 0:
logger.warning(
"Dropping %i PSMs from PIN due to invalid amino acids (%s)",
len(to_drop),
self.invalid_amino_acids
)
self.df = self.df.drop(index=to_drop)

@staticmethod
def fix_tabs(
path: str, id_column: str = "SpecId", prot_sep: Optional[str] = "|||"
Expand Down Expand Up @@ -376,6 +398,8 @@ def read(self, path: Optional[str] = None):
if not self.path:
raise ValueError("No path for PIN file defined.")
self.df = pd.read_csv(self.fix_tabs(self.path), sep="\t")
if self.invalid_amino_acids:
self.drop_invalid_amino_acids()

def write(self, path: Optional[str] = None):
"""Write PIN to file."""
Expand Down

0 comments on commit d740dd9

Please sign in to comment.