Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small changes for FlashLFQ writer #131

Merged
merged 2 commits into from
Dec 4, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions mokapot/writers/flashlfq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""This module writes data in the generic format for FlashLFQ.

Details about the format can be found here:
https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats#generic
"""

from pathlib import Path
import logging

import pandas as pd

LOGGER = logging.getLogger(__name__)


def to_flashlfq(conf, out_file="mokapot.flashlfq.txt"):
"""Save confidenct peptides for quantification with FlashLFQ.

`FlashLFQ <https://github.com/smith-chem-wisc/FlashLFQ>`_ is an open-source
tool for label-free quantification. For mokapot to save results in a
compatible format, a few extra columns are required to be present, which
specify the MS data file name, the theoretical peptide monoisotopic mass,
the retention time, and the charge for each PSM. If these are not present,
saving to the FlashLFQ format is disabled.

Note that protein grouping in the FlashLFQ results will be more accurate if
proteins were added for analysis with mokapot.

Parameters
----------
conf : Confidence object or tuple of Confidence objects
One or more :py:class:`~mokapot.confidence.LinearConfidence` objects.
out_file : str, optional
The output file to write.

Returns
-------
str
The path to the saved file.

"""
try:
assert not isinstance(conf, str)
iter(conf)
except TypeError:
conf = [conf]
except AssertionError:
raise ValueError("'conf' should be a Confidence object, not a string.")

flashlfq = pd.concat([_format_flashlfq(c) for c in conf])
flashlfq.to_csv(str(out_file), sep="\t", index=False)
return out_file


def _format_flashlfq(conf):
"""Format peptides for quantification with FlashLFQ

If proteins are provided, use the mokapot protein groups. Else,
use the protein_column.

Parameters
----------
conf : a LinearConfidence object
A :py:class:`~mokapot.confidence.LinearConfidence` object.

Returns
-------
pandas.DataFrame
The peptides in FlashLFQ format.
"""
# Do some error checking for the required columns:
required = ["filename", "calcmass", "rt", "charge"]
missing = [c for c in required if conf._optional_columns[c] is None]
if missing:
missing = ", ".join([c + "_column" for c in missing])
raise ValueError(
"The following parameters must be specified when loading a "
"collection of PSMs in order to save them in FlashLFQ format: "
f"{missing}"
)

if conf._has_proteins:
proteins = conf._proteins
elif conf._protein_column is not None:
proteins = conf._protein_column
else:
proteins = None

# Get parameters
peptides = conf.peptides
filename_column = conf._optional_columns["filename"]
peptide_column = conf._peptide_column
mass_column = conf._optional_columns["calcmass"]
rt_column = conf._optional_columns["rt"]
charge_column = conf._optional_columns["charge"]
eval_fdr = conf._eval_fdr

# Create FlashLFQ dataframe
passing = peptides["mokapot q-value"] <= eval_fdr

out_df = pd.DataFrame()
out_df["File Name"] = peptides.loc[passing, filename_column].apply(
lambda x: Path(x).name
)

seq = peptides.loc[passing, peptide_column]
base_seq = (
seq.str.replace(r"[\[\(].*?[\]\)]", "", regex=True)
.str.replace(r"^.*?\.", "", regex=True)
.str.replace(r"\..*?$", "", regex=True)
)

out_df["Base Sequence"] = base_seq
out_df["Full Sequence"] = seq
out_df["Peptide Monoisotopic Mass"] = peptides.loc[passing, mass_column]
out_df["Scan Retention Time"] = peptides.loc[passing, rt_column]
out_df["Precursor Charge"] = peptides.loc[passing, charge_column]

if isinstance(proteins, str):
# TODO: Add delimiter sniffing.
prots = peptides.loc[passing, proteins].str.replace("\t", "; ", regex=False)
elif proteins is None:
prots = ""
else:
prots = base_seq.map(proteins.peptide_map.get)
shared = pd.isna(prots)
prots.loc[shared] = base_seq[shared].map(proteins.shared_peptides.get)

out_df["Protein Accession"] = prots
missing = pd.isna(out_df["Protein Accession"])
num_missing = missing.sum()
if num_missing:
LOGGER.warning(
"- Discarding %i peptides that could not be mapped to protein " "groups",
num_missing,
)
out_df = out_df.loc[~missing, :]

return out_df
Loading