Skip to content

Commit

Permalink
Fix test brew: (#20)
Browse files Browse the repository at this point in the history
- Create new object of OnDiskPsmDataset to use for brew tests
- Update brew function outputs and assert statements
  • Loading branch information
sambenfredj authored and gessulat committed Feb 27, 2024
1 parent f3d50c8 commit 4293410
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 51 deletions.
2 changes: 1 addition & 1 deletion mokapot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
except DistributionNotFound:
pass

from .dataset import LinearPsmDataset
from .dataset import LinearPsmDataset, OnDiskPsmDataset
from .model import Model, PercolatorModel, save_model, load_model
from .brew import brew
from .parsers.pin import read_pin, read_percolator
Expand Down
2 changes: 1 addition & 1 deletion mokapot/brew.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def brew(
descs = [desc] * len(psms)
scores = [
read_file(
_psms.file_name,
_psms.filename,
use_cols=[feat],
).values
for _psms in psms
Expand Down
50 changes: 49 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
import numpy as np
import pandas as pd
from mokapot import LinearPsmDataset
from mokapot import LinearPsmDataset, OnDiskPsmDataset


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -98,6 +98,54 @@ def psms(psm_df_1000):
return psms


@pytest.fixture
def psms_ondisk():
"""A small OnDiskPsmDataset"""
filename = "data/scope2_FP97AA.pin"
df_spectra = pd.read_csv(
filename, sep="\t", usecols=["ScanNr", "ExpMass", "Label"]
)
with open(filename) as perc:
columns = perc.readline().rstrip().split("\t")
psms = OnDiskPsmDataset(
filename=filename,
target_column="Label",
spectrum_columns=["ScanNr", "ExpMass"],
peptide_column="Peptide",
scan_column="ScanNr",
calcmass_column="CalcMass",
expmass_column="ExpMass",
rt_column="ret_time",
charge_column=None,
protein_column=None,
group_column=None,
feature_columns=[
"CalcMass",
"lnrSp",
"deltLCn",
"deltCn",
"Sp",
"IonFrac",
"RefactoredXCorr",
"NegLog10PValue",
"NegLog10ResEvPValue",
"NegLog10CombinePValue",
"enzN",
"enzC",
"enzInt",
"lnNumDSP",
"dM",
"absdM",
],
metadata_columns=["SpecId", "ScanNr", "Peptide", "Proteins", "Label"],
filename_column=None,
specId_column="SpecId",
spectra_dataframe=df_spectra,
columns=columns,
)
return psms


def _make_fasta(
num_proteins, peptides, peptides_per_protein, random_state, prefix=""
):
Expand Down
104 changes: 56 additions & 48 deletions tests/unit_tests/test_brew.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Tests that the brew function works"""
import copy

import pytest
import numpy as np
import mokapot
Expand All @@ -11,127 +13,133 @@
@pytest.fixture
def svm():
"""A simple Percolator model"""
return PercolatorModel(train_fdr=0.05, max_iter=10)
return PercolatorModel(train_fdr=0.05, max_iter=10, rng=2)


def test_brew_simple(psms, svm):
def test_brew_simple(psms_ondisk, svm):
"""Test with mostly default parameters of brew"""
results, models = mokapot.brew(psms, svm, test_fdr=0.05)
assert isinstance(results, mokapot.confidence.LinearConfidence)
psms, models, scores, desc = mokapot.brew(psms_ondisk, svm, test_fdr=0.05)
assert len(models) == 3
assert isinstance(models[0], PercolatorModel)


def test_brew_random_forest(psms):
def test_brew_random_forest(psms_ondisk):
"""Verify there are no dependencies on the SVM."""
rfm = Model(
RandomForestClassifier(),
train_fdr=0.1,
)
results, models = mokapot.brew(psms, model=rfm, test_fdr=0.1)
assert isinstance(results, mokapot.confidence.LinearConfidence)
psms, models, scores, desc = mokapot.brew(
psms_ondisk, model=rfm, test_fdr=0.1
)
assert len(models) == 3
assert isinstance(models[0], Model)


def test_brew_joint(psms, svm):
def test_brew_joint(psms_ondisk, svm):
"""Test that the multiple input PSM collections yield multiple out"""
collections = [psms, psms, psms]
results, models = mokapot.brew(collections, svm, test_fdr=0.05)
assert len(results) == 3
collections = [psms_ondisk, copy.copy(psms_ondisk), copy.copy(psms_ondisk)]
psms, models, scores, desc = mokapot.brew(collections, svm, test_fdr=0.05)
assert len(scores) == 3
assert len(psms) == 3
assert len(models) == 3
assert len(desc) == 3


def test_brew_folds(psms, svm):
def test_brew_folds(psms_ondisk, svm):
"""Test that changing the number of folds works"""
results, models = mokapot.brew(psms, svm, test_fdr=0.1, folds=4)
assert isinstance(results, mokapot.confidence.LinearConfidence)
psms, models, scores, desc = mokapot.brew(
psms_ondisk, svm, test_fdr=0.05, folds=4
)
assert len(scores) == 1
assert len(psms) == 1
assert len(models) == 4


def test_brew_seed(psms, svm):
def test_brew_seed(psms_ondisk, svm):
"""Test that (not) changing the split selection seed works"""
folds = 3
seed = 0

results_a, models_a = mokapot.brew(
psms, svm, test_fdr=0.05, folds=folds, rng=seed
psms_ondisk_b = copy.copy(psms_ondisk)
psms_ondisk_c = copy.copy(psms_ondisk)
psms_a, models_a, scores_a, desc_a = mokapot.brew(
psms_ondisk, svm, test_fdr=0.05, folds=folds, rng=seed
)
assert isinstance(results_a, mokapot.confidence.LinearConfidence)
assert len(models_a) == folds

results_b, models_b = mokapot.brew(
psms, svm, test_fdr=0.05, folds=folds, rng=seed
psms_b, models_b, scores_b, desc_b = mokapot.brew(
psms_ondisk_b, svm, test_fdr=0.05, folds=folds, rng=seed
)
assert isinstance(results_b, mokapot.confidence.LinearConfidence)
assert len(models_b) == folds

assert (
results_a.accepted == results_b.accepted
assert np.array_equal(
scores_a[0], scores_b[0]
), "Results differed with same seed"

results_c, models_c = mokapot.brew(
psms, svm, test_fdr=0.05, folds=folds, rng=seed + 2
psms_c, models_c, scores_c, desc_c = mokapot.brew(
psms_ondisk_c, svm, test_fdr=0.05, folds=folds, rng=seed + 2
)
assert isinstance(results_c, mokapot.confidence.LinearConfidence)
assert len(models_c) == folds

assert (
results_a.accepted != results_c.accepted
assert ~(
np.array_equal(scores_a[0], scores_c[0])
), "Results were identical with different seed!"


def test_brew_test_fdr_error(psms, svm):
def test_brew_test_fdr_error(psms_ondisk, svm):
"""Test that we get a sensible error message"""
with pytest.raises(RuntimeError) as err:
results, models = mokapot.brew(psms, svm)

mokapot.brew(psms_ondisk, svm, test_fdr=0.001, rng=2)
assert "Failed to calibrate" in str(err)


# @pytest.mark.skip(reason="Not currently working, at least on MacOS.")
def test_brew_multiprocess(psms, svm):
def test_brew_multiprocess(psms_ondisk, svm):
"""Test that multiprocessing doesn't yield an error"""
_, models = mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2)

_, models, _, _ = mokapot.brew(psms_ondisk, svm, test_fdr=0.05, max_workers=2)
# The models should not be the same:
assert_not_close(models[0].estimator.coef_, models[1].estimator.coef_)
assert_not_close(models[1].estimator.coef_, models[2].estimator.coef_)
assert_not_close(models[2].estimator.coef_, models[0].estimator.coef_)


def test_brew_trained_models(psms, svm):
def test_brew_trained_models(psms_ondisk, svm):
"""Test that using trained models reproduces same results"""
# fix a seed to have the same random split for each run
results_with_training, models_with_training = mokapot.brew(
psms, svm, test_fdr=0.05, rng=3
)
(
psms_with_training,
models_with_training,
scores_with_training,
desc_with_training,
) = mokapot.brew(copy.copy(psms_ondisk), svm, test_fdr=0.05, rng=2)
models = list(models_with_training)
models.reverse() # Change the model order
results_without_training, models_without_training = mokapot.brew(
psms, models, test_fdr=0.05, rng=3
)
(
psms_without_training,
models_without_training,
scores_without_training,
desc_without_training,
) = mokapot.brew(psms_ondisk, models, test_fdr=0.05, rng=2)
assert models_with_training == models_without_training
assert results_with_training.accepted == results_without_training.accepted
assert np.array_equal(scores_with_training[0], scores_without_training[0])


def test_brew_using_few_models_error(psms, svm):
def test_brew_using_few_models_error(psms_ondisk, svm):
"""Test that if the number of trained models less than the number of
folds we get the expected error message.
"""
with pytest.raises(ValueError) as err:
mokapot.brew(psms, [svm, svm], test_fdr=0.05)
mokapot.brew(psms_ondisk, [svm, svm], test_fdr=0.05)
assert (
"The number of trained models (2) must match the number of folds (3)."
in str(err)
)


def test_brew_using_non_trained_models_error(psms, svm):
def test_brew_using_non_trained_models_error(psms_ondisk, svm):
"""Test that using non trained models gives the expected error message"""
svm.is_trained = False
with pytest.raises(RuntimeError) as err:
mokapot.brew(psms, [svm, svm, svm], test_fdr=0.05)
mokapot.brew(psms_ondisk, [svm, svm, svm], test_fdr=0.05)
assert (
"One or more of the provided models was not previously trained"
in str(err)
Expand Down

0 comments on commit 4293410

Please sign in to comment.