diff --git a/mokapot/__init__.py b/mokapot/__init__.py index 0ca4b7f4..30232dc4 100644 --- a/mokapot/__init__.py +++ b/mokapot/__init__.py @@ -15,7 +15,7 @@ except DistributionNotFound: pass -from .dataset import LinearPsmDataset +from .dataset import LinearPsmDataset, OnDiskPsmDataset from .model import Model, PercolatorModel, save_model, load_model from .brew import brew from .parsers.pin import read_pin, read_percolator diff --git a/mokapot/brew.py b/mokapot/brew.py index c698487e..8b65e0c5 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -253,7 +253,7 @@ def brew( descs = [desc] * len(psms) scores = [ read_file( - _psms.file_name, + _psms.filename, use_cols=[feat], ).values for _psms in psms diff --git a/tests/conftest.py b/tests/conftest.py index 481d94e4..6a9bf805 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import pytest import numpy as np import pandas as pd -from mokapot import LinearPsmDataset +from mokapot import LinearPsmDataset, OnDiskPsmDataset @pytest.fixture(autouse=True) @@ -98,6 +98,54 @@ def psms(psm_df_1000): return psms +@pytest.fixture +def psms_ondisk(): + """A small OnDiskPsmDataset""" + filename = "data/scope2_FP97AA.pin" + df_spectra = pd.read_csv( + filename, sep="\t", usecols=["ScanNr", "ExpMass", "Label"] + ) + with open(filename) as perc: + columns = perc.readline().rstrip().split("\t") + psms = OnDiskPsmDataset( + filename=filename, + target_column="Label", + spectrum_columns=["ScanNr", "ExpMass"], + peptide_column="Peptide", + scan_column="ScanNr", + calcmass_column="CalcMass", + expmass_column="ExpMass", + rt_column="ret_time", + charge_column=None, + protein_column=None, + group_column=None, + feature_columns=[ + "CalcMass", + "lnrSp", + "deltLCn", + "deltCn", + "Sp", + "IonFrac", + "RefactoredXCorr", + "NegLog10PValue", + "NegLog10ResEvPValue", + "NegLog10CombinePValue", + "enzN", + "enzC", + "enzInt", + "lnNumDSP", + "dM", + "absdM", + ], + metadata_columns=["SpecId", "ScanNr", "Peptide", "Proteins", "Label"], + filename_column=None, + specId_column="SpecId", + spectra_dataframe=df_spectra, + columns=columns, + ) + return psms + + def _make_fasta( num_proteins, peptides, peptides_per_protein, random_state, prefix="" ): diff --git a/tests/unit_tests/test_brew.py b/tests/unit_tests/test_brew.py index 27d04951..02534a0d 100644 --- a/tests/unit_tests/test_brew.py +++ b/tests/unit_tests/test_brew.py @@ -1,4 +1,6 @@ """Tests that the brew function works""" +import copy + import pytest import numpy as np import mokapot @@ -11,127 +13,133 @@ @pytest.fixture def svm(): """A simple Percolator model""" - return PercolatorModel(train_fdr=0.05, max_iter=10) + return PercolatorModel(train_fdr=0.05, max_iter=10, rng=2) -def test_brew_simple(psms, svm): +def test_brew_simple(psms_ondisk, svm): """Test with mostly default parameters of brew""" - results, models = mokapot.brew(psms, svm, test_fdr=0.05) - assert isinstance(results, mokapot.confidence.LinearConfidence) + psms, models, scores, desc = mokapot.brew(psms_ondisk, svm, test_fdr=0.05) assert len(models) == 3 assert isinstance(models[0], PercolatorModel) -def test_brew_random_forest(psms): +def test_brew_random_forest(psms_ondisk): """Verify there are no dependencies on the SVM.""" rfm = Model( RandomForestClassifier(), train_fdr=0.1, ) - results, models = mokapot.brew(psms, model=rfm, test_fdr=0.1) - assert isinstance(results, mokapot.confidence.LinearConfidence) + psms, models, scores, desc = mokapot.brew( + psms_ondisk, model=rfm, test_fdr=0.1 + ) assert len(models) == 3 assert isinstance(models[0], Model) -def test_brew_joint(psms, svm): +def test_brew_joint(psms_ondisk, svm): """Test that the multiple input PSM collections yield multiple out""" - collections = [psms, psms, psms] - results, models = mokapot.brew(collections, svm, test_fdr=0.05) - assert len(results) == 3 + collections = [psms_ondisk, copy.copy(psms_ondisk), copy.copy(psms_ondisk)] + psms, models, scores, desc = mokapot.brew(collections, svm, test_fdr=0.05) + assert len(scores) == 3 + assert len(psms) == 3 assert len(models) == 3 + assert len(desc) == 3 -def test_brew_folds(psms, svm): +def test_brew_folds(psms_ondisk, svm): """Test that changing the number of folds works""" - results, models = mokapot.brew(psms, svm, test_fdr=0.1, folds=4) - assert isinstance(results, mokapot.confidence.LinearConfidence) + psms, models, scores, desc = mokapot.brew( + psms_ondisk, svm, test_fdr=0.05, folds=4 + ) + assert len(scores) == 1 + assert len(psms) == 1 assert len(models) == 4 -def test_brew_seed(psms, svm): +def test_brew_seed(psms_ondisk, svm): """Test that (not) changing the split selection seed works""" folds = 3 seed = 0 - - results_a, models_a = mokapot.brew( - psms, svm, test_fdr=0.05, folds=folds, rng=seed + psms_ondisk_b = copy.copy(psms_ondisk) + psms_ondisk_c = copy.copy(psms_ondisk) + psms_a, models_a, scores_a, desc_a = mokapot.brew( + psms_ondisk, svm, test_fdr=0.05, folds=folds, rng=seed ) - assert isinstance(results_a, mokapot.confidence.LinearConfidence) assert len(models_a) == folds - results_b, models_b = mokapot.brew( - psms, svm, test_fdr=0.05, folds=folds, rng=seed + psms_b, models_b, scores_b, desc_b = mokapot.brew( + psms_ondisk_b, svm, test_fdr=0.05, folds=folds, rng=seed ) - assert isinstance(results_b, mokapot.confidence.LinearConfidence) assert len(models_b) == folds - assert ( - results_a.accepted == results_b.accepted + assert np.array_equal( + scores_a[0], scores_b[0] ), "Results differed with same seed" - results_c, models_c = mokapot.brew( - psms, svm, test_fdr=0.05, folds=folds, rng=seed + 2 + psms_c, models_c, scores_c, desc_c = mokapot.brew( + psms_ondisk_c, svm, test_fdr=0.05, folds=folds, rng=seed + 2 ) - assert isinstance(results_c, mokapot.confidence.LinearConfidence) assert len(models_c) == folds - - assert ( - results_a.accepted != results_c.accepted + assert ~( + np.array_equal(scores_a[0], scores_c[0]) ), "Results were identical with different seed!" -def test_brew_test_fdr_error(psms, svm): +def test_brew_test_fdr_error(psms_ondisk, svm): """Test that we get a sensible error message""" with pytest.raises(RuntimeError) as err: - results, models = mokapot.brew(psms, svm) - + mokapot.brew(psms_ondisk, svm, test_fdr=0.001, rng=2) assert "Failed to calibrate" in str(err) # @pytest.mark.skip(reason="Not currently working, at least on MacOS.") -def test_brew_multiprocess(psms, svm): +def test_brew_multiprocess(psms_ondisk, svm): """Test that multiprocessing doesn't yield an error""" - _, models = mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2) - + _, models, _, _ = mokapot.brew(psms_ondisk, svm, test_fdr=0.05, max_workers=2) # The models should not be the same: assert_not_close(models[0].estimator.coef_, models[1].estimator.coef_) assert_not_close(models[1].estimator.coef_, models[2].estimator.coef_) assert_not_close(models[2].estimator.coef_, models[0].estimator.coef_) -def test_brew_trained_models(psms, svm): +def test_brew_trained_models(psms_ondisk, svm): """Test that using trained models reproduces same results""" # fix a seed to have the same random split for each run - results_with_training, models_with_training = mokapot.brew( - psms, svm, test_fdr=0.05, rng=3 - ) + ( + psms_with_training, + models_with_training, + scores_with_training, + desc_with_training, + ) = mokapot.brew(copy.copy(psms_ondisk), svm, test_fdr=0.05, rng=2) models = list(models_with_training) models.reverse() # Change the model order - results_without_training, models_without_training = mokapot.brew( - psms, models, test_fdr=0.05, rng=3 - ) + ( + psms_without_training, + models_without_training, + scores_without_training, + desc_without_training, + ) = mokapot.brew(psms_ondisk, models, test_fdr=0.05, rng=2) assert models_with_training == models_without_training - assert results_with_training.accepted == results_without_training.accepted + assert np.array_equal(scores_with_training[0], scores_without_training[0]) -def test_brew_using_few_models_error(psms, svm): +def test_brew_using_few_models_error(psms_ondisk, svm): """Test that if the number of trained models less than the number of folds we get the expected error message. """ with pytest.raises(ValueError) as err: - mokapot.brew(psms, [svm, svm], test_fdr=0.05) + mokapot.brew(psms_ondisk, [svm, svm], test_fdr=0.05) assert ( "The number of trained models (2) must match the number of folds (3)." in str(err) ) -def test_brew_using_non_trained_models_error(psms, svm): +def test_brew_using_non_trained_models_error(psms_ondisk, svm): """Test that using non trained models gives the expected error message""" svm.is_trained = False with pytest.raises(RuntimeError) as err: - mokapot.brew(psms, [svm, svm, svm], test_fdr=0.05) + mokapot.brew(psms_ondisk, [svm, svm, svm], test_fdr=0.05) assert ( "One or more of the provided models was not previously trained" in str(err)