-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathmetadata.py
446 lines (396 loc) · 18.5 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
import functools
import itertools
import logging
import os
import re
import subprocess
from typing import Iterator, List, Optional, Tuple
import joblib
import numba as nb
import numpy as np
import pandas as pd
from spectrum_utils import spectrum as sus
from spectrum_utils import utils as suu
logger = logging.getLogger('gleams')
regex_non_alpha = re.compile(r'[^A-Za-z]+')
regex_mod = re.compile(r'\+\d+.\d+')
def convert_massivekb_metadata(massivekb_filename: str,
metadata_filename: str,
charges: Optional[Tuple[int]] = None) -> None:
"""
Convert the MassIVE-KB metadata file to a stripped down metadata file
containing only the relevant information.
The initial metadata file needs to be downloaded manually from MassIVE:
MassIVE Knowledge Base > Human HCD Spectral Library
> All Candidate library spectra > Download
The new metadata file will contain PSM information the following columns:
- dataset: The MassIVE dataset identifier.
- filename: The file in which the PSM's spectrum is present.
- scan: The PSM's scan number in its spectral data file.
- sequence: The PSM's peptide sequence.
- charge: The PSM's precursor charge.
- mz: The PSM's precursor m/z.
If the stripped down metadata file already exists it will _not_ be
recreated.
Parameters
----------
massivekb_filename : str
The MassIVE-KB metadata file name.
metadata_filename : str
The metadata file name.
charges : Optional[Tuple[int]]
Optional tuple of minimum and maximum precursor charge (both inclusive)
to include, spectra with other precursor charges will be omitted.
"""
if not os.path.isfile(metadata_filename):
logger.info('Convert the MassIVE-KB metadata file')
metadata = pd.read_csv(massivekb_filename, sep='\t', usecols=[
'annotation', 'charge', 'filename', 'mz', 'scan'])
if charges is not None:
metadata = metadata[(metadata['charge'] >= charges[0]) &
(metadata['charge'] <= charges[1])].copy()
metadata = metadata.rename(columns={'annotation': 'sequence'})
dataset_filename = metadata['filename'].str.split('/').str
metadata['dataset'] = dataset_filename[0]
metadata['filename'] = dataset_filename[-1]
metadata['scan'] = metadata['scan'].astype(np.int64)
metadata = metadata.sort_values(['dataset', 'filename', 'scan'])
metadata = metadata[['dataset', 'filename', 'scan', 'sequence',
'charge', 'mz']]
logger.debug('Save metadata file to %s', metadata_filename)
metadata.to_parquet(metadata_filename, index=False)
def split_metadata_train_val_test(
metadata_filename: str, val_ratio: float = None,
test_ratio: float = None, rel_tol: float = None) -> None:
"""
Split the metadata file in training/validation/test files.
The split is based on dataset, with the ratio of the number of PSMs in each
split approximating the given ratios.
The metadata files for each split will be saved to the same directory as
the provided metadata file, with suffix "_train", "_val", and "_test"
respectively.
If all metadata files corresponding to the training/validation/test splits
already exist the splits will _not_ be recreated.
Parameters
----------
metadata_filename : str
The input metadata filename. Should be a Parquet file.
val_ratio : float
Proportion of the total number of PSMs that should approximately be in
the validation set. If None, no validation split will be generated.
test_ratio : float
Proportion of the total number of PSMs that should approximately be in
the test set. If None, no test split will be generated.
rel_tol : float
Proportion tolerance of the number of PSMs in the validation and test
splits. Default: 0.1 * val_ratio.
"""
filename_train = metadata_filename.replace('.parquet', '_train.parquet')
filename_val = metadata_filename.replace('.parquet', '_val.parquet')
filename_test = metadata_filename.replace('.parquet', '_test.parquet')
if (os.path.isfile(filename_train) and
(val_ratio is None or os.path.isfile(filename_val)) and
(test_ratio is None or os.path.isfile(filename_test))):
return
metadata = pd.read_parquet(metadata_filename).set_index('dataset')
abs_tol = int((rel_tol if rel_tol is not None else
(0.1 * (val_ratio if val_ratio is not None else 0)))
* len(metadata))
num_val = int(val_ratio * len(metadata)) if val_ratio is not None else 0
num_test = int(test_ratio * len(metadata)) if test_ratio is not None else 0
# Add datasets to the validation/test splits until they contain a suitable
# number of PSMs.
perc_val = (val_ratio if val_ratio is not None else 0) * 100
perc_test = (test_ratio if test_ratio is not None else 0) * 100
perc_train = 100 - perc_val - perc_test
logger.info('Split the metadata file into train (~%.f%%), validation '
'(~%.f%%), and test (~%.f%%) sets', perc_train, perc_val,
perc_test)
datasets = (metadata.groupby('dataset', sort=False)['scan']
.count().sample(frac=1))
if num_val > 0:
selected_val = _select_datasets(datasets, num_val, abs_tol)
logger.debug('Save validation metadata file to %s', filename_val)
(metadata.loc[selected_val].reset_index()
.to_parquet(filename_val, index=False))
datasets = datasets.drop(selected_val)
if num_test > 0:
selected_test = _select_datasets(datasets, num_test, abs_tol)
logger.debug('Save test metadata file to %s', filename_test)
(metadata.loc[selected_test].reset_index()
.to_parquet(filename_test, index=False))
datasets = datasets.drop(selected_test)
logger.debug('Save train metadata file to %s', filename_train)
(metadata.loc[datasets.index].reset_index()
.to_parquet(filename_train, index=False))
def _select_datasets(datasets: pd.Series, num_to_select: int, num_tol: int)\
-> List[str]:
"""
Select datasets with the specified number of PSMs until the requested
number of PSMs is approximately reached.
Parameters
----------
datasets : pd.Series
A Series with dataset identifiers as index and number of PSMs as
values.
num_to_select : int
The number of PSMs that should approximately be selected.
num_tol : int
The amount of deviation that is allowed from `num_to_select`.
Returns
-------
List[str]
A list of dataset identifiers that are selected.
"""
datasets_selected, num_selected = [], 0
for dataset, dataset_num_psms in datasets.items():
if (num_selected + dataset_num_psms) - num_to_select < num_tol:
datasets_selected.append(dataset)
num_selected += dataset_num_psms
if abs(num_to_select - num_selected) <= num_tol:
break
# noinspection PyTypeChecker
return datasets_selected
def download_massive_file(massive_filename: str, dir_name: str) -> None:
"""
Download the given file from MassIVE.
The file is downloaded using a `wget` subprocess.
If the file already exists it will _not_ be downloaded again.
Parameters
----------
massive_filename : str
The local MassIVE file link.
dir_name : str
The local directory where the file will be stored.
"""
peak_filename = os.path.join(dir_name, massive_filename.rsplit('/', 1)[-1])
if not os.path.isfile(peak_filename):
if not os.path.isdir(dir_name):
try:
os.makedirs(dir_name)
except OSError:
pass
logger.debug('Download file %s', massive_filename)
url = f'ftp://massive.ucsd.edu/{massive_filename}'
proc = subprocess.run(
['wget', '--no-verbose', '--timestamping', '--retry-connrefused',
f'--directory-prefix={dir_name}', '--passive-ftp', url],
capture_output=True, text=True)
if proc.returncode != 0:
logger.warning('Could not download file %s: wget error %d: %s',
peak_filename, proc.returncode, proc.stderr)
def download_massivekb_peaks(massivekb_filename: str, dir_name: str) -> None:
"""
Download all peak files listed in the given MassIVE-KB metadata file.
Peak files will be stored in subdirectories the given directory per
dataset.
Existing peak files will _not_ be downloaded again.
Parameters
----------
massivekb_filename : str
The metadata file name.
dir_name : str
The local directory where the peak files will be stored.
"""
filenames = (pd.read_csv(massivekb_filename, sep='\t',
usecols=['filename'])
.drop_duplicates('filename'))
datasets = filenames.str.split('/', 1).str[0]
filenames['dir_name'] = datasets.apply(
lambda dataset: os.path.join(dir_name, dataset))
logger.info('Download peak files from MassIVE')
joblib.Parallel(n_jobs=-1)(
joblib.delayed(download_massive_file)(filename, dir_name)
for filename, dir_name in zip(filenames['filename'],
filenames['dir_name']))
def generate_pairs_positive(metadata_filename: str,
charges: Tuple[int]) -> None:
"""
Generate index pairs for positive training pairs for the given metadata
file.
The positive training pairs consist of all pairs with the same peptide
sequence in the metadata, split by precursor charge. Identity pairs are
included.
Pairs of row numbers in the metadata file for each positive pair are stored
in Parquet file `{metadata_filename}_pairs_pos_{charge}.parquet` in the
same directory as the provided metadata file.
If these files already exists they will _not_ be recreated.
Parameters
----------
metadata_filename : str
The metadata file name. Should be a Parquet file.
charges : Tuple[int]
Tuple of minimum and maximum precursor charge (both inclusive) to
include, spectra with other precursor charges will be omitted.
"""
metadata = (pd.read_parquet(metadata_filename,
columns=['sequence', 'charge'])
.reset_index().dropna())
metadata['sequence'] = metadata['sequence'].str.replace('I', 'L')
for charge in np.arange(charges[0], charges[1] + 1):
pairs_filename = metadata_filename.replace('.parquet',
f'_pairs_pos_{charge}.npy')
if not os.path.isfile(pairs_filename):
logger.info('Generate positive pair indexes for charge %d from '
'metadata file %s', charge, metadata_filename)
same_row_nums = metadata[metadata['charge'] == charge].groupby(
'sequence', sort=False)['index']
logger.debug('Save positive pair indexes for charge %d to file %s',
charge, pairs_filename)
np.save(pairs_filename, np.asarray(
[[np.uint32(p1), np.uint32(p2)]
for p1, p2 in itertools.chain(*(same_row_nums.apply(
functools.partial(itertools.combinations, r=2))))],
dtype=np.uint32))
def generate_pairs_negative(metadata_filename: str, charges: Tuple[int],
mz_tolerance: float, fragment_tolerance: float,
matching_fragments_threshold: float) \
-> None:
"""
Generate index pairs for negative training pairs for the given metadata
file.
The negative training pairs consist of all pairs with a different peptide
sequence, a precursor m/z difference smaller than the given m/z tolerance,
and mostly non-overlapping b and y ions, split by precursor charge.
Pairs of row numbers in the metadata file for each negative pair are stored
in Parquet file `{metadata_filename}_pairs_neg_{charge}.parquet` in the
same directory as the provided metadata file.
If these files already exists they will _not_ be recreated.
Parameters
----------
metadata_filename : str
The metadata file name. Should be a Parquet file.
charges : Tuple[int]
Tuple of minimum and maximum precursor charge (both inclusive) to
include, spectra with other precursor charges will be omitted.
mz_tolerance : float
Maximum precursor m/z tolerance in ppm for two PSMs to be considered a
negative pair.
fragment_tolerance : float
Maximum fragment m/z tolerance in Da for two fragments to be considered
overlapping (to avoid overly similar negative pairs).
matching_fragments_threshold : float
Maximum ratio of matching fragments relative to the number of b and y
ions of shortest peptide to be considered a negative pair (to avoid
overly similar negative pairs).
"""
metadata = (pd.read_parquet(metadata_filename,
columns=['sequence', 'charge', 'mz'])
.reset_index().dropna()
.sort_values(['charge', 'mz']).reset_index(drop=True))
for charge in np.arange(charges[0], charges[1] + 1):
pairs_filename = metadata_filename.replace('.parquet',
f'_pairs_neg_{charge}.npy')
if not os.path.isfile(pairs_filename):
logger.info('Generate negative pair indexes for charge %d from '
'metadata file %s', charge, metadata_filename)
metadata_charge = metadata[metadata['charge'] == charge]
# List because Numba can't handle object (string) arrays.
sequences = nb.typed.List(metadata_charge['sequence']
.str.replace('I', 'L')
.apply(_remove_mod))
fragments = nb.typed.List(metadata_charge['sequence']
.apply(_get_theoretical_fragment_mzs))
logger.debug('Save negative pair indexes for charge %d to file %s',
charge, pairs_filename)
np.save(pairs_filename, np.fromiter(_generate_pairs_negative(
metadata_charge['index'].values, metadata_charge['mz'].values,
sequences, fragments, mz_tolerance, fragment_tolerance,
matching_fragments_threshold), np.uint32)
.reshape((-1, 2)))
@functools.lru_cache(None)
def _get_theoretical_fragment_mzs(sequence: str) -> np.ndarray:
"""
Get the theoretical b and y ion m/z values for the given peptide sequence.
Parameters
----------
sequence : str
The peptide sequence for which b and y ion m/z values will be
calculated.
Returns
-------
np.ndarray
An array of sorted m/z values of the b and y ions for the given peptide
sequence.
"""
# Correct for 0-based offsets required by spectrum_utils.
mods, mod_pos_offset = {}, 1
for match in re.finditer(regex_mod, sequence):
mods[match.start() - mod_pos_offset] = float(match.group(0))
mod_pos_offset += match.end() - match.start()
# noinspection PyProtectedMember
return np.asarray([fragment.calc_mz for fragment in
sus._get_theoretical_peptide_fragments(
_remove_mod(sequence), mods)])
@functools.lru_cache(None)
def _remove_mod(peptide: str) -> str:
"""
Remove modifications indicated by a delta mass from the peptide sequence.
Parameters
----------
peptide : str
The given peptide sequence string.
Returns
-------
str
The normalized peptide sequence, or None if the input was None.
"""
return regex_non_alpha.sub('', peptide) if peptide is not None else None
@nb.njit
def _generate_pairs_negative(row_nums: np.ndarray, mzs: np.ndarray,
sequences: nb.typed.List,
fragments: nb.typed.List,
precursor_mz_tol: float, fragment_mz_tol: float,
matching_fragments_threshold: float) \
-> Iterator[int]:
"""
Numba utility function to efficiently generate row numbers for negative
pairs.
Parameters
----------
row_nums : np.ndarray
A NumPy array of row numbers for each PSM.
mzs : np.ndarray
A NumPy array of precursor m/z values for each PSM.
sequences : nb.typed.List
A list of peptide sequences for each PSM.
fragments: nb.typed.List
Theoretical fragments of the peptides corresponding to each PSM.
precursor_mz_tol : float
Maximum precursor m/z tolerance in ppm for two PSMs to be considered a
negative pair.
fragment_mz_tol : float
Maximum fragment m/z tolerance in Da for two fragments to be considered
overlapping.
matching_fragments_threshold : float
Maximum ratio of matching fragments relative to the number of b and y
ions of shortest peptide to be considered a negative pair.
Returns
-------
Iterator[int]
A generator of row numbers for the negative pairs, with row numbers `i`
and `i + 1` forming pairs.
"""
for row_num1 in range(len(row_nums)):
row_num2 = row_num1 + 1
while (row_num2 < len(mzs) and
(abs(suu.mass_diff(mzs[row_num1], mzs[row_num2], False))
<= precursor_mz_tol)):
if sequences[row_num1] != sequences[row_num2]:
fragments1 = fragments[row_num1]
fragments2 = fragments[row_num2]
num_matching_fragments = 0
for fragment1_i, fragment2 in zip(
np.searchsorted(fragments1, fragments2), fragments2):
fragment1_left = fragments1[max(0, fragment1_i - 1)]
fragment1_right = fragments1[min(fragment1_i,
len(fragments1) - 1)]
if ((abs(fragment1_left - fragment2) < fragment_mz_tol)
or (abs(fragment1_right - fragment2)
< fragment_mz_tol)):
num_matching_fragments += 1
if num_matching_fragments < matching_fragments_threshold * min(
len(fragments1), len(fragments2)):
yield row_nums[row_num1]
yield row_nums[row_num2]
row_num2 += 1