-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter-seqs.py
executable file
·42 lines (32 loc) · 1.04 KB
/
filter-seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python
import click
import pandas as pd
import numpy as np
import skbio
@click.command()
@click.option('--tsv', help='filepath to nextstrain formatted tsv metadata',
type=click.File(mode='r'), required=True)
@click.argument('input', type=click.Path())
@click.argument('output', type=click.Path())
def main(input, output, tsv):
df = pd.read_csv(tsv, sep='\t')
df.index = df['gisaid_epi_isl']
total = 0
kept = 0
def generator():
for seq in skbio.io.read(input, format='fasta', constructor=skbio.DNA):
nonlocal total
nonlocal kept
total += 1
try:
to_find = seq.metadata['id'].split('|')[1]
except IndexError:
continue
if to_find in df.index:
kept += 1
yield seq
skbio.io.write(generator(), format='fasta', into=output)
print("For %d ids to keep, %d were found out of %d total samples"
% (len(df), kept, total))
if __name__ == '__main__':
main()