-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdeduper_script.py
199 lines (155 loc) · 5.23 KB
/
deduper_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
# write in comments for what I think an if statement
# module load python3/3.6.1
import argparse
import re
def get_arguments():
parser = argparse.ArgumentParser(description="Allows users to know how to navigate this program")
parser.add_argument("-f", "--file", help="Absolute file path. Must include .sam in the name", required=True, type=str)
parser.add_argument("-p", "--paired", help="(Currently unsupported) Designates file is paired end (not single-end)", required=False, type=str)
parser.add_argument("-u", "--umi", help="Designates file containing the list of UMIs ", required=True, type=str)
return parser.parse_args()
################################
###### Argparse Variables ######
################################
args = get_arguments()
file = args.file
paired = args.paired
umi = args.umi
if paired != None:
print("I suck and did not take the time to make this script capable of accepting paured end reads.")
sys.exit()
#######################################
######## High Level Function ##########
#######################################
def correct_5prime_start(FLAG, POS, CIGAR):
if((int(FLAG) & 16) != 16):
# search the forward (sense) strand
sense_S = re.findall("^\d+S", CIGAR) # search beginning of string for numbers that preceed S
if sense_S:
# grab any softclipping isolated by findall
sense_clipped = int(sense_S[0][:-1]) # isolate the number
new_pos = int(POS) - sense_clipped # adjust the position for sense strand
return(new_pos)
else:
return(int(POS))
else:
# search the reverse (antisense) strand
new_pos = int(POS)
anti_S = re.findall("\d+S$", CIGAR) # search end of string for numbers that preceed S
if anti_S != []:
sum_S = 0
S_list = []
for soft_clip in anti_S:
# grab any softclipping isolated by findall
soft_clips = soft_clip[:-1]
soft_clips = int(soft_clips)
S_list.append(soft_clips)
sum_S = sum(S_list)
new_pos += sum_S
M = re.findall("\d+M", CIGAR)
if M != []:
sum_M = 0
M_list = []
for match in M:
# grab any matches/mismatches that were isolated by findall
matches = match[:-1]
matches = int(matches)
M_list.append(matches)
sum_M = sum(M_list)
new_pos += sum_M
D = re.findall("\d+D", CIGAR)
if D != []:
sum_deletions = 0
deletion_list = []
for deletion in D:
# grab any deletions that were isolated by findall
deletions = deletion[:-1]
deletions = int(deletions)
deletion_list.append(deletions)
summed_deletions = sum(deletion_list)
new_pos += sum_deletions
N = re.findall("\d+N", CIGAR)
if N != []:
sum_skips = 0
skip_list = []
for skip in N:
# grab any skipped regions due to alternative splicing that were isolated by findall
skips = skip[:-1]
skips = int(skips)
skip_list.append(skips)
summed_skips = sum(skip_list)
new_pos += summed_skips
return(new_pos)
#######################################
########## Open Output File ###########
#######################################
output_name = re.search("([A-Za-z0-9]+\.sam)$", file).group(1)
#prefix name?
deduped = open(output_name+"_deduped", "wt")
#######################################
######## Check for UMI list ###########
#######################################
if umi != None:
with open(umi, "r") as umis:
umi_list = []
for u in umis:
umi_list.append(u.strip())
# assert statement here?
#######################################
##### Open Input, Grab Uniques ########
#######################################
uniq_reads = set()
chroms_seen = set()
total_ct = 0
dup_ct = 0
with open(file, "rt") as f:
while True: # keep looping until the break
full_sam = f.readline() # maintains entire SAM line
sam_list = full_sam.strip().split() # splits into list to isolate components of interest
if sam_list == []:
break
if full_sam.startswith("@"):
# write out header lines
deduped.write(full_sam)
continue
total_ct += 1
if total_ct == 1:
# special case for the very first the time loop runs
chroms_seen.add(sam_list[2])
umi = sam_list[0][-8:]
strand = sam_list[1]
chrom = sam_list[2]
left_most = sam_list[3]
cig_string = sam_list[5]
if umi in umi_list:
new_start = correct_5prime_start(strand, left_most, cig_string) # make these variables
#print(new_start)
set_sam = (umi, strand, chrom, new_start) # QNAME, strand, chromosome, left most position
#print(set_sam)
if set_sam not in uniq_reads:
# must be a unique read - immediately write out to new file
uniq_reads.add(set_sam)
deduped.write(full_sam)
else:
dup_ct +=1
continue
if chrom not in chroms_seen:
# refresh the set to conserve memory
chroms_seen.add(chrom)
uniq_reads = set()
deduped.write(full_sam) # first unique instant
uniq_reads.add(set_sam) # add it to the set immediately
else:
# dont include UMIs that are not legitimate
continue
deduped.close()
################################
## Relevant Output Statements ##
################################
#print(uniq_reads)
#print(total_ct)
#print(dup_ct)
dup_proportion = (dup_ct / total_ct) * 100
dup_proportion = round(dup_proportion, 2)
print("PCR duplicates compose {}% of your file".format(dup_proportion))