-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRezzonico_deduper.py
210 lines (171 loc) · 6.33 KB
/
Rezzonico_deduper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env python3
'''
Author: Mitchell Rezzonico
Description: This script takes a SAM file and outputs a properly formatted SAM file that
has removed all PCR duplicates. The first time a unique read is seen, it will be written to
the output file. Subsequent versions of the same read will not be saved. The user must provide
a list of UMIs formatted in the same way as STL96.txt in this repository. Alignments with UMIs
that are not in submitted list are considered illigitimate and will not be considered. The input
SAM file must have already been sorted by chromosome with samtools sort. The input SAM file must
also have .sam in its name.
'''
# module load python3/3.6.1
import argparse
import re
def get_arguments():
parser = argparse.ArgumentParser(description="Allows users to know how to navigate this program")
parser.add_argument("-f", "--file", help="Absolute file path. Must include .sam in the name", required=True, type=str)
parser.add_argument("-p", "--paired", help="(Currently unsupported) Designates file is paired end (not single-end)", required=False, type=str)
parser.add_argument("-u", "--umi", help="Designates file containing the list of UMIs", required=True, type=str)
return parser.parse_args()
################################
###### Argparse Variables ######
################################
args = get_arguments()
file = args.file
paired = args.paired
umi = args.umi
if paired != None:
print("I suck and did not take the time to make this script capable of accepting paired end reads.")
sys.exit()
#######################################
######## High Level Function ##########
#######################################
def correct_5prime_start(FLAG:str, POS:str, CIGAR:str) -> int:
'''
FLAG: Bitwise flag indicative of forward/reverse strand
POS: Left-most mapping position
CIGAR: Cigar string
returns: Corrected left-most mapping position for forward (sense) strand or
right-most mapping position for reverse (antisense) strand. Returns original
position if there are no cigar operations.
'''
if((int(FLAG) & 16) != 16):
# search the forward (sense) strand
sense_S = re.findall("^\d+S", CIGAR) # search beginning of string for numbers that preceed S
if sense_S:
# grab any softclipping isolated by findall
sense_clipped = int(sense_S[0][:-1]) # isolate the number
new_pos = int(POS) - sense_clipped # adjust the position for sense strand
return(new_pos)
else:
return(int(POS))
else:
# search the reverse (antisense) strand
new_pos = int(POS)
anti_S = re.findall("\d+S$", CIGAR) # search end of string for numbers that preceed S
if anti_S != []:
sum_S = 0
S_list = []
for soft_clip in anti_S:
# grab any softclipping isolated by findall
soft_clips = soft_clip[:-1]
soft_clips = int(soft_clips)
S_list.append(soft_clips)
sum_S = sum(S_list)
new_pos += sum_S
M = re.findall("\d+M", CIGAR)
if M != []:
sum_M = 0
M_list = []
for match in M:
# grab any matches/mismatches that were isolated by findall
matches = match[:-1]
matches = int(matches)
M_list.append(matches)
sum_M = sum(M_list)
new_pos += sum_M
D = re.findall("\d+D", CIGAR)
if D != []:
sum_deletions = 0
deletion_list = []
for deletion in D:
# grab any deletions that were isolated by findall
deletions = deletion[:-1]
deletions = int(deletions)
deletion_list.append(deletions)
summed_deletions = sum(deletion_list)
new_pos += sum_deletions
N = re.findall("\d+N", CIGAR)
if N != []:
sum_skips = 0
skip_list = []
for skip in N:
# grab any skipped regions due to alternative splicing that were isolated by findall
skips = skip[:-1]
skips = int(skips)
skip_list.append(skips)
summed_skips = sum(skip_list)
new_pos += summed_skips
return(new_pos)
#######################################
########## Open Output File ###########
#######################################
output_name = re.search("([A-Za-z0-9]+\.sam)$", file).group(1) # removes the full path to preserve original .sam file name
deduped = open(output_name+"_deduped", "wt")
#######################################
######## Check for UMI list ###########
#######################################
if umi != None:
with open(umi, "r") as umis:
umi_list = []
for u in umis:
umi_list.append(u.strip())
else:
print("This code is currently only equipped to handle UMIs. Randomers will be a feature in the next update.")
sys.exit()
#######################################
##### Open Input, Grab Uniques ########
#######################################
uniq_reads = set()
chroms_seen = set()
total_ct = 0
dup_ct = 0
with open(file, "rt") as f:
while True: # keep looping until the break
full_sam = f.readline() # maintains entire SAM line
sam_list = full_sam.strip().split() # splits into list to isolate components of interest
if sam_list == []:
break
if full_sam.startswith("@"):
# write out header lines
deduped.write(full_sam)
continue
total_ct += 1
umi = sam_list[0][-8:]
strand = sam_list[1]
chrom = sam_list[2]
left_most = sam_list[3]
cig_string = sam_list[5]
if total_ct == 1:
# special case for the very first the time loop runs
chroms_seen.add(chrom)
if umi in umi_list:
new_start = correct_5prime_start(strand, left_most, cig_string) # make these variables
#print(new_start)
set_sam = (umi, strand, chrom, new_start) # umi, strand, chromosome, left most position
#print(set_sam)
if set_sam not in uniq_reads:
# must be a unique read - immediately write out to new file
uniq_reads.add(set_sam)
deduped.write(full_sam)
else:
dup_ct +=1
continue
if chrom not in chroms_seen:
# refresh the set to conserve memory once an entire chromosome has been seen
chroms_seen.add(chrom)
uniq_reads = set()
deduped.write(full_sam) # first unique instant
uniq_reads.add(set_sam) # add it to the set immediately
else:
# dont include UMIs that are not legitimate
continue
deduped.close()
################################
## Relevant Output Statements ##
################################
dup_proportion = (dup_ct / total_ct) * 100
dup_proportion = round(dup_proportion, 2)
print("Given {} SAM file alignments, {} were identified as PCR duplicates".format(total_ct, dup_ct))
print("PCR duplicates compose {}% of your file".format(dup_proportion))