-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorfHMM_Viterbi.py
413 lines (326 loc) · 15.7 KB
/
orfHMM_Viterbi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
from math import inf
from math import log
import numpy as np
from scipy.stats import nbinom
from collections import defaultdict
# # log func
# In[ ]:
def log_func(number):
'''
convert to log form
number: scalar
output: scalar
'''
if number > 0:
return log(number)
else:
return -inf
# # log-sum-exp
# In[ ]:
def logSumExp(x):
'''
HMM Notes p17
log sum(exp(x1)+exp(x2)+...)
x: list
result: scalar
'''
m = max(x)
if m == -inf: # infinity
return -inf
else:
minus_m = np.array(x) - m
result = m + log(sum(np.exp(minus_m)))
return result
# # log likelihood f(x)
# In[ ]:
def lnNB(x, alpha, beta, E):
'''
Notes p2
x: scalar. Realization from NB distribution
alpha, beta: scalar. Parameters from Gamma distribution
E: scalar. Normalization factor for n-th sequence
result: scalar. Log likelihood f(x)
'''
r = alpha
p = beta / (E+beta)
result = nbinom.logpmf(x, r, p) # x follows NB(alpha, beta/(E+beta))
return result
# # Viterbi
# In[ ]:
def start_codon_false_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1):
'''
Calculate Viterbi algorithm given the next codon won't start
i: integer. Current i-th element inside this RNA sequence
prob_current: a list of 21 or 10 probabilites. Indicates current forward algorithm
prob_next: a list of 21 or 10 probabilites. Indicates next forward algorithm, updating this value
observed_data: a list. Indicates the height of a sequence and this list contains scalars
alpha_list: a list of alpha values for 21 or 10 states (NB parameter)
beta_list: a list of beta values for 21 or 10 states (NB parameter)
E: scalar. Normalization factor for this specific sequences
model1: boolean (True/False). Identify it's model1 (21-states: True) or model2 (10-states: False)
output: predict state
'''
if model1 == True:
# state 1 to state 1
temp = prob_current[0] + log(1) + lnNB(observed_data[i+1], alpha_list[0], beta_list[0], E)
prob_next[0] = temp
output[i+1][0] = 1
# state 10 and state 11 to state 11
log_10_11 = prob_current[9] + log(1)
log_11_11 = prob_current[10] + log(1)
if log_10_11 > log_11_11:
output[i+1][10] = 10
else:
output[i+1][10] = 11
# find maximum
temp = max(log_10_11, log_11_11)
prob_next[10] = temp + lnNB(observed_data[i+1], alpha_list[10], beta_list[10], E)
elif model1 == False:
# state 1 and state 10 to state 1
log_1_1 = prob_current[0] + log(1)
log_10_1 = prob_current[9] + log(1)
if log_1_1 > log_10_1:
output[i+1][0] = 1
else:
output[i+1][0] = 10
temp = max(log_1_1, log_10_1)
prob_next[0] = temp + lnNB(observed_data[i+1], alpha_list[0], beta_list[0], E)
return (prob_next, output)
# In[ ]:
def start_codon_true_viterbi(i, prob_current, prob_next, trans, next_codon, observed_data, alpha_list, beta_list, E, output, model1):
'''
Calculate Viterbi algorithm given the next codon is one of the start codons
i: integer. Current i-th element inside this RNA sequence
prob_current: a list of 21 or 10 probabilites. Indicates current forward algorithm
prob_next: a list of 21 or 10 probabilites. Indicates next forward algorithm, updating this value
trans: a dictionary that key is the start codon (string), value is a list of scalars (three transition probability)
next_codon: string. Indicates the specific start codon
observed_data: a list. Indicates the height of a sequence and this list contains scalars
alpha_list: a list of alpha values for 21 or 10 states (NB parameter)
beta_list: a list of beta values for 21 or 10 states (NB parameter)
E: scalar. Normalization factor for this specific sequences
model1: boolean (True/False). Identify it's model1 (21-states: True) or model2 (10-states: False)
output: predict state
'''
if model1 == True:
# state 1 to state 1
log_1_1 = log_func(1 - trans[next_codon][0] - trans[next_codon][1])
temp = prob_current[0] + log_1_1 + lnNB(observed_data[i+1], alpha_list[0], beta_list[0], E)
prob_next[0] = temp
output[i+1][0] = 1
# state 1 to state 2
log_1_2 = log_func(trans[next_codon][0])
temp = prob_current[0] + log_1_2 + lnNB(observed_data[i+1], alpha_list[1], beta_list[1], E)
prob_next[1] = temp
output[i+1][1] = 1
# state 1 and state 11 to state 12
log_1_12 = prob_current[0] + log_func(trans[next_codon][1])
log_11_12 = prob_current[10] + log_func(trans[next_codon][2])
if log_1_12 > log_11_12:
output[i+1][11] = 1
else:
output[i+1][11] = 11
# find maximum
temp = max(log_1_12, log_11_12)
prob_next[11] = temp + lnNB(observed_data[i+1], alpha_list[11], beta_list[11], E)
# state 10 and state 11 to state 11
log_10_11 = prob_current[9] + log(1)
log_11_11 = prob_current[10] + log_func(1 - trans[next_codon][2])
if log_10_11 > log_11_11:
output[i+1][10] = 10
else:
output[i+1][10] = 11
# find maximum
temp = max(log_10_11, log_11_11)
prob_next[10] = temp + lnNB(observed_data[i+1], alpha_list[10], beta_list[10], E)
elif model1 == False:
# state 1 and state 10 to state 1
log_1_1 = prob_current[0] + log_func(1 - trans[next_codon])
log_10_1 = prob_current[9] + log(1)
if log_1_1 > log_10_1:
output[i+1][0] = 1
else:
output[i+1][0] = 10
temp = max(log_1_1, log_10_1)
prob_next[0] = temp + lnNB(observed_data[i+1], alpha_list[0], beta_list[0], E)
# state 1 to state 2
log_1_2 = log_func(trans[next_codon])
temp = prob_current[0] + log_1_2 + lnNB(observed_data[i+1], alpha_list[1], beta_list[1], E)
prob_next[1] = temp
output[i+1][1] = 1
return (prob_next, output)
# In[ ]:
def stop_codon_false_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1):
'''
Calculate Viterbi algorithm given the next codon won't stop
i: integer. Current i-th element inside this RNA sequence
prob_current: a list of 21 or 10 probabilites. Indicates current forward algorithm
prob_next: a list of 21 or 10 probabilites. Indicates next forward algorithm, updating this value
observed_data: a list. Indicates the height of a sequence and this list contains scalars
alpha_list: a list of alpha values for 21 or 10 states (NB parameter)
beta_list: a list of beta values for 21 or 10 states (NB parameter)
E: scalar. Normalization factor for this specific sequences
model1: boolean (True/False). Identify it's model1 (21-states: True) or model2 (10-states: False)
output: predict state
'''
# state 4 and state 7 to state 5
log_4_5 = prob_current[3] + log(1)
log_7_5 = prob_current[6] + log(1)
if log_4_5 > log_7_5:
output[i+1][4] = 4
else:
output[i+1][4] = 7
# find maximum
temp = max(log_4_5, log_7_5)
prob_next[4] = temp + lnNB(observed_data[i+1], alpha_list[4], beta_list[4], E)
if model1 == True:
# state 14 and state 17 to state 15
log_14_15 = prob_current[13] + log(1)
log_17_15 = prob_current[16] + log(1)
if log_14_15 > log_17_15:
output[i+1][14] = 14
else:
output[i+1][14] = 17
# find maximum
temp = max(log_14_15, log_17_15)
prob_next[14] = temp + lnNB(observed_data[i+1], alpha_list[14], beta_list[14], E)
return (prob_next, output)
# In[ ]:
def stop_codon_true_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1):
'''
Calculate forward algorithm given the next codon is one of the stop codons
i: integer. Current i-th element inside this RNA sequence
prob_current: a list of 21 or 10 probabilites. Indicates current forward algorithm
prob_next: a list of 21 or 10 probabilites. Indicates next forward algorithm, updating this value
observed_data: a list. Indicates the height of a sequence and this list contains scalars
alpha_list: a list of alpha values for 21 or 10 states (NB parameter)
beta_list: a list of beta values for 21 or 10 states (NB parameter)
E: scalar. Normalization factor for this specific sequences
model1: boolean (True/False). Identify it's model1 (21-states: True) or model2 (10-states: False)
output: predict state
'''
# state 4 to state 5
prob_next[4] = prob_current[3] + log(1) + lnNB(observed_data[i+1], alpha_list[4], beta_list[4], E)
output[i+1][4] = 4
# state 7 to state 8
prob_next[7] = prob_current[6] + log(1) + lnNB(observed_data[i+1], alpha_list[7], beta_list[7], E)
output[i+1][7] = 7
if model1 == True:
# state 14 to 15
prob_next[14] = prob_current[13] + log(1) + lnNB(observed_data[i+1], alpha_list[14], beta_list[14], E)
output[i+1][14] = 14
# state 17 to state 18
prob_next[17] = prob_current[16] + log(1) + lnNB(observed_data[i+1], alpha_list[17], beta_list[17], E)
output[i+1][17] = 17
return (prob_next, output)
# In[ ]:
def viterbi(RNA_data, observed_data, alpha_list, beta_list, E, trans, stop_codon_list, num_sequence, model1):
'''
Compute Viterbi algorithm
RNA_data: a list of lists. Each inner list indicates a single RNA sequence and this list contains letters 'A', 'C', 'U', 'G'
observed_data: a list of lists. Each inner list indicates the height of a sequence and this list contains scalars
alpha_list: a list of alpha values for 21 or 10 states (NB parameter)
beta_list: a list of beta values for 21 or 10 states (NB parameter)
E: a list of scalars. Normalization factor for all sequences
trans: a dictionary that key is the start codon (string), value is a list of scalars (three transition probability)
stop_codon_list: a list of stop codons (string)
num_sequence: scalar indicates n-th sequence
model1: boolean (True/False). Identify it's model1 (21-states: True) or model2 (10-states: False)
output: a list of predicted states
'''
# Find E, observed_data, RNA_data according the order of sequences
E = E[num_sequence - 1]
observed_data = observed_data[num_sequence - 1]
RNA_data = RNA_data[num_sequence - 1]
sequence_length = len(RNA_data) # length of data
if model1 == True:
output = np.zeros((sequence_length, 21)) # initialize matrix
prob_current = np.ones(21) * (-inf) # initialize probability, 21 different states
elif model1 == False:
output = np.zeros((sequence_length, 10))
prob_current = np.ones(10) * (-inf) # initialize probability, 10 different states
prob_current[0] = lnNB(observed_data[0], alpha_list[0], beta_list[0], E)
for i in range(sequence_length - 1):
start_codon = False
stop_codon = False
next_codon = ""
if model1 == True:
prob_next = np.ones(21) * (-inf)
elif model1 == False:
prob_next = np.ones(10) * (-inf)
# Check start and stop codon
if i + 3 < sequence_length:
next_codon = RNA_data[i+1] + RNA_data[i+2] + RNA_data[i+3]
if next_codon in trans.keys():
start_codon = True
if next_codon in stop_codon_list:
stop_codon = True
# start codon is false
if start_codon == False:
prob_next = start_codon_false_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1)[0]
output = start_codon_false_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1)[1]
# start codon is ture
elif start_codon == True:
prob_next = start_codon_true_viterbi(i, prob_current, prob_next, trans, next_codon, observed_data,
alpha_list, beta_list, E, output, model1)[0]
output = start_codon_true_viterbi(i, prob_current, prob_next, trans, next_codon, observed_data,
alpha_list, beta_list, E, output, model1)[1]
# stop codon is ture
if stop_codon == True:
prob_next = stop_codon_true_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1)[0]
output = stop_codon_true_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1)[1]
# stop codon is false
elif stop_codon == False:
prob_next = stop_codon_false_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1)[0]
output = stop_codon_false_viterbi(i, prob_current, prob_next, observed_data, alpha_list, beta_list, E, output, model1)[1]
# transter to next state with probability 1
if model1 == True:
sure_to_transit = [1, 2, 4, 5, 7, 8, 11, 12, 14, 15, 17, 18]
elif model1 == False:
sure_to_transit = [1, 2, 4, 5, 7, 8]
for k in sure_to_transit:
temp = prob_current[k] + log(1) + lnNB(observed_data[i+1], alpha_list[k+1], beta_list[k+1], E)
prob_next[k+1] = temp
output[i+1][k+1] = k+1
if model1 == True:
# state 20 and state 21 to state 21
log_20_21 = prob_current[19] + log(1)
log_21_21 = prob_current[20] + log(1)
if log_20_21 > log_21_21:
output[i+1][20] = 20
else:
output[i+1][20] = 21
temp = max(log_20_21, log_21_21)
prob_next[20] = temp + lnNB(observed_data[i+1], alpha_list[20], beta_list[20], E)
prob_current = prob_next
# find the index of maximum
output_list = [list(prob_current).index(max(prob_current))+1]
start = len(output) - 1
while start >= 1:
index = int(output_list[-1] - 1)
current = int(output[start][index])
output_list.append(current)
start -= 1
output_list.reverse()
return output_list
# In[ ]:
def viterbi_sequence(RNA_data, observed_data, alpha_list, beta_list, E, trans, stop_codon_list, model1):
'''
Compute predicted states for all RNA sequences by combining result from each single sequence
RNA_data: a list of lists. Each inner list indicates a single RNA sequence and this list contains letters 'A', 'C', 'U', 'G'
observed_data: a list of lists. Each inner list indicates the height of a sequence and this list contains scalars
alpha_list: a list of alpha values for 21 or 10 states (NB parameter)
beta_list: a list of beta values for 21 or 10 states (NB parameter)
E: a list of scalars. Normalization factor for all sequences
trans: a dictionary that key is the start codon (string), value is a list of scalars (three transition probability)
stop_codon_list: a list of stop codons (string)
model1: boolean (True/False). Identify it's model1 (21-states: True) or model2 (10-states: False)
output: matrix stores the Viterbi algorithm for multiple RNA sequences
'''
output_list = []
for i in range(len(RNA_data)):
output_list.append(viterbi(RNA_data, observed_data, alpha_list, beta_list, E, trans, stop_codon_list, i+1, model1))
return output_list