-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtuning_wordlist.py
142 lines (116 loc) · 3.89 KB
/
tuning_wordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
###This program is used to tune the wordlist by using linear matrix equation#####
import sys
import json
import re
from afinn import Afinn
from collections import defaultdict, OrderedDict
import numpy as np
from numpy import *
reload(sys)
sys.setdefaultencoding('utf-8')
subj = "subject"
topic = "topic"
path_text = 'path_to_data/Data/' + subj+ '/data_text/'
def normalize(mtrx,new_min, new_max):
matrix = []
for x in mtrx:
new_x = (x - np.min(mtrx))*(new_max-new_min)/(np.max(mtrx)-np.min(mtrx)) + new_min
matrix.append(new_x)
return np.array(matrix)
if __name__ == '__main__':
afinn = Afinn(language='id',emoticons=True)
np.set_printoptions(threshold=np.inf, precision=1)
#extract wordlist
L=[]
with open('tuning_wordlist_candidate.txt') as f:
for line in f:
L.append(line.strip())
matrixA1 = [] #for algorithm's score
matrixA2 = [] #for expert's score
matrixB1 = [] #for algorithm's result
matrixB2 = [] #for expert's result
count = 0
with open('data_for_training.json') as f:
for line in f:
row = json.loads(line)
word = row["word"]
text = row["text"]
totscore = row["totscore"]
elmA1 = []
for wd in sorted(L):
try:
elmA1.append(word[wd]) #ex. word = {'ingin':1}
except KeyError:
elmA1.append(0)
matrixA1.append(elmA1)
elmB1 = []
elmB1.append(totscore)
matrixB1.append(elmB1)
count += 1
mA1 = np.array(matrixA1)
mB1 = np.array(matrixB1)
#normalize the AFINN score
new_mB1 = normalize(mB1,-3,3)
#normalize the algorithm's score in matrix A
factor = new_mB1/mB1
count = 0
new_mA1 = []
for x in mA1:
new_mA1.append(factor[count]*x)
count += 1
new_mA1 = np.array(new_mA1)
count = 0
file_expert = "path_to_evaluation_file_from_experts.csv"
with open(file_expert) as f:
for line in f:
row = line.split(";")
id = row[1]
text = row[3]
words = afinn.find_all(text)
totscore = row[4]
numW = {}
for word in words:
numW[word]=words.count(word) #find words occurence
elmA2 = []
for wd in sorted(L):
try:
elmA2.append(numW[wd])
except KeyError:
elmA2.append(0)
matrixA2.append(elmA2)
elmB2 = []
elmB2.append(int(totscore))
matrixB2.append(elmB2)
count += 1
mA2 = np.array(matrixA2)
mB2 = np.array(matrixB2)
####begin the tuning process######
#linear eq on expert matrix
ls = np.linalg.lstsq(mA2, mB2)
factor = ls[0]
count = 0
new_mA2 = []
for x in mA2:
new_mA2.append(factor[count]*x)
count += 1
new_mA2 = np.array(new_mA2)
#update the wordlist score
fp1 = open('path_to_wordlist1.txt', 'w') #the new value is simply replace the original value
fp2 = open('path_to_wordlist2.txt', 'w') #the new value resulting from the average value of the original and tuning value
dA = new_mA1 - new_mA2
dB = new_mB1 - mB2
ls = np.linalg.lstsq(dA, dB)
factor = normalize(ls[0],-5,5)
new_L = {}
new_L2 = {}
count = 0
for wd in sorted(L):
new_L[wd] = factor[count]
afinn_score = afinn.score_with_pattern(wd)
new_L2[wd] = new_L[wd] - ((new_L[wd]-afinn_score)/2)
fp1.write(wd + "\t" + str(new_L[wd][0])+ "\n")
fp2.write(wd + "\t" + str(new_L2[wd][0])+ "\n")
count += 1
fp1.close()
fp2.close()
print "Finish..................."