-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp_context.py
107 lines (102 loc) · 4.48 KB
/
app_context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
import json
import ast
import define
from tool.fen_ci import cut_words, filter_tags
class AppCtx(object):
def __init__(self):
self.url = "https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset={1}&platform=desktop&sort_by=default"
self.question_id = -1
self.initial_offset = 0
self.model = None
self.stop_word_table = None
self.comments = None
self.comments_rich_text_split = None
self.word_count = None
self.doc_vec_list = None
self.gender_filter = None
self.crawler_website = ''#zhi_hu dou_ban...
self.is_crawler_finish = False #是否已经跑过爬虫
self.cur_comment = 0 #当前在阅读器中正在看的comment编号
self.cur_recommend = 0
self.recommend_list = []
self.like_list = []
self.keyword_include = []
self.keyword_exclude = []
def init(self):
self._load_model('./data/' + str(define.word_vector_dimension) + '/wiki_model')
self.stop_word_table = self._build_stop_word_table(define.stop_word_table_path)
self._load_comments(define.comments_path)
def reload(self):
if self.model is None:
self._load_model('./data/' + str(define.word_vector_dimension) + '/wiki_model')
if self.stop_word_table is None:
self.stop_word_table = self._build_stop_word_table(define.stop_word_table_path)
if self.comments is None:
self._load_comments(define.comments_path)
def _build_stop_word_table(self, path):
stop_word_table = []
with open(path) as f:
line = f.readline()
while(line):
stop_word_table.append(line.strip('\n'))
line = f.readline()
stop_word_table.append('\n')
self.stop_word_table = stop_word_table
return True
def _load_comments(self, file_path):
with open(file_path) as f:
comments = f.readlines()
comments_list = []
for c in comments:
c.strip()
comments_list.append(ast.literal_eval(c))
self.comments = comments_list
def _load_model(self, file_path):
self.model = Word2Vec.load(file_path)
def _get_comments_rich_text(self):
'''
将所有的评论内容拼成str
'''
rich_text = ''
if self.stop_word_table is None:
self._build_stop_word_table(define.stop_word_table_path)
for c in self.comments:
rich_text += filter_tags(c['content'])
#print(self.stop_word_table)
rich_text = cut_words(rich_text, self.stop_word_table)
self.comments_rich_text_split = rich_text
self._get_word_count()
return True
def _get_word_count(self):
'''
获取rich text split中的词语计数
'''
if self.comments_rich_text_split is None:
return False
word_list = self.comments_rich_text_split.split(' ')
word_set = set(word_list)
word_count = {}
for w in word_set:
word_count[w] = word_list.count(w)
word_count = sorted(word_count.items(), key = lambda x : x[1], reverse=True)
#print(word_count)
self.word_count = word_count
return True
def get_word_count_result(self, n):
result = ""
if self.word_count is not None:
for i in range(min(n, len(self.word_count))):
print(self.word_count[i])
result += self.word_count[i][0]
result += ' '
result += str(self.word_count[i][1])
result += '\n'
return result
return False
if __name__ == '__main__':
app_ctx = AppCtx()
app_ctx._load_comments('./data/comments.txt')
text = app_ctx._get_comments_rich_text()