-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathwiki_processor.py
488 lines (381 loc) · 17.3 KB
/
wiki_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
import os
from argparse import ArgumentParser
import subprocess
import re
from pathlib2 import Path
from random import shuffle,seed,uniform
import math
from shutil import move
import utils
import wiki_utils
import text_manipulation
import wiki_thresholds
import json
logger = utils.setup_logger(__name__, 'processor_log.log', True )
doc_split_delimiter = "</doc>"
id_parts = 7
# minimal number of sentences in document (used to filter non informal documents such as https://en.wikipedia.org/wiki?curid=32283
seed(1234)
wikipedia_namespaces = ['Category', 'File', 'Ru', 'Wikipedia', 'Talk', 'User', 'MediaWiki', 'Template', 'Help', 'Portal', 'Book', 'Draft',
'Education Program', 'TimedText', 'Module', 'Gadget', 'Gadget definition', 'Media', 'Special']
disambigutaiton_pattern = '(disambiguation)'
global num_sentneces_for_avg
global sum_sentneces_for_avg
num_sentneces_for_avg = 0
sum_sentneces_for_avg = 0
def count_str_occurrences(str,findStr):
return len(str.split(findStr)) - 1
def get_file_path(id):
chopped_id = []
id_str = str(id)
padding_count = id_parts - len(id_str)
while padding_count > 0:
id_str = "0" + id_str
padding_count-= 1
for i in range(0,3):
chopped_id.append(id_str[:2])
id_str = id_str[2:]
path = ""
for sub_path in chopped_id:
path =os.path.join(path, sub_path)
return path
def process_header(header):
id_match = re.search(r'<doc id="(\d+)" url', header)
id = id_match.groups()[0]
title_match = re.search(r'title="(.*)">', header)
title = title_match.groups()[0]
not_valid = title.isdigit() or any(title.startswith(prefix + ':' or prefix + ' talk:' ) for prefix in wikipedia_namespaces) or title.endswith(disambigutaiton_pattern)
return id, not not_valid
def get_sections(content):
lines = content.split('\n')
section = ""
# sections include headers
sections = []
sections.append(wiki_utils.get_segment_seperator(1,"preface."))
for line in lines:
if (wiki_utils.is_seperator_line(line)):
if len(section) > 0:
sections.append(section)
section = ""
sections.append(line)
else:
section += line
section += '\n'
if len(section) > 0:
sections.append(section)
return sections
def process_section(section, id):
global num_sentneces_for_avg
global sum_sentneces_for_avg
sentences = text_manipulation.split_sentences(section, id)
section_sentences = []
num_lists = 0
num_sentences = 0
num_formulas = 0
num_codes = 0
last_sentence_was_list = False
for sentence in sentences:
is_list_sentence = wiki_utils.get_list_token() + "." == sentence.encode('utf-8')
if '\n' in sentence:
logger.info("DocId: " + str(id) + " back slash in sentence: " + sentence)
if (wiki_utils.get_list_token() in sentence) and (wiki_utils.get_list_token() + ".") != sentence.encode('utf-8'):
# TODO: delete this if section, since it is not suupposed to happen any more - but still happen
num_lists += 1
last_sentence_was_list = True
logger.info("DocId: " + str(id) + " Special case 1: " + sentence)
continue
elif is_list_sentence:
if (last_sentence_was_list):
continue
last_sentence_was_list = True
num_lists += 1
else:
last_sentence_was_list = False
sentence_words = text_manipulation.extract_sentence_words(sentence)
if len(sentence_words) < wiki_thresholds.min_words_in_sentence:
# ignore this sentence
continue
sum_sentneces_for_avg += len(sentence_words)
num_sentneces_for_avg += 1
num_formulas += count_str_occurrences(sentence, wiki_utils.get_formula_token())
num_codes += count_str_occurrences(sentence, wiki_utils.get_codesnipet_token())
num_sentences += 1
section_sentences.append(sentence)
valid_section = True
error_message = None
if (num_sentences < wiki_thresholds.min_sentence_in_section):
valid_section = False
error_message = "sentences count in section is too low"
if (num_sentences > 0):
lists_perentage = float(num_lists) / float(num_sentences)
if lists_perentage >= wiki_thresholds.max_list_in_section_percentage:
valid_section = False
error_message = "list percentage in section is too high: " + str(lists_perentage)
section_text = ''.join(section_sentences)
if len(re.findall('[a-zA-Z]', section_text)) < wiki_thresholds.min_section_char_count:
valid_section = False
error_message = "char count in section is too low"
if num_formulas >= wiki_thresholds.max_section_formulas_count:
valid_section = False
error_message = "number of formulas in section is too high: " + str(num_formulas)
if num_codes >= wiki_thresholds.max_section_code_snipet_count:
valid_section = False
error_message = "number of code snippets in section is too high: " + str(num_codes)
return valid_section, section_sentences, error_message
def is_valid_article(valid_section_count, section_count):
if valid_section_count < wiki_thresholds.min_valid_section_count:
return False, "Valid section count is too low: " + str(valid_section_count)
valid_section_percentage = float(valid_section_count) / float (section_count)
if valid_section_percentage < wiki_thresholds.min_valid_section_percentage:
return False, "Valid section percentage is too low: " + str(valid_section_percentage)
return True,""
def max_level_in_article(content):
max_lavel = -1
for line in content:
if (wiki_utils.is_seperator_line(line)):
current_level = wiki_utils.get_segment_level(line)
if current_level > max_lavel:
max_lavel = current_level
return max_lavel
def delete_empty_segment_headers(content):
num_of_deletions = 0
max_level = max_level_in_article(content)
for handle_level in range(max_level,0,-1):
last_section_level = -1
last_section_header = True
for i in range(len(content) -1 , -1 , -1):
section = content[i]
if (wiki_utils.is_seperator_line(section)):
section_level = wiki_utils.get_segment_level(section)
if (section_level == handle_level):
# empty section if last seciont was also a header
is_empty = last_section_header
if (is_empty & (last_section_level <= section_level)):
del content[i]
num_of_deletions += 1
last_section_level = section_level
last_section_header = True
else:
last_section_header = False
return content, num_of_deletions
def vec_to_text(sections_with_headers):
adjusted_content = ""
for section in sections_with_headers:
adjusted_content += section + '\n'
return adjusted_content
def process_content(content, id):
sections_with_headers = get_sections(content)
adjueted_content_text = ""
article_lines = []
section_count = 0
valid_section_count = 0
for i in range(len(sections_with_headers)):
section = sections_with_headers[i]
if wiki_utils.is_seperator_line(section):
article_lines.append(section)
else:
is_valid_section, section_sentences, message = process_section(section, id)
section_count += 1
if (is_valid_section):
valid_section_count += 1
article_lines.extend(section_sentences)
else:
logger.info('Invalid section in article id: ' + id +
' Reason: ' + message + ' Content: ' + vec_to_text(section_sentences).strip('\n') )
is_valid,reason = is_valid_article(valid_section_count, section_count )
if is_valid:
article_content,_ = delete_empty_segment_headers(article_lines)
adjueted_content_text = vec_to_text(article_content)
return is_valid, adjueted_content_text,reason
# old process content, for comparsion
# def process_content(content):
#
# # keep only scetions with minimal number of characters
# sections = [s.strip('\n') for s in content.strip().split(section_delimiter) if
# len(re.findall('[a-zA-Z]', s)) > min_section_length]
#
# # article must have at least 3 sections, to avoid articles with only one section which is summaization. E.g:
# # https://en.wikipedia.org/wiki?curid=821470
# sections_count = len(sections)
# if sections_count < min_article_sections_count or sections_count >= max_article_sections_count:
# return content, False, 'Sections count is: ' + str(sections_count)
#
# # remove first section since it usually the summary of the whole article
# adjueted_content = ('\n' + section_delimiter + '\n').join(sections[1:])
#
# return adjueted_content, True, ""
def process_article(article):
non_empty_lines = [l for l in article.strip().split("\n") if l != ""]
header = non_empty_lines[0]
id, is_valid_header = process_header(header)
if not is_valid_header:
logger.info('Invalid header in doc id: ' + str(id)+ ' header: ' + header)
return "", id, False
content = "\n".join(non_empty_lines[2:])
is_valid_content, processed_content , debug = process_content(content, id)
if not(is_valid_content):
logger.info('Invalid article in doc id: ' + str(id) + '. ' + debug +'\n\n')
else:
logger.info('Valid article , id: ' + str(id) +'\n\n')
return processed_content, id, is_valid_content
def process_wiki_file(path, output_folder,train_ratio,test_ratio, forbidden_train_ids):
train_size = 0
dev_size = 0
test_size = 0
with open(path, "r") as file:
raw_content = file.read()
articles = [s for s in raw_content.decode('utf-8').strip().split(doc_split_delimiter) if len(s) > 0]
created_articles_count = 0
processed_articles_count = 0
for article in articles:
processed_article, id, is_valid = process_article(article)
processed_articles_count+=1
if not is_valid:
continue;
random_num = uniform(0, 1)
if (random_num > train_ratio and random_num <= train_ratio + test_ratio) or int(id) in forbidden_train_ids:
partition = "test"
test_size += 1
elif (random_num > train_ratio + test_ratio):
partition = "dev"
dev_size += 1
else:
partition = "train"
train_size += 1
output_sub_folder = os.path.join(output_folder,partition, get_file_path(id))
if not os.path.exists(output_sub_folder):
os.makedirs(output_sub_folder)
output_file_path = os.path.join(output_sub_folder, str(id))
with open(output_file_path, "w") as output_file:
output_file.write(processed_article.encode('utf-8'), )
created_articles_count+=1
return created_articles_count, processed_articles_count, train_size,dev_size,test_size
def get_forbidden_train_ids():
# Return ids of article which must be in test set (and not train/dev)
with open('wikicities_article_names_to_ids') as f:
wiki_cities = json.load(f)
with open('wikielements_article_names_to_ids') as f:
wiki_elements = json.load(f)
forbidden_train_ids = []
for k,v in wiki_cities.iteritems():
forbidden_train_ids.append(int(v))
for k,v in wiki_elements.iteritems():
forbidden_train_ids.append(int(v))
unique_ids = set(forbidden_train_ids)
return unique_ids;
def get_wiki_files(path):
all_objects = Path(path).glob('**/*')
files = (str(p) for p in all_objects if p.is_file())
return files
def process_wiki_folder(input_folder, output_folder,train_ratio,test_ratio):
total_train_size = 0
total_dev_size = 0
total_test_size = 0
folders = [o for o in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, o))]
total_created_articles = 0
total_processed_articles = 0
previous_debug = 0
forbidden_train_ids = get_forbidden_train_ids()
for folder in folders:
full_folder_path = os.path.join(input_folder, folder)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
files = get_wiki_files(full_folder_path)
for file in files:
created_articles, processed_articles, train_size, dev_size, test_size = process_wiki_file(file, output_folder, float(train_ratio), float(test_ratio), forbidden_train_ids)
total_train_size += train_size
total_dev_size += dev_size
total_test_size += test_size
total_created_articles += created_articles
total_processed_articles += processed_articles
if (total_created_articles - previous_debug > 2500):
previous_debug = total_created_articles
print ('Created ' + str(total_created_articles) + ' wiki articles, out of ' + str(total_processed_articles) + ' processed articles')
total_samples = total_train_size + total_dev_size + total_test_size
print 'total_samples = ', str(total_samples)
print "#train = ",total_train_size,"ratio: ","{:.2f}".format(total_train_size / float(total_samples))
print "#dev = ", total_dev_size,"ratio: ","{:.2f}".format(total_dev_size/ float(total_samples))
print "#test = ", total_test_size,"ratio: ","{:.2f}".format(total_test_size / float(total_samples))
def move_wiki_file(src, folder, partition):
# get relative path to inputFolder
file = os.path.relpath(src, folder)
# extract file path in train folder
dstFile = os.path.join(folder, partition, file)
dstdir = os.path.dirname(dstFile)
if not os.path.exists(dstdir):
os.makedirs(dstdir)
move(src, dstFile)
def removeEmptyFolders(path, removeRoot=True):
if not os.path.isdir(path):
return
# remove empty subfolders
files = os.listdir(path)
for f in files:
fullpath = os.path.join(path, f)
if os.path.isdir(fullpath):
removeEmptyFolders(fullpath)
# if folder empty, delete it
files = os.listdir(path)
if len(files) == 0 and removeRoot:
#print "Removing empty folder:", path
os.rmdir(path)
def trainTestDev(destFolder, train_size, test_size):
train_size_ratio = float(train_size)
test_size_ratio = float(test_size)
dev_size_ratio = 1 - train_size_ratio - test_size_ratio
print (destFolder,train_size,test_size)
allFiles = []
if not os.path.exists(destFolder):
print ("Output folder does not exist")
return
folders = [o for o in os.listdir(destFolder) if os.path.isdir(os.path.join(destFolder, o))]
for folder in folders:
full_folder_path = os.path.join(destFolder, folder)
files = get_wiki_files(full_folder_path)
allFiles.extend(files)
shuffle(allFiles)
trainSize = int(math.floor(len(allFiles) * train_size_ratio))
devSize = int(math.floor(len(allFiles) * dev_size_ratio))
for i in range(0,trainSize):
move_wiki_file(allFiles[i], destFolder, partition="train")
if devSize > 0:
for i in range(trainSize, trainSize + devSize):
move_wiki_file(allFiles[i], destFolder, partition="dev")
for i in range(trainSize + devSize,len(allFiles)):
move_wiki_file(allFiles[i], destFolder, partition="test")
print ("#train = ",trainSize)
print ("#dev = ", devSize)
print ("#test = ", len(allFiles) - trainSize -devSize)
removeEmptyFolders(destFolder)
def main (args):
global num_sentneces_for_avg
global sum_sentneces_for_avg
if not os.path.exists(args.temp):
os.makedirs(args.temp)
# execute extraction of wikipedia dump
cmd = ['python', str(Path(__file__).parent / 'wiki_extractor.py'), '-s', '-o', args.temp, '--article_count', str(args.article_count),'--lists']
print cmd
if args.processes:
cmd += ['--processes', args.processes]
cmd += [args.input]
if not args.no_extractor:
subprocess.call(cmd)
print ("Finisehd extractor")
if not os.path.exists(args.output):
os.makedirs(args.output)
# create file per each wiki value from the extracted dump
process_wiki_folder(args.temp, args.output,args.train, args.test)
print ("Number of processed sentences: " + str(num_sentneces_for_avg))
print "avg len sentence = " + str(sum_sentneces_for_avg / float(num_sentneces_for_avg))
print ('done')
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--input', help='Path to wikipedia dump', required=True)
parser.add_argument('--processes', help='Num of processors to use in wiki_extractor')
parser.add_argument('--no_extractor', help='Skip wiki-extractor', action='store_true')
parser.add_argument('--temp', help='folder to save temporal files', required=True)
parser.add_argument('--output', help='output folder', required=True)
parser.add_argument('--train', help='train size ratio', required=True)
parser.add_argument('--test', help='test size ratio', required=True)
parser.add_argument("--article_count", help = 'max number of wikipedia articles to extract', default=1000000)
main(parser.parse_args())