-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdsl.py
382 lines (308 loc) · 12.2 KB
/
dsl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#import codecs
#import io
from os import makedirs
from os.path import exists, basename, dirname, join, expanduser, expandvars
import re
import logging
import pickle
import fileinput
from argparse import ArgumentParser
from multiprocessing import Pool
from bs4 import BeautifulSoup
from yatetradki.reader.demangle_dsl import _clean_tags
FORMAT = '%(asctime)-15s %(levelname)s %(message)s'
logging.basicConfig(format=FORMAT, level=logging.DEBUG)
SHORT_ARTICLE_LENGTH = 60
RE_SHORT_REFERENCE = re.compile(r'= (\w+)')
RE_REF_DICT = re.compile(r'\[ref dict="[^"]*"\]')
RE_A_HREF = re.compile(r'<a href="(\w+)">')
RE_SEE_OTHER = re.compile(r'^See (\w+).?$')
RUSSIAN_TRANSLATION = re.compile(u" — [\u0400-\u0500]+")
STR_MAIN_ENTRY = 'Main entry:'
STR_SEE_MAIN_ENTRY = 'See main entry: ↑'
EXAMPLES_PER_DICT = 3
MAX_ARTICLE_LEN = 100000
class DSLRawReader(object):
def __init__(self, filename, encoding='utf-16',
article_header='<meta charset="utf-8">'):
self._filename = filename
self._article_header = [article_header]
self._file = open(filename, 'r', encoding=encoding)
self._file.seek(0)
def __repr__(self):
return '%s(%s)' % (self.__class__, self._filename)
@property
def filename(self):
return self._filename
def tell(self):
return self._file.tell()
def seek(self, offset, from_what=0):
return self._file.seek(offset, from_what)
def __len__(self):
pos = self.tell()
self.seek(0, 2)
size = self.tell()
self.seek(pos)
return size
def read_header(self):
while True:
pos = self._file.tell()
line = self._file.readline()
if line == '':
# unexpected EOF
break
elif line.startswith('#'):
continue # header
elif len(line.strip()) == 0:
continue # empty line delimiter
else:
self._file.seek(pos)
break
def _skip_until_article_or_eof(self):
initial_pos = self._file.tell()
while True:
saved_pos = self._file.tell()
line = self._file.readline()
if line == '': # eof
break
if line[0] in ' \t': # article body
self._file.seek(saved_pos)
break
return initial_pos != self._file.tell()
def _read_article_lines(self, convert=True):
lines = []
while True:
saved_pos = self._file.tell()
line = self._file.readline()
if line == '': # eof
break
elif line[0] in ' \t': # article body
if convert:
line = _clean_tags(line.strip(), None)
lines.append(line)
else:
# we've reached next word title, probably
self._file.seek(saved_pos)
break
return lines
def get_next_word(self, convert=True):
word = self._file.readline()
if word == '': # eof
return None, None
saved_pos = self._file.tell()
skipped_anything = self._skip_until_article_or_eof()
article = self._read_article_lines(convert)
if skipped_anything:
self._file.seek(saved_pos)
# Be cautious that words may contain multiple titles, e.g.:
# En-En_American_Heritage_Dictionary.dsl:
# 'preeminence', 'preeminently'
# 'predominately', 'predomination', 'predominator'
# 'Eurocentrism', 'Eurocentrist'
#
article = '\n'.join(self._article_header + article)
return word.strip(), article
class DSLIndexer(object):
def __init__(self, filename, dsl_raw_reader):
self._index = dict()
if exists(filename):
with open(filename, 'rb') as index_file:
self._index = pickle.load(index_file)
return
size = len(dsl_raw_reader)
logging.info('Indexing to file %s (dict size %s)', filename, size)
base_filename = basename(filename)
dsl_raw_reader.read_header()
last_percent = 0
while True:
pos = dsl_raw_reader.tell()
current_word, _article = dsl_raw_reader.get_next_word(convert=False)
if current_word is None: # eof
break
self._index[current_word] = pos
percent = float(pos) / size * 100.
if percent - last_percent > 10:
last_percent = percent
logging.info('Indexing %s... %%%d', base_filename, percent)
try:
makedirs(dirname(filename))
except OSError:
pass
with open(filename, 'wb') as index_file:
pickle.dump(self._index, index_file)
logging.info('Indexing done (%s entries, %s)',
len(self._index), filename)
def __len__(self):
return len(self._index)
def get_pos(self, word):
return self._index.get(word)
class DSLLookuper(object):
def __init__(self, filename, dsl_raw_reader=None, dsl_indexer=None):
self._filename = filename
self._dsl_raw_reader = dsl_raw_reader
if self._dsl_raw_reader is None:
self._dsl_raw_reader = DSLRawReader(filename)
self._dsl_indexer = dsl_indexer
if self._dsl_indexer is None:
index_path = expanduser(expandvars('~/.cache/dsl_index/'))
index_path = join(index_path, basename(filename) + '.index')
self._dsl_indexer = DSLIndexer(index_path, self._dsl_raw_reader)
self._dsl_raw_reader.seek(0)
def __repr__(self):
return '%s(%s)' % (self.__class__, self._filename)
def _find_word(self, word):
while True:
current_word, article = self._dsl_raw_reader.get_next_word()
if word == current_word:
return article
elif current_word is None:
logging.info('Could not find word "%s"', word)
return None
def lookup(self, word):
self._dsl_raw_reader.seek(0, 0)
self._dsl_raw_reader.read_header()
pos = self._dsl_indexer.get_pos(word)
if pos is None:
return None
self._dsl_raw_reader.seek(pos)
result = self._find_word(word)
return result
def check_reference(dsl_lookuper, word, article, depth):
# Special case for articles in En-En-Longman_DOCE5.dsl
text = BeautifulSoup(article, 'html.parser').text
if text.startswith(STR_SEE_MAIN_ENTRY):
referenced_word = text[len(STR_SEE_MAIN_ENTRY):].strip()
logging.info('Detected reference from "%s" to "%s" (LongmanDOCE5)', word, referenced_word)
return lookup_word(dsl_lookuper, referenced_word, depth)
# Special case for CambridgeAdvancedLearners
main_entry_start = article.find(STR_MAIN_ENTRY)
if main_entry_start != -1:
article_rest = article[main_entry_start + len(STR_MAIN_ENTRY):]
match = RE_A_HREF.search(article_rest)
if match:
referenced_word = match.group(1)
if referenced_word != word:
logging.info('Detected reference from "%s" to "%s" (CambridgeAdvancedLearners)', word, referenced_word)
more_article, more_examples = lookup_word(dsl_lookuper, referenced_word, depth)
return article + more_article, more_examples
# Special case for LingvoUniversal
if len(text) < SHORT_ARTICLE_LENGTH:
match = RE_SHORT_REFERENCE.search(text)
if match:
referenced_word = match.group(1)
if word == referenced_word:
logging.warning('Self reference from "%s" to "%s", skipping (LingvoUniversal)', word, referenced_word)
else:
logging.info('Detected reference from "%s" to "%s" (LingvoUniversal)', word, referenced_word)
return lookup_word(dsl_lookuper, referenced_word, depth)
# Special case for En-En_American_Heritage_Dictionary.dsl
match = RE_SEE_OTHER.search(text)
if match:
referenced_word = match.group(1)
if referenced_word != word:
logging.info('Detected reference from "%s" to "%s" (AmericanHeritageDictionary)', word, referenced_word)
return lookup_word(dsl_lookuper, referenced_word, depth)
return article, None
def cleanup_article(article):
article = article.replace('\t', ' ')
article = article.replace('\n', '')
article = RE_REF_DICT.sub('', article)
return article
def strip_russian_translation(text):
match = re.search(RUSSIAN_TRANSLATION, text)
if match is not None:
text = text[:match.start(0)]
return text
def strip_dots(text):
return text.strip('.')
def extract_examples(article):
result = []
soup = BeautifulSoup(article, 'html.parser')
for tag in ('div', 'span'):
for element in soup.findAll(tag, class_='sec ex'):
text = strip_russian_translation(element.text.strip())
text = strip_dots(text)
if text:
result.append(text)
return result
def lookup_word(dsl_lookuper, word, depth):
if depth == 0:
logging.info('Exceeded recusion limit for word "%s"', word)
return None, None
article = dsl_lookuper.lookup(word)
if article is None:
return None, None
# print(dsl_lookuper, file=stderr)
# print(article, file=stderr)
article = cleanup_article(article)
article, _examples = check_reference(dsl_lookuper, word, article, depth-1)
# print('----------------', file=stderr)
examples = None
if article is not None:
examples = extract_examples(article)
# print('EXAMPLES', examples, file=stderr)
return article, examples
def _init_index(filename):
"""
It does not work if this function is a closure. Keep it on
the module level.
"""
DSLLookuper(filename)
def _ensure_indexes_present(dsl_filenames):
"""
This functions is only called for its side effects. It creates DSLLookupers
for each DSL file in parallel processes to make user DSLIndexers are also
created in parallel.
"""
with Pool(processes=None) as pool:
pool.map(_init_index, dsl_filenames)
def _uniq_at(current_chunk, all_words):
uniq = set(current_chunk) - set(all_words)
return [word for word in current_chunk if word in uniq]
def lookup(dsls, words):
_ensure_indexes_present(dsls)
dsl_lookupers = [DSLLookuper(dsl) for dsl in dsls]
# print(dsl_reader.lookup('abrade'))
words_found = 0
words_missing = 0
result = []
for word in words:
found = 0
articles = []
examples = []
word = word.strip()
for dsl_reader in dsl_lookupers:
depth = 5
article, current_examples = lookup_word(dsl_reader, word, depth)
if article is not None:
articles.append(article)
uniq_examples = _uniq_at(current_examples, examples)
examples.extend(uniq_examples[:EXAMPLES_PER_DICT])
found = 1
if found:
articles = '<br>'.join(articles)
articles = articles[:MAX_ARTICLE_LEN]
examples = ''.join(['<li>%s</li>' % ex for ex in examples])
if examples:
examples = '<ul>%s</ul>' % examples
result.append(articles)
# print('%s\t%s\t%s' % (word, examples, articles))
#print(examples, file=stderr)
words_found += 1
else:
words_missing += 1
# logging.info('Missing word: %s', word)
# logging.info('Found %d words, %d missing words, %d total',
# words_found, words_missing, words_found + words_missing)
return ''.join(result)
def main():
parser = ArgumentParser('Extract word articles from a DSL file')
parser.add_argument('--dsl', dest='dsl', type=str, action='append',
help='path to a dsl dictionary file')
args = parser.parse_args()
#path = '/mnt/big_ntfs/distrib/lang/dictionaries/LDOCE5 for Lingvo/dsl/long-8.dsl'
#path = '/mnt/big_ntfs/distrib/lang/dictionaries/LDOCE5 for Lingvo/dsl/En-En-Longman_DOCE5.dsl'
words = [word for word in fileinput.input('-')]
print(lookup(args.dsl, words))
if __name__ == '__main__':
main()