-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen-locale-data.py
403 lines (359 loc) · 20.1 KB
/
gen-locale-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
import os
import re
import sys
import xml.etree.ElementTree as ET
# Digits are not included inside the locale distributions, which means we have to make them ourselves.
altdigits_map = {
"ar_": "٠١٢٣٤٥٦٧٨٩",
"bn_": "০১২৩৪৫৬৭৮৯",
# "zh_CN": "〇一二三四五六七八九", // Too complicated
# "zh_TW": "零壹貳參肆伍陸柒捌玖", // Too complicated
#"san": "०१२३४५६७८९", # not present in GNU locales
# "am_": "0፩፪፫፬፭፮፯፰፱", // Too complicated
"gu_": "૦૧૨૩૪૫૬૭૮૯",
"pa_": "੦੧੨੩੪੫੬੭੮੯",
"kn_": "೦೧೨೩೪೫೬೭೮೯",
"km_": "០១២៣៤៥៦៧៨៩",
"lo_": "໐໑໒໓໔໕໖໗໘໙",
"ml_": "൦൧൨൩൪൫൬൭൮൯",
"mn_": "᠐᠑᠒᠓᠔᠕᠖᠗᠘᠙",
"or_": "୦୧୨୩୪୫୬୭୮୯",
"ta_": "௦௧௨௩௪௫௬௭௮௯",
"te_": "౦౧౨౩౪౫౬౭౮౯",
"th_": "๐๑๒๓๔๕๖๗๘๙",
"bo_": "༠༡༢༣༤༥༦༧༨༩",
"ur_": "۰۱۲۳۴۵۶۷۸۹",
"fa_": "۰۱۲۳۴۵۶۷۸۹",
}
def decode_utf8(encoded_string):
decoded_string = ''
thres = 0
for i in range(len(encoded_string)):
if thres > i:
continue
if encoded_string[i] == "<" and re.search(r'<U[0-9A-Fa-f]{4}>', encoded_string[i:i+7]):
code = encoded_string[i+2:i+6]
code_int = int(code, 16) # convert to base 16
decoded_string += chr(code_int)
thres = i+7
else:
decoded_string += encoded_string[i]
thres = i+1
return decoded_string
def read_locale_file(file_name, language_map, code):
# Set the comment_char and escape_char options
comment_char = '%'
escape_char = '/'
# Read in the given file
with open(file_name) as f:
all_lines = []
line = "1"
while line:
try:
line = f.readline()
all_lines.append(line)
except UnicodeDecodeError as e:
# Some comment has invalid bytes as the real data
# is only supposed to be in ASCII - just ignore it
continue
# Variables to store the current section and key-values
current_section = ""
kv_pairs = {"" : {}}
continue_line = False
# Iterate through all the lines
for line in all_lines:
line = line.replace('\t', ' ')
line = decode_utf8(line.strip())
# If the line starts with the comment character, blank,
# or an END directive (e.g. END LC_MESSAGES), ignore it
if line == '' or line.startswith("END") or not continue_line and line.startswith(comment_char):
continue
elif continue_line:
if line[-1] == escape_char:
continue_line = True
line = line[:-1]
else:
continue_line = False
value = line.replace('//', '/').split(';')
kv_pairs[current_section][key] += [v for v in value if v != '']
elif line.count(' ') > 0:
key = line.split(' ')[0]
value = line[len(key)+1:]
# check for escape char (except for the escape_char key)
if len(value) > 0 and value[-1] == escape_char and key != "escape_char":
continue_line = True
value = value[:-1]
else:
continue_line = False
# Strip any whitespace
key = key.strip()
value = value.strip().replace('//', '/').split(';')
# Add it to the dict
if key in kv_pairs[current_section].keys():
kv_pairs[current_section][key] += [v for v in value if v != '']
else:
kv_pairs[current_section][key] = [v for v in value if v != '']
else:
# The line is a section header
# Get the section name
current_section = line.strip()
kv_pairs[current_section] = {}
# STOP! Don't just return here, we need to make sure that LC_TIME fields are filled.
if "LC_TIME" not in kv_pairs.keys():
return language_map
language_map[code]["locale_info"] = kv_pairs
if "copy" in kv_pairs["LC_TIME"].keys():
ref_code = kv_pairs["LC_TIME"]["copy"][0].replace('"', '')
language_map[ref_code] = {}
language_map = read_locale_file("/usr/share/i18n/locales/{}".format(ref_code), language_map, ref_code)
# Will it work?
language_map[code]["locale_info"]["LC_TIME"] = language_map[ref_code]["locale_info"]["LC_TIME"]
return language_map
def parse_ldml_language_map(file_name):
tree = ET.parse(file_name)
root = tree.getroot()
language_map = {}
for node in root.iter('localeDisplayNames'):
for language in node.iter('languages'):
for name in language.iter('language'):
short_name = name.attrib['type']
if "_" not in short_name:
long_name = name.text
language_map[short_name] = {}
language_map[short_name]["name"] = long_name.upper()
return language_map
def parse_ldml_locales(language_map, file_name, iso_code):
tree = ET.parse(file_name)
root = tree.getroot()
months_long = {}
months_short = {}
weeks_long = {}
weeks_short = {}
for node in root.iter('dates'):
for node in node.find('calendars').iter('calendar'):
calendar_name = node.attrib['type']
if calendar_name == 'gregorian': # we only support gregorian calendars for now
for month_node in node.iter('months'):
for month in month_node.iter('monthContext'):
month_name = month.attrib['type']
if month_name == 'format':
for names in month.iter('monthWidth'):
name_type = names.attrib['type']
if name_type == 'wide':
for name in names.iter('month'):
month_string = name.attrib['type']
months_long[month_string] = name.text
elif name_type == 'abbreviated':
for name in names.iter('month'):
month_string = name.attrib['type']
months_short[month_string] = name.text
for week_node in node.iter('days'):
for week in week_node.iter('dayContext'):
week_name = week.attrib['type']
if week_name == 'format':
for names in week.iter('dayWidth'):
name_type = names.attrib['type']
if name_type == 'wide':
for name in names.iter('month'):
month_string = name.attrib['type']
months_long[month_string] = name.text
elif name_type == 'abbreviated':
for name in names.iter('month'):
month_string = name.attrib['type']
months_short[month_string] = name.text
for week_node in node.iter('days'):
for week in week_node.iter('dayContext'):
week_name = week.attrib['type']
if week_name == 'format':
for names in week.iter('dayWidth'):
name_type = names.attrib['type']
if name_type == 'wide':
for name in names.iter('day'):
week_string = name.attrib['type']
weeks_long[week_string] = name.text
elif name_type == 'abbreviated':
for name in names.iter('day'):
week_string = name.attrib['type']
weeks_short[week_string] = name.text
language_map[iso_code]["months_long"] = months_long
language_map[iso_code]["months_short"] = months_short
language_map[iso_code]["weeks_long"] = weeks_long
language_map[iso_code]["weeks_short"] = weeks_short
return language_map
def print_xdatetime_macros(language_map):
print("// You must define an X_DATETIME_ONLY_LOCALE_* macro, which is only read if you don't want locales.")
print("// English is set by default in the event that locales are disabled - which are also enabled by default.")
print("// This means if you want to disable the English locale, you must undefine this macro before including this file.")
print("#define X_DATETIME_ONLY_LOCALE_ENGLISH\n")
print("#ifndef X_DATETIME_NO_LOCALES")
for code, language in language_map.items():
if "months_long" not in language.keys() or ("months_long" in language.keys() and len(language["months_long"]) == 0) or \
"months_short" not in language.keys() or ("months_short" in language.keys() and len(language["months_short"]) == 0) or \
"weeks_long" not in language.keys() or ("weeks_long" in language.keys() and len(language["weeks_long"]) == 0) or \
"weeks_short" not in language.keys() or ("weeks_short" in language.keys() and len(language["weeks_short"]) == 0):
continue
name = language["name"]
name = name.replace(',', '_')
name = name.replace(' ', '_')
name = name.replace('-', '_')
for identifier, data in language["months_long"].items():
print(" data.long_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("") # empty line
for identifier, data in language["months_short"].items():
print(" data.short_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("")
for identifier, data in language["weeks_long"].items():
print(" data.long_weeks[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("")
for identifier, data in language["weeks_short"].items():
print(" data.short_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("\n") # two empty lines
print("#else")
for code, language in language_map.items():
if "months_long" not in language.keys() or ("months_long" in language.keys() and len(language["months_long"]) == 0) or \
"months_short" not in language.keys() or ("months_short" in language.keys() and len(language["months_short"]) == 0) or \
"weeks_long" not in language.keys() or ("weeks_long" in language.keys() and len(language["weeks_long"]) == 0) or \
"weeks_short" not in language.keys() or ("weeks_short" in language.keys() and len(language["weeks_short"]) == 0):
continue
name = language["name"]
name = name.replace(',', '_')
name = name.replace(' ', '_')
name = name.replace('-', '_')
print("#ifdef X_DATETIME_ONLY_LOCALE_{}".format(name))
for identifier, data in language["months_long"].items():
print(" data.long_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("") # empty line
for identifier, data in language["months_short"].items():
print(" data.short_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("")
for identifier, data in language["weeks_long"].items():
print(" data.long_weeks[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("")
for identifier, data in language["weeks_short"].items():
print(" data.short_weeks[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data))
print("#endif\n")
print("\n") # two empty lines
print("#endif\n")
# The collective GNU C library community wisdom regarding abday, day, week, first_weekday, and first_workday states at https://sourceware.org/glibc/wiki/Locales the following:
#
# * The value of the second week list item specifies the base of the abday and day lists.
#
# * first_weekday specifies the offset of the first day-of-week in the abday and day lists.
#
# * For compatibility reasons, all glibc locales should set the value of the second week list item to 19971130 (Sunday) and base the abday and day lists appropriately, and set first_weekday and first_workday to
# 1 or 2, depending on whether the week and work week actually starts on Sunday or Monday for the locale.
def print_xdatetime_macros2(language_map):
for code, language in language_map.items():
if "locale_info" not in language.keys() or ("locale_info" in language.keys() and len(language["locale_info"]) == 0):
continue
if code == "i18n":
continue # "i18n" is a garbage locale. It is not used by anything.
language = language["locale_info"]
name = code.upper().replace("@", "_");
print("#ifdef X_DATETIME_WITH_LOCALE_{}".format(name))
print(" data.am[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["am_pm"][0].replace('"', '')))
print(" data.pm[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["am_pm"][1].replace('"', '')))
if "date_fmt" in language["LC_TIME"].keys():
print(" data.date1_format[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["date_fmt"][0].replace('"', '')))
else:
print(" data.date1_format[\"{}\"] = reinterpret_cast<const char*>(u8\"%a %b %e %H:%M:%S %Z %Y\");".format(code))
print(" data.date_time_format[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["d_t_fmt"][0].replace('"', '')))
print(" data.date_format[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["d_fmt"][0].replace('"', '')))
print(" data.time24_format[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["t_fmt"][0].replace('"', '')))
if "t_fmt_ampm" in language["LC_TIME"].keys():
print(" data.time12_format[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["t_fmt_ampm"][0].replace('"', '')))
else:
print(" data.time12_format[\"{}\"] = reinterpret_cast<const char*>(u8\"{}\");".format(code, language["LC_TIME"]["t_fmt"][0].replace('"', '')))
if "week" in language["LC_TIME"].keys():
print(" data.days_in_week[\"{}\"] = {};".format(code, int(language["LC_TIME"]["week"][0])))
print(" data.first_weekday_ref[\"{}\"] = {};".format(code, int(language["LC_TIME"]["week"][1])))
print(" data.first_week_year_min_days[\"{}\"] = {};".format(code, int(language["LC_TIME"]["week"][2])))
else:
print(" data.days_in_week[\"{}\"] = 7;".format(code))
print(" data.first_weekday_ref[\"{}\"] = 11971130;".format(code))
print(" data.first_week_year_min_days[\"{}\"] = 4;".format(code))
if "first_weekday" in language["LC_TIME"].keys():
print(" data.first_weekday[\"{}\"] = {};".format(code, int(language["LC_TIME"]["first_weekday"][0])))
else:
print(" data.first_weekday[\"{}\"] = 1;".format(code))
i = 0
for data in language["LC_TIME"]["mon"]:
print(" data.long_months[\"{}\"][{}] = reinterpret_cast<const char*>(u8\"{}\");".format(code, i, data.replace('"', '')))
i += 1
print("") # empty line
i = 0
for data in language["LC_TIME"]["abmon"]:
print(" data.short_months[\"{}\"][{}] = reinterpret_cast<const char*>(u8\"{}\");".format(code, i, data.replace('"', '')))
i += 1
print("")
i = 0
for data in language["LC_TIME"]["day"]:
print(" data.long_weekdays[\"{}\"][{}] = reinterpret_cast<const char*>(u8\"{}\");".format(code, i, data.replace('"', '')))
i += 1
print("")
i = 0
for data in language["LC_TIME"]["abday"]:
print(" data.short_weekdays[\"{}\"][{}] = reinterpret_cast<const char*>(u8\"{}\");".format(code, i, data.replace('"', '')))
i += 1
foundkey = ""
for altkey in altdigits_map.keys():
if code.startswith(altkey):
foundkey = altkey
break
if foundkey:
for i in range(0, 10):
print(" data.alt_digits[\"{}\"][{}] = reinterpret_cast<const char*>(u8\"{}\");".format(code, i, altdigits_map[foundkey][i]))
else:
for i in range(0, 10):
print(" data.alt_digits[\"{}\"][{}] = \"{}\";".format(code, i, str(i)))
print("#endif /* X_DATETIME_WITH_LOCALE_{} */".format(name))
print("")
def print_autogenerated_code(language_map):
print("// Automatically generated by cldr-gen-locale-data.py. DO NOT MODIFY.\n")
print("#ifndef X_DATETIME_LOCALE_DATA_H")
print("#define X_DATETIME_LOCALE_DATA_H")
print("#include <map>")
print("#include <string>\n")
print("namespace xDateTime {")
print("struct _LocaleData {")
print(" std::map<std::string, std::string> am;")
print(" std::map<std::string, std::string> pm;")
print(" std::map<std::string, std::string> date1_format;")
print(" std::map<std::string, std::string> date_time_format;")
print(" std::map<std::string, std::string> date_format;")
print(" std::map<std::string, std::string> time24_format;")
print(" std::map<std::string, std::string> time12_format;")
print(" std::map<std::string, int> days_in_week;")
print(" std::map<std::string, int> first_weekday_ref;")
print(" std::map<std::string, int> first_weekday;")
print(" std::map<std::string, int> first_week_year_min_days;")
print(" std::map<std::string, std::map<int, std::string>> long_months;")
print(" std::map<std::string, std::map<int, std::string>> short_months;")
print(" std::map<std::string, std::map<int, std::string>> long_weekdays;")
print(" std::map<std::string, std::map<int, std::string>> short_weekdays;")
print(" std::map<std::string, std::map<int, std::string>> alt_digits;")
print("};\n")
print("static inline void InitializeLocaleData(_LocaleData& data) {")
print_xdatetime_macros2(language_map)
print("}\n")
print("}") # namespace
print("#endif /* X_DATETIME_LOCALE_DATA_H */")
import pprint
def print_locale_info(language_map):
for key, value in language_map.items():
if "locale_info" in value.keys():
for vkey, vvalue in value["locale_info"].items():
if vkey == "LC_TIME":
print(key, "$$$$$$$")
pprint.pprint(vvalue)
def main():
# search the OS-specific locales
locales_folder = "/usr/share/i18n/locales"
language_map = {}
for file_name2 in os.listdir(locales_folder):
# The OS locales do not have a file extension.
language_map[file_name2] = {}
language_map = read_locale_file(locales_folder + '/' + file_name2, language_map, file_name2)
print_autogenerated_code(language_map)
if __name__ == '__main__':
main()