mk_hyb_file.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. #!/usr/bin/env python
  2. # Copyright (C) 2015 The Android Open Source Project
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the 'License');
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an 'AS IS' BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
  17. into binary format. See doc/hyb_file_format.md for more information.
  18. Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb
  19. Optional -v parameter turns on verbose debugging.
  20. """
  21. from __future__ import print_function
  22. import io
  23. import sys
  24. import struct
  25. import math
  26. import getopt
  27. VERBOSE = False
  28. # U+00DF is LATIN SMALL LETTER SHARP S
  29. # U+1E9E is LATIN CAPITAL LETTER SHARP S
  30. SHARP_S_TO_DOUBLE = u'\u00dfSS'
  31. SHARP_S_TO_CAPITAL = u'\u00df\u1e9e'
  32. if sys.version_info[0] >= 3:
  33. def unichr(x):
  34. return chr(x)
  35. # number of bits required to represent numbers up to n inclusive
  36. def num_bits(n):
  37. return 1 + int(math.log(n, 2)) if n > 0 else 0
  38. class Node:
  39. def __init__(self):
  40. self.succ = {}
  41. self.res = None
  42. self.fsm_pat = None
  43. self.fail = None
  44. # List of free slots, implemented as doubly linked list
  45. class Freelist:
  46. def __init__(self):
  47. self.first = None
  48. self.last = None
  49. self.pred = []
  50. self.succ = []
  51. def grow(self):
  52. this = len(self.pred)
  53. self.pred.append(self.last)
  54. self.succ.append(None)
  55. if self.last is None:
  56. self.first = this
  57. else:
  58. self.succ[self.last] = this
  59. self.last = this
  60. def next(self, cursor):
  61. if cursor == 0:
  62. cursor = self.first
  63. if cursor is None:
  64. self.grow()
  65. result = self.last
  66. else:
  67. result = cursor
  68. return result, self.succ[result]
  69. def is_free(self, ix):
  70. while ix >= len(self.pred):
  71. self.grow()
  72. return self.pred[ix] != -1
  73. def use(self, ix):
  74. if self.pred[ix] is None:
  75. self.first = self.succ[ix]
  76. else:
  77. self.succ[self.pred[ix]] = self.succ[ix]
  78. if self.succ[ix] is None:
  79. self.last = self.pred[ix]
  80. else:
  81. self.pred[self.succ[ix]] = self.pred[ix]
  82. if self.pred[ix] == -1:
  83. assert self.pred[ix] != -1, 'double free!'
  84. self.pred[ix] = -1
  85. def combine(a, b):
  86. if a is None: return b
  87. if b is None: return a
  88. if len(b) < len(a): a, b = b, a
  89. res = b[:len(b) - len(a)]
  90. for i in range(len(a)):
  91. res.append(max(a[i], b[i + len(b) - len(a)]))
  92. return res
  93. def trim(pattern):
  94. for ix in range(len(pattern)):
  95. if pattern[ix] != 0:
  96. return pattern[ix:]
  97. def pat_to_binary(pattern):
  98. return b''.join(struct.pack('B', x) for x in pattern)
  99. class Hyph:
  100. def __init__(self):
  101. self.root = Node()
  102. self.root.str = '<root>'
  103. self.node_list = [self.root]
  104. # Add a pattern (word fragment with numeric codes, such as ".ad4der")
  105. def add_pat(self, pat):
  106. lastWasLetter = False
  107. haveSeenNumber = False
  108. result = []
  109. word = ''
  110. for c in pat:
  111. if c.isdigit():
  112. result.append(int(c))
  113. lastWasLetter = False
  114. haveSeenNumber = True
  115. else:
  116. word += c
  117. if lastWasLetter and haveSeenNumber:
  118. result.append(0)
  119. lastWasLetter = True
  120. if lastWasLetter:
  121. result.append(0)
  122. self.add_word_res(word, result)
  123. # Add an exception (word with hyphens, such as "ta-ble")
  124. def add_exception(self, hyph_word):
  125. res = []
  126. word = ['.']
  127. need_10 = False
  128. for c in hyph_word:
  129. if c == '-':
  130. res.append(11)
  131. need_10 = False
  132. else:
  133. if need_10:
  134. res.append(10)
  135. word.append(c)
  136. need_10 = True
  137. word.append('.')
  138. res.append(0)
  139. res.append(0)
  140. if VERBOSE:
  141. print(word, res)
  142. self.add_word_res(''.join(word), res)
  143. def add_word_res(self, word, result):
  144. if VERBOSE:
  145. print(word, result)
  146. t = self.root
  147. s = ''
  148. for c in word:
  149. s += c
  150. if c not in t.succ:
  151. new_node = Node()
  152. new_node.str = s
  153. self.node_list.append(new_node)
  154. t.succ[c] = new_node
  155. t = t.succ[c]
  156. t.res = result
  157. def pack(self, node_list, ch_map, use_node=False):
  158. size = 0
  159. self.node_map = {}
  160. nodes = Freelist()
  161. edges = Freelist()
  162. edge_start = 1 if use_node else 0
  163. for node in node_list:
  164. succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
  165. if len(succ):
  166. cursor = 0
  167. while True:
  168. edge_ix, cursor = edges.next(cursor)
  169. ix = edge_ix - succ[0]
  170. if (ix >= 0 and nodes.is_free(ix) and
  171. all(edges.is_free(ix + s) for s in succ) and
  172. ((not use_node) or edges.is_free(ix))):
  173. break
  174. elif use_node:
  175. ix, _ = edges.next(0)
  176. nodes.is_free(ix) # actually don't need nodes at all when use_node,
  177. # but keep it happy
  178. else:
  179. ix, _ = nodes.next(0)
  180. node.ix = ix
  181. self.node_map[ix] = node
  182. nodes.use(ix)
  183. size = max(size, ix)
  184. if use_node:
  185. edges.use(ix)
  186. for s in succ:
  187. edges.use(ix + s)
  188. size += max(ch_map.values()) + 1
  189. return size
  190. # return list of nodes in bfs order
  191. def bfs(self, ch_map):
  192. result = [self.root]
  193. ix = 0
  194. while ix < len(result):
  195. node = result[ix]
  196. node.bfs_ix = ix
  197. mapped = {}
  198. for c, next in node.succ.items():
  199. assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
  200. mapped[ch_map[c]] = next
  201. for i in sorted(mapped.keys()):
  202. result.append(mapped[i])
  203. ix += 1
  204. self.bfs_order = result
  205. return result
  206. # suffix compression - convert the trie into an acyclic digraph, merging nodes when
  207. # the subtries are identical
  208. def dedup(self):
  209. uniques = []
  210. dupmap = {}
  211. dedup_ix = [0] * len(self.bfs_order)
  212. for ix in reversed(range(len(self.bfs_order))):
  213. # construct string representation of node
  214. node = self.bfs_order[ix]
  215. if node.res is None:
  216. s = ''
  217. else:
  218. s = ''.join(str(c) for c in node.res)
  219. for c in sorted(node.succ.keys()):
  220. succ = node.succ[c]
  221. s += ' ' + c + str(dedup_ix[succ.bfs_ix])
  222. if s in dupmap:
  223. dedup_ix[ix] = dupmap[s]
  224. else:
  225. uniques.append(node)
  226. dedup_ix[ix] = ix
  227. dupmap[s] = dedup_ix[ix]
  228. uniques.reverse()
  229. print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
  230. return dedup_ix, uniques
  231. # load the ".pat" file, which contains patterns such as a1b2c3
  232. def load(fn):
  233. hyph = Hyph()
  234. with io.open(fn, encoding='UTF-8') as f:
  235. for l in f:
  236. pat = l.strip()
  237. hyph.add_pat(pat)
  238. return hyph
  239. # load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
  240. def load_chr(fn):
  241. ch_map = {'.': 0}
  242. with io.open(fn, encoding='UTF-8') as f:
  243. for i, l in enumerate(f):
  244. l = l.strip()
  245. if len(l) > 2:
  246. if l == SHARP_S_TO_DOUBLE:
  247. # replace with lowercasing from capital letter sharp s
  248. l = SHARP_S_TO_CAPITAL
  249. else:
  250. # lowercase maps to multi-character uppercase sequence, ignore uppercase for now
  251. l = l[:1]
  252. else:
  253. assert len(l) == 2, 'expected 2 chars in chr'
  254. for c in l:
  255. ch_map[c] = i + 1
  256. return ch_map
  257. # load exceptions with explicit hyphens
  258. def load_hyp(hyph, fn):
  259. with io.open(fn, encoding='UTF-8') as f:
  260. for l in f:
  261. hyph.add_exception(l.strip())
  262. def generate_header(alphabet, trie, pattern):
  263. alphabet_off = 6 * 4
  264. trie_off = alphabet_off + len(alphabet)
  265. pattern_off = trie_off + len(trie)
  266. file_size = pattern_off + len(pattern)
  267. data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
  268. return struct.pack('<6I', *data)
  269. def generate_alphabet(ch_map):
  270. ch_map = ch_map.copy()
  271. del ch_map['.']
  272. min_ch = ord(min(ch_map))
  273. max_ch = ord(max(ch_map))
  274. if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
  275. # generate format 0
  276. data = [0] * (max_ch - min_ch + 1)
  277. for c, val in ch_map.items():
  278. data[ord(c) - min_ch] = val
  279. result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
  280. for b in data:
  281. result.append(struct.pack('<B', b))
  282. else:
  283. # generate format 1
  284. assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
  285. result = [struct.pack('<2I', 1, len(ch_map))]
  286. for c, val in sorted(ch_map.items()):
  287. data = (ord(c) << 11) | val
  288. result.append(struct.pack('<I', data))
  289. binary = b''.join(result)
  290. if len(binary) % 4 != 0:
  291. binary += b'\x00' * (4 - len(binary) % 4)
  292. return binary
  293. # assumes hyph structure has been packed, ie node.ix values have been set
  294. def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
  295. ch_array = [0] * n_trie
  296. link_array = [0] * n_trie
  297. pat_array = [0] * n_trie
  298. link_shift = num_bits(max(ch_map.values()))
  299. char_mask = (1 << link_shift) - 1
  300. pattern_shift = link_shift + num_bits(n_trie - 1)
  301. link_mask = (1 << pattern_shift) - (1 << link_shift)
  302. result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]
  303. for node in dedup_nodes:
  304. ix = node.ix
  305. if node.res is not None:
  306. pat_array[ix] = patmap[pat_to_binary(node.res)]
  307. for c, next in node.succ.items():
  308. c_num = ch_map[c]
  309. link_ix = ix + c_num
  310. ch_array[link_ix] = c_num
  311. if dedup_ix is None:
  312. dedup_next = next
  313. else:
  314. dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
  315. link_array[link_ix] = dedup_next.ix
  316. for i in range(n_trie):
  317. #print((pat_array[i], link_array[i], ch_array[i]))
  318. packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i]
  319. result.append(struct.pack('<I', packed))
  320. return b''.join(result)
  321. def generate_pattern(pats):
  322. pat_array = [0]
  323. patmap = {b'': 0}
  324. raw_pat_array = []
  325. raw_pat_size = 0
  326. raw_patmap = {}
  327. for pat in pats:
  328. if pat is None:
  329. continue
  330. pat_str = pat_to_binary(pat)
  331. if pat_str not in patmap:
  332. shift = 0
  333. while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
  334. shift += 1
  335. rawpat = pat_str[:len(pat) - shift]
  336. if rawpat not in raw_patmap:
  337. raw_patmap[rawpat] = raw_pat_size
  338. raw_pat_array.append(rawpat)
  339. raw_pat_size += len(rawpat)
  340. data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat]
  341. patmap[pat_str] = len(pat_array)
  342. pat_array.append(data)
  343. data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
  344. result = [struct.pack('<4I', *data)]
  345. for x in pat_array:
  346. result.append(struct.pack('<I', x))
  347. result.extend(raw_pat_array)
  348. return patmap, b''.join(result)
  349. def generate_hyb_file(hyph, ch_map, hyb_fn):
  350. bfs = hyph.bfs(ch_map)
  351. dedup_ix, dedup_nodes = hyph.dedup()
  352. n_trie = hyph.pack(dedup_nodes, ch_map)
  353. alphabet = generate_alphabet(ch_map)
  354. patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
  355. trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
  356. header = generate_header(alphabet, trie, pattern)
  357. with open(hyb_fn, 'wb') as f:
  358. f.write(header)
  359. f.write(alphabet)
  360. f.write(trie)
  361. f.write(pattern)
  362. # Verify that the file contains the same lines as the lines argument, in arbitrary order
  363. def verify_file_sorted(lines, fn):
  364. file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')]
  365. line_set = set(lines)
  366. file_set = set(file_lines)
  367. if SHARP_S_TO_DOUBLE in file_set:
  368. # ignore difference of double capital letter s and capital letter sharp s
  369. file_set.symmetric_difference_update([SHARP_S_TO_DOUBLE, SHARP_S_TO_CAPITAL])
  370. if line_set == file_set:
  371. return True
  372. for line in line_set - file_set:
  373. print(repr(line) + ' in reconstruction, not in file')
  374. for line in file_set - line_set:
  375. print(repr(line) + ' in file, not in reconstruction')
  376. return False
  377. def map_to_chr(alphabet_map):
  378. result = []
  379. ch_map = {}
  380. for val in alphabet_map.values():
  381. chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
  382. # non-cased characters (like Ethopic) are in both, matching chr file
  383. lowercase = [ch for ch in chs if not ch.isupper()]
  384. uppercase = [ch for ch in chs if not ch.islower()]
  385. # print(val, `lowercase`, `uppercase`)
  386. assert len(lowercase) == 1, 'expected 1 lowercase character'
  387. assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
  388. ch_map[val] = lowercase[0]
  389. result.append(''.join(lowercase + uppercase))
  390. ch_map[0] = '.'
  391. return (ch_map, result)
  392. def get_pattern(pattern_data, ix):
  393. pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
  394. entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
  395. pat_len = entry >> 26
  396. pat_shift = (entry >> 20) & 0x1f
  397. offset = pattern_offset + (entry & 0xfffff)
  398. return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift
  399. def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
  400. (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
  401. node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
  402. pattern = node_entry >> pattern_shift
  403. if pattern:
  404. result = []
  405. is_exception = False
  406. pat = get_pattern(pattern_data, pattern)
  407. for i in range(len(s) + 1):
  408. pat_off = i - 1 + len(pat) - len(s)
  409. if pat_off < 0:
  410. code = 0
  411. else:
  412. code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
  413. if 1 <= code <= 9:
  414. result.append('%d' % code)
  415. elif code == 10:
  416. is_exception = True
  417. elif code == 11:
  418. result.append('-')
  419. is_exception = True
  420. else:
  421. assert code == 0, 'unexpected code'
  422. if i < len(s):
  423. result.append(s[i])
  424. pat_str = ''.join(result)
  425. #print(`pat_str`, `pat`)
  426. if is_exception:
  427. assert pat_str[0] == '.', "expected leading '.'"
  428. assert pat_str[-1] == '.', "expected trailing '.'"
  429. exceptions.append(pat_str[1:-1]) # strip leading and trailing '.'
  430. else:
  431. patterns.append(pat_str)
  432. for ch in ch_map:
  433. edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
  434. link = (edge_entry & link_mask) >> link_shift
  435. if link != 0 and ch == (edge_entry & char_mask):
  436. sch = s + ch_map[ch]
  437. traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)
  438. # Verify the generated binary file by reconstructing the textual representations
  439. # from the binary hyb file, then checking that they're identical (mod the order of
  440. # lines within the file, which is irrelevant). This function makes assumptions that
  441. # are stronger than absolutely necessary (in particular, that the patterns are in
  442. # lowercase as defined by python islower).
  443. def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
  444. with open(hyb_fn, 'rb') as f:
  445. hyb_data = f.read()
  446. header = hyb_data[0: 6 * 4]
  447. (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
  448. alphabet_data = hyb_data[alphabet_off:trie_off]
  449. trie_data = hyb_data[trie_off:pattern_off]
  450. pattern_data = hyb_data[pattern_off:file_size]
  451. # reconstruct alphabet table
  452. alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
  453. alphabet_map = {}
  454. if alphabet_version == 0:
  455. (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
  456. for ch in range(min_ch, max_ch):
  457. offset = 12 + ch - min_ch
  458. b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
  459. if b != 0:
  460. alphabet_map[unichr(ch)] = b
  461. else:
  462. assert alphabet_version == 1
  463. n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
  464. for i in range(n_entries):
  465. entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
  466. alphabet_map[unichr(entry >> 11)] = entry & 0x7ff
  467. ch_map, reconstructed_chr = map_to_chr(alphabet_map)
  468. # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
  469. if u'\u0587' in reconstructed_chr:
  470. reconstructed_chr.remove(u'\u0587')
  471. reconstructed_chr.append(u'\u0587\u0535\u0552')
  472. assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'
  473. # reconstruct trie
  474. patterns = []
  475. exceptions = []
  476. traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)
  477. # EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0>
  478. if u'\u044c' in patterns:
  479. patterns.remove(u'\u044c')
  480. patterns.append(u'0\u044c0')
  481. assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
  482. assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'
  483. def main():
  484. global VERBOSE
  485. try:
  486. opts, args = getopt.getopt(sys.argv[1:], 'v')
  487. except getopt.GetoptError as err:
  488. print(str(err))
  489. sys.exit(1)
  490. for o, _ in opts:
  491. if o == '-v':
  492. VERBOSE = True
  493. pat_fn, out_fn = args
  494. hyph = load(pat_fn)
  495. if pat_fn.endswith('.pat.txt'):
  496. chr_fn = pat_fn[:-8] + '.chr.txt'
  497. ch_map = load_chr(chr_fn)
  498. hyp_fn = pat_fn[:-8] + '.hyp.txt'
  499. load_hyp(hyph, hyp_fn)
  500. generate_hyb_file(hyph, ch_map, out_fn)
  501. verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)
  502. if __name__ == '__main__':
  503. main()