#!/usr/bin/python
|
# -*- coding: utf-8 -*-
|
# make_unicode_wb_data.py
|
# Copyright (c) 2019 K.Kosako
|
|
import sys
|
import re
|
|
MAX_CODE_POINT = 0x10ffff
|
|
PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
|
PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
|
PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)")
|
PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
|
BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
|
VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt")
|
|
VERSION_INFO = [-1, -1, -1]
|
DIC = { }
|
PROPS = []
|
PropIndex = { }
|
|
def check_version_info(s):
|
m = VERSION_REG.match(s)
|
if m is not None:
|
VERSION_INFO[0] = int(m.group(1))
|
VERSION_INFO[1] = int(m.group(2))
|
VERSION_INFO[2] = int(m.group(3))
|
|
def print_ranges(ranges):
|
for (start, end) in ranges:
|
print "0x%06x, 0x%06x" % (start, end)
|
|
def print_prop_and_index(prop, i):
|
print "%-35s %3d" % (prop + ',', i)
|
PropIndex[prop] = i
|
|
def dic_find_by_value(dic, v):
|
for key, val in dic.items():
|
if val == v:
|
return key
|
|
return None
|
|
|
def normalize_ranges(in_ranges, sort=False):
|
if sort:
|
ranges = sorted(in_ranges)
|
else:
|
ranges = in_ranges
|
|
r = []
|
prev = None
|
for (start, end) in ranges:
|
if prev >= start - 1:
|
(pstart, pend) = r.pop()
|
end = max(pend, end)
|
start = pstart
|
|
r.append((start, end))
|
prev = end
|
|
return r
|
|
def inverse_ranges(in_ranges):
|
r = []
|
prev = 0x000000
|
for (start, end) in in_ranges:
|
if prev < start:
|
r.append((prev, start - 1))
|
|
prev = end + 1
|
|
if prev < MAX_CODE_POINT:
|
r.append((prev, MAX_CODE_POINT))
|
|
return r
|
|
def add_ranges(r1, r2):
|
r = r1 + r2
|
return normalize_ranges(r, True)
|
|
def sub_one_range(one_range, rs):
|
r = []
|
(s1, e1) = one_range
|
n = len(rs)
|
for i in range(0, n):
|
(s2, e2) = rs[i]
|
if s2 >= s1 and s2 <= e1:
|
if s2 > s1:
|
r.append((s1, s2 - 1))
|
if e2 >= e1:
|
return r
|
|
s1 = e2 + 1
|
elif s2 < s1 and e2 >= s1:
|
if e2 < e1:
|
s1 = e2 + 1
|
else:
|
return r
|
|
r.append((s1, e1))
|
return r
|
|
def sub_ranges(r1, r2):
|
r = []
|
for one_range in r1:
|
rs = sub_one_range(one_range, r2)
|
r.extend(rs)
|
|
return r
|
|
def add_ranges_in_dic(dic):
|
r = []
|
for k, v in dic.items():
|
r = r + v
|
|
return normalize_ranges(r, True)
|
|
def normalize_ranges_in_dic(dic, sort=False):
|
for k, v in dic.items():
|
r = normalize_ranges(v, sort)
|
dic[k] = r
|
|
def merge_dic(to_dic, from_dic):
|
to_keys = to_dic.keys()
|
from_keys = from_dic.keys()
|
common = list(set(to_keys) & set(from_keys))
|
if len(common) != 0:
|
print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
|
|
to_dic.update(from_dic)
|
|
def merge_props(to_props, from_props):
|
common = list(set(to_props) & set(from_props))
|
if len(common) != 0:
|
print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
|
|
to_props.extend(from_props)
|
|
def add_range_into_dic(dic, name, start, end):
|
d = dic.get(name, None)
|
if d is None:
|
d = [(start, end)]
|
dic[name] = d
|
else:
|
d.append((start, end))
|
|
def list_sub(a, b):
|
x = set(a) - set(b)
|
return list(x)
|
|
def parse_properties(path):
|
with open(path, 'r') as f:
|
dic = { }
|
prop = None
|
props = []
|
for line in f:
|
s = line.strip()
|
if len(s) == 0:
|
continue
|
|
if s[0] == '#':
|
if VERSION_INFO[0] < 0:
|
check_version_info(s)
|
|
m = PR_LINE_REG.match(s)
|
if m:
|
prop = m.group(3)
|
if m.group(2):
|
start = int(m.group(1), 16)
|
end = int(m.group(2), 16)
|
add_range_into_dic(dic, prop, start, end)
|
else:
|
start = int(m.group(1), 16)
|
add_range_into_dic(dic, prop, start, start)
|
|
elif PR_TOTAL_REG.match(s) is not None:
|
props.append(prop)
|
|
normalize_ranges_in_dic(dic)
|
return (dic, props)
|
|
|
### main ###
|
argv = sys.argv
|
argc = len(argv)
|
|
dic, props = parse_properties('WordBreakProperty.txt')
|
merge_dic(DIC, dic)
|
merge_props(PROPS, props)
|
|
PROPS = sorted(PROPS)
|
|
print '/* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */'
|
COPYRIGHT = '''
|
/*-
|
* Copyright (c) 2019 K.Kosako
|
* All rights reserved.
|
*
|
* Redistribution and use in source and binary forms, with or without
|
* modification, are permitted provided that the following conditions
|
* are met:
|
* 1. Redistributions of source code must retain the above copyright
|
* notice, this list of conditions and the following disclaimer.
|
* 2. Redistributions in binary form must reproduce the above copyright
|
* notice, this list of conditions and the following disclaimer in the
|
* documentation and/or other materials provided with the distribution.
|
*
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
* SUCH DAMAGE.
|
*/
|
'''.strip()
|
|
print COPYRIGHT
|
print ''
|
if VERSION_INFO[0] < 0:
|
raise RuntimeError("Version is not found.")
|
|
print "#define WORD_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2])
|
print ''
|
|
ranges = []
|
for prop in PROPS:
|
rs = DIC[prop]
|
for (start, end) in rs:
|
ranges.append((start, end, prop))
|
|
ranges = sorted(ranges, key=lambda x: x[0])
|
|
prev = -1
|
for (start, end, prop) in ranges:
|
if prev >= start:
|
raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev))
|
|
|
print '/*'
|
for prop in PROPS:
|
print "%s" % prop
|
print '*/'
|
print ''
|
|
num_ranges = len(ranges)
|
print "static int WB_RANGE_NUM = %d;" % num_ranges
|
|
print 'static WB_RANGE_TYPE WB_RANGES[] = {'
|
for i, (start, end, prop) in enumerate(ranges):
|
if i == num_ranges - 1:
|
comma = ''
|
else:
|
comma = ','
|
|
type_name = 'WB_' + prop
|
print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma)
|
|
print '};'
|
|
sys.exit(0)
|