RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.
This commit is contained in:
212
zeroidc/vendor/unicode-segmentation/scripts/unicode_gen_breaktests.py
vendored
Executable file
212
zeroidc/vendor/unicode-segmentation/scripts/unicode_gen_breaktests.py
vendored
Executable file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8
|
||||
#
|
||||
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# This script uses the following Unicode tables:
|
||||
# - auxiliary/GraphemeBreakTest.txt
|
||||
# - auxiliary/WordBreakTest.txt
|
||||
#
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the unicode.rs file into git.
|
||||
from __future__ import print_function
|
||||
|
||||
import unicode, re, os, fileinput
|
||||
|
||||
def load_test_data(f, optsplit=[]):
|
||||
testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
|
||||
|
||||
unicode.fetch(f)
|
||||
data = []
|
||||
for line in fileinput.input(os.path.basename(f)):
|
||||
# lines that include a test start with the ÷ character
|
||||
if len(line) < 2 or not line.startswith('÷'):
|
||||
continue
|
||||
|
||||
m = testRe1.match(line)
|
||||
if not m:
|
||||
print("error: no match on line where test was expected: %s" % line)
|
||||
continue
|
||||
|
||||
# process the characters in this test case
|
||||
chars = process_split_string(m.group(1))
|
||||
# skip test case if it contains invalid characters (viz., surrogates)
|
||||
if not chars:
|
||||
continue
|
||||
|
||||
# now process test cases
|
||||
(chars, info) = process_split_info(m.group(2), chars, optsplit)
|
||||
|
||||
# make sure that we have break info for each break!
|
||||
assert len(chars) - 1 == len(info)
|
||||
|
||||
data.append((chars, info))
|
||||
|
||||
return data
|
||||
|
||||
def process_split_info(s, c, o):
|
||||
outcs = []
|
||||
outis = []
|
||||
workcs = c.pop(0)
|
||||
|
||||
# are we on a × or a ÷?
|
||||
isX = False
|
||||
if s.startswith('×'):
|
||||
isX = True
|
||||
|
||||
# find each instance of '(÷|×) [x.y] '
|
||||
while s:
|
||||
# find the currently considered rule number
|
||||
sInd = s.index('[') + 1
|
||||
eInd = s.index(']')
|
||||
|
||||
# if it's '× [a.b]' where 'a.b' is in o, then
|
||||
# we consider it a split even though it's not
|
||||
# marked as one
|
||||
# if it's ÷ then it's always a split
|
||||
if not isX or s[sInd:eInd] in o:
|
||||
outis.append(s[sInd:eInd])
|
||||
outcs.append(workcs)
|
||||
workcs = c.pop(0)
|
||||
else:
|
||||
workcs.extend(c.pop(0))
|
||||
|
||||
idx = 1
|
||||
while idx < len(s):
|
||||
if s[idx:].startswith('×'):
|
||||
isX = True
|
||||
break
|
||||
if s[idx:].startswith('÷'):
|
||||
isX = False
|
||||
break
|
||||
idx += 1
|
||||
s = s[idx:]
|
||||
|
||||
outcs.append(workcs)
|
||||
return (outcs, outis)
|
||||
|
||||
def process_split_string(s):
|
||||
outls = []
|
||||
workls = []
|
||||
|
||||
inls = s.split()
|
||||
|
||||
for i in inls:
|
||||
if i == '÷' or i == '×':
|
||||
outls.append(workls)
|
||||
workls = []
|
||||
continue
|
||||
|
||||
ival = int(i,16)
|
||||
|
||||
if unicode.is_surrogate(ival):
|
||||
return []
|
||||
|
||||
workls.append(ival)
|
||||
|
||||
if workls:
|
||||
outls.append(workls)
|
||||
|
||||
return outls
|
||||
|
||||
def showfun(x):
|
||||
outstr = '("'
|
||||
for c in x[0]:
|
||||
outstr += "\\u{%x}" % c
|
||||
outstr += '",&['
|
||||
xfirst = True
|
||||
for xx in x[1:]:
|
||||
if not xfirst:
|
||||
outstr += '],&['
|
||||
xfirst = False
|
||||
sfirst = True
|
||||
for sp in xx:
|
||||
if not sfirst:
|
||||
outstr += ','
|
||||
sfirst = False
|
||||
outstr += '"'
|
||||
for c in sp:
|
||||
outstr += "\\u{%x}" % c
|
||||
outstr += '"'
|
||||
outstr += '])'
|
||||
return outstr
|
||||
|
||||
def create_grapheme_data(f):
|
||||
# rules 9.1 and 9.2 are for extended graphemes only
|
||||
optsplits = ['9.1','9.2']
|
||||
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
|
||||
|
||||
test_same = []
|
||||
test_diff = []
|
||||
|
||||
for (c, i) in d:
|
||||
allchars = [cn for s in c for cn in s]
|
||||
extgraphs = []
|
||||
extwork = []
|
||||
|
||||
extwork.extend(c[0])
|
||||
for n in range(0,len(i)):
|
||||
if i[n] in optsplits:
|
||||
extwork.extend(c[n+1])
|
||||
else:
|
||||
extgraphs.append(extwork)
|
||||
extwork = []
|
||||
extwork.extend(c[n+1])
|
||||
|
||||
# these are the extended grapheme clusters
|
||||
extgraphs.append(extwork)
|
||||
|
||||
if extgraphs == c:
|
||||
test_same.append((allchars, c))
|
||||
else:
|
||||
test_diff.append((allchars, extgraphs, c))
|
||||
|
||||
stype = "&'static [(&'static str, &'static [&'static str])]"
|
||||
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
|
||||
f.write(" // official Unicode test data\n")
|
||||
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
|
||||
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
|
||||
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
|
||||
|
||||
def create_words_data(f):
|
||||
d = load_test_data("auxiliary/WordBreakTest.txt")
|
||||
|
||||
test = []
|
||||
|
||||
for (c, i) in d:
|
||||
allchars = [cn for s in c for cn in s]
|
||||
test.append((allchars, c))
|
||||
|
||||
wtype = "&'static [(&'static str, &'static [&'static str])]"
|
||||
f.write(" // official Unicode test data\n")
|
||||
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
|
||||
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
|
||||
|
||||
def create_sentence_data(f):
|
||||
d = load_test_data("auxiliary/SentenceBreakTest.txt")
|
||||
|
||||
test = []
|
||||
|
||||
for (c, i) in d:
|
||||
allchars = [cn for s in c for cn in s]
|
||||
test.append((allchars, c))
|
||||
|
||||
wtype = "&'static [(&'static str, &'static [&'static str])]"
|
||||
f.write(" // official Unicode test data\n")
|
||||
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
|
||||
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
with open("testdata.rs", "w") as rf:
|
||||
rf.write(unicode.preamble)
|
||||
create_grapheme_data(rf)
|
||||
create_words_data(rf)
|
||||
create_sentence_data(rf)
|
||||
Reference in New Issue
Block a user