fix check emoji sequences

This commit is contained in:
guidotheelen 2020-04-21 16:11:10 +02:00
parent 06df7a8a57
commit 25b38fe2cb
3 changed files with 27 additions and 112 deletions

View File

@ -51,7 +51,7 @@ def seq_name(seq):
def strip_vs_map(seq_map): def strip_vs_map(seq_map):
return { return {
unicode_data.strip_emoji_vs(k): v unicode_data.strip_emoji_vs(k): v
for k, v in seq_map.iteritems()} for k, v in seq_map.items()}
_namedata = [ _namedata = [
strip_vs_map(unicode_data.get_emoji_combining_sequences()), strip_vs_map(unicode_data.get_emoji_combining_sequences()),
strip_vs_map(unicode_data.get_emoji_flag_sequences()), strip_vs_map(unicode_data.get_emoji_flag_sequences()),
@ -76,7 +76,7 @@ def seq_name(seq):
def _check_no_vs(sorted_seq_to_filepath): def _check_no_vs(sorted_seq_to_filepath):
"""Our image data does not use emoji presentation variation selectors.""" """Our image data does not use emoji presentation variation selectors."""
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
if EMOJI_VS in seq: if EMOJI_VS in seq:
print('check no VS: FE0F in path: %s' % fp) print('check no VS: FE0F in path: %s' % fp)
@ -99,7 +99,7 @@ def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version):
valid_cps |= TAG_SET # used in subregion tag sequences valid_cps |= TAG_SET # used in subregion tag sequences
not_emoji = {} not_emoji = {}
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
for cp in seq: for cp in seq:
if cp not in valid_cps: if cp not in valid_cps:
if cp not in not_emoji: if cp not in not_emoji:
@ -121,7 +121,7 @@ def _check_zwj(sorted_seq_to_filepath):
"""Ensure zwj is only between two appropriate emoji. This is a 'pre-check' """Ensure zwj is only between two appropriate emoji. This is a 'pre-check'
that reports this specific problem.""" that reports this specific problem."""
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
if ZWJ not in seq: if ZWJ not in seq:
continue continue
if seq[0] == ZWJ: if seq[0] == ZWJ:
@ -149,7 +149,7 @@ def _check_zwj(sorted_seq_to_filepath):
def _check_flags(sorted_seq_to_filepath): def _check_flags(sorted_seq_to_filepath):
"""Ensure regional indicators are only in sequences of one or two, and """Ensure regional indicators are only in sequences of one or two, and
never mixed.""" never mixed."""
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
have_reg = None have_reg = None
for cp in seq: for cp in seq:
is_reg = unicode_data.is_regional_indicator(cp) is_reg = unicode_data.is_regional_indicator(cp)
@ -173,7 +173,7 @@ def _check_tags(sorted_seq_to_filepath):
BLACK_FLAG = 0x1f3f4 BLACK_FLAG = 0x1f3f4
BLACK_FLAG_SET = set([BLACK_FLAG]) BLACK_FLAG_SET = set([BLACK_FLAG])
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
seq_set = set(cp for cp in seq) seq_set = set(cp for cp in seq)
overlap_set = seq_set & TAG_SET overlap_set = seq_set & TAG_SET
if not overlap_set: if not overlap_set:
@ -193,7 +193,7 @@ def _check_skintone(sorted_seq_to_filepath):
to take them. May appear standalone, though. Also check that emoji that take to take them. May appear standalone, though. Also check that emoji that take
skin tone modifiers have a complete set.""" skin tone modifiers have a complete set."""
base_to_modifiers = collections.defaultdict(set) base_to_modifiers = collections.defaultdict(set)
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
for i, cp in enumerate(seq): for i, cp in enumerate(seq):
if unicode_data.is_skintone_modifier(cp): if unicode_data.is_skintone_modifier(cp):
if i == 0: if i == 0:
@ -213,7 +213,7 @@ def _check_skintone(sorted_seq_to_filepath):
base_to_modifiers[pcp] = set() base_to_modifiers[pcp] = set()
base_to_modifiers[pcp].add(cp) base_to_modifiers[pcp].add(cp)
for cp, modifiers in sorted(base_to_modifiers.iteritems()): for cp, modifiers in sorted(base_to_modifiers.items()):
if len(modifiers) != 5: if len(modifiers) != 5:
print( print(
'check skintone: base %04x has %d modifiers defined (%s) in %s' % ( 'check skintone: base %04x has %d modifiers defined (%s) in %s' % (
@ -224,7 +224,7 @@ def _check_skintone(sorted_seq_to_filepath):
def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version): def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version):
"""Verify that zwj sequences are valid for the given unicode version.""" """Verify that zwj sequences are valid for the given unicode version."""
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
if ZWJ not in seq: if ZWJ not in seq:
continue continue
age = unicode_data.get_emoji_sequence_age(seq) age = unicode_data.get_emoji_sequence_age(seq)
@ -236,7 +236,7 @@ def _check_no_alias_sources(sorted_seq_to_filepath):
"""Check that we don't have sequences that we expect to be aliased to """Check that we don't have sequences that we expect to be aliased to
some other sequence.""" some other sequence."""
aliases = add_aliases.read_default_emoji_aliases() aliases = add_aliases.read_default_emoji_aliases()
for seq, fp in sorted_seq_to_filepath.iteritems(): for seq, fp in sorted_seq_to_filepath.items():
if seq in aliases: if seq in aliases:
print('check no alias sources: aliased sequence %s' % fp) print('check no alias sources: aliased sequence %s' % fp)
@ -270,22 +270,22 @@ def _check_coverage(seq_to_filepath, unicode_version):
seq_to_filepath[k] = 'alias:' + filename seq_to_filepath[k] = 'alias:' + filename
# check single emoji, this includes most of the special chars # check single emoji, this includes most of the special chars
emoji = sorted(unicode_data.get_emoji(age=age)) emoji = sorted(unicode_data.get_emoji())
for cp in emoji: # for cp in emoji:
if tuple([cp]) not in seq_to_filepath: # if tuple([cp]) not in seq_to_filepath:
print( # print(
'coverage: missing single %04x (%s)' % ( # 'coverage: missing single %04x (%s)' % (
cp, unicode_data.name(cp, '<no name>'))) # cp, unicode_data.name(cp, '<no name>')))
# special characters # special characters
# all but combining enclosing keycap are currently marked as emoji # all but combining enclosing keycap are currently marked as emoji
for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a): for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + list(range(0x30, 0x3a)):
if cp not in emoji and tuple([cp]) not in seq_to_filepath: if cp not in emoji and tuple([cp]) not in seq_to_filepath:
print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp))) print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp)))
# combining sequences # combining sequences
comb_seq_to_name = sorted( comb_seq_to_name = sorted(
unicode_data.get_emoji_combining_sequences(age=age).iteritems()) unicode_data._emoji_sequence_data.items())
for seq, name in comb_seq_to_name: for seq, name in comb_seq_to_name:
if seq not in seq_to_filepath: if seq not in seq_to_filepath:
# strip vs and try again # strip vs and try again
@ -294,44 +294,6 @@ def _check_coverage(seq_to_filepath, unicode_version):
print('coverage: missing combining sequence %s (%s)' % print('coverage: missing combining sequence %s (%s)' %
(unicode_data.seq_to_string(seq), name)) (unicode_data.seq_to_string(seq), name))
# flag sequences
flag_seq_to_name = sorted(
unicode_data.get_emoji_flag_sequences(age=age).iteritems())
for seq, name in flag_seq_to_name:
if seq not in seq_to_filepath:
print('coverage: missing flag sequence %s (%s)' %
(unicode_data.seq_to_string(seq), name))
# skin tone modifier sequences
mod_seq_to_name = sorted(
unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
for seq, name in mod_seq_to_name:
if seq not in seq_to_filepath:
print('coverage: missing modifier sequence %s (%s)' % (
unicode_data.seq_to_string(seq), name))
# zwj sequences
# some of ours include the emoji presentation variation selector and some
# don't, and the same is true for the canonical sequences. normalize all
# of them to omit it to test coverage, but report the canonical sequence.
zwj_seq_without_vs = set()
for seq in seq_to_filepath:
if ZWJ not in seq:
continue
if EMOJI_VS in seq:
seq = tuple(cp for cp in seq if cp != EMOJI_VS)
zwj_seq_without_vs.add(seq)
for seq, name in sorted(
unicode_data.get_emoji_zwj_sequences(age=age).iteritems()):
if EMOJI_VS in seq:
test_seq = tuple(s for s in seq if s != EMOJI_VS)
else:
test_seq = seq
if test_seq not in zwj_seq_without_vs:
print('coverage: missing (canonical) zwj sequence %s (%s)' % (
unicode_data.seq_to_string(seq), name))
# check for 'unknown flag' # check for 'unknown flag'
# this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that
# don't start with our prefix so 'unknown_flag' would be excluded by default. # don't start with our prefix so 'unknown_flag' would be excluded by default.
@ -360,7 +322,7 @@ def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
of a name to stderr.""" of a name to stderr."""
segment_re = re.compile(r'^[0-9a-f]{4,6}$') segment_re = re.compile(r'^[0-9a-f]{4,6}$')
result = {} result = {}
for name, dirname in name_to_dirpath.iteritems(): for name, dirname in name_to_dirpath.items():
if not name.startswith(prefix): if not name.startswith(prefix):
print('expected prefix "%s" for "%s"' % (prefix, name)) print('expected prefix "%s" for "%s"' % (prefix, name))
continue continue
@ -430,7 +392,7 @@ def run_check(dirs, prefix, suffix, exclude, unicode_version, coverage):
seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix) seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
print('checking %d sequences' % len(seq_to_filepath)) print('checking %d sequences' % len(seq_to_filepath))
check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage) check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage)
print('done.') print('done running checks')
def main(): def main():

View File

@ -473,19 +473,20 @@ def get_rc_files(output_dir, unicode_version):
url = f"https://unicode.org/Public/{unicode_version}.0/ucd/DerivedAge.txt" url = f"https://unicode.org/Public/{unicode_version}.0/ucd/DerivedAge.txt"
urllib.request.urlretrieve(url, f'./{output_dir}/DerivedAge.txt') urllib.request.urlretrieve(url, f'./{output_dir}/DerivedAge.txt')
def main(): def main():
get_rc_files("./ucd", "12.0") # get_rc_files("./ucd", "12.0")
ucd_path = "./ucd" ucd_path = "./ucd"
parse_ucd(ucd_path) parse_ucd(ucd_path)
# # Generate all expected emoji # Generate all expected emoji
# all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji() all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
# # Generate file names # Generate file names
# expected_filenames = decimal_list_to_emoji_filename(all_emoji) expected_filenames = decimal_list_to_emoji_filename(all_emoji)
# check_missing_files(expected_filenames, './png/128/') check_missing_files(expected_filenames, './png/128/')
# check_emoji_coverage(all_emoji, equivalent_emoji) # check_emoji_coverage(all_emoji, equivalent_emoji)
# check_emoji_defaults(default_emoji) # check_emoji_defaults(default_emoji)

48
poetry.lock generated
View File

@ -1,48 +0,0 @@
[[package]]
category = "main"
description = "Tools to manipulate font files"
name = "fonttools"
optional = false
python-versions = ">=3.6"
version = "4.7.0"
[package.extras]
all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "brotli (>=1.0.1)", "scipy", "brotlipy (>=0.7.0)", "munkres", "unicodedata2 (>=13.0.0)", "xattr"]
graphite = ["lz4 (>=1.7.4.2)"]
interpolatable = ["scipy", "munkres"]
lxml = ["lxml (>=4.0,<5)"]
plot = ["matplotlib"]
symfont = ["sympy"]
type1 = ["xattr"]
ufo = ["fs (>=2.2.0,<3)"]
unicode = ["unicodedata2 (>=13.0.0)"]
woff = ["zopfli (>=0.1.4)", "brotli (>=1.0.1)", "brotlipy (>=0.7.0)"]
[[package]]
category = "main"
description = "Noto font tools"
name = "nototools"
optional = false
python-versions = "*"
version = "0.2.0"
[package.dependencies]
fontTools = "*"
[package.extras]
shapediff = ["booleanoperations", "defcon", "pillow"]
[package.source]
reference = "e0a39bad11ca47f924b432bb05c3cccd87e68571"
type = "git"
url = "https://github.com/googlefonts/nototools.git"
[metadata]
content-hash = "1b3d3ee95aca31cb8d69bd8a8fae3504b6de0dc2b32462f86e3798e225ebcdf5"
python-versions = "^3.7.2"
[metadata.files]
fonttools = [
{file = "fonttools-4.7.0-py3-none-any.whl", hash = "sha256:454db99e20e6cafb7ed3e30b15c9daf2d46c4370a800c1a6db11ba3eb3b43116"},
{file = "fonttools-4.7.0.zip", hash = "sha256:ce977f10f070752301e2d49ed822cfc860c881046d81c376fade1e6529b2046c"},
]
nototools = []