fix check emoji sequences

This commit is contained in:
guidotheelen 2020-04-21 16:11:10 +02:00
parent 06df7a8a57
commit 25b38fe2cb
3 changed files with 27 additions and 112 deletions

View File

@ -51,7 +51,7 @@ def seq_name(seq):
def strip_vs_map(seq_map):
return {
unicode_data.strip_emoji_vs(k): v
for k, v in seq_map.iteritems()}
for k, v in seq_map.items()}
_namedata = [
strip_vs_map(unicode_data.get_emoji_combining_sequences()),
strip_vs_map(unicode_data.get_emoji_flag_sequences()),
@ -76,7 +76,7 @@ def seq_name(seq):
def _check_no_vs(sorted_seq_to_filepath):
"""Our image data does not use emoji presentation variation selectors."""
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
if EMOJI_VS in seq:
print('check no VS: FE0F in path: %s' % fp)
@ -99,7 +99,7 @@ def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version):
valid_cps |= TAG_SET # used in subregion tag sequences
not_emoji = {}
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
for cp in seq:
if cp not in valid_cps:
if cp not in not_emoji:
@ -121,7 +121,7 @@ def _check_zwj(sorted_seq_to_filepath):
"""Ensure zwj is only between two appropriate emoji. This is a 'pre-check'
that reports this specific problem."""
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
if ZWJ not in seq:
continue
if seq[0] == ZWJ:
@ -149,7 +149,7 @@ def _check_zwj(sorted_seq_to_filepath):
def _check_flags(sorted_seq_to_filepath):
"""Ensure regional indicators are only in sequences of one or two, and
never mixed."""
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
have_reg = None
for cp in seq:
is_reg = unicode_data.is_regional_indicator(cp)
@ -173,7 +173,7 @@ def _check_tags(sorted_seq_to_filepath):
BLACK_FLAG = 0x1f3f4
BLACK_FLAG_SET = set([BLACK_FLAG])
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
seq_set = set(cp for cp in seq)
overlap_set = seq_set & TAG_SET
if not overlap_set:
@ -193,7 +193,7 @@ def _check_skintone(sorted_seq_to_filepath):
to take them. May appear standalone, though. Also check that emoji that take
skin tone modifiers have a complete set."""
base_to_modifiers = collections.defaultdict(set)
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
for i, cp in enumerate(seq):
if unicode_data.is_skintone_modifier(cp):
if i == 0:
@ -213,7 +213,7 @@ def _check_skintone(sorted_seq_to_filepath):
base_to_modifiers[pcp] = set()
base_to_modifiers[pcp].add(cp)
for cp, modifiers in sorted(base_to_modifiers.iteritems()):
for cp, modifiers in sorted(base_to_modifiers.items()):
if len(modifiers) != 5:
print(
'check skintone: base %04x has %d modifiers defined (%s) in %s' % (
@ -224,7 +224,7 @@ def _check_skintone(sorted_seq_to_filepath):
def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version):
"""Verify that zwj sequences are valid for the given unicode version."""
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
if ZWJ not in seq:
continue
age = unicode_data.get_emoji_sequence_age(seq)
@ -236,7 +236,7 @@ def _check_no_alias_sources(sorted_seq_to_filepath):
"""Check that we don't have sequences that we expect to be aliased to
some other sequence."""
aliases = add_aliases.read_default_emoji_aliases()
for seq, fp in sorted_seq_to_filepath.iteritems():
for seq, fp in sorted_seq_to_filepath.items():
if seq in aliases:
print('check no alias sources: aliased sequence %s' % fp)
@ -270,22 +270,22 @@ def _check_coverage(seq_to_filepath, unicode_version):
seq_to_filepath[k] = 'alias:' + filename
# check single emoji, this includes most of the special chars
emoji = sorted(unicode_data.get_emoji(age=age))
for cp in emoji:
if tuple([cp]) not in seq_to_filepath:
print(
'coverage: missing single %04x (%s)' % (
cp, unicode_data.name(cp, '<no name>')))
emoji = sorted(unicode_data.get_emoji())
# for cp in emoji:
# if tuple([cp]) not in seq_to_filepath:
# print(
# 'coverage: missing single %04x (%s)' % (
# cp, unicode_data.name(cp, '<no name>')))
# special characters
# all but combining enclosing keycap are currently marked as emoji
for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + list(range(0x30, 0x3a)):
if cp not in emoji and tuple([cp]) not in seq_to_filepath:
print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp)))
# combining sequences
comb_seq_to_name = sorted(
unicode_data.get_emoji_combining_sequences(age=age).iteritems())
unicode_data._emoji_sequence_data.items())
for seq, name in comb_seq_to_name:
if seq not in seq_to_filepath:
# strip vs and try again
@ -294,44 +294,6 @@ def _check_coverage(seq_to_filepath, unicode_version):
print('coverage: missing combining sequence %s (%s)' %
(unicode_data.seq_to_string(seq), name))
# flag sequences
flag_seq_to_name = sorted(
unicode_data.get_emoji_flag_sequences(age=age).iteritems())
for seq, name in flag_seq_to_name:
if seq not in seq_to_filepath:
print('coverage: missing flag sequence %s (%s)' %
(unicode_data.seq_to_string(seq), name))
# skin tone modifier sequences
mod_seq_to_name = sorted(
unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
for seq, name in mod_seq_to_name:
if seq not in seq_to_filepath:
print('coverage: missing modifier sequence %s (%s)' % (
unicode_data.seq_to_string(seq), name))
# zwj sequences
# some of ours include the emoji presentation variation selector and some
# don't, and the same is true for the canonical sequences. normalize all
# of them to omit it to test coverage, but report the canonical sequence.
zwj_seq_without_vs = set()
for seq in seq_to_filepath:
if ZWJ not in seq:
continue
if EMOJI_VS in seq:
seq = tuple(cp for cp in seq if cp != EMOJI_VS)
zwj_seq_without_vs.add(seq)
for seq, name in sorted(
unicode_data.get_emoji_zwj_sequences(age=age).iteritems()):
if EMOJI_VS in seq:
test_seq = tuple(s for s in seq if s != EMOJI_VS)
else:
test_seq = seq
if test_seq not in zwj_seq_without_vs:
print('coverage: missing (canonical) zwj sequence %s (%s)' % (
unicode_data.seq_to_string(seq), name))
# check for 'unknown flag'
# this is either emoji_ufe82b or 'unknown_flag', but we filter out things that
# don't start with our prefix so 'unknown_flag' would be excluded by default.
@ -360,7 +322,7 @@ def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
of a name to stderr."""
segment_re = re.compile(r'^[0-9a-f]{4,6}$')
result = {}
for name, dirname in name_to_dirpath.iteritems():
for name, dirname in name_to_dirpath.items():
if not name.startswith(prefix):
print('expected prefix "%s" for "%s"' % (prefix, name))
continue
@ -430,7 +392,7 @@ def run_check(dirs, prefix, suffix, exclude, unicode_version, coverage):
seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
print('checking %d sequences' % len(seq_to_filepath))
check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage)
print('done.')
print('done running checks')
def main():

View File

@ -473,19 +473,20 @@ def get_rc_files(output_dir, unicode_version):
url = f"https://unicode.org/Public/{unicode_version}.0/ucd/DerivedAge.txt"
urllib.request.urlretrieve(url, f'./{output_dir}/DerivedAge.txt')
def main():
get_rc_files("./ucd", "12.0")
# get_rc_files("./ucd", "12.0")
ucd_path = "./ucd"
parse_ucd(ucd_path)
# # Generate all expected emoji
# all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
# Generate all expected emoji
all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
# # Generate file names
# expected_filenames = decimal_list_to_emoji_filename(all_emoji)
# Generate file names
expected_filenames = decimal_list_to_emoji_filename(all_emoji)
# check_missing_files(expected_filenames, './png/128/')
check_missing_files(expected_filenames, './png/128/')
# check_emoji_coverage(all_emoji, equivalent_emoji)
# check_emoji_defaults(default_emoji)

48
poetry.lock generated
View File

@ -1,48 +0,0 @@
[[package]]
category = "main"
description = "Tools to manipulate font files"
name = "fonttools"
optional = false
python-versions = ">=3.6"
version = "4.7.0"
[package.extras]
all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "brotli (>=1.0.1)", "scipy", "brotlipy (>=0.7.0)", "munkres", "unicodedata2 (>=13.0.0)", "xattr"]
graphite = ["lz4 (>=1.7.4.2)"]
interpolatable = ["scipy", "munkres"]
lxml = ["lxml (>=4.0,<5)"]
plot = ["matplotlib"]
symfont = ["sympy"]
type1 = ["xattr"]
ufo = ["fs (>=2.2.0,<3)"]
unicode = ["unicodedata2 (>=13.0.0)"]
woff = ["zopfli (>=0.1.4)", "brotli (>=1.0.1)", "brotlipy (>=0.7.0)"]
[[package]]
category = "main"
description = "Noto font tools"
name = "nototools"
optional = false
python-versions = "*"
version = "0.2.0"
[package.dependencies]
fontTools = "*"
[package.extras]
shapediff = ["booleanoperations", "defcon", "pillow"]
[package.source]
reference = "e0a39bad11ca47f924b432bb05c3cccd87e68571"
type = "git"
url = "https://github.com/googlefonts/nototools.git"
[metadata]
content-hash = "1b3d3ee95aca31cb8d69bd8a8fae3504b6de0dc2b32462f86e3798e225ebcdf5"
python-versions = "^3.7.2"
[metadata.files]
fonttools = [
{file = "fonttools-4.7.0-py3-none-any.whl", hash = "sha256:454db99e20e6cafb7ed3e30b15c9daf2d46c4370a800c1a6db11ba3eb3b43116"},
{file = "fonttools-4.7.0.zip", hash = "sha256:ce977f10f070752301e2d49ed822cfc860c881046d81c376fade1e6529b2046c"},
]
nototools = []