Skip to content

Commit 5ba540d

Browse files
authored
build updates to bin/update.py (#152)
- Update URL_EMOJI_ZWJ, upstream URL has changed - do not make any "modify only timestamps" changes to generated files From https://unicode.org/Public/emoji/ReadMe.txt > This directory contains data files for versions 1.0 to 16.0 > of Unicode Emoji. > > Starting with Unicode 17.0.0, the data files for Unicode Emoji are published in > https://www.unicode.org/Public/<version>/emoji/ > together with the related files in > https://www.unicode.org/Public/<version>/ucd/emoji/ I checked, and this file does not change in value between 15.1.0 and 17.0.0; and, new code has been added to *verify* that no change is done. This is because we hardcode *all* vs16 tables as a single "version 9.0.0" table,
1 parent ed5fb46 commit 5ba540d

File tree

2 files changed

+52
-5
lines changed

2 files changed

+52
-5
lines changed

bin/update-tables.py

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import re
1616
import sys
1717
import string
18+
import difflib
1819
import datetime
1920
import functools
2021
import unicodedata
@@ -406,8 +407,9 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
406407
match unicode releases 8, 9, and 10, these specifications were mostly
407408
implemented only in Terminals supporting Unicode 9.0 or later.
408409
409-
For that reason, and that these values are not expected to change,
410-
only this single shared table is exported.
410+
For that reason, and that **these values are not expected to change**,
411+
If they do, a noticeable change would occur in `wcwidth/table_vs16.py`
412+
falsely labeled under version 9.0 but is prevented by assertion.
411413
412414
One example, where v3.2 became v1.1 ("-" 12.0, "+" 15.1)::
413415
@@ -562,7 +564,7 @@ class UnicodeDataFile:
562564
URL_DERIVED_CATEGORY = 'https://www.unicode.org/Public/{version}/ucd/extracted/DerivedGeneralCategory.txt'
563565
URL_EMOJI_VARIATION = 'https://unicode.org/Public/{version}/ucd/emoji/emoji-variation-sequences.txt'
564566
URL_LEGACY_VARIATION = 'https://unicode.org/Public/emoji/{version}/emoji-variation-sequences.txt'
565-
URL_EMOJI_ZWJ = 'https://unicode.org/Public/emoji/{version}/emoji-zwj-sequences.txt'
567+
URL_EMOJI_ZWJ = 'https://unicode.org/Public/{version}/emoji/emoji-zwj-sequences.txt'
566568

567569
@classmethod
568570
def DerivedAge(cls) -> str:
@@ -666,6 +668,44 @@ def filenames() -> list[str]:
666668
return [os.path.join(PATH_DATA, match.string) for match in filename_matches]
667669

668670

671+
def replace_if_modified(new_filename: str, original_filename: str) -> None:
672+
"""Replace original file with new file only if there are significant changes.
673+
674+
If only the 'This code generated' timestamp line differs, discard the new file.
675+
If there are other changes or the original doesn't exist, replace it.
676+
"""
677+
if os.path.exists(original_filename):
678+
with open(original_filename, 'r', encoding='utf-8') as f1, \
679+
open(new_filename, 'r', encoding='utf-8') as f2:
680+
old_lines = f1.readlines()
681+
new_lines = f2.readlines()
682+
683+
# Generate diff
684+
diff_lines = list(difflib.unified_diff(old_lines, new_lines,
685+
fromfile=original_filename,
686+
tofile=new_filename,
687+
lineterm=''))
688+
689+
# Check if only the 'This code generated' line is different
690+
significant_changes = False
691+
for line in diff_lines:
692+
if (line.startswith(('@@', '---', '+++')) or
693+
(line.startswith(('-','+')) and 'This code generated' in line)):
694+
continue
695+
else:
696+
significant_changes = line.startswith(('-', '+'))
697+
if significant_changes:
698+
break
699+
700+
if not significant_changes:
701+
# only the code-generated timestamp changed, remove the .new file
702+
os.remove(new_filename)
703+
return False
704+
# Significant changes found, replace the original
705+
os.replace(new_filename, original_filename)
706+
return True
707+
708+
669709
def main() -> None:
670710
"""Update east-asian, combining and zero width tables."""
671711
# This defines which jinja source templates map to which output filenames,
@@ -682,10 +722,16 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
682722
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
683723

684724
for render_def in get_codegen_definitions():
685-
with open(render_def.output_filename, 'w', encoding='utf-8', newline='\n') as fout:
686-
print(f'write {render_def.output_filename}: ', flush=True, end='')
725+
new_filename = render_def.output_filename + '.new'
726+
with open(new_filename, 'w', encoding='utf-8', newline='\n') as fout:
727+
print(f'write {new_filename}: ', flush=True, end='')
687728
for data in render_def.generate():
688729
fout.write(data)
730+
731+
if not replace_if_modified(new_filename, render_def.output_filename):
732+
print(f'discarded {new_filename} (timestamp-only change)')
733+
else:
734+
assert render_def.output_filename != 'table_vs16.py', ('table_vs16 not expected to change!')
689735
print('ok')
690736

691737
# fetch latest test data files

tests/test_core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ def test_zero_wide_conflict():
393393
assert wcwidth.wcwidth(chr(0x0309a), unicode_version='4.1.0') == 0
394394
assert wcwidth.wcwidth(chr(0x0309b), unicode_version='4.1.0') == 2
395395

396+
396397
def test_soft_hyphen():
397398
# Test SOFT HYPHEN, category 'Cf' usually are zero-width, but most
398399
# implementations agree to draw it was '1' cell, visually

0 commit comments

Comments
 (0)