build updates to bin/update.py (#152)

jquast · web-flow · commit 5ba540df3386 · 2025-10-14T19:36:53.000-04:00
- Update URL_EMOJI_ZWJ, upstream URL has changed - do not make any "modify only timestamps" changes to generated files From https://unicode.org/Public/emoji/ReadMe.txt > This directory contains data files for versions 1.0 to 16.0 > of Unicode Emoji. > > Starting with Unicode 17.0.0, the data files for Unicode Emoji are published in > https://www.unicode.org/Public/<version>/emoji/ > together with the related files in > https://www.unicode.org/Public/<version>/ucd/emoji/ I checked, and this file does not change in value between 15.1.0 and 17.0.0; and, new code has been added to *verify* that no change is done. This is because we hardcode *all* vs16 tables as a single "version 9.0.0" table,
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -15,6 +15,7 @@
 import re
 import sys
 import string
+import difflib
 import datetime
 import functools
 import unicodedata
@@ -406,8 +407,9 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
     match unicode releases 8, 9, and 10, these specifications were mostly
     implemented only in Terminals supporting Unicode 9.0 or later.
 
-    For that reason, and that these values are not expected to change,
-    only this single shared table is exported.
+    For that reason, and that **these values are not expected to change**,
+    If they do, a noticeable change would occur in `wcwidth/table_vs16.py`
+    falsely labeled under version 9.0 but is prevented by assertion.
 
     One example, where v3.2 became v1.1 ("-" 12.0, "+" 15.1)::
 
@@ -562,7 +564,7 @@ class UnicodeDataFile:
     URL_DERIVED_CATEGORY = 'https://www.unicode.org/Public/{version}/ucd/extracted/DerivedGeneralCategory.txt'
     URL_EMOJI_VARIATION = 'https://unicode.org/Public/{version}/ucd/emoji/emoji-variation-sequences.txt'
     URL_LEGACY_VARIATION = 'https://unicode.org/Public/emoji/{version}/emoji-variation-sequences.txt'
-    URL_EMOJI_ZWJ = 'https://unicode.org/Public/emoji/{version}/emoji-zwj-sequences.txt'
+    URL_EMOJI_ZWJ = 'https://unicode.org/Public/{version}/emoji/emoji-zwj-sequences.txt'
 
     @classmethod
     def DerivedAge(cls) -> str:
@@ -666,6 +668,44 @@ def filenames() -> list[str]:
         return [os.path.join(PATH_DATA, match.string) for match in filename_matches]
 
 
+def replace_if_modified(new_filename: str, original_filename: str) -> None:
+    """Replace original file with new file only if there are significant changes.
+
+    If only the 'This code generated' timestamp line differs, discard the new file.
+    If there are other changes or the original doesn't exist, replace it.
+    """
+    if os.path.exists(original_filename):
+        with open(original_filename, 'r', encoding='utf-8') as f1, \
+                open(new_filename, 'r', encoding='utf-8') as f2:
+            old_lines = f1.readlines()
+            new_lines = f2.readlines()
+
+        # Generate diff
+        diff_lines = list(difflib.unified_diff(old_lines, new_lines,
+                                               fromfile=original_filename,
+                                               tofile=new_filename,
+                                               lineterm=''))
+
+        # Check if only the 'This code generated' line is different
+        significant_changes = False
+        for line in diff_lines:
+            if (line.startswith(('@@', '---', '+++')) or
+                (line.startswith(('-','+')) and 'This code generated' in line)):
+                continue
+            else:
+                significant_changes = line.startswith(('-', '+'))
+            if significant_changes:
+                break
+
+        if not significant_changes:
+            # only the code-generated timestamp changed, remove the .new file
+            os.remove(new_filename)
+            return False
+    # Significant changes found, replace the original
+    os.replace(new_filename, original_filename)
+    return True
+
+
 def main() -> None:
     """Update east-asian, combining and zero width tables."""
     # This defines which jinja source templates map to which output filenames,
@@ -682,10 +722,16 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
         yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
 
     for render_def in get_codegen_definitions():
-        with open(render_def.output_filename, 'w', encoding='utf-8', newline='\n') as fout:
-            print(f'write {render_def.output_filename}: ', flush=True, end='')
+        new_filename = render_def.output_filename + '.new'
+        with open(new_filename, 'w', encoding='utf-8', newline='\n') as fout:
+            print(f'write {new_filename}: ', flush=True, end='')
             for data in render_def.generate():
                 fout.write(data)
+
+        if not replace_if_modified(new_filename, render_def.output_filename):
+            print(f'discarded {new_filename} (timestamp-only change)')
+        else:
+            assert render_def.output_filename != 'table_vs16.py', ('table_vs16 not expected to change!')
             print('ok')
 
     # fetch latest test data files
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -393,6 +393,7 @@ def test_zero_wide_conflict():
     assert wcwidth.wcwidth(chr(0x0309a), unicode_version='4.1.0') == 0
     assert wcwidth.wcwidth(chr(0x0309b), unicode_version='4.1.0') == 2
 
+
 def test_soft_hyphen():
     # Test SOFT HYPHEN, category 'Cf' usually are zero-width, but most
     # implementations agree to draw it was '1' cell, visually