Fix: Prevent auto-splitting of French accented words in text recognition

Ihebdhouibi · Ihebdhouibi · commit c37b052a5364 · 2025-11-07T09:21:52.000+01:00
Added support for Latin characters with diacritics (é, è, à, ç, etc.) and French contractions (n'êtes) in word grouping logic of BaseRecLabelDecode.get_word_info().

This fix ensures that French words are no longer split at accented characters during OCR text recognition.
diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py
@@ -18,6 +18,26 @@
 from paddle.nn import functional as F
 import re
 import json
+import unicodedata
+
+
+def is_latin_char(char):
+    """
+    Check if a character is a Latin letter (including accented characters).
+    This will properly categorize accented characters like é, è, à, etc.
+    """
+    try:
+        # Get the Unicode category
+        category = unicodedata.category(char)
+        # Lu = Letter, uppercase
+        # Ll = Letter, lowercase
+        # Lt = Letter, titlecase
+        # Lo = Letter, other (some symbols from Latin-derived alphabets)
+        return category.startswith("L") and unicodedata.name(char).startswith(
+            ("LATIN", "FRENCH")
+        )
+    except ValueError:
+        return False
 
 
 class BaseRecLabelDecode(object):
@@ -95,11 +115,16 @@ def get_word_info(self, text, selection):
         for c_i, char in enumerate(text):
             if "\u4e00" <= char <= "\u9fff":
                 c_state = "cn"
-            elif bool(re.search("[a-zA-Z0-9]", char)):
+            # Modified condition to include accented characters used in French and other Latin-based languages
+            elif bool(re.search("[a-zA-Z0-9]", char)) or is_latin_char(char):
                 c_state = "en&num"
             else:
                 c_state = "splitter"
 
+            # Handle apostrophes in French words like "n'êtes"
+            if char == "'" and state == "en&num":
+                c_state = "en&num"
+
             if (
                 char == "."
                 and state == "en&num"
diff --git a/test_french_accents.py b/test_french_accents.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Test script to verify French accented character handling in OCR text recognition.
+
+This script tests that French words with accented characters (é, è, à, ç, etc.)
+and contractions (n'êtes, l'été) are properly grouped as single words and not
+split at each accented character.
+"""
+
+import sys
+import os
+import numpy as np
+
+# Add the project root to the path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode
+
+
+def test_french_word_grouping():
+    """Test that French words with accents are properly grouped."""
+
+    # Initialize the decoder
+    decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True)
+
+    # Test cases with French accented words
+    test_cases = [
+        {
+            "name": "Simple accented word: été (summer)",
+            "text": "été",
+            "expected_words": [["é", "t", "é"]],
+            "expected_states": ["en&num"],
+        },
+        {
+            "name": "Word with ç: français (French)",
+            "text": "français",
+            "expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]],
+            "expected_states": ["en&num"],
+        },
+        {
+            "name": "Contraction: n'êtes (you are)",
+            "text": "n'êtes",
+            "expected_words": [["n", "'", "ê", "t", "e", "s"]],
+            "expected_states": ["en&num"],
+        },
+        {
+            "name": "Multiple accents: élève (student)",
+            "text": "élève",
+            "expected_words": [["é", "l", "è", "v", "e"]],
+            "expected_states": ["en&num"],
+        },
+        {
+            "name": "Word with à: à demain (see you tomorrow)",
+            "text": "à demain",
+            "expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]],
+            "expected_states": ["en&num", "en&num"],
+        },
+        {
+            "name": "Complex: C'était très français (It was very French)",
+            "text": "C'était très français",
+            "expected_words": [
+                ["C", "'", "é", "t", "a", "i", "t"],
+                ["t", "r", "è", "s"],
+                ["f", "r", "a", "n", "ç", "a", "i", "s"],
+            ],
+            "expected_states": ["en&num", "en&num", "en&num"],
+        },
+    ]
+
+    print("=" * 70)
+    print("Testing French Accented Character Word Grouping")
+    print("=" * 70)
+
+    all_passed = True
+
+    for test in test_cases:
+        text = test["name"]
+        test_text = test["text"]
+
+        # Create a mock selection array (all characters are valid)
+        selection = np.ones(len(test_text), dtype=bool)
+
+        # Call get_word_info
+        word_list, word_col_list, state_list = decoder.get_word_info(
+            test_text, selection
+        )
+
+        # Check results
+        passed = True
+
+        if len(word_list) != len(test["expected_words"]):
+            passed = False
+            print(f"\n❌ FAILED: {text}")
+            print(
+                f"   Expected {len(test['expected_words'])} words, got {len(word_list)}"
+            )
+        elif state_list != test["expected_states"]:
+            passed = False
+            print(f"\n❌ FAILED: {text}")
+            print(f"   Expected states: {test['expected_states']}")
+            print(f"   Got states: {state_list}")
+        else:
+            # Check if words match
+            for i, (expected, actual) in enumerate(
+                zip(test["expected_words"], word_list)
+            ):
+                if expected != actual:
+                    passed = False
+                    print(f"\n❌ FAILED: {text}")
+                    print(f"   Word {i}: Expected {expected}, got {actual}")
+                    break
+
+        if passed:
+            print(f"\n✅ PASSED: {text}")
+            print(f"   Text: '{test_text}'")
+            print(f"   Words: {[''.join(w) for w in word_list]}")
+            print(f"   States: {state_list}")
+        else:
+            all_passed = False
+            print(f"   Text: '{test_text}'")
+            print(f"   Expected words: {[''.join(w) for w in test['expected_words']]}")
+            print(f"   Got words: {[''.join(w) for w in word_list]}")
+
+    print("\n" + "=" * 70)
+    if all_passed:
+        print("✅ All tests PASSED! French accented words are properly grouped.")
+    else:
+        print("❌ Some tests FAILED. Please review the output above.")
+    print("=" * 70)
+
+    return all_passed
+
+
+if __name__ == "__main__":
+    success = test_french_word_grouping()
+    sys.exit(0 if success else 1)