Skip to content

Commit c37b052

Browse files
committed
Fix: Prevent auto-splitting of French accented words in text recognition
Added support for Latin characters with diacritics (é, è, à, ç, etc.) and French contractions (n'êtes) in word grouping logic of BaseRecLabelDecode.get_word_info(). This fix ensures that French words are no longer split at accented characters during OCR text recognition.
1 parent 0f82618 commit c37b052

File tree

2 files changed

+163
-1
lines changed

2 files changed

+163
-1
lines changed

ppocr/postprocess/rec_postprocess.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,26 @@
1818
from paddle.nn import functional as F
1919
import re
2020
import json
21+
import unicodedata
22+
23+
24+
def is_latin_char(char):
25+
"""
26+
Check if a character is a Latin letter (including accented characters).
27+
This will properly categorize accented characters like é, è, à, etc.
28+
"""
29+
try:
30+
# Get the Unicode category
31+
category = unicodedata.category(char)
32+
# Lu = Letter, uppercase
33+
# Ll = Letter, lowercase
34+
# Lt = Letter, titlecase
35+
# Lo = Letter, other (some symbols from Latin-derived alphabets)
36+
return category.startswith("L") and unicodedata.name(char).startswith(
37+
("LATIN", "FRENCH")
38+
)
39+
except ValueError:
40+
return False
2141

2242

2343
class BaseRecLabelDecode(object):
@@ -95,11 +115,16 @@ def get_word_info(self, text, selection):
95115
for c_i, char in enumerate(text):
96116
if "\u4e00" <= char <= "\u9fff":
97117
c_state = "cn"
98-
elif bool(re.search("[a-zA-Z0-9]", char)):
118+
# Modified condition to include accented characters used in French and other Latin-based languages
119+
elif bool(re.search("[a-zA-Z0-9]", char)) or is_latin_char(char):
99120
c_state = "en&num"
100121
else:
101122
c_state = "splitter"
102123

124+
# Handle apostrophes in French words like "n'êtes"
125+
if char == "'" and state == "en&num":
126+
c_state = "en&num"
127+
103128
if (
104129
char == "."
105130
and state == "en&num"

test_french_accents.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Test script to verify French accented character handling in OCR text recognition.
5+
6+
This script tests that French words with accented characters (é, è, à, ç, etc.)
7+
and contractions (n'êtes, l'été) are properly grouped as single words and not
8+
split at each accented character.
9+
"""
10+
11+
import sys
12+
import os
13+
import numpy as np
14+
15+
# Add the project root to the path
16+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
17+
18+
from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode
19+
20+
21+
def test_french_word_grouping():
22+
"""Test that French words with accents are properly grouped."""
23+
24+
# Initialize the decoder
25+
decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True)
26+
27+
# Test cases with French accented words
28+
test_cases = [
29+
{
30+
"name": "Simple accented word: été (summer)",
31+
"text": "été",
32+
"expected_words": [["é", "t", "é"]],
33+
"expected_states": ["en&num"],
34+
},
35+
{
36+
"name": "Word with ç: français (French)",
37+
"text": "français",
38+
"expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]],
39+
"expected_states": ["en&num"],
40+
},
41+
{
42+
"name": "Contraction: n'êtes (you are)",
43+
"text": "n'êtes",
44+
"expected_words": [["n", "'", "ê", "t", "e", "s"]],
45+
"expected_states": ["en&num"],
46+
},
47+
{
48+
"name": "Multiple accents: élève (student)",
49+
"text": "élève",
50+
"expected_words": [["é", "l", "è", "v", "e"]],
51+
"expected_states": ["en&num"],
52+
},
53+
{
54+
"name": "Word with à: à demain (see you tomorrow)",
55+
"text": "à demain",
56+
"expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]],
57+
"expected_states": ["en&num", "en&num"],
58+
},
59+
{
60+
"name": "Complex: C'était très français (It was very French)",
61+
"text": "C'était très français",
62+
"expected_words": [
63+
["C", "'", "é", "t", "a", "i", "t"],
64+
["t", "r", "è", "s"],
65+
["f", "r", "a", "n", "ç", "a", "i", "s"],
66+
],
67+
"expected_states": ["en&num", "en&num", "en&num"],
68+
},
69+
]
70+
71+
print("=" * 70)
72+
print("Testing French Accented Character Word Grouping")
73+
print("=" * 70)
74+
75+
all_passed = True
76+
77+
for test in test_cases:
78+
text = test["name"]
79+
test_text = test["text"]
80+
81+
# Create a mock selection array (all characters are valid)
82+
selection = np.ones(len(test_text), dtype=bool)
83+
84+
# Call get_word_info
85+
word_list, word_col_list, state_list = decoder.get_word_info(
86+
test_text, selection
87+
)
88+
89+
# Check results
90+
passed = True
91+
92+
if len(word_list) != len(test["expected_words"]):
93+
passed = False
94+
print(f"\n❌ FAILED: {text}")
95+
print(
96+
f" Expected {len(test['expected_words'])} words, got {len(word_list)}"
97+
)
98+
elif state_list != test["expected_states"]:
99+
passed = False
100+
print(f"\n❌ FAILED: {text}")
101+
print(f" Expected states: {test['expected_states']}")
102+
print(f" Got states: {state_list}")
103+
else:
104+
# Check if words match
105+
for i, (expected, actual) in enumerate(
106+
zip(test["expected_words"], word_list)
107+
):
108+
if expected != actual:
109+
passed = False
110+
print(f"\n❌ FAILED: {text}")
111+
print(f" Word {i}: Expected {expected}, got {actual}")
112+
break
113+
114+
if passed:
115+
print(f"\n✅ PASSED: {text}")
116+
print(f" Text: '{test_text}'")
117+
print(f" Words: {[''.join(w) for w in word_list]}")
118+
print(f" States: {state_list}")
119+
else:
120+
all_passed = False
121+
print(f" Text: '{test_text}'")
122+
print(f" Expected words: {[''.join(w) for w in test['expected_words']]}")
123+
print(f" Got words: {[''.join(w) for w in word_list]}")
124+
125+
print("\n" + "=" * 70)
126+
if all_passed:
127+
print("✅ All tests PASSED! French accented words are properly grouped.")
128+
else:
129+
print("❌ Some tests FAILED. Please review the output above.")
130+
print("=" * 70)
131+
132+
return all_passed
133+
134+
135+
if __name__ == "__main__":
136+
success = test_french_word_grouping()
137+
sys.exit(0 if success else 1)

0 commit comments

Comments
 (0)