|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +Test script to verify French accented character handling in OCR text recognition. |
| 5 | +
|
| 6 | +This script tests that French words with accented characters (é, è, à, ç, etc.) |
| 7 | +and contractions (n'êtes, l'été) are properly grouped as single words and not |
| 8 | +split at each accented character. |
| 9 | +""" |
| 10 | + |
| 11 | +import sys |
| 12 | +import os |
| 13 | +import numpy as np |
| 14 | + |
| 15 | +# Add the project root to the path |
| 16 | +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| 17 | + |
| 18 | +from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode |
| 19 | + |
| 20 | + |
| 21 | +def test_french_word_grouping(): |
| 22 | + """Test that French words with accents are properly grouped.""" |
| 23 | + |
| 24 | + # Initialize the decoder |
| 25 | + decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True) |
| 26 | + |
| 27 | + # Test cases with French accented words |
| 28 | + test_cases = [ |
| 29 | + { |
| 30 | + "name": "Simple accented word: été (summer)", |
| 31 | + "text": "été", |
| 32 | + "expected_words": [["é", "t", "é"]], |
| 33 | + "expected_states": ["en&num"], |
| 34 | + }, |
| 35 | + { |
| 36 | + "name": "Word with ç: français (French)", |
| 37 | + "text": "français", |
| 38 | + "expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]], |
| 39 | + "expected_states": ["en&num"], |
| 40 | + }, |
| 41 | + { |
| 42 | + "name": "Contraction: n'êtes (you are)", |
| 43 | + "text": "n'êtes", |
| 44 | + "expected_words": [["n", "'", "ê", "t", "e", "s"]], |
| 45 | + "expected_states": ["en&num"], |
| 46 | + }, |
| 47 | + { |
| 48 | + "name": "Multiple accents: élève (student)", |
| 49 | + "text": "élève", |
| 50 | + "expected_words": [["é", "l", "è", "v", "e"]], |
| 51 | + "expected_states": ["en&num"], |
| 52 | + }, |
| 53 | + { |
| 54 | + "name": "Word with à: à demain (see you tomorrow)", |
| 55 | + "text": "à demain", |
| 56 | + "expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]], |
| 57 | + "expected_states": ["en&num", "en&num"], |
| 58 | + }, |
| 59 | + { |
| 60 | + "name": "Complex: C'était très français (It was very French)", |
| 61 | + "text": "C'était très français", |
| 62 | + "expected_words": [ |
| 63 | + ["C", "'", "é", "t", "a", "i", "t"], |
| 64 | + ["t", "r", "è", "s"], |
| 65 | + ["f", "r", "a", "n", "ç", "a", "i", "s"], |
| 66 | + ], |
| 67 | + "expected_states": ["en&num", "en&num", "en&num"], |
| 68 | + }, |
| 69 | + ] |
| 70 | + |
| 71 | + print("=" * 70) |
| 72 | + print("Testing French Accented Character Word Grouping") |
| 73 | + print("=" * 70) |
| 74 | + |
| 75 | + all_passed = True |
| 76 | + |
| 77 | + for test in test_cases: |
| 78 | + text = test["name"] |
| 79 | + test_text = test["text"] |
| 80 | + |
| 81 | + # Create a mock selection array (all characters are valid) |
| 82 | + selection = np.ones(len(test_text), dtype=bool) |
| 83 | + |
| 84 | + # Call get_word_info |
| 85 | + word_list, word_col_list, state_list = decoder.get_word_info( |
| 86 | + test_text, selection |
| 87 | + ) |
| 88 | + |
| 89 | + # Check results |
| 90 | + passed = True |
| 91 | + |
| 92 | + if len(word_list) != len(test["expected_words"]): |
| 93 | + passed = False |
| 94 | + print(f"\n❌ FAILED: {text}") |
| 95 | + print( |
| 96 | + f" Expected {len(test['expected_words'])} words, got {len(word_list)}" |
| 97 | + ) |
| 98 | + elif state_list != test["expected_states"]: |
| 99 | + passed = False |
| 100 | + print(f"\n❌ FAILED: {text}") |
| 101 | + print(f" Expected states: {test['expected_states']}") |
| 102 | + print(f" Got states: {state_list}") |
| 103 | + else: |
| 104 | + # Check if words match |
| 105 | + for i, (expected, actual) in enumerate( |
| 106 | + zip(test["expected_words"], word_list) |
| 107 | + ): |
| 108 | + if expected != actual: |
| 109 | + passed = False |
| 110 | + print(f"\n❌ FAILED: {text}") |
| 111 | + print(f" Word {i}: Expected {expected}, got {actual}") |
| 112 | + break |
| 113 | + |
| 114 | + if passed: |
| 115 | + print(f"\n✅ PASSED: {text}") |
| 116 | + print(f" Text: '{test_text}'") |
| 117 | + print(f" Words: {[''.join(w) for w in word_list]}") |
| 118 | + print(f" States: {state_list}") |
| 119 | + else: |
| 120 | + all_passed = False |
| 121 | + print(f" Text: '{test_text}'") |
| 122 | + print(f" Expected words: {[''.join(w) for w in test['expected_words']]}") |
| 123 | + print(f" Got words: {[''.join(w) for w in word_list]}") |
| 124 | + |
| 125 | + print("\n" + "=" * 70) |
| 126 | + if all_passed: |
| 127 | + print("✅ All tests PASSED! French accented words are properly grouped.") |
| 128 | + else: |
| 129 | + print("❌ Some tests FAILED. Please review the output above.") |
| 130 | + print("=" * 70) |
| 131 | + |
| 132 | + return all_passed |
| 133 | + |
| 134 | + |
| 135 | +if __name__ == "__main__": |
| 136 | + success = test_french_word_grouping() |
| 137 | + sys.exit(0 if success else 1) |
0 commit comments