34
34
35
35
#include "mysql_general_ci_table.h"
36
36
#include "mysql_unicode_ci_table.h"
37
+ #include "mysql_unicode_520_ci_table.h"
37
38
#include "mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h"
38
39
39
40
#ifdef __GNUC__
@@ -155,7 +156,7 @@ utf8_to_unichar(const char *utf8, int byte_size)
155
156
156
157
static inline void
157
158
decompose_character (const char * rest , int character_length ,
158
- int * page , uint32_t * low_code )
159
+ size_t * page , uint32_t * low_code )
159
160
{
160
161
switch (character_length ) {
161
162
case 1 :
@@ -175,10 +176,27 @@ decompose_character(const char *rest, int character_length,
175
176
((rest [0 ] & 0x07 ) << 10 ) +
176
177
((rest [1 ] & 0x3f ) << 4 ) +
177
178
((rest [2 ] & 0x3c ) >> 2 );
178
- * low_code = ((rest [1 ] & 0x03 ) << 6 ) + (rest [2 ] & 0x3f );
179
+ * low_code = ((rest [2 ] & 0x03 ) << 6 ) + (rest [3 ] & 0x3f );
180
+ break ;
181
+ case 5 :
182
+ * page =
183
+ ((rest [0 ] & 0x03 ) << 16 ) +
184
+ ((rest [1 ] & 0x3f ) << 10 ) +
185
+ ((rest [2 ] & 0x3f ) << 4 ) +
186
+ ((rest [3 ] & 0x3c ) >> 2 );
187
+ * low_code = ((rest [3 ] & 0x03 ) << 6 ) + (rest [4 ] & 0x3f );
188
+ break ;
189
+ case 6 :
190
+ * page =
191
+ ((rest [0 ] & 0x01 ) << 22 ) +
192
+ ((rest [1 ] & 0x3f ) << 16 ) +
193
+ ((rest [2 ] & 0x3f ) << 10 ) +
194
+ ((rest [3 ] & 0x3f ) << 4 ) +
195
+ ((rest [4 ] & 0x3c ) >> 2 );
196
+ * low_code = ((rest [4 ] & 0x03 ) << 6 ) + (rest [5 ] & 0x3f );
179
197
break ;
180
198
default :
181
- * page = -1 ;
199
+ * page = ( size_t ) -1 ;
182
200
* low_code = 0x00 ;
183
201
break ;
184
202
}
@@ -187,15 +205,16 @@ decompose_character(const char *rest, int character_length,
187
205
static inline void
188
206
normalize_character (const char * utf8 , int character_length ,
189
207
uint32_t * * normalize_table ,
208
+ size_t normalize_table_size ,
190
209
char * normalized ,
191
210
unsigned int * normalized_character_length ,
192
211
unsigned int * normalized_length_in_bytes ,
193
212
unsigned int * normalized_n_characters )
194
213
{
195
- int page ;
214
+ size_t page ;
196
215
uint32_t low_code ;
197
216
decompose_character (utf8 , character_length , & page , & low_code );
198
- if (( 0x00 <= page && page <= 0xff ) && normalize_table [page ]) {
217
+ if (page < normalize_table_size && normalize_table [page ]) {
199
218
uint32_t normalized_code ;
200
219
unsigned int n_bytes ;
201
220
normalized_code = normalize_table [page ][low_code ];
@@ -304,6 +323,7 @@ static void
304
323
normalize (grn_ctx * ctx , grn_obj * string ,
305
324
const char * normalizer_type_label ,
306
325
uint32_t * * normalize_table ,
326
+ size_t normalize_table_size ,
307
327
normalizer_func custom_normalizer )
308
328
{
309
329
const char * original , * rest ;
@@ -371,7 +391,8 @@ normalize(grn_ctx *ctx, grn_obj *string,
371
391
& normalized_n_characters );
372
392
}
373
393
if (!custom_normalized ) {
374
- normalize_character (rest , character_length , normalize_table ,
394
+ normalize_character (rest , character_length ,
395
+ normalize_table , normalize_table_size ,
375
396
normalized ,
376
397
& normalized_character_length ,
377
398
& normalized_length_in_bytes ,
@@ -448,7 +469,9 @@ mysql_general_ci_next(GNUC_UNUSED grn_ctx *ctx,
448
469
grn_encoding_to_string (encoding ));
449
470
return NULL ;
450
471
}
451
- normalize (ctx , string , normalizer_type_label , general_ci_table , NULL );
472
+ normalize (ctx , string , normalizer_type_label ,
473
+ general_ci_table , sizeof (general_ci_table ) / sizeof (uint32_t * ),
474
+ NULL );
452
475
return NULL ;
453
476
}
454
477
@@ -472,7 +495,9 @@ mysql_unicode_ci_next(GNUC_UNUSED grn_ctx *ctx,
472
495
grn_encoding_to_string (encoding ));
473
496
return NULL ;
474
497
}
475
- normalize (ctx , string , normalizer_type_label , unicode_ci_table , NULL );
498
+ normalize (ctx , string , normalizer_type_label ,
499
+ unicode_ci_table , sizeof (unicode_ci_table ) / sizeof (uint32_t * ),
500
+ NULL );
476
501
return NULL ;
477
502
}
478
503
@@ -638,10 +663,38 @@ mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next(
638
663
normalize (ctx , string ,
639
664
normalizer_type_label ,
640
665
unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table ,
666
+ sizeof (unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table ) / sizeof (uint32_t * ),
641
667
normalize_halfwidth_katakana_with_voiced_sound_mark );
642
668
return NULL ;
643
669
}
644
670
671
+ static grn_obj *
672
+ mysql_unicode_520_ci_next (GNUC_UNUSED grn_ctx * ctx ,
673
+ GNUC_UNUSED int nargs ,
674
+ grn_obj * * args ,
675
+ GNUC_UNUSED grn_user_data * user_data )
676
+ {
677
+ grn_obj * string = args [0 ];
678
+ grn_encoding encoding ;
679
+ const char * normalizer_type_label = "mysql-unicode-520-ci" ;
680
+
681
+ encoding = grn_string_get_encoding (ctx , string );
682
+ if (encoding != GRN_ENC_UTF8 ) {
683
+ GRN_PLUGIN_ERROR (ctx ,
684
+ GRN_FUNCTION_NOT_IMPLEMENTED ,
685
+ "[normalizer][%s] "
686
+ "UTF-8 encoding is only supported: %s" ,
687
+ normalizer_type_label ,
688
+ grn_encoding_to_string (encoding ));
689
+ return NULL ;
690
+ }
691
+ normalize (ctx , string , normalizer_type_label ,
692
+ unicode_520_ci_table ,
693
+ sizeof (unicode_520_ci_table ) / sizeof (uint32_t * ),
694
+ NULL );
695
+ return NULL ;
696
+ }
697
+
645
698
grn_rc
646
699
GRN_PLUGIN_INIT (grn_ctx * ctx )
647
700
{
@@ -664,6 +717,8 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx)
664
717
NULL ,
665
718
mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next ,
666
719
NULL );
720
+ grn_normalizer_register (ctx , "NormalizerMySQLUnicode520CI" , -1 ,
721
+ NULL , mysql_unicode_520_ci_next , NULL );
667
722
return GRN_SUCCESS ;
668
723
}
669
724
0 commit comments