Skip to content

Commit abc4a8d

Browse files
committed
Add NormalizerMySQLUnicode520CI
It's utf8mb4_unicode_520_ci compatible normalizer.
1 parent f5934cd commit abc4a8d

31 files changed

+5290
-34
lines changed

normalizers/Makefile.am

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,11 @@ ensure-mysql-source-dir:
3939
exit 1; \
4040
fi
4141

42-
UPDATE_TABLES_TARGETS = \
43-
update-general-ci-table \
44-
update-unicode-ci-table \
45-
update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table
42+
UPDATE_TABLES_TARGETS = \
43+
update-general-ci-table \
44+
update-unicode-ci-table \
45+
update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table \
46+
update-unicode-520-ci-table
4647

4748
update-tables: $(UPDATE_TABLES_TARGETS)
4849

@@ -67,3 +68,10 @@ update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table: ensure-mysql
6768
--suffix _except_kana_ci_kana_with_voiced_sound_mark \
6869
$(MYSQL_SOURCE_DIR)/strings/ctype-uca.c > \
6970
$(srcdir)/mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h
71+
72+
update-unicode-520-ci-table: ensure-mysql-source-dir
73+
$(RUBY) \
74+
$(top_srcdir)/tool/generate_uca_table.rb \
75+
--version=520 \
76+
$(MYSQL_SOURCE_DIR)/strings/ctype-uca.c > \
77+
$(srcdir)/mysql_unicode_520_ci_table.h

normalizers/mysql.c

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
#include "mysql_general_ci_table.h"
3636
#include "mysql_unicode_ci_table.h"
37+
#include "mysql_unicode_520_ci_table.h"
3738
#include "mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h"
3839

3940
#ifdef __GNUC__
@@ -155,7 +156,7 @@ utf8_to_unichar(const char *utf8, int byte_size)
155156

156157
static inline void
157158
decompose_character(const char *rest, int character_length,
158-
int *page, uint32_t *low_code)
159+
size_t *page, uint32_t *low_code)
159160
{
160161
switch (character_length) {
161162
case 1 :
@@ -175,10 +176,27 @@ decompose_character(const char *rest, int character_length,
175176
((rest[0] & 0x07) << 10) +
176177
((rest[1] & 0x3f) << 4) +
177178
((rest[2] & 0x3c) >> 2);
178-
*low_code = ((rest[1] & 0x03) << 6) + (rest[2] & 0x3f);
179+
*low_code = ((rest[2] & 0x03) << 6) + (rest[3] & 0x3f);
180+
break;
181+
case 5 :
182+
*page =
183+
((rest[0] & 0x03) << 16) +
184+
((rest[1] & 0x3f) << 10) +
185+
((rest[2] & 0x3f) << 4) +
186+
((rest[3] & 0x3c) >> 2);
187+
*low_code = ((rest[3] & 0x03) << 6) + (rest[4] & 0x3f);
188+
break;
189+
case 6 :
190+
*page =
191+
((rest[0] & 0x01) << 22) +
192+
((rest[1] & 0x3f) << 16) +
193+
((rest[2] & 0x3f) << 10) +
194+
((rest[3] & 0x3f) << 4) +
195+
((rest[4] & 0x3c) >> 2);
196+
*low_code = ((rest[4] & 0x03) << 6) + (rest[5] & 0x3f);
179197
break;
180198
default :
181-
*page = -1;
199+
*page = (size_t)-1;
182200
*low_code = 0x00;
183201
break;
184202
}
@@ -187,15 +205,16 @@ decompose_character(const char *rest, int character_length,
187205
static inline void
188206
normalize_character(const char *utf8, int character_length,
189207
uint32_t **normalize_table,
208+
size_t normalize_table_size,
190209
char *normalized,
191210
unsigned int *normalized_character_length,
192211
unsigned int *normalized_length_in_bytes,
193212
unsigned int *normalized_n_characters)
194213
{
195-
int page;
214+
size_t page;
196215
uint32_t low_code;
197216
decompose_character(utf8, character_length, &page, &low_code);
198-
if ((0x00 <= page && page <= 0xff) && normalize_table[page]) {
217+
if (page < normalize_table_size && normalize_table[page]) {
199218
uint32_t normalized_code;
200219
unsigned int n_bytes;
201220
normalized_code = normalize_table[page][low_code];
@@ -304,6 +323,7 @@ static void
304323
normalize(grn_ctx *ctx, grn_obj *string,
305324
const char *normalizer_type_label,
306325
uint32_t **normalize_table,
326+
size_t normalize_table_size,
307327
normalizer_func custom_normalizer)
308328
{
309329
const char *original, *rest;
@@ -371,7 +391,8 @@ normalize(grn_ctx *ctx, grn_obj *string,
371391
&normalized_n_characters);
372392
}
373393
if (!custom_normalized) {
374-
normalize_character(rest, character_length, normalize_table,
394+
normalize_character(rest, character_length,
395+
normalize_table, normalize_table_size,
375396
normalized,
376397
&normalized_character_length,
377398
&normalized_length_in_bytes,
@@ -448,7 +469,9 @@ mysql_general_ci_next(GNUC_UNUSED grn_ctx *ctx,
448469
grn_encoding_to_string(encoding));
449470
return NULL;
450471
}
451-
normalize(ctx, string, normalizer_type_label, general_ci_table, NULL);
472+
normalize(ctx, string, normalizer_type_label,
473+
general_ci_table, sizeof(general_ci_table) / sizeof(uint32_t *),
474+
NULL);
452475
return NULL;
453476
}
454477

@@ -472,7 +495,9 @@ mysql_unicode_ci_next(GNUC_UNUSED grn_ctx *ctx,
472495
grn_encoding_to_string(encoding));
473496
return NULL;
474497
}
475-
normalize(ctx, string, normalizer_type_label, unicode_ci_table, NULL);
498+
normalize(ctx, string, normalizer_type_label,
499+
unicode_ci_table, sizeof(unicode_ci_table) / sizeof(uint32_t *),
500+
NULL);
476501
return NULL;
477502
}
478503

@@ -638,10 +663,38 @@ mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next(
638663
normalize(ctx, string,
639664
normalizer_type_label,
640665
unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table,
666+
sizeof(unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *),
641667
normalize_halfwidth_katakana_with_voiced_sound_mark);
642668
return NULL;
643669
}
644670

671+
static grn_obj *
672+
mysql_unicode_520_ci_next(GNUC_UNUSED grn_ctx *ctx,
673+
GNUC_UNUSED int nargs,
674+
grn_obj **args,
675+
GNUC_UNUSED grn_user_data *user_data)
676+
{
677+
grn_obj *string = args[0];
678+
grn_encoding encoding;
679+
const char *normalizer_type_label = "mysql-unicode-520-ci";
680+
681+
encoding = grn_string_get_encoding(ctx, string);
682+
if (encoding != GRN_ENC_UTF8) {
683+
GRN_PLUGIN_ERROR(ctx,
684+
GRN_FUNCTION_NOT_IMPLEMENTED,
685+
"[normalizer][%s] "
686+
"UTF-8 encoding is only supported: %s",
687+
normalizer_type_label,
688+
grn_encoding_to_string(encoding));
689+
return NULL;
690+
}
691+
normalize(ctx, string, normalizer_type_label,
692+
unicode_520_ci_table,
693+
sizeof(unicode_520_ci_table) / sizeof(uint32_t *),
694+
NULL);
695+
return NULL;
696+
}
697+
645698
grn_rc
646699
GRN_PLUGIN_INIT(grn_ctx *ctx)
647700
{
@@ -664,6 +717,8 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx)
664717
NULL,
665718
mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next,
666719
NULL);
720+
grn_normalizer_register(ctx, "NormalizerMySQLUnicode520CI", -1,
721+
NULL, mysql_unicode_520_ci_next, NULL);
667722
return GRN_SUCCESS;
668723
}
669724

normalizers/mysql_general_ci_table.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
MA 02110-1301, USA
1818
1919
This file uses normalization table defined in
20-
mysql-5.5.29/strings/ctype-utf8.c.
20+
mysql-5.6.23/strings/ctype-utf8.c.
2121
The following is the header of the file:
2222
2323
Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.

0 commit comments

Comments
 (0)