HashidaTKS
diff --git a/‎normalizers/Makefile.am
Lines changed: 12 additions & 4 deletions b/‎normalizers/Makefile.am
Lines changed: 12 additions & 4 deletions
diff --git a/‎normalizers/mysql.c
Lines changed: 63 additions & 8 deletions b/‎normalizers/mysql.c
Lines changed: 63 additions & 8 deletions
diff --git a/‎normalizers/mysql_general_ci_table.h
Lines changed: 1 addition & 1 deletion b/‎normalizers/mysql_general_ci_table.h
Lines changed: 1 addition & 1 deletion
@@ -39,10 +39,11 @@ ensure-mysql-source-dir:
 	  exit 1;					\
 	fi
 
-UPDATE_TABLES_TARGETS =			\
-	update-general-ci-table		\
-	update-unicode-ci-table		\
-	update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table
+UPDATE_TABLES_TARGETS =								\
+	update-general-ci-table							\
+	update-unicode-ci-table							\
+	update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table	\
+	update-unicode-520-ci-table
 
 update-tables: $(UPDATE_TABLES_TARGETS)
 
@@ -67,3 +68,10 @@ update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table: ensure-mysql
 	  --suffix _except_kana_ci_kana_with_voiced_sound_mark	\
 	  $(MYSQL_SOURCE_DIR)/strings/ctype-uca.c >		\
 	  $(srcdir)/mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h
+
+update-unicode-520-ci-table: ensure-mysql-source-dir
+	$(RUBY)						\
+	  $(top_srcdir)/tool/generate_uca_table.rb	\
+	  --version=520					\
+	  $(MYSQL_SOURCE_DIR)/strings/ctype-uca.c >	\
+	  $(srcdir)/mysql_unicode_520_ci_table.h
@@ -34,6 +34,7 @@
 
 #include "mysql_general_ci_table.h"
 #include "mysql_unicode_ci_table.h"
+#include "mysql_unicode_520_ci_table.h"
 #include "mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h"
 
 #ifdef __GNUC__
@@ -155,7 +156,7 @@ utf8_to_unichar(const char *utf8, int byte_size)
 
 static inline void
 decompose_character(const char *rest, int character_length,
-                    int *page, uint32_t *low_code)
+                    size_t *page, uint32_t *low_code)
 {
   switch (character_length) {
   case 1 :
@@ -175,10 +176,27 @@ decompose_character(const char *rest, int character_length,
       ((rest[0] & 0x07) << 10) +
       ((rest[1] & 0x3f) << 4) +
       ((rest[2] & 0x3c) >> 2);
-    *low_code = ((rest[1] & 0x03) << 6) + (rest[2] & 0x3f);
+    *low_code = ((rest[2] & 0x03) << 6) + (rest[3] & 0x3f);
+    break;
+  case 5 :
+    *page =
+      ((rest[0] & 0x03) << 16) +
+      ((rest[1] & 0x3f) << 10) +
+      ((rest[2] & 0x3f) << 4) +
+      ((rest[3] & 0x3c) >> 2);
+    *low_code = ((rest[3] & 0x03) << 6) + (rest[4] & 0x3f);
+    break;
+  case 6 :
+    *page =
+      ((rest[0] & 0x01) << 22) +
+      ((rest[1] & 0x3f) << 16) +
+      ((rest[2] & 0x3f) << 10) +
+      ((rest[3] & 0x3f) << 4) +
+      ((rest[4] & 0x3c) >> 2);
+    *low_code = ((rest[4] & 0x03) << 6) + (rest[5] & 0x3f);
     break;
   default :
-    *page = -1;
+    *page = (size_t)-1;
     *low_code = 0x00;
     break;
   }
@@ -187,15 +205,16 @@ decompose_character(const char *rest, int character_length,
 static inline void
 normalize_character(const char *utf8, int character_length,
                     uint32_t **normalize_table,
+                    size_t normalize_table_size,
                     char *normalized,
                     unsigned int *normalized_character_length,
                     unsigned int *normalized_length_in_bytes,
                     unsigned int *normalized_n_characters)
 {
-  int page;
+  size_t page;
   uint32_t low_code;
   decompose_character(utf8, character_length, &page, &low_code);
-  if ((0x00 <= page && page <= 0xff) && normalize_table[page]) {
+  if (page < normalize_table_size && normalize_table[page]) {
     uint32_t normalized_code;
     unsigned int n_bytes;
     normalized_code = normalize_table[page][low_code];
@@ -304,6 +323,7 @@ static void
 normalize(grn_ctx *ctx, grn_obj *string,
           const char *normalizer_type_label,
           uint32_t **normalize_table,
+          size_t normalize_table_size,
           normalizer_func custom_normalizer)
 {
   const char *original, *rest;
@@ -371,7 +391,8 @@ normalize(grn_ctx *ctx, grn_obj *string,
                                               &normalized_n_characters);
       }
       if (!custom_normalized) {
-        normalize_character(rest, character_length, normalize_table,
+        normalize_character(rest, character_length,
+                            normalize_table, normalize_table_size,
                             normalized,
                             &normalized_character_length,
                             &normalized_length_in_bytes,
@@ -448,7 +469,9 @@ mysql_general_ci_next(GNUC_UNUSED grn_ctx *ctx,
                      grn_encoding_to_string(encoding));
     return NULL;
   }
-  normalize(ctx, string, normalizer_type_label, general_ci_table, NULL);
+  normalize(ctx, string, normalizer_type_label,
+            general_ci_table, sizeof(general_ci_table) / sizeof(uint32_t *),
+            NULL);
   return NULL;
 }
 
@@ -472,7 +495,9 @@ mysql_unicode_ci_next(GNUC_UNUSED grn_ctx *ctx,
                      grn_encoding_to_string(encoding));
     return NULL;
   }
-  normalize(ctx, string, normalizer_type_label, unicode_ci_table, NULL);
+  normalize(ctx, string, normalizer_type_label,
+            unicode_ci_table, sizeof(unicode_ci_table) / sizeof(uint32_t *),
+            NULL);
   return NULL;
 }
 
@@ -638,10 +663,38 @@ mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next(
   normalize(ctx, string,
             normalizer_type_label,
             unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table,
+            sizeof(unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *),
             normalize_halfwidth_katakana_with_voiced_sound_mark);
   return NULL;
 }
 
+static grn_obj *
+mysql_unicode_520_ci_next(GNUC_UNUSED grn_ctx *ctx,
+                          GNUC_UNUSED int nargs,
+                          grn_obj **args,
+                          GNUC_UNUSED grn_user_data *user_data)
+{
+  grn_obj *string = args[0];
+  grn_encoding encoding;
+  const char *normalizer_type_label = "mysql-unicode-520-ci";
+
+  encoding = grn_string_get_encoding(ctx, string);
+  if (encoding != GRN_ENC_UTF8) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_FUNCTION_NOT_IMPLEMENTED,
+                     "[normalizer][%s] "
+                     "UTF-8 encoding is only supported: %s",
+                     normalizer_type_label,
+                     grn_encoding_to_string(encoding));
+    return NULL;
+  }
+  normalize(ctx, string, normalizer_type_label,
+            unicode_520_ci_table,
+            sizeof(unicode_520_ci_table) / sizeof(uint32_t *),
+            NULL);
+  return NULL;
+}
+
 grn_rc
 GRN_PLUGIN_INIT(grn_ctx *ctx)
 {
@@ -664,6 +717,8 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx)
                           NULL,
                           mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next,
                           NULL);
+  grn_normalizer_register(ctx, "NormalizerMySQLUnicode520CI", -1,
+                          NULL, mysql_unicode_520_ci_next, NULL);
   return GRN_SUCCESS;
 }
 
 
@@ -17,7 +17,7 @@
   MA 02110-1301, USA
 
   This file uses normalization table defined in
-  mysql-5.5.29/strings/ctype-utf8.c.
+  mysql-5.6.23/strings/ctype-utf8.c.
   The following is the header of the file:
 
     Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.