Skip to content

Commit

Permalink
Fixed: Start bytes 0xF5 and above are invalid for UTF-8
Browse files Browse the repository at this point in the history
Signed-off-by: Zafer Balkan <[email protected]>
  • Loading branch information
zbalkan committed Dec 5, 2024
1 parent c3fbb30 commit 1d50e3e
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions src/common/utf8_op/src/utf8_op.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,29 @@

/* Two bytes: 110xxxxx 10xxxxxx */
/* Starting bytes 0xC0 and 0xC1 are forbidden (overlong) */
#define valid_2(x) (((x)[0] & 0xE0) == 0xC0 && (x)[0] >= (char)0xC2 && ((x)[1] & 0xC0) == 0x80)
#define valid_2(x) (((x)[0] & 0xE0) == 0xC0 && \
(x)[0] >= (char)0xC2 && ((x)[1] & 0xC0) == 0x80)

/* Three bytes: 1110xxxx 10xxxxxx 10xxxxxx */
/* 0xE0 could start overlong encodings */
/* 0xED (range U+D800–U+DFFF) is reserved for UTF-16 surrogate halves */
#define valid_3(x) (((x)[0] & 0xF0) == 0xE0 && ((x)[1] & 0xC0) == 0x80 && ((x)[2] & 0xC0) == 0x80 && ((x)[0] != (char)0xE0 || (unsigned char)(x)[1] >= 0xA0) && ((x)[0] != (char)0xED || (unsigned char)(x)[1] < 0xA0) && ((x)[0] != (char)0xEF || (unsigned char)(x)[1] <= 0xBF))
#define valid_3(x) (((x)[0] & 0xF0) == 0xE0 && \
((x)[1] & 0xC0) == 0x80 && \
((x)[2] & 0xC0) == 0x80 && \
((x)[0] != (char)0xE0 || (unsigned char)(x)[1] >= 0xA0) && \
((x)[0] != (char)0xED || (unsigned char)(x)[1] < 0xA0) && \
((x)[0] != (char)0xEF || (unsigned char)(x)[1] <= 0xBF))

/* Four bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
/* 0xF0 could start overlong encodings */
/* Starting bytes 111101xx are forbidden (Unicode limit) */
#define valid_4(x) (((x)[0] & 0xF8) == 0xF0 && ((x)[1] & 0xC0) == 0x80 && ((x)[2] & 0xC0) == 0x80 && ((x)[3] & 0xC0) == 0x80 && ((x)[0] != (char)0xF0 || (unsigned char)(x)[1] >= 0x90) && ((x)[0] != (char)0xF4 || (unsigned char)(x)[1] <= 0x8F))
/* Start bytes 0xF5 and above are invalid for UTF-8 */
#define valid_4(x) (((x)[0] & 0xF8) == 0xF0 && \
(unsigned char)(x)[0] <= 0xF4 && \
((x)[1] & 0xC0) == 0x80 && \
((x)[2] & 0xC0) == 0x80 && \
((x)[3] & 0xC0) == 0x80 && \
((x)[0] != (char)0xF0 || (unsigned char)(x)[1] >= 0x90) && \
((x)[0] != (char)0xF4 || (unsigned char)(x)[1] <= 0x8F))

/* Return whether a string is UTF-8 */
bool w_utf8_valid(const char * string) {
Expand Down

0 comments on commit 1d50e3e

Please sign in to comment.