Skip to content

Commit a1089b4

Browse files
committed
Bug: 'utf8.codes' accepts spurious continuation bytes
1 parent f8c4c4f commit a1089b4

File tree

2 files changed

+27
-12
lines changed

2 files changed

+27
-12
lines changed

lutf8lib.c

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525

2626
#define MAXUTF 0x7FFFFFFFu
2727

28+
29+
#define MSGInvalid "invalid UTF-8 code"
30+
2831
/*
2932
** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
3033
*/
@@ -35,7 +38,8 @@ typedef unsigned long utfint;
3538
#endif
3639

3740

38-
#define iscont(p) ((*(p) & 0xC0) == 0x80)
41+
#define iscont(c) (((c) & 0xC0) == 0x80)
42+
#define iscontp(p) iscont(*(p))
3943

4044

4145
/* from strlib */
@@ -65,7 +69,7 @@ static const char *utf8_decode (const char *s, utfint *val, int strict) {
6569
int count = 0; /* to count number of continuation bytes */
6670
for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
6771
unsigned int cc = (unsigned char)s[++count]; /* read next byte */
68-
if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
72+
if (!iscont(cc)) /* not a continuation byte? */
6973
return NULL; /* invalid byte sequence */
7074
res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
7175
}
@@ -140,7 +144,7 @@ static int codepoint (lua_State *L) {
140144
utfint code;
141145
s = utf8_decode(s, &code, !lax);
142146
if (s == NULL)
143-
return luaL_error(L, "invalid UTF-8 code");
147+
return luaL_error(L, MSGInvalid);
144148
lua_pushinteger(L, code);
145149
n++;
146150
}
@@ -190,16 +194,16 @@ static int byteoffset (lua_State *L) {
190194
"position out of bounds");
191195
if (n == 0) {
192196
/* find beginning of current byte sequence */
193-
while (posi > 0 && iscont(s + posi)) posi--;
197+
while (posi > 0 && iscontp(s + posi)) posi--;
194198
}
195199
else {
196-
if (iscont(s + posi))
200+
if (iscontp(s + posi))
197201
return luaL_error(L, "initial position is a continuation byte");
198202
if (n < 0) {
199203
while (n < 0 && posi > 0) { /* move back */
200204
do { /* find beginning of previous character */
201205
posi--;
202-
} while (posi > 0 && iscont(s + posi));
206+
} while (posi > 0 && iscontp(s + posi));
203207
n++;
204208
}
205209
}
@@ -208,7 +212,7 @@ static int byteoffset (lua_State *L) {
208212
while (n > 0 && posi < (lua_Integer)len) {
209213
do { /* find beginning of next character */
210214
posi++;
211-
} while (iscont(s + posi)); /* (cannot pass final '\0') */
215+
} while (iscontp(s + posi)); /* (cannot pass final '\0') */
212216
n--;
213217
}
214218
}
@@ -226,15 +230,15 @@ static int iter_aux (lua_State *L, int strict) {
226230
const char *s = luaL_checklstring(L, 1, &len);
227231
lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2);
228232
if (n < len) {
229-
while (iscont(s + n)) n++; /* skip continuation bytes */
233+
while (iscontp(s + n)) n++; /* go to next character */
230234
}
231235
if (n >= len) /* (also handles original 'n' being negative) */
232236
return 0; /* no more codepoints */
233237
else {
234238
utfint code;
235239
const char *next = utf8_decode(s + n, &code, strict);
236-
if (next == NULL)
237-
return luaL_error(L, "invalid UTF-8 code");
240+
if (next == NULL || iscontp(next))
241+
return luaL_error(L, MSGInvalid);
238242
lua_pushinteger(L, n + 1);
239243
lua_pushinteger(L, code);
240244
return 2;
@@ -253,7 +257,8 @@ static int iter_auxlax (lua_State *L) {
253257

254258
static int iter_codes (lua_State *L) {
255259
int lax = lua_toboolean(L, 2);
256-
luaL_checkstring(L, 1);
260+
const char *s = luaL_checkstring(L, 1);
261+
luaL_argcheck(L, !iscontp(s), 1, MSGInvalid);
257262
lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
258263
lua_pushvalue(L, 1);
259264
lua_pushinteger(L, 0);

testes/utf8.lua

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,15 @@ do -- error indication in utf8.len
9797
assert(not a and b == p)
9898
end
9999
check("abc\xE3def", 4)
100-
check("汉字\x80", #("汉字") + 1)
101100
check("\xF4\x9F\xBF", 1)
102101
check("\xF4\x9F\xBF\xBF", 1)
102+
-- spurious continuation bytes
103+
check("汉字\x80", #("汉字") + 1)
104+
check("\x80hello", 1)
105+
check("hel\x80lo", 4)
106+
check("汉字\xBF", #("汉字") + 1)
107+
check("\xBFhello", 1)
108+
check("hel\xBFlo", 4)
103109
end
104110

105111
-- errors in utf8.codes
@@ -112,12 +118,16 @@ do
112118
end
113119
errorcodes("ab\xff")
114120
errorcodes("\u{110000}")
121+
errorcodes("in\x80valid")
122+
errorcodes("\xbfinvalid")
123+
errorcodes("αλφ\xBFα")
115124

116125
-- calling interation function with invalid arguments
117126
local f = utf8.codes("")
118127
assert(f("", 2) == nil)
119128
assert(f("", -1) == nil)
120129
assert(f("", math.mininteger) == nil)
130+
121131
end
122132

123133
-- error in initial position for offset

0 commit comments

Comments
 (0)