Skip to content

Commit

Permalink
overlong characters are now invalid
Browse files Browse the repository at this point in the history
  • Loading branch information
cs127 committed Jun 9, 2024
1 parent 04ed311 commit 240ced4
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 12 deletions.
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@ unlike the standard functions which are locale-dependent, unicorn does not suppo
* multibyte strings (used in `mbstowcs` and the like) are assumed to be encoded in UTF-8.
* surrogates (`U+D800`-`U+DFFF`) are considered invalid in multibyte strings.
* characters of length 5-8 are considered invalid, and so are 4-byte characters that exceed `U+10FFFF`.

> [!WARNING]
> do not put overlong characters (characters encoded in a larger number of bytes than needed) in your multibyte strings!
> currently, unicorn does not consider them invalid, but **this will change**.
* overlong characters (characters encoded in a larger number of bytes than necessary) are considered invalid.

everything that unicorn implements uses the same name as its counterpart in standard C, except with a `UC_` prefix.
the only exception being the `wchar_t` type. unicorn uses the standard `wchar_t`.
Expand Down
57 changes: 49 additions & 8 deletions src/unicorn.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
/*
* unicorn
* version 0.001
* 2024-06-09
*/

/* I wish I could use BCPL-style comments. damn you C89.. */


Expand Down Expand Up @@ -39,6 +45,11 @@ typedef int bool;

#define UC_IS_SUR(c) (UC_IS_HIGHSUR(c) || UC_IS_LOWSUR(c))

#define UC_IS_OVERLONG(c, l) \
((l) > 1 && (c) <= UC_MAX_1BYTE) || \
((l) > 2 && (c) <= UC_MAX_2BYTE) || \
((l) > 3 && (c) <= UC_MAX_3BYTE)

#define UC_TOP1 0x80
#define UC_TOP2 0xC0
#define UC_TOP3 0xE0
Expand All @@ -50,12 +61,12 @@ typedef int bool;
#define UC_BOTTOM5 0x1F
#define UC_BOTTOM6 0x3F

#define UC_IS_1BYTE(c) (!((c) & UC_TOP1))
#define UC_IS_2BYTE(c) (((c) & UC_TOP3) == UC_TOP2)
#define UC_IS_3BYTE(c) (((c) & UC_TOP4) == UC_TOP3)
#define UC_IS_4BYTE(c) (((c) & UC_TOP5) == UC_TOP4)
#define UC_IS_1BYTE(b) (!((b) & UC_TOP1))
#define UC_IS_2BYTE(b) (((b) & UC_TOP3) == UC_TOP2)
#define UC_IS_3BYTE(b) (((b) & UC_TOP4) == UC_TOP3)
#define UC_IS_4BYTE(b) (((b) & UC_TOP5) == UC_TOP4)

#define UC_IS_CONT(c) (((c) & UC_TOP2) == UC_TOP1)
#define UC_IS_CONT(b) (((b) & UC_TOP2) == UC_TOP1)


size_t UC_wcslen(const wchar_t* s)
Expand Down Expand Up @@ -444,7 +455,8 @@ size_t UC_mbstowcs(wchar_t* dest, const char* src, size_t n)
{
size_t isrc,
idest,
trail;
trail,
clen;
bool lowsur;
unsigned long int csrc;

Expand Down Expand Up @@ -501,6 +513,8 @@ size_t UC_mbstowcs(wchar_t* dest, const char* src, size_t n)
return -1;
}

clen = 1 + trail;

/* read the remaining bytes. */

while (trail)
Expand All @@ -516,6 +530,13 @@ size_t UC_mbstowcs(wchar_t* dest, const char* src, size_t n)
(src[isrc++] & UC_BOTTOM6) << (6 * (--trail));
}

if (UC_IS_OVERLONG(csrc, clen))
{
/* overlong character
(encoded in more bytes than necessary). */
return -1;
}

if (UC_IS_SUR(csrc))
{
/* UTF-16 surrogate (invalid in UTF-8). */
Expand Down Expand Up @@ -694,7 +715,8 @@ int UC_wcstomb(char* s, const wchar_t* pc)
int UC_mbtowc(wchar_t* pc, const char* s, size_t n)
{
size_t i,
trail;
trail,
clen;
unsigned long int c;

if (!s)
Expand Down Expand Up @@ -758,11 +780,20 @@ int UC_mbtowc(wchar_t* pc, const char* s, size_t n)
(nonstandard). */
return -1;
}

clen = 1 + trail;
}
else if (!trail)
{
/* the character has been fully parsed. */

if (UC_IS_OVERLONG(c, clen))
{
/* overlong character
(encoded in more bytes than necessary). */
return -1;
}

if (UC_IS_SUR(c))
{
/* UTF-16 surrogate (invalid in UTF-8). */
Expand Down Expand Up @@ -831,7 +862,8 @@ UC_wint_t UC_btowc(int c)
int UC_mblen(const char* s, size_t n)
{
size_t i,
trail;
trail,
clen;
unsigned long int c;

trail = UC_INVALID_SIZE;
Expand Down Expand Up @@ -887,11 +919,20 @@ int UC_mblen(const char* s, size_t n)
(nonstandard). */
return -1;
}

clen = 1 + trail;
}
else if (!trail)
{
/* the character has been fully parsed. */

if (UC_IS_OVERLONG(c, clen))
{
/* overlong character
(encoded in more bytes than necessary). */
return -1;
}

if (UC_IS_SUR(c))
{
/* UTF-16 surrogate (invalid in UTF-8). */
Expand Down
7 changes: 7 additions & 0 deletions src/unicorn.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
/*
* unicorn
* version 0.001
* 2024-06-09
*/


#ifndef UC_H_UNICORN
#define UC_H_UNICORN

Expand Down

0 comments on commit 240ced4

Please sign in to comment.