Skip to content

Commit

Permalink
Add support for escaped UTF-16 surrogate pairs
Browse files Browse the repository at this point in the history
I'm new to this code base so have likely implemented this in a way that isn't ideal, but hopefully it's enough of a starting point.

References:

* https://russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
* https://mathiasbynens.be/notes/javascript-unicode
* readerc.go

Fixes go-yaml#279
  • Loading branch information
Steve Hoeksema committed Apr 8, 2024
1 parent f6f7691 commit 6e0a9b4
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 3 deletions.
16 changes: 16 additions & 0 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,22 @@ var unmarshalTests = []struct {
M{"ñoño": "very yes 🟔"},
},

// UTF-16 surrogate pair.
{
`"\ud83e\udd23"`,
"🤣",
},
// UTF-16 multiple surrogate pairs.
{
`"\uD83D\uDE00\uD83D\uDE01"`,
"😀😁",
},
// UTF-16 non-surrogate pair character in between.
{
`"\uD83D\uDE00a\uD83D\uDE01"`,
"😀a😁",
},

// This *is* in fact a float number, per the spec. #171 was a mistake.
{
"a: 123456e1\n",
Expand Down
42 changes: 39 additions & 3 deletions scannerc.go
Original file line number Diff line number Diff line change
Expand Up @@ -2570,12 +2570,48 @@ func yaml_parser_scan_flow_scalar(parser *yaml_parser_t, token *yaml_token_t, si
}

// Check the value and write the character.
if (value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF {
if (value >= 0xDC00 && value <= 0xDFFF) || value > 0x10FFFF {
yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar",
start_mark, "found invalid Unicode character escape code")
return false
}
if value <= 0x7F {
} else if value >= 0xD800 && value <= 0xDBFF {
high := value

// we need the remaining 4 characters in the escape code, plus \u and the next escape code
if parser.unread < 10 && !yaml_parser_update_buffer(parser, 10) {
yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar",
start_mark, "incomplete surrogate pair")
return false
}

// expect \u
if parser.buffer[parser.buffer_pos+code_length] != '\\' || parser.buffer[parser.buffer_pos+code_length+1] != 'u' {
yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar",
start_mark, "expected surrogate pair to follow with \\u")
return false
}

// parse the second escape code into low
offset := parser.buffer_pos + code_length + 2
var low int
for k := 0; k < code_length; k++ {
if !is_hex(parser.buffer, offset+k) {
yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar",
start_mark, "did not find expected hexdecimal number")
return false
}
low = (low << 4) + as_hex(parser.buffer, offset+k)
}

result := rune(((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000)

// skip the second escape code (the first is skipped later)
for k := 0; k < 6; k++ {
skip(parser)
}

s = append(s, []byte(string(result))...)
} else if value <= 0x7F {
s = append(s, byte(value))
} else if value <= 0x7FF {
s = append(s, byte(0xC0+(value>>6)))
Expand Down

0 comments on commit 6e0a9b4

Please sign in to comment.