diff --git a/decode_test.go b/decode_test.go index 0364b0bb..8b58da05 100644 --- a/decode_test.go +++ b/decode_test.go @@ -761,6 +761,22 @@ var unmarshalTests = []struct { M{"ñoño": "very yes 🟔"}, }, + // UTF-16 surrogate pair. + { + `"\ud83e\udd23"`, + "🤣", + }, + // UTF-16 multiple surrogate pairs. + { + `"\uD83D\uDE00\uD83D\uDE01"`, + "😀😁", + }, + // UTF-16 non-surrogate pair character in between. + { + `"\uD83D\uDE00a\uD83D\uDE01"`, + "😀a😁", + }, + // This *is* in fact a float number, per the spec. #171 was a mistake. { "a: 123456e1\n", diff --git a/scannerc.go b/scannerc.go index ca007010..3efefcce 100644 --- a/scannerc.go +++ b/scannerc.go @@ -2570,12 +2570,48 @@ func yaml_parser_scan_flow_scalar(parser *yaml_parser_t, token *yaml_token_t, si } // Check the value and write the character. - if (value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF { + if (value >= 0xDC00 && value <= 0xDFFF) || value > 0x10FFFF { yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar", start_mark, "found invalid Unicode character escape code") return false - } - if value <= 0x7F { + } else if value >= 0xD800 && value <= 0xDBFF { + high := value + + // we need the remaining 4 characters in the escape code, plus \u and the next escape code + if parser.unread < 10 && !yaml_parser_update_buffer(parser, 10) { + yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar", + start_mark, "incomplete surrogate pair") + return false + } + + // expect \u + if parser.buffer[parser.buffer_pos+code_length] != '\\' || parser.buffer[parser.buffer_pos+code_length+1] != 'u' { + yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar", + start_mark, "expected surrogate pair to follow with \\u") + return false + } + + // parse the second escape code into low + offset := parser.buffer_pos + code_length + 2 + var low int + for k := 0; k < code_length; k++ { + if !is_hex(parser.buffer, offset+k) { + yaml_parser_set_scanner_error(parser, "while parsing a quoted scalar", + start_mark, "did not find expected hexdecimal number") + return false + } + low = (low << 4) + as_hex(parser.buffer, offset+k) + } + + result := rune(((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000) + + // skip the second escape code (the first is skipped later) + for k := 0; k < 6; k++ { + skip(parser) + } + + s = append(s, []byte(string(result))...) + } else if value <= 0x7F { s = append(s, byte(value)) } else if value <= 0x7FF { s = append(s, byte(0xC0+(value>>6)))