Skip to content

Reading past input end #47

Description

@captainurist

In __utf16_with::skip_input_error:

	if (__it == __last) {
		break;
	}
	auto __second_surrogate = __it;
	++__second_surrogate;
	if (__ztd_idk_detail_is_trail_surrogate(
		    static_cast<ztd_char32_t>(*__second_surrogate))) {
		break;
	}
	__it = ::std::move(__second_surrogate);

That *__second_surrogate reads past end.

AI-generated repro:

/**
 * Reproducer for ztd::text UTF-16 bugs.
 *
 * Bug 1: In utf16.hpp's skip_input_error function, when handling an unpaired surrogate at the end
 * of input, the code dereferences __second_surrogate without checking if it has reached __last.
 *
 * Bug 2: In utf16.hpp's decode_one function, the bounds check for incomplete surrogate pairs
 * happens BEFORE advancing the iterator, so it doesn't catch the case where a lead surrogate
 * is the last character in the input.
 *
 * To reproduce: run this test WITHOUT the fixes applied to utf16.hpp.
 */

#include <cassert>
#include <cstddef>
#include <iterator>
#include <string>

#include <gtest/gtest.h>
#include <ztd/text.hpp>

/**
 * A string_view-like class with checked iterators that assert on past-end dereference.
 * This is used to detect when ztd::text reads past the end of input.
 */
template<class CharT>
class AssertingStringView {
 public:
    using value_type = CharT;
    using size_type = std::size_t;
    using difference_type = std::ptrdiff_t;
    using pointer = const CharT *;
    using const_pointer = const CharT *;
    using reference = const CharT &;
    using const_reference = const CharT &;

    class iterator {
     public:
        using iterator_category = std::contiguous_iterator_tag;
        using value_type = CharT;
        using difference_type = std::ptrdiff_t;
        using pointer = const CharT *;
        using reference = const CharT &;

        iterator() = default;
        iterator(const CharT *ptr, const CharT *end) : _ptr(ptr), _end(end) {}

        reference operator*() const {
            assert(_ptr < _end && "AssertingStringView: cannot dereference end iterator");
            return *_ptr;
        }

        pointer operator->() const {
            assert(_ptr < _end && "AssertingStringView: cannot dereference end iterator");
            return _ptr;
        }

        iterator &operator++() { ++_ptr; return *this; }
        iterator operator++(int) { iterator tmp = *this; ++_ptr; return tmp; }
        iterator &operator--() { --_ptr; return *this; }
        iterator operator--(int) { iterator tmp = *this; --_ptr; return tmp; }

        iterator &operator+=(difference_type n) { _ptr += n; return *this; }
        iterator &operator-=(difference_type n) { _ptr -= n; return *this; }

        iterator operator+(difference_type n) const { return iterator(_ptr + n, _end); }
        iterator operator-(difference_type n) const { return iterator(_ptr - n, _end); }
        difference_type operator-(const iterator &other) const { return _ptr - other._ptr; }

        reference operator[](difference_type n) const {
            assert(_ptr + n < _end && "AssertingStringView: index out of bounds");
            return _ptr[n];
        }

        bool operator==(const iterator &other) const { return _ptr == other._ptr; }
        bool operator!=(const iterator &other) const { return _ptr != other._ptr; }
        bool operator<(const iterator &other) const { return _ptr < other._ptr; }
        bool operator<=(const iterator &other) const { return _ptr <= other._ptr; }
        bool operator>(const iterator &other) const { return _ptr > other._ptr; }
        bool operator>=(const iterator &other) const { return _ptr >= other._ptr; }

        friend iterator operator+(difference_type n, const iterator &it) { return it + n; }

     private:
        const CharT *_ptr = nullptr;
        const CharT *_end = nullptr;
    };

    using const_iterator = iterator;

    AssertingStringView() = default;
    AssertingStringView(const CharT *data, size_type size) : _data(data), _size(size) {}

    const_iterator begin() const { return iterator(_data, _data + _size); }
    const_iterator end() const { return iterator(_data + _size, _data + _size); }
    const_iterator cbegin() const { return begin(); }
    const_iterator cend() const { return end(); }

    size_type size() const { return _size; }
    bool empty() const { return _size == 0; }
    const_pointer data() const { return _data; }

    reference operator[](size_type pos) const {
        assert(pos < _size && "AssertingStringView: index out of bounds");
        return _data[pos];
    }

 private:
    const CharT *_data = nullptr;
    size_type _size = 0;
};

/**
 * Bug reproducer: Unpaired lead surrogate at end of input causes past-end dereference.
 *
 * Input: L"\xDC00\x0028" (unpaired low surrogate followed by '(')
 * Expected: Should transcode to UTF-8 with replacement character for the unpaired surrogate.
 * Actual (before fix): Asserts "cannot dereference end iterator" because skip_input_error
 *                      tries to check if the next code unit is a trail surrogate without
 *                      first checking if we've reached the end of input.
 */
TEST(ZtdTextBug, UnpairedSurrogateAtEnd) {
    // L"\xDC00" is an unpaired low surrogate (invalid UTF-16).
    // L"\x0028" is '(' (valid).
    // When transcoding, the replacement_handler is invoked for the invalid surrogate,
    // which calls skip_input_error. The bug is in skip_input_error's surrogate pair detection.
    const wchar_t data[] = L"\xDC00\x0028";
    AssertingStringView<wchar_t> input(data, 2);

    // This should not assert - it should produce replacement character + '('
    std::string result = ztd::text::transcode(input, ztd::text::wide_utf16, ztd::text::compat_utf8,
                                              ztd::text::replacement_handler);

    // U+FFFD in UTF-8 is 0xEF 0xBF 0xBD, followed by '(' (0x28)
    EXPECT_EQ(result, "\xef\xbf\xbd(");
}

/**
 * Additional test: Lead surrogate at end of input (incomplete surrogate pair).
 */
TEST(ZtdTextBug, LeadSurrogateAtEnd) {
    // L"\xD800" is a lead surrogate without a trailing surrogate.
    const wchar_t data[] = L"\xD800";
    AssertingStringView<wchar_t> input(data, 1);

    // This should produce a replacement character, not crash.
    std::string result = ztd::text::transcode(input, ztd::text::wide_utf16, ztd::text::compat_utf8,
                                              ztd::text::replacement_handler);

    // U+FFFD in UTF-8 is 0xEF 0xBF 0xBD
    EXPECT_EQ(result, "\xef\xbf\xbd");
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions