Skip to content

Commit

Permalink
Merge pull request #8 from JuliaData/td-docs-tweaks
Browse files Browse the repository at this point in the history
[NFC] Add more comments
  • Loading branch information
nickrobinson251 authored Nov 22, 2023
2 parents e1bfb0c + e9327b9 commit a297b35
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# NewlineLexers.jl

Quote-aware newline finder.
Quote-aware newline finder. By default it uses a branchless algorithm to find newlines, and is able to skip those which appear inside string fields. This is useful for parsing CSV files, for example, where we want to quickly find all newlines that separate individual records.

```julia
julia> data = collect(codeunits(""" abc\n "efg\n" \n """));
Expand Down
110 changes: 99 additions & 11 deletions src/NewlineLexers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ let # Feature detection -- copied from ScanByte.jl
features = split(unsafe_string(features_cstring), ',')
Libc.free(features_cstring)

# prefix_xor works like this: it goes through the bits of the input UInt from least significant to most significant
# and it xors the current bit with the previous one. This means that it starts producing 0 until it meets the first 1, then
# it starts producing 1s until it meets the next 1, then it starts producing 0s again, etc.
# Example:
# 0b00001000010
# -> 0b00000111110
@eval if _AVOID_PLATFORM_SPECIFIC_LLVM_CODE || !any(x->occursin("clmul", x), $(features))
@inline function prefix_xor(q)
mask = q (q << 1)
Expand All @@ -59,6 +65,7 @@ let # Feature detection -- copied from ScanByte.jl
return mask
end
else
# Cool explainer on carryless multiplication: https://wunkolo.github.io/post/2020/05/pclmulqdq-tricks/
function carrylessmul(a::NTuple{2,VecElement{UInt64}}, b::NTuple{2,VecElement{UInt64}})
ccall("llvm.x86.pclmulqdq", llvmcall, NTuple{2,VecElement{UInt64}}, (NTuple{2,VecElement{UInt64}}, NTuple{2,VecElement{UInt64}}, UInt8), a, b, 0)
end
Expand Down Expand Up @@ -123,13 +130,38 @@ end
# These must be respected in the `_find_newlines_kernel!` and `_find_newlines_generic!`
# All other cases are unambiguous, i.e. we can tell if we are inside a string or not,
# and if we are, we can tell if we are on an escape or not.
"""
Lexer{E,OQ,CQ,NL,IO_t}
Lexer(io, escapechar, openquotechar, closequotechar, newline) -> Lexer{E,OQ,CQ,NL,IO_t}
Lexer(io, nothing, newline) -> Lexer{Nothing,Nothing,Nothing,NL,IO_t}
A stateful lexer type for newline detection. Use with the `find_newlines!` function.
The type parameters are:
- `E`: the escape character
- `OQ`: the open quote character
- `CQ`: the close quote character
- `NL`: the newline character
- `IO_t`: the type of the IO object, e.g. `IOBuffer` or `IOStream`
Either `E`, `OQ`, and `CQ` are all `Nothing`, or they are all single-byte characters.
When `E`, `OQ`, and `CQ` are not `Nothing`, the lexer will find all newlines in the input,
that are not inside a string (between two quotes). This is useful for finding record separators
in CSVs.
If they are all `Nothing`, the lexer will be quote-unaware, and find all newlines in the input,
regardless of whether they are inside a string or not. You can construct such a lexer with
`Lexer(io, nothing, newline)`.
"""
mutable struct Lexer{E,OQ,CQ,NL,IO_t}
@constfield io::IO_t
@constfield escape::Vec{64, UInt8}
@constfield quotechar::Vec{64, UInt8}
@constfield newline::Vec{64, UInt8}
prev_escaped::UInt # 0 or 1
prev_in_string::UInt # 0 or typemax(UInt)
prev_escaped::UInt # 0 or 1, see the tables above
prev_in_string::UInt # 0 or typemax(UInt), see the tables above
done::Bool # Right now, this is not used but could be set by the caller

function Lexer(
Expand Down Expand Up @@ -192,6 +224,28 @@ end
possibly_not_in_string(l::Lexer{Q,Q,Q}) where {Q} = (l.prev_in_string & UInt(1)) == l.prev_escaped
possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0

# This is where we process 64 byte input when the quotechar and escapechar are identical.
# This is our adaptation of the original `simdjson` implementation which handles the escaping rules
# common in CSVs.
#
# An example showing intermediate results when parsing 64 bytes with one quoted newline in a
# string and one unquoted newline:
# Note all the bits are reversed for readability. `*` marks the newlines.
#
# "abc,"quoted,field","quoted*newline","escaped"" """" """"",01234*"
# X 0b0000100000000000010100000000000000101000000011011110111110000000
# F 0b0000010000000000001010000000000000010100000001101111011111000000
# SEQ 0b0000100000000000010100000000000000101000000010010000100000000000
# EB 0b1010101010101010101010101010101010101010101010101010101010101010
# OS 0b0000000000000000010100000000000000000000000000010000000000000000
# ES 0b0000100000000000000000000000000000101000000010000000100000000000
# OC 0b0000000000000000001010000000000000000000000000000000000000000000
# EC 0b0000010000000000000000000000000000010100000000000000000001000000
# Q 0b0000100000000000010100000000000000101000000000000000000010000000
# PX 0b0000111111111111100111111111111111001111111111111111111100000000
# STR 0b0000111111111111100111111111111111001111111111111111111100000000
# NL 0b0000000000000000000000000000000000000000000000000000000000000001
# "abc,"quoted,field","quoted*newline","escaped"" """" """"",01234*
@inline function _find_newlines_kernel!(l::Lexer{Q,Q,Q}, input::Vec{64, UInt8}) where {Q}
escape_chars = compress_escapes(l, input)
follows_escape = escape_chars << 1
Expand Down Expand Up @@ -224,7 +278,9 @@ possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0
# This ignores strings that are entirely made up of quotes, e.g. "", """", etc.
# But those cannot contain newlines so we don't care
# Shift by one as carries are always one bit off due to the addition 0b0001 + 0b0001 = 0b0010
quotes = ((even_string_starts | odd_string_starts) >> 1) l.prev_escaped # TODO: explain the xor and how it works with the prev_escaped
# When `l.prev_escaped` is set, it means we ended on an unescaped quote, so we need to add
# it here.
quotes = ((even_string_starts | odd_string_starts) >> 1) l.prev_escaped
in_string = prefix_xor(quotes) l.prev_in_string
newlines = compress_newlines(l, input) & ~in_string

Expand Down Expand Up @@ -253,6 +309,25 @@ possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0
return newlines
end

# This is where we process 64 byte input when the quotechar and escapechar are different characters.
# In this case we follow the implementation from `simdjson`.
# See section "3.1.1 Identification of the quoted substrings" in https://arxiv.org/pdf/1902.08318.pdf
#
# An example showing intermediate results when parsing 64 bytes with one quoted newline in a
# string and one unquoted newline:
# Note all the bits are reversed for readability. `*` marks the newlines.
#
# "abc,"quoted,field","quoted*newline","escaped\\ \\\\ \"\"",01234*"
# X 0b0000000000000000000000000000000000000000000011011110101000000000
# F 0b0000000000000000000000000000000000000000000001101111010100000000 X << 1 | l.prev_escaped
# EB 0b1010101010101010101010101010101010101010101010101010101010101010
# OS 0b0000000000000000000000000000000000000000000000010000000000000000 X & ~EB & ~F
# EC 0b0000000000000000000000000000000000000000000011000001101000000000 X + OS
# IM 0b0000000000000000000000000000000000000000000001100000110100000000 EC << 1
# E 0b0000000000000000000000000000000000000000000001001010010100000000 (EB ⊻ IM) & F
# Q 0b0000100000000000010100000000000000101000000000000000000010000000 quotes & ~E
# STR 0b0000111111111111100111111111111111001111111111111111111100000000 CLMUL(Q)
# NL 0b0000000000000000000000000000000000000000000000000000000000000001
@inline function _find_newlines_kernel!(l::Lexer{E,Q,Q}, input::Vec{64, UInt8}) where {E,Q}
escape_chars = compress_escapes(l, input) & ~l.prev_escaped
follows_escape = escape_chars << 1 | l.prev_escaped
Expand All @@ -277,8 +352,8 @@ end
# "\nIM 0b", bitstring(SIMD.Intrinsics.bitreverse(invert_mask)), " EC << 1",
# "\nE 0b", bitstring(SIMD.Intrinsics.bitreverse(escaped)), " (EB ⊻ IM) & F",
# "\nQ 0b", bitstring(SIMD.Intrinsics.bitreverse(quotes)), " quotes & ~E",
# "\nS 0b", bitstring(SIMD.Intrinsics.bitreverse(in_string)), " CLMUL(Q)",
# "\nL 0b", bitstring(SIMD.Intrinsics.bitreverse(newlines)),
# "\nSTR 0b", bitstring(SIMD.Intrinsics.bitreverse(in_string)), " CLMUL(Q)",
# "\nNL 0b", bitstring(SIMD.Intrinsics.bitreverse(newlines)),
# "\n \"", replace(join(map(x->Char(x.value), collect(input.data))), "\n" => "*"), "\"",
# "\n[E] 0b", bitstring(SIMD.Intrinsics.bitreverse(UInt(_overflowed_odd))),
# "\n[Q] 0b", bitstring((in_string >> 63) * typemax(UInt)),
Expand All @@ -289,8 +364,9 @@ end
return newlines
end

# Generic fallback for when open and close quote differs and when buffer, or its last trailing bytes are too small for SIMD, i.e. < 64 bytes).
function _find_newlines_generic!(l::Lexer{E,OQ,CQ}, buf, out, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,OQ,CQ}
@assert 1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32)
@assert (1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32))
structural_characters = _scanbyte_bytes(l)

ptr = pointer(buf)
Expand Down Expand Up @@ -388,8 +464,9 @@ function _find_newlines_generic!(l::Lexer{E,OQ,CQ}, buf, out, curr_pos::Int=firs
return nothing
end

# Quote-unaware lexer we use for trailing bytes of inputs with length that is not a multiple of 64.
function _find_newlines_quote_unaware_scanbyte!(l::Lexer{E,OQ,CQ,NL}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,OQ,CQ,NL}
@assert 1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32)
@assert (1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32))
ptr = pointer(buf, curr_pos)
bytes_to_search = end_pos - curr_pos + 1
base = Int32(curr_pos - 1)
Expand All @@ -409,8 +486,9 @@ function _find_newlines_quote_unaware_scanbyte!(l::Lexer{E,OQ,CQ,NL}, buf::Vecto
end
end

# Quote-unaware lexer which handles 64-byte aligned buffers and leaves the rest to `_find_newlines_quote_unaware_scanbyte!`.
function _find_newlines_quote_unaware_simd!(l::Lexer, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf))
@assert 1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32)
@assert (1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32))
base = unsafe_trunc(Int32, curr_pos)
@inbounds while curr_pos <= (end_pos - 63)
input = vload(Vec{64, UInt8}, buf, curr_pos)
Expand All @@ -429,22 +507,32 @@ function _find_newlines_quote_unaware_simd!(l::Lexer, buf::Vector{UInt8}, out::A
end
end

"""
find_newlines!(l::Lexer, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf))
Find newlines in `buf[curr_pos:end_pos]` and push their positions to `out`. The newline positions are relative to the beginning of `buf`.
The type of the `Lexer` determines the rules for handling quotes and escapes. See `Lexer` for details.
`end_pos` must be less than `typemax(Int32)` and `1 <= curr_pos <= end_pos`.
"""
function find_newlines! end

# Generic fallback for when open and close quote differs (should be also used in case the buffer is too small for SIMD).
function find_newlines!(l::Lexer{E,OQ,CQ}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,OQ,CQ}
1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
(1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32)) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
_find_newlines_generic!(l, buf, out, curr_pos, end_pos)
return nothing
end
# Fast path for when no newlines may appear inside quotes.
function find_newlines!(l::Lexer{Nothing,Nothing,Nothing}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf))
1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
(1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32)) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
_find_newlines_quote_unaware_simd!(l, buf, out, curr_pos, end_pos)
return nothing
end

# Path for when open and close quote are the same (escape might be different or the same as the quote)
function find_newlines!(l::Lexer{E,Q,Q}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,Q}
1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
(1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32)) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
base = unsafe_trunc(Int32, curr_pos)
@inbounds while curr_pos <= (end_pos - 63)
input = vload(Vec{64, UInt8}, buf, curr_pos)
Expand Down

0 comments on commit a297b35

Please sign in to comment.