-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Tweak code organization, add comments and docstrings
- Loading branch information
Showing
12 changed files
with
271 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# When splitting the work among multiple tasks, each task should have at least this many bytes of input | ||
# This is to avoid having too many tasks with too little work to do. | ||
# TODO: make this configurable and find a good default (the current 16 KiB is a guess) | ||
const MIN_TASK_SIZE_IN_BYTES = 16 * 1024 | ||
|
||
_comment_to_bytes(x::AbstractString) = Vector{UInt8}(x) | ||
_comment_to_bytes(x::Char) = _comment_to_bytes(ncodeunits(x) > 1 ? string(x) : UInt8(x)) | ||
_comment_to_bytes(x::UInt8) = [x] | ||
_comment_to_bytes(x::Vector{UInt8}) = x | ||
_comment_to_bytes(::Nothing) = nothing | ||
|
||
# Holds a byte buffer and newline positions for a single chunk of the input file. | ||
# The newline positions are used to split the chunk into tasks for parallel parsing. | ||
struct ChunkingContext | ||
id::Int # id of the chunking context (1 or 2) | ||
counter::TaskCounter # synchronization mechanism to coordinate parsing | ||
newline_positions::BufferedVector{Int32} # positions of newlines in the bytes | ||
bytes::Vector{UInt8} # raw bytes ingested from the input | ||
nworkers::Int # number of worker tasks | ||
limit::Int # maximum number of rows to parse, see `limit_eols!` | ||
# byte prefix to skip, used in `skip_rows_init!` and handed to `populate_result_buffer!` | ||
# for user to handle with consistently (`_startswith` could be used to do the check) | ||
comment::Union{Nothing,Vector{UInt8}} | ||
# number of times we refilled the buffer, can be combined with `id` to uniquely identify a chunk | ||
buffer_refills::Base.RefValue{Int} | ||
end | ||
function ChunkingContext(buffersize::Integer, nworkers::Integer, limit::Integer, comment::Union{Nothing,UInt8,String,Char,Vector{UInt8}}) | ||
(4 <= buffersize <= typemax(Int32)) || throw(ArgumentError("`buffersize` argument must be larger than 4 and smaller than 2_147_483_648 bytes.")) | ||
(0 < nworkers < 256) || throw(ArgumentError("`nworkers` argument must be larger than 0 and smaller than 256.")) | ||
(0 <= limit <= typemax(Int)) || throw(ArgumentError("`limit` argument must be positive and smaller than 9_223_372_036_854_775_808.")) | ||
# TRACING # clear_traces!(nworkers) | ||
return ChunkingContext( | ||
1, | ||
TaskCounter(), | ||
BufferedVector{Int32}(Int32[0], 1), | ||
Vector{UInt8}(undef, buffersize), | ||
nworkers, | ||
limit, | ||
_comment_to_bytes(comment), | ||
Ref(0), | ||
) | ||
end | ||
# Convenience for double-buffering | ||
function ChunkingContext(ctx::ChunkingContext) | ||
out = ChunkingContext( | ||
ctx.id + 1, | ||
TaskCounter(), | ||
BufferedVector{Int32}(Vector{Int32}(undef, max(1, length(ctx.newline_positions))), 1), | ||
similar(ctx.bytes), | ||
ctx.nworkers, | ||
ctx.limit, | ||
ctx.comment, | ||
Ref(0), | ||
) | ||
out.newline_positions.elements[1] = 0 | ||
return out | ||
end | ||
tasks_per_chunk(ctx::ChunkingContext) = ctx.nworkers | ||
total_result_buffers_count(ctx::ChunkingContext) = 2tasks_per_chunk(ctx) | ||
last_newline_at(ctx::ChunkingContext) = Int(last(ctx.newline_positions)) | ||
function should_use_parallel(ctx::ChunkingContext, _force) | ||
return !( | ||
_force === :serial || | ||
((_force !== :parallel) && (Threads.nthreads() == 1 || ctx.nworkers == 1 || last_newline_at(ctx) < MIN_TASK_SIZE_IN_BYTES)) | ||
) | ||
end | ||
|
||
# We split the detected newlines equally among thr nworkers parsing tasks, but each | ||
# unit of work should contain at least 16 KiB of raw bytes (MIN_TASK_SIZE_IN_BYTES). | ||
function estimate_task_size(ctx::ChunkingContext) | ||
eols = ctx.newline_positions | ||
length(eols) == 1 && return 1 # empty file | ||
bytes_to_parse = last(eols) | ||
rows = length(eols) # actually rows + 1 | ||
buffersize = length(ctx.bytes) | ||
# There are 2*nworkers result buffers total, but there are nworkers tasks per chunk | ||
prorated_maxtasks = ceil(Int, tasks_per_chunk(ctx) * (bytes_to_parse / buffersize)) | ||
# Lower bound is 2 because length(eols) == 2 => 1 row | ||
# bump min rows if average row is much smaller than MIN_TASK_SIZE_IN_BYTES | ||
min_rows = max(2, cld(MIN_TASK_SIZE_IN_BYTES, cld(bytes_to_parse, rows))) | ||
return min(max(min_rows, cld(rows, prorated_maxtasks)), rows) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
""" | ||
populate_result_buffer!( | ||
result_buf::AbstractResultBuffer, | ||
newline_segment:AbstractVector{Int32}, | ||
parsing_ctx::AbstractParsingContext, | ||
bytes::Vector{UInt8}, | ||
comment::Union{Nothing,Vector{UInt8}}=nothing, | ||
::Type{CT}=Tuple{} | ||
) where {CT} | ||
Override with your `AbstractParsingContext` to provide a custom logic for parsing the input bytes | ||
in `parsing_ctx.bytes` between the newline positions in `newline_segment` into `result_buf`. | ||
The method is called from multiple tasks in parallel, each having a different `newline_segment`, | ||
some sharing the same `parsing_ctx.bytes`. The `result_buf` is only accessed by one task at a time. | ||
# Arguments: | ||
* `result_buf`: a user-provided object which is meant to store the parsing results from this function | ||
* `newline_segment`: a vector of newline positions in `bytes` which delimit the rows of the input. | ||
* `parsing_ctx`: a user-provided object which is used to dispatch to this method and carry parsing specific config | ||
* `bytes`: the raw bytes ingested from the input | ||
* `comment`: the comment prefix to skip, if any | ||
* `CT`: an optional, compile-time known object which was passed to `parse_file_parallel` / `parse_file_serial` | ||
# Notes: | ||
Each consecutive pair of `newline_segment` values defines and exclusive range of bytes in `bytes` which | ||
constitutes a single row. | ||
The range needs to be treated as exclusive because we add a fictional newline at the beginning at the chunk | ||
at position 0 and past the end of the file if it doesn't end on a newline. | ||
A safe way of processing each row would be e.g.: | ||
``` | ||
start_index = first(newline_segment) | ||
for i in 2:length(newline_segment) | ||
end_index = newline_segment[i] | ||
row_bytes = view(bytes, start_index+1:end_index-1) # +/- 1 is needed! | ||
# ... actually populate the result_buf | ||
start_index = end_index | ||
end | ||
``` | ||
""" | ||
function populate_result_buffer! end | ||
|
||
# Users should subtype this to create custom parsing contexts variables which are | ||
# then used in `parse_file_parallel` / `parse_file_serial`, to dispatch on their | ||
# `populate_result_buffer!` method. | ||
abstract type AbstractParsingContext end | ||
# Users should subtype this to create custom result buffer objects to store the | ||
# parsed results in `populate_result_buffer!`. | ||
abstract type AbstractResultBuffer end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.