diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index fc1b22e3c2..c6de34764b 100755 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -42,13 +42,17 @@ jobs: git config user.email "70000568+nvauto@users.noreply.github.com " git fetch origin ${HEAD} ${BASE} git checkout -b ${INTERMEDIATE_HEAD} origin/${HEAD} - OUT=$(git --no-pager diff --name-only origin/${BASE} | grep "${FILE_USE_BASE}" || true) - [[ ! -z "${OUT}" ]] && git checkout origin/${BASE} -- ${FILE_USE_BASE} && \ - git commit -s -am "Auto-merge use submodule in BASE ref" + # Sync the $BASE branch with the commits from the $HEAD branch, + # excluding the paths defined as $FILE_USE_BASE (located under ./thirdparty). + git checkout origin/${BASE} -- ${FILE_USE_BASE} + # If any submodule file is updated in the HEAD branch, + # always change it to the corresponding one from the BASE branch. + [ ! -z "$(git status --porcelain=v1 --untracked=no)" ] && \ + git commit -s -am "Auto-merge use ${BASE} versions" git push origin ${INTERMEDIATE_HEAD} -f env: INTERMEDIATE_HEAD: bot-auto-merge-${{ env.HEAD }} - FILE_USE_BASE: thirdparty/cudf + FILE_USE_BASE: thirdparty/cudf thirdparty/cudf-pins - name: auto-merge job uses: ./.github/workflows/action-helper diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index e944337861..c7c6c242b8 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -50,6 +50,1060 @@ namespace spark_rapids_jni { namespace detail { +/** + * write JSON style + */ +enum class write_style { raw_style, quoted_style, flatten_style }; + +/** + * path instruction + */ +struct path_instruction { + __device__ inline path_instruction(path_instruction_type _type) : type(_type) {} + + path_instruction_type type; + + // used when type is named type + cudf::string_view name; + + // used when type is index + int index{-1}; +}; + +/** + * JSON generator is used to write out JSON content. + * Because of get_json_object only outputs JSON object as a whole item, + * it's no need to store internal state for JSON object when outputing, + * only need to store internal state for JSON array. + */ +class json_generator { + public: + __device__ json_generator(char* _output) : output(_output), output_len(0) {} + __device__ json_generator() : output(nullptr), output_len(0) {} + + // create a nested child generator based on this parent generator, + // child generator is a view, parent and child share the same byte array + __device__ json_generator new_child_generator() + { + if (nullptr == output) { + return json_generator(); + } else { + return json_generator(output + output_len); + } + } + + // write [ + // add an extra comma if needed, + // e.g.: when JSON content is: [[1,2,3] + // writing a new [ should result: [[1,2,3],[ + __device__ void write_start_array() + { + try_write_comma(); + + if (output) { *(output + output_len) = '['; } + + output_len++; + array_depth++; + // new array is empty + is_curr_array_empty = true; + } + + // write ] + __device__ void write_end_array() + { + if (output) { *(output + output_len) = ']'; } + output_len++; + + // point to parent array + array_depth--; + + // set parent array as non-empty because already had a closed child item. + is_curr_array_empty = false; + } + + // write first start array without output, only update internal state + __device__ void write_first_start_array_without_output() + { + // hide the outer start array token + // Note: do not inc output_len + array_depth++; + // new array is empty + is_curr_array_empty = true; + } + + // return true if it's in a array context and it's not writing the first item. + __device__ inline bool need_comma() { return (array_depth > 0 && !is_curr_array_empty); } + + /** + * write comma accroding to current generator state + */ + __device__ void try_write_comma() + { + if (need_comma()) { + // in array context and writes first item + if (output) { *(output + output_len) = ','; } + output_len++; + } + } + + /** + * copy current structure when parsing. If current token is start + * object/array, then copy to corresponding matched end object/array. return + * false if JSON format is invalid return true if JSON format is valid + */ + __device__ bool copy_current_structure(json_parser& parser) + { + // first try add comma + try_write_comma(); + + if (array_depth > 0) { is_curr_array_empty = false; } + + if (nullptr != output) { + auto copy_to = output + output_len; + auto [b, copy_len] = parser.copy_current_structure(copy_to); + output_len += copy_len; + return b; + } else { + char* copy_to = nullptr; + auto [b, copy_len] = parser.copy_current_structure(copy_to); + output_len += copy_len; + return b; + } + } + + /** + * Get current text from JSON parser and then write the text + * Note: Because JSON strings contains '\' to do escape, + * JSON parser should do unescape to remove '\' and JSON parser + * then can not return a pointer and length pair (char *, len), + * For number token, JSON parser can return a pair (char *, len) + */ + __device__ void write_raw(json_parser& parser) + { + if (array_depth > 0) { is_curr_array_empty = false; } + + if (nullptr != output) { + auto copied = parser.write_unescaped_text(output + output_len); + output_len += copied; + } else { + auto len = parser.compute_unescaped_len(); + output_len += len; + } + } + + /** + * write child raw value + * e.g.: + * + * write_outer_array_tokens = false + * need_comma = true + * [1,2,3]1,2,3 + * ^ + * | + * child pointer + * ==>> + * [1,2,3],1,2,3 + * + * + * write_outer_array_tokens = true + * need_comma = true + * [12,3,4 + * ^ + * | + * child pointer + * ==>> + * [1,[2,3,4] + * + * For more information about param write_outer_array_tokens, refer to + * `write_first_start_array_without_output` + * @param child_block_begin + * @param child_block_len + * @param write_outer_array_tokens whether write outer array tokens for child + * block + */ + __device__ void write_child_raw_value(char* child_block_begin, + size_t child_block_len, + bool write_outer_array_tokens) + { + bool insert_comma = need_comma(); + + if (array_depth > 0) { is_curr_array_empty = false; } + + if (nullptr != output) { + if (write_outer_array_tokens) { + if (insert_comma) { + *(child_block_begin + child_block_len + 2) = ']'; + move_forward(child_block_begin, child_block_len, 2); + *(child_block_begin + 1) = '['; + *(child_block_begin) = ','; + } else { + *(child_block_begin + child_block_len + 1) = ']'; + move_forward(child_block_begin, child_block_len, 1); + *(child_block_begin) = '['; + } + } else { + if (insert_comma) { + move_forward(child_block_begin, child_block_len, 1); + *(child_block_begin) = ','; + } else { + // do not need comma && do not need write outer array tokens + // do nothing, because child generator buff is directly after the + // parent generator + } + } + } + + // update length + if (insert_comma) { output_len++; } + if (write_outer_array_tokens) { output_len += 2; } + output_len += child_block_len; + } + + // move memory block forward by specified bytes + // e.g.: memory is: 1 2 0 0, begin is 1, len is 2, after moving, + // memory is: 1 2 1 2. + // e.g.: memory is: 1 2 0 0, begin is 1, len is 1, after moving, + // memory is: 1 1 2 0. + // Note: should move from end to begin to avoid overwrite buffer + __device__ void move_forward(char* begin, size_t len, int forward) + { + // TODO copy by 8 bytes + char* pos = begin + len + forward - 1; + char* e = begin + forward - 1; + while (pos > e) { + *pos = *(pos - forward); + pos--; + } + } + + __device__ void reset() { output_len = 0; } + + __device__ inline size_t get_output_len() const { return output_len; } + __device__ inline char* get_output_start_position() const { return output; } + __device__ inline char* get_current_output_position() const { return output + output_len; } + + /** + * generator may contain trash output, e.g.: generator writes some output, + * then JSON format is invalid, the previous output becomes trash. + */ + __device__ inline void set_output_len_zero() { output_len = 0; } + + __device__ inline void set_output_len(size_t len) { output_len = len; } + + private: + char* output; + size_t output_len; + + // whether already worte a item in current array + // used to decide whether add a comma before writing out a new item. + bool is_curr_array_empty; + int array_depth = 0; +}; + +/** + * path evaluator which can run on both CPU and GPU + */ +__device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; } + +__device__ inline bool path_match_element(path_instruction const* path_ptr, + size_t path_size, + path_instruction_type path_type0) +{ + if (path_size < 1) { return false; } + return path_ptr[0].type == path_type0; +} + +__device__ inline bool path_match_elements(path_instruction const* path_ptr, + size_t path_size, + path_instruction_type path_type0, + path_instruction_type path_type1) +{ + if (path_size < 2) { return false; } + return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1; +} + +__device__ inline bool path_match_elements(path_instruction const* path_ptr, + size_t path_size, + path_instruction_type path_type0, + path_instruction_type path_type1, + path_instruction_type path_type2, + path_instruction_type path_type3) +{ + if (path_size < 4) { return false; } + return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1 && + path_ptr[2].type == path_type2 && path_ptr[3].type == path_type3; +} + +__device__ inline thrust::tuple path_match_subscript_index( + path_instruction const* path_ptr, size_t path_size) +{ + auto match = path_match_elements( + path_ptr, path_size, path_instruction_type::SUBSCRIPT, path_instruction_type::INDEX); + if (match) { + return thrust::make_tuple(true, path_ptr[1].index); + } else { + return thrust::make_tuple(false, 0); + } +} + +__device__ inline thrust::tuple path_match_named( + path_instruction const* path_ptr, size_t path_size) +{ + auto match = path_match_element(path_ptr, path_size, path_instruction_type::NAMED); + if (match) { + return thrust::make_tuple(true, path_ptr[0].name); + } else { + return thrust::make_tuple(false, cudf::string_view()); + } +} + +__device__ inline thrust::tuple path_match_subscript_index_subscript_wildcard( + path_instruction const* path_ptr, size_t path_size) +{ + auto match = path_match_elements(path_ptr, + path_size, + path_instruction_type::SUBSCRIPT, + path_instruction_type::INDEX, + path_instruction_type::SUBSCRIPT, + path_instruction_type::WILDCARD); + if (match) { + return thrust::make_tuple(true, path_ptr[1].index); + } else { + return thrust::make_tuple(false, 0); + } +} + +/** + * + * The following commented function is recursive version, + * The next function below is the rewritten version, + * Keep version here is for review purpuse, because rewritten version(iterative) + * is not human friendly. + * + */ +// __device__ bool evaluate_path(json_parser& p, +// json_generator& g, +// write_style style, +// path_instruction const* path_ptr, +// int path_size) +// { +// auto token = p.get_current_token(); + +// // case (VALUE_STRING, Nil) if style == RawStyle +// // case path 1 +// if (json_token::VALUE_STRING == token && path_is_empty(path_size) && +// style == write_style::raw_style) { +// // there is no array wildcard or slice parent, emit this string without +// // quotes write current string in parser to generator +// g.write_raw(p); +// return true; +// } +// // case (START_ARRAY, Nil) if style == FlattenStyle +// // case path 2 +// else if (json_token::START_ARRAY == token && path_is_empty(path_size) && +// style == write_style::flatten_style) { +// // flatten this array into the parent +// bool dirty = false; +// while (json_token::END_ARRAY != p.next_token()) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// dirty |= path_evaluator::evaluate_path(p, g, style, nullptr, 0); +// } +// return dirty; +// } +// // case (_, Nil) +// // case path 3 +// else if (path_is_empty(path_size)) { +// // general case: just copy the child tree verbatim +// return g.copy_current_structure(p); +// } +// // case (START_OBJECT, Key :: xs) +// // case path 4 +// else if (json_token::START_OBJECT == token && +// path_match_element(path_ptr, path_size, path_instruction_type::KEY)) { +// bool dirty = false; +// while (json_token::END_OBJECT != p.next_token()) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// if (dirty) { +// // once a match has been found we can skip other fields +// if (!p.try_skip_children()) { +// // JSON validation check +// return false; +// } +// } else { +// dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1); +// } +// } +// return dirty; +// } +// // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) +// // case path 5 +// else if (json_token::START_ARRAY == token && +// path_match_elements(path_ptr, +// path_size, +// path_instruction_type::SUBSCRIPT, +// path_instruction_type::WILDCARD, +// path_instruction_type::SUBSCRIPT, +// path_instruction_type::WILDCARD)) { +// // special handling for the non-structure preserving double wildcard +// // behavior in Hive +// bool dirty = false; +// g.write_start_array(); +// while (p.next_token() != json_token::END_ARRAY) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// dirty |= path_evaluator::evaluate_path( +// p, g, write_style::flatten_style, path_ptr + 4, path_size - 4); +// } +// g.write_end_array(); +// return dirty; +// } +// // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle +// // case path 6 +// else if (json_token::START_ARRAY == token && +// path_match_elements(path_ptr, +// path_size, +// path_instruction_type::SUBSCRIPT, +// path_instruction_type::WILDCARD) && +// style != write_style::quoted_style) { +// // retain Flatten, otherwise use Quoted... cannot use Raw within an array +// write_style next_style = write_style::raw_style; +// switch (style) { +// case write_style::raw_style: next_style = write_style::quoted_style; break; +// case write_style::flatten_style: next_style = write_style::flatten_style; break; +// case write_style::quoted_style: next_style = write_style::quoted_style; // never happen +// } + +// // temporarily buffer child matches, the emitted json will need to be +// // modified slightly if there is only a single element written + +// int dirty = 0; +// // create a child generator with hide outer array tokens mode. +// auto child_g = g.new_child_generator(/*hide_outer_array_tokens*/ true); + +// // Note: child generator does not actually write the outer start array +// // token into buffer it only updates internal nested state +// child_g.write_start_array(); + +// while (p.next_token() != json_token::END_ARRAY) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// // track the number of array elements and only emit an outer array if +// // we've written more than one element, this matches Hive's behavior +// dirty += +// (path_evaluator::evaluate_path(p, child_g, next_style, path_ptr + 2, path_size - 2) ? 1 +// : +// 0); +// } + +// // Note: child generator does not actually write the outer end array token +// // into buffer it only updates internal nested state +// child_g.write_end_array(); + +// char* child_g_start = child_g.get_output_start_position(); +// size_t child_g_len = child_g.get_output_len(); // len already excluded outer [ ] + +// if (dirty > 1) { +// // add outer array tokens +// g.write_child_raw_value(child_g_start, child_g_len, true); +// } else if (dirty == 1) { +// // remove outer array tokens +// g.write_child_raw_value(child_g_start, child_g_len, false); +// } // else do not write anything + +// return dirty > 0; +// } +// // case (START_ARRAY, Subscript :: Wildcard :: xs) +// // case path 7 +// else if (json_token::START_ARRAY == token && +// path_match_elements(path_ptr, +// path_size, +// path_instruction_type::SUBSCRIPT, +// path_instruction_type::WILDCARD)) { +// bool dirty = false; +// g.write_start_array(); +// while (p.next_token() != json_token::END_ARRAY) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// // wildcards can have multiple matches, continually update the dirty +// // count +// dirty |= path_evaluator::evaluate_path( +// p, g, write_style::quoted_style, path_ptr + 2, path_size - 2); +// } +// g.write_end_array(); + +// return dirty; +// } +// /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */ +// // case path 8 +// else if (json_token::START_ARRAY == token && +// thrust::get<0>(path_match_subscript_index_subscript_wildcard(path_ptr, path_size))) +// { +// int idx = thrust::get<1>(path_match_subscript_index_subscript_wildcard(path_ptr, +// path_size)); p.next_token(); +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// int i = idx; +// while (i >= 0) { +// if (p.get_current_token() == json_token::END_ARRAY) { +// // terminate, nothing has been written +// return false; +// } +// if (0 == i) { +// bool dirty = path_evaluator::evaluate_path( +// p, g, write_style::quoted_style, path_ptr + 2, path_size - 2); +// while (p.next_token() != json_token::END_ARRAY) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// // advance the token stream to the end of the array +// if (!p.try_skip_children()) { return false; } +// } +// return dirty; +// } else { +// // i > 0 +// if (!p.try_skip_children()) { return false; } + +// p.next_token(); +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } +// } +// --i; +// } +// // path parser guarantees idx >= 0 +// // will never reach to here +// return false; +// } +// // case (START_ARRAY, Subscript :: Index(idx) :: xs) +// // case path 9 +// else if (json_token::START_ARRAY == token && +// thrust::get<0>(path_match_subscript_index(path_ptr, path_size))) { +// int idx = thrust::get<1>(path_match_subscript_index(path_ptr, path_size)); +// p.next_token(); +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// int i = idx; +// while (i >= 0) { +// if (p.get_current_token() == json_token::END_ARRAY) { +// // terminate, nothing has been written +// return false; +// } +// if (0 == i) { +// bool dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 2, path_size - 2); +// while (p.next_token() != json_token::END_ARRAY) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// // advance the token stream to the end of the array +// if (!p.try_skip_children()) { return false; } +// } +// return dirty; +// } else { +// // i > 0 +// if (!p.try_skip_children()) { return false; } + +// p.next_token(); +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } +// } +// --i; +// } +// // path parser guarantees idx >= 0 +// // will never reach to here +// return false; +// } +// // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name +// // case path 10 +// else if (json_token::FIELD_NAME == token && +// thrust::get<0>(path_match_named(path_ptr, path_size)) && +// p.match_current_field_name(thrust::get<1>(path_match_named(path_ptr, path_size)))) { +// if (p.next_token() != json_token::VALUE_NULL) { +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1); +// } else { +// return false; +// } +// } +// // case (FIELD_NAME, Wildcard :: xs) +// // case path 11 +// else if (json_token::FIELD_NAME == token && +// path_match_element(path_ptr, path_size, path_instruction_type::WILDCARD)) { +// p.next_token(); +// // JSON validation check +// if (json_token::ERROR == p.get_current_token()) { return false; } + +// return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1); +// } +// // case _ => +// // case path 12 +// else { +// if (!p.try_skip_children()) { return false; } +// return false; +// } +// } + +/** + * + * This function is rewritten from above commented recursive function. + * this function is equivalent to the above commented recursive function. + */ +__device__ bool evaluate_path(json_parser& p, + json_generator& root_g, + write_style root_style, + path_instruction const* root_path_ptr, + int root_path_size) +{ + // manually maintained context stack in lieu of calling evaluate_path recursively. + struct context { + // current token + json_token token; + + // which case path that this task is from + int case_path; + + // used to save current generator + json_generator g; + + write_style style; + path_instruction const* path_ptr; + int path_size; + + // is this context task is done + bool task_is_done; + + // whether written output + // if dirty > 0, indicates success + int dirty; + + // for some case paths + bool is_first_enter; + + // used to save child JSON generator for case path 8 + json_generator child_g; + }; + + // path max depth limitation + // There is a same constant in JSONUtil.java, keep them consistent when changing + // Note: Spark-Rapids should guarantee the path depth is less or equal to this limit, + // or GPU reports cudaErrorIllegalAddress + constexpr int max_path_depth = 16; + + // define stack; plus 1 indicates root context task needs an extra memory + context stack[max_path_depth + 1]; + int stack_pos = 0; + + // push context function + auto push_context = [&stack, &stack_pos](json_token _token, + int _case_path, + json_generator _g, + write_style _style, + path_instruction const* _path_ptr, + int _path_size) { + // no need to check stack is full + // because Spark-Rapids already checked maximum length of `path_instruction` + auto& ctx = stack[stack_pos]; + ctx.token = _token; + ctx.case_path = _case_path; + ctx.g = _g; + ctx.style = _style; + ctx.path_ptr = _path_ptr; + ctx.path_size = _path_size; + ctx.task_is_done = false; + ctx.dirty = 0; + ctx.is_first_enter = true; + + stack_pos++; + }; + + // put the first context task + push_context(p.get_current_token(), -1, root_g, root_style, root_path_ptr, root_path_size); + + while (stack_pos > 0) { + auto& ctx = stack[stack_pos - 1]; + if (!ctx.task_is_done) { + // task is not done. + + // case (VALUE_STRING, Nil) if style == RawStyle + // case path 1 + if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path_size) && + ctx.style == write_style::raw_style) { + // there is no array wildcard or slice parent, emit this string without + // quotes write current string in parser to generator + ctx.g.write_raw(p); + ctx.dirty = 1; + ctx.task_is_done = true; + } + // case (START_ARRAY, Nil) if style == FlattenStyle + // case path 2 + else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path_size) && + ctx.style == write_style::flatten_style) { + // flatten this array into the parent + if (json_token::END_ARRAY != p.next_token()) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + // push back task + // add child task + push_context(p.get_current_token(), 2, ctx.g, ctx.style, nullptr, 0); + } else { + // END_ARRAY + ctx.task_is_done = true; + } + } + // case (_, Nil) + // case path 3 + else if (path_is_empty(ctx.path_size)) { + // general case: just copy the child tree verbatim + if (!(ctx.g.copy_current_structure(p))) { + // JSON validation check + return false; + } + ctx.dirty = 1; + ctx.task_is_done = true; + } + // case (START_OBJECT, Key :: xs) + // case path 4 + else if (json_token::START_OBJECT == ctx.token && + path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::KEY)) { + if (json_token::END_OBJECT != p.next_token()) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + + if (ctx.dirty > 0) { + // once a match has been found we can skip other fields + if (!p.try_skip_children()) { + // JSON validation check + return false; + } + } else { + // need to try more children + push_context( + p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1); + } + } else { + ctx.task_is_done = true; + } + } + // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) + // case path 5 + else if (json_token::START_ARRAY == ctx.token && + path_match_elements(ctx.path_ptr, + ctx.path_size, + path_instruction_type::SUBSCRIPT, + path_instruction_type::WILDCARD, + path_instruction_type::SUBSCRIPT, + path_instruction_type::WILDCARD)) { + // special handling for the non-structure preserving double wildcard + // behavior in Hive + if (ctx.is_first_enter) { + ctx.is_first_enter = false; + ctx.g.write_start_array(); + } + + if (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + push_context(p.get_current_token(), + 5, + ctx.g, + write_style::flatten_style, + ctx.path_ptr + 4, + ctx.path_size - 4); + } else { + ctx.g.write_end_array(); + ctx.task_is_done = true; + } + } + // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle + // case path 6 + else if (json_token::START_ARRAY == ctx.token && + path_match_elements(ctx.path_ptr, + ctx.path_size, + path_instruction_type::SUBSCRIPT, + path_instruction_type::WILDCARD) && + ctx.style != write_style::quoted_style) { + // retain Flatten, otherwise use Quoted... cannot use Raw within an array + write_style next_style = write_style::raw_style; + switch (ctx.style) { + case write_style::raw_style: next_style = write_style::quoted_style; break; + case write_style::flatten_style: next_style = write_style::flatten_style; break; + case write_style::quoted_style: next_style = write_style::quoted_style; // never happen + } + + // temporarily buffer child matches, the emitted json will need to be + // modified slightly if there is only a single element written + + json_generator child_g; + if (ctx.is_first_enter) { + ctx.is_first_enter = false; + // create a child generator with hide outer array tokens mode. + child_g = ctx.g.new_child_generator(); + // write first [ without output, without update len, only update internal state + child_g.write_first_start_array_without_output(); + } else { + child_g = ctx.child_g; + } + + if (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + // track the number of array elements and only emit an outer array if + // we've written more than one element, this matches Hive's behavior + push_context( + p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 2, ctx.path_size - 2); + } else { + char* child_g_start = child_g.get_output_start_position(); + size_t child_g_len = child_g.get_output_len(); + + if (ctx.dirty > 1) { + // add outer array tokens + ctx.g.write_child_raw_value( + child_g_start, child_g_len, /* write_outer_array_tokens */ true); + ctx.task_is_done = true; + } else if (ctx.dirty == 1) { + // remove outer array tokens + ctx.g.write_child_raw_value( + child_g_start, child_g_len, /* write_outer_array_tokens */ false); + ctx.task_is_done = true; + } // else do not write anything + } + } + // case (START_ARRAY, Subscript :: Wildcard :: xs) + // case path 7 + else if (json_token::START_ARRAY == ctx.token && + path_match_elements(ctx.path_ptr, + ctx.path_size, + path_instruction_type::SUBSCRIPT, + path_instruction_type::WILDCARD)) { + if (ctx.is_first_enter) { + ctx.is_first_enter = false; + ctx.g.write_start_array(); + } + + if (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + + // wildcards can have multiple matches, continually update the dirty + // count + push_context(p.get_current_token(), + 7, + ctx.g, + write_style::quoted_style, + ctx.path_ptr + 2, + ctx.path_size - 2); + } else { + ctx.g.write_end_array(); + ctx.task_is_done = true; + } + } + /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */ + // case path 8 + else if (json_token::START_ARRAY == ctx.token && + thrust::get<0>( + path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size))) { + int idx = thrust::get<1>( + path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size)); + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + ctx.is_first_enter = false; + + int i = idx; + while (i > 0) { + if (p.get_current_token() == json_token::END_ARRAY) { + // terminate, nothing has been written + return false; + } + + if (!p.try_skip_children()) { return false; } + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + + --i; + } + + // i == 0 + push_context(p.get_current_token(), + 8, + ctx.g, + write_style::quoted_style, + ctx.path_ptr + 2, + ctx.path_size - 2); + } + // case (START_ARRAY, Subscript :: Index(idx) :: xs) + // case path 9 + else if (json_token::START_ARRAY == ctx.token && + thrust::get<0>(path_match_subscript_index(ctx.path_ptr, ctx.path_size))) { + int idx = thrust::get<1>(path_match_subscript_index(ctx.path_ptr, ctx.path_size)); + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + + int i = idx; + while (i > 0) { + if (p.get_current_token() == json_token::END_ARRAY) { + // terminate, nothing has been written + return false; + } + + if (!p.try_skip_children()) { return false; } + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + + --i; + } + + // i == 0 + push_context( + p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 2, ctx.path_size - 2); + } + // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name + // case path 10 + else if (json_token::FIELD_NAME == ctx.token && + thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size)) && + p.match_current_field_name( + thrust::get<1>(path_match_named(ctx.path_ptr, ctx.path_size)))) { + if (p.next_token() != json_token::VALUE_NULL) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + push_context( + p.get_current_token(), 10, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1); + } else { + return false; + } + } + // case (FIELD_NAME, Wildcard :: xs) + // case path 11 + else if (json_token::FIELD_NAME == ctx.token && + path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) { + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + push_context( + p.get_current_token(), 11, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1); + } + // case _ => + // case path 12 + else { + if (!p.try_skip_children()) { return false; } + // default case path, return false for this task + ctx.dirty = 0; + ctx.task_is_done = true; + } + } else { + // current context is done. + + // pop current top context + stack_pos--; + + // pop parent task + // update parent task info according to current task result + if (stack_pos > 0) { + // peek parent context task + auto& p_ctx = stack[stack_pos - 1]; + + // case (VALUE_STRING, Nil) if style == RawStyle + // case path 1 + if (1 == ctx.case_path) { + // never happen + } + // path 2: case (START_ARRAY, Nil) if style == FlattenStyle + // path 5: case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) + // path 7: case (START_ARRAY, Subscript :: Wildcard :: xs) + else if (2 == ctx.case_path || 5 == ctx.case_path || 7 == ctx.case_path) { + // collect result from child task + p_ctx.dirty += ctx.dirty; + // copy generator states to parent task; + p_ctx.g = ctx.g; + } + // case (START_OBJECT, Key :: xs) + // case path 4 + else if (4 == ctx.case_path) { + if (p_ctx.dirty < 1 && ctx.dirty > 0) { p_ctx.dirty = ctx.dirty; } + // copy generator states to parent task; + p_ctx.g = ctx.g; + } + // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle + // case path 6 + else if (6 == ctx.case_path) { + // collect result from child task + p_ctx.dirty += ctx.dirty; + // update child generator for parent task + p_ctx.child_g = ctx.g; + } + /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */ + // case path 8 + // case (START_ARRAY, Subscript :: Index(idx) :: xs) + // case path 9 + else if (8 == ctx.case_path || 9 == ctx.case_path) { + // collect result from child task + p_ctx.dirty += ctx.dirty; + + // post logic: + while (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return false; } + // advance the token stream to the end of the array + if (!p.try_skip_children()) { return false; } + } + // task is done + p_ctx.task_is_done = true; + // copy generator states to parent task; + p_ctx.g = ctx.g; + } + // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name + // case path 10 + else if (10 == ctx.case_path) { + // collect result from child task + p_ctx.dirty += ctx.dirty; + // task is done + p_ctx.task_is_done = true; + // copy generator states to parent task; + p_ctx.g = ctx.g; + } + // case (FIELD_NAME, Wildcard :: xs) + // case path 11 + else if (11 == ctx.case_path) { + // collect result from child task + p_ctx.dirty += ctx.dirty; + // task is done + p_ctx.task_is_done = true; + // copy generator states to parent task; + p_ctx.g = ctx.g; + } + // case path 3: case (_, Nil) + // case path 12: case _ => + // others + else { + // never happen + } + } else { + // has no parent task, stack is empty, will exit + } + } + } + + // copy output len + root_g.set_output_len(stack[0].g.get_output_len()); + return stack[0].dirty > 0; +} + rmm::device_uvector construct_path_commands( std::vector> const& instructions, cudf::string_scalar const& all_names_scalar, @@ -92,28 +1146,6 @@ rmm::device_uvector construct_path_commands( return cudf::detail::make_device_uvector_sync(path_commands, stream, mr); } -/** - * @brief Parse a single json string using the provided command buffer - * - * @param j_parser The incoming json string and associated parser - * @param path_ptr The command buffer to be applied to the string. - * @param path_size Command buffer size - * @param output Buffer used to store the results of the query - * @returns A result code indicating success/fail/empty. - */ -__device__ inline bool parse_json_path(json_parser<>& j_parser, - path_instruction const* path_ptr, - size_t path_size, - json_generator<>& output) -{ - j_parser.next_token(); - // JSON validation check - if (json_token::ERROR == j_parser.get_current_token()) { return false; } - - return path_evaluator::evaluate_path( - j_parser, output, write_style::raw_style, path_ptr, path_size); -} - /** * @brief Parse a single json string using the provided command buffer * @@ -127,7 +1159,7 @@ __device__ inline bool parse_json_path(json_parser<>& j_parser, * @param out_buf_size Size of the output buffer * @returns A pair containing the result code and the output buffer. */ -__device__ thrust::pair> get_json_object_single( +__device__ thrust::pair get_json_object_single( char const* input, cudf::size_type input_len, path_instruction const* path_commands_ptr, @@ -135,35 +1167,28 @@ __device__ thrust::pair> get_json_object_single( char* out_buf, size_t out_buf_size) { - char* actual_output; - if (nullptr == out_buf) { - // First step: preprocess sizes - actual_output = out_buf; - } else { - // Second step: writes output - // if output buf size is zero, pass in nullptr to avoid generator writing trash output - actual_output = (0 == out_buf_size) ? nullptr : out_buf; - } - json_parser j_parser(input, input_len); - json_generator generator(actual_output); + j_parser.next_token(); + // JSON validation check + if (json_token::ERROR == j_parser.get_current_token()) { return {false, 0}; } - if (!out_buf) { - // First step: preprocess sizes - bool success = parse_json_path(j_parser, path_commands_ptr, path_commands_size, generator); + // First pass: preprocess sizes. + // Second pass: writes output. + // The generator automatically determines which pass based on `out_buf`. + // If `out_buf_size` is zero, pass in `nullptr` to avoid generator writing trash output. + json_generator generator((out_buf == nullptr || out_buf_size == 0) ? nullptr : out_buf); - if (!success) { - // generator may contain trash output, e.g.: generator writes some output, - // then JSON format is invalid, the previous output becomes trash. - // set output as zero to tell second step - generator.set_output_len_zero(); - } - return {success, std::move(generator)}; - } else { - // Second step: writes output - bool success = parse_json_path(j_parser, path_commands_ptr, path_commands_size, generator); - return {success, std::move(generator)}; + bool const success = evaluate_path( + j_parser, generator, write_style::raw_style, path_commands_ptr, path_commands_size); + + if (nullptr == out_buf && !success) { + // generator may contain trash output, e.g.: generator writes some output, + // then JSON format is invalid, the previous output becomes trash. + // set output as zero to tell second step + generator.set_output_len_zero(); } + + return {success, generator.get_output_len()}; } /** @@ -202,22 +1227,24 @@ __launch_bounds__(block_size) CUDF_KERNEL while (tid < col.size()) { bool is_valid = false; cudf::string_view const str = col.element(tid); - cudf::size_type output_size = 0; if (str.size_bytes() > 0) { char* dst = out_buf != nullptr ? out_buf + output_offsets[tid] : nullptr; size_t const dst_size = out_buf != nullptr ? output_offsets[tid + 1] - output_offsets[tid] : 0; // process one single row - auto [result, out] = get_json_object_single( + auto [result, output_size] = get_json_object_single( str.data(), str.size_bytes(), path_commands_ptr, path_commands_size, dst, dst_size); - output_size = out.get_output_len(); if (result) { is_valid = true; } - } - // filled in only during the precompute step. during the compute step, the - // offsets are fed back in so we do -not- want to write them out - if (out_buf == nullptr) { d_sizes[tid] = output_size; } + // filled in only during the precompute step. during the compute step, the + // offsets are fed back in so we do -not- want to write them out + if (out_buf == nullptr) { d_sizes[tid] = static_cast(output_size); } + } else { + // valid JSON length is always greater than 0 + // if `str` size len is zero, output len is 0 and `is_valid` is false + if (out_buf == nullptr) { d_sizes[tid] = 0; } + } // validity filled in only during the output step if (out_validity != nullptr) { @@ -305,16 +1332,11 @@ std::unique_ptr get_json_object( static_cast(validity.data()), d_valid_count.data()); - auto result = make_strings_column(input.size(), - std::move(offsets), - chars.release(), - input.size() - d_valid_count.value(stream), - std::move(validity)); - // unmatched array query may result in unsanitized '[' value in the result - if (auto const result_cv = result->view(); cudf::detail::has_nonempty_nulls(result_cv, stream)) { - result = cudf::detail::purge_nonempty_nulls(result_cv, stream, mr); - } - return result; + return make_strings_column(input.size(), + std::move(offsets), + chars.release(), + input.size() - d_valid_count.value(stream), + std::move(validity)); } } // namespace detail diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp index 0522cabee5..cf1f0c3470 100644 --- a/src/main/cpp/src/get_json_object.hpp +++ b/src/main/cpp/src/get_json_object.hpp @@ -14,7 +14,9 @@ * limitations under the License. */ -#include "json_parser.hpp" +#pragma once + +#include "json_parser.cuh" #include #include @@ -35,1146 +37,6 @@ namespace spark_rapids_jni { */ enum class path_instruction_type { SUBSCRIPT, WILDCARD, KEY, INDEX, NAMED }; -namespace detail { - -/** - * write JSON style - */ -enum class write_style { raw_style, quoted_style, flatten_style }; - -/** - * path instruction - */ -struct path_instruction { - __device__ inline path_instruction(path_instruction_type _type) : type(_type) {} - - path_instruction_type type; - - // used when type is named type - cudf::string_view name; - - // used when type is index - int index{-1}; -}; - -/** - * JSON generator is used to write out JSON content. - * Because of get_json_object only outputs JSON object as a whole item, - * it's no need to store internal state for JSON object when outputing, - * only need to store internal state for JSON array. - */ -template -class json_generator { - public: - __device__ json_generator(char* _output) : output(_output), output_len(0) {} - __device__ json_generator() : output(nullptr), output_len(0) {} - - __device__ json_generator<>& operator=(json_generator<> const& other) - { - this->output = other.output; - this->output_len = other.output_len; - this->array_depth = other.array_depth; - for (size_t i = 0; i < max_json_nesting_depth; i++) { - this->is_first_item[i] = other.is_first_item[i]; - } - - return *this; - } - - // create a nested child generator based on this parent generator, - // child generator is a view, parent and child share the same byte array - __device__ json_generator new_child_generator() - { - if (nullptr == output) { - return json_generator(); - } else { - return json_generator(output + output_len); - } - } - - // write [ - // add an extra comma if needed, - // e.g.: when JSON content is: [[1,2,3] - // writing a new [ should result: [[1,2,3],[ - __device__ void write_start_array() - { - try_write_comma(); - - // update internal state - if (array_depth > 0) { is_first_item[array_depth - 1] = false; } - - if (output) { *(output + output_len) = '['; } - - output_len++; - is_first_item[array_depth] = true; - array_depth++; - } - - // write ] - __device__ void write_end_array() - { - if (output) { *(output + output_len) = ']'; } - output_len++; - array_depth--; - } - - // write first start array without output, only update internal state - __device__ void write_first_start_array_without_output() - { - // hide the outer start array token - // Note: do not inc output_len - is_first_item[array_depth] = true; - array_depth++; - } - - // return true if it's in a array context and it's not writing the first item. - __device__ bool need_comma() { return (array_depth > 0 && !is_first_item[array_depth - 1]); } - - /** - * write comma accroding to current generator state - */ - __device__ void try_write_comma() - { - if (need_comma()) { - // in array context and writes first item - if (output) { *(output + output_len) = ','; } - output_len++; - } - } - - /** - * copy current structure when parsing. If current token is start - * object/array, then copy to corresponding matched end object/array. return - * false if JSON format is invalid return true if JSON format is valid - */ - __device__ bool copy_current_structure(json_parser<>& parser) - { - // first try add comma - try_write_comma(); - - if (array_depth > 0) { is_first_item[array_depth - 1] = false; } - - if (nullptr != output) { - auto copy_to = output + output_len; - auto [b, copy_len] = parser.copy_current_structure(copy_to); - output_len += copy_len; - return b; - } else { - char* copy_to = nullptr; - auto [b, copy_len] = parser.copy_current_structure(copy_to); - output_len += copy_len; - return b; - } - } - - /** - * Get current text from JSON parser and then write the text - * Note: Because JSON strings contains '\' to do escape, - * JSON parser should do unescape to remove '\' and JSON parser - * then can not return a pointer and length pair (char *, len), - * For number token, JSON parser can return a pair (char *, len) - */ - __device__ void write_raw(json_parser<>& parser) - { - if (array_depth > 0) { is_first_item[array_depth - 1] = false; } - - if (nullptr != output) { - auto copied = parser.write_unescaped_text(output + output_len); - output_len += copied; - } else { - auto len = parser.compute_unescaped_len(); - output_len += len; - } - } - - /** - * write child raw value - * e.g.: - * - * write_outer_array_tokens = false - * need_comma = true - * [1,2,3]1,2,3 - * ^ - * | - * child pointer - * ==>> - * [1,2,3],1,2,3 - * - * - * write_outer_array_tokens = true - * need_comma = true - * [12,3,4 - * ^ - * | - * child pointer - * ==>> - * [1,[2,3,4] - * - * For more information about param write_outer_array_tokens, refer to - * `write_first_start_array_without_output` - * @param child_block_begin - * @param child_block_len - * @param write_outer_array_tokens whether write outer array tokens for child block - */ - __device__ void write_child_raw_value(char* child_block_begin, - size_t child_block_len, - bool write_outer_array_tokens) - { - bool insert_comma = need_comma(); - - is_first_item[array_depth - 1] = false; - - if (nullptr != output) { - if (write_outer_array_tokens) { - if (insert_comma) { - *(child_block_begin + child_block_len + 2) = ']'; - move_forward(child_block_begin, child_block_len, 2); - *(child_block_begin + 1) = '['; - *(child_block_begin) = ','; - } else { - *(child_block_begin + child_block_len + 1) = ']'; - move_forward(child_block_begin, child_block_len, 1); - *(child_block_begin) = '['; - } - } else { - if (insert_comma) { - move_forward(child_block_begin, child_block_len, 1); - *(child_block_begin) = ','; - } else { - // do not need comma && do not need write outer array tokens - // do nothing, because child generator buff is directly after the - // parent generator - } - } - } - - // update length - if (insert_comma) { output_len++; } - if (write_outer_array_tokens) { output_len += 2; } - output_len += child_block_len; - } - - // move memory block forward by specified bytes - // e.g.: memory is: 1 2 0 0, begin is 1, len is 2, after moving, - // memory is: 1 2 1 2. - // e.g.: memory is: 1 2 0 0, begin is 1, len is 1, after moving, - // memory is: 1 1 2 0. - // Note: should move from end to begin to avoid overwrite buffer - __device__ void move_forward(char* begin, size_t len, int forward) - { - char* pos = begin + len + forward - 1; - char* e = begin + forward - 1; - while (pos > e) { - *pos = *(pos - forward); - pos--; - } - } - - __device__ void reset() { output_len = 0; } - - __device__ inline size_t get_output_len() const { return output_len; } - __device__ inline char* get_output_start_position() const { return output; } - __device__ inline char* get_current_output_position() const { return output + output_len; } - - /** - * generator may contain trash output, e.g.: generator writes some output, - * then JSON format is invalid, the previous output becomes trash. - */ - __device__ inline void set_output_len_zero() { output_len = 0; } - - __device__ inline void set_output_len(size_t len) { output_len = len; } - - private: - char* output; - size_t output_len; - - bool is_first_item[max_json_nesting_depth]; - int array_depth = 0; -}; - -/** - * path evaluator which can run on both CPU and GPU - */ -struct path_evaluator { - static __device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; } - - static __device__ inline bool path_match_element(path_instruction const* path_ptr, - size_t path_size, - path_instruction_type path_type0) - { - if (path_size < 1) { return false; } - return path_ptr[0].type == path_type0; - } - - static __device__ inline bool path_match_elements(path_instruction const* path_ptr, - size_t path_size, - path_instruction_type path_type0, - path_instruction_type path_type1) - { - if (path_size < 2) { return false; } - return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1; - } - - static __device__ inline bool path_match_elements(path_instruction const* path_ptr, - size_t path_size, - path_instruction_type path_type0, - path_instruction_type path_type1, - path_instruction_type path_type2, - path_instruction_type path_type3) - { - if (path_size < 4) { return false; } - return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1 && - path_ptr[2].type == path_type2 && path_ptr[3].type == path_type3; - } - - static __device__ inline thrust::tuple path_match_subscript_index( - path_instruction const* path_ptr, size_t path_size) - { - auto match = path_match_elements( - path_ptr, path_size, path_instruction_type::SUBSCRIPT, path_instruction_type::INDEX); - if (match) { - return thrust::make_tuple(true, path_ptr[1].index); - } else { - return thrust::make_tuple(false, 0); - } - } - - static __device__ inline thrust::tuple path_match_named( - path_instruction const* path_ptr, size_t path_size) - { - auto match = path_match_element(path_ptr, path_size, path_instruction_type::NAMED); - if (match) { - return thrust::make_tuple(true, path_ptr[0].name); - } else { - return thrust::make_tuple(false, cudf::string_view()); - } - } - - static __device__ inline thrust::tuple path_match_subscript_index_subscript_wildcard( - path_instruction const* path_ptr, size_t path_size) - { - auto match = path_match_elements(path_ptr, - path_size, - path_instruction_type::SUBSCRIPT, - path_instruction_type::INDEX, - path_instruction_type::SUBSCRIPT, - path_instruction_type::WILDCARD); - if (match) { - return thrust::make_tuple(true, path_ptr[1].index); - } else { - return thrust::make_tuple(false, 0); - } - } - - /** - * - * The following commented function is recursive version, - * The next function below is the rewritten version, - * Keep version here is for review purpuse, because rewritten version(iterative) - * is not human friendly. - * - */ - // static __device__ bool evaluate_path(json_parser<>& p, - // json_generator<>& g, - // write_style style, - // path_instruction const* path_ptr, - // int path_size) - // { - // auto token = p.get_current_token(); - - // // case (VALUE_STRING, Nil) if style == RawStyle - // // case path 1 - // if (json_token::VALUE_STRING == token && path_is_empty(path_size) && - // style == write_style::raw_style) { - // // there is no array wildcard or slice parent, emit this string without - // // quotes write current string in parser to generator - // g.write_raw(p); - // return true; - // } - // // case (START_ARRAY, Nil) if style == FlattenStyle - // // case path 2 - // else if (json_token::START_ARRAY == token && path_is_empty(path_size) && - // style == write_style::flatten_style) { - // // flatten this array into the parent - // bool dirty = false; - // while (json_token::END_ARRAY != p.next_token()) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // dirty |= path_evaluator::evaluate_path(p, g, style, nullptr, 0); - // } - // return dirty; - // } - // // case (_, Nil) - // // case path 3 - // else if (path_is_empty(path_size)) { - // // general case: just copy the child tree verbatim - // return g.copy_current_structure(p); - // } - // // case (START_OBJECT, Key :: xs) - // // case path 4 - // else if (json_token::START_OBJECT == token && - // path_match_element(path_ptr, path_size, path_instruction_type::KEY)) { - // bool dirty = false; - // while (json_token::END_OBJECT != p.next_token()) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // if (dirty) { - // // once a match has been found we can skip other fields - // if (!p.try_skip_children()) { - // // JSON validation check - // return false; - // } - // } else { - // dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1); - // } - // } - // return dirty; - // } - // // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) - // // case path 5 - // else if (json_token::START_ARRAY == token && - // path_match_elements(path_ptr, - // path_size, - // path_instruction_type::SUBSCRIPT, - // path_instruction_type::WILDCARD, - // path_instruction_type::SUBSCRIPT, - // path_instruction_type::WILDCARD)) { - // // special handling for the non-structure preserving double wildcard - // // behavior in Hive - // bool dirty = false; - // g.write_start_array(); - // while (p.next_token() != json_token::END_ARRAY) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // dirty |= path_evaluator::evaluate_path( - // p, g, write_style::flatten_style, path_ptr + 4, path_size - 4); - // } - // g.write_end_array(); - // return dirty; - // } - // // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle - // // case path 6 - // else if (json_token::START_ARRAY == token && - // path_match_elements(path_ptr, - // path_size, - // path_instruction_type::SUBSCRIPT, - // path_instruction_type::WILDCARD) && - // style != write_style::quoted_style) { - // // retain Flatten, otherwise use Quoted... cannot use Raw within an array - // write_style next_style = write_style::raw_style; - // switch (style) { - // case write_style::raw_style: next_style = write_style::quoted_style; break; - // case write_style::flatten_style: next_style = write_style::flatten_style; break; - // case write_style::quoted_style: next_style = write_style::quoted_style; // never happen - // } - - // // temporarily buffer child matches, the emitted json will need to be - // // modified slightly if there is only a single element written - - // int dirty = 0; - // // create a child generator with hide outer array tokens mode. - // auto child_g = g.new_child_generator(/*hide_outer_array_tokens*/ true); - - // // Note: child generator does not actually write the outer start array - // // token into buffer it only updates internal nested state - // child_g.write_start_array(); - - // while (p.next_token() != json_token::END_ARRAY) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // // track the number of array elements and only emit an outer array if - // // we've written more than one element, this matches Hive's behavior - // dirty += - // (path_evaluator::evaluate_path(p, child_g, next_style, path_ptr + 2, path_size - 2) ? 1 - // : - // 0); - // } - - // // Note: child generator does not actually write the outer end array token - // // into buffer it only updates internal nested state - // child_g.write_end_array(); - - // char* child_g_start = child_g.get_output_start_position(); - // size_t child_g_len = child_g.get_output_len(); // len already excluded outer [ ] - - // if (dirty > 1) { - // // add outer array tokens - // g.write_child_raw_value(child_g_start, child_g_len, true); - // } else if (dirty == 1) { - // // remove outer array tokens - // g.write_child_raw_value(child_g_start, child_g_len, false); - // } // else do not write anything - - // return dirty > 0; - // } - // // case (START_ARRAY, Subscript :: Wildcard :: xs) - // // case path 7 - // else if (json_token::START_ARRAY == token && - // path_match_elements(path_ptr, - // path_size, - // path_instruction_type::SUBSCRIPT, - // path_instruction_type::WILDCARD)) { - // bool dirty = false; - // g.write_start_array(); - // while (p.next_token() != json_token::END_ARRAY) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // // wildcards can have multiple matches, continually update the dirty - // // count - // dirty |= path_evaluator::evaluate_path( - // p, g, write_style::quoted_style, path_ptr + 2, path_size - 2); - // } - // g.write_end_array(); - - // return dirty; - // } - // /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */ - // // case path 8 - // else if (json_token::START_ARRAY == token && - // thrust::get<0>(path_match_subscript_index_subscript_wildcard(path_ptr, path_size))) - // { - // int idx = thrust::get<1>(path_match_subscript_index_subscript_wildcard(path_ptr, - // path_size)); p.next_token(); - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // int i = idx; - // while (i >= 0) { - // if (p.get_current_token() == json_token::END_ARRAY) { - // // terminate, nothing has been written - // return false; - // } - // if (0 == i) { - // bool dirty = path_evaluator::evaluate_path( - // p, g, write_style::quoted_style, path_ptr + 2, path_size - 2); - // while (p.next_token() != json_token::END_ARRAY) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // // advance the token stream to the end of the array - // if (!p.try_skip_children()) { return false; } - // } - // return dirty; - // } else { - // // i > 0 - // if (!p.try_skip_children()) { return false; } - - // p.next_token(); - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - // } - // --i; - // } - // // path parser guarantees idx >= 0 - // // will never reach to here - // return false; - // } - // // case (START_ARRAY, Subscript :: Index(idx) :: xs) - // // case path 9 - // else if (json_token::START_ARRAY == token && - // thrust::get<0>(path_match_subscript_index(path_ptr, path_size))) { - // int idx = thrust::get<1>(path_match_subscript_index(path_ptr, path_size)); - // p.next_token(); - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // int i = idx; - // while (i >= 0) { - // if (p.get_current_token() == json_token::END_ARRAY) { - // // terminate, nothing has been written - // return false; - // } - // if (0 == i) { - // bool dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 2, path_size - 2); - // while (p.next_token() != json_token::END_ARRAY) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // // advance the token stream to the end of the array - // if (!p.try_skip_children()) { return false; } - // } - // return dirty; - // } else { - // // i > 0 - // if (!p.try_skip_children()) { return false; } - - // p.next_token(); - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - // } - // --i; - // } - // // path parser guarantees idx >= 0 - // // will never reach to here - // return false; - // } - // // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name - // // case path 10 - // else if (json_token::FIELD_NAME == token && - // thrust::get<0>(path_match_named(path_ptr, path_size)) && - // p.match_current_field_name(thrust::get<1>(path_match_named(path_ptr, path_size)))) { - // if (p.next_token() != json_token::VALUE_NULL) { - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1); - // } else { - // return false; - // } - // } - // // case (FIELD_NAME, Wildcard :: xs) - // // case path 11 - // else if (json_token::FIELD_NAME == token && - // path_match_element(path_ptr, path_size, path_instruction_type::WILDCARD)) { - // p.next_token(); - // // JSON validation check - // if (json_token::ERROR == p.get_current_token()) { return false; } - - // return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1); - // } - // // case _ => - // // case path 12 - // else { - // if (!p.try_skip_children()) { return false; } - // return false; - // } - // } - - /** - * - * This function is rewritten from above commented recursive function. - * this function is equivalent to the above commented recursive function. - */ - static __device__ bool evaluate_path(json_parser<>& p, - json_generator<>& root_g, - write_style root_style, - path_instruction const* root_path_ptr, - int root_path_size) - { - // manually maintained context stack in lieu of calling evaluate_path recursively. - struct context { - // current token - json_token token; - - // which case path that this task is from - int case_path; - - // used to save current generator - json_generator<> g; - - write_style style; - path_instruction const* path_ptr; - int path_size; - - // is this context task is done - bool task_is_done = false; - - // whether written output - // if dirty > 0, indicates success - int dirty = 0; - - // for some case paths - bool is_first_enter = true; - - // used to save child JSON generator for case path 8 - json_generator<> child_g; - - __device__ context() - : token(json_token::INIT), - case_path(-1), - g(json_generator<>()), - style(write_style::raw_style), - path_ptr(nullptr), - path_size(0) - { - } - - __device__ context(json_token _token, - int _case_path, - json_generator<> _g, - write_style _style, - path_instruction const* _path_ptr, - int _path_size) - : token(_token), - case_path(_case_path), - g(_g), - style(_style), - path_ptr(_path_ptr), - path_size(_path_size) - { - } - - __device__ context& operator=(context const&) = default; - }; - - // path max depth limitation - constexpr int max_path_depth = 32; - - // stack - context stack[max_path_depth]; - int stack_pos = 0; - - // push context function - auto push_context = [&stack, &stack_pos](json_token _token, - int _case_path, - json_generator<> _g, - write_style _style, - path_instruction const* _path_ptr, - int _path_size) { - if (stack_pos == max_path_depth - 1) { return false; } - stack[stack_pos++] = context(_token, _case_path, _g, _style, _path_ptr, _path_size); - return true; - }; - - // push context function - auto push_ctx = [&stack, &stack_pos](context ctx) { - if (stack_pos == max_path_depth - 1) { return false; } - stack[stack_pos++] = ctx; - return true; - }; - - // pop context function - auto pop_context = [&stack, &stack_pos](context& c) { - if (stack_pos > 0) { - c = stack[--stack_pos]; - return true; - } - return false; - }; - - // put the first context task - push_context(p.get_current_token(), -1, root_g, root_style, root_path_ptr, root_path_size); - - // current context task - context ctx; - - // parent context task - context p_ctx; - - while (pop_context(ctx)) { - if (!ctx.task_is_done) { - // task is not done. - - // case (VALUE_STRING, Nil) if style == RawStyle - // case path 1 - if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path_size) && - ctx.style == write_style::raw_style) { - // there is no array wildcard or slice parent, emit this string without - // quotes write current string in parser to generator - ctx.g.write_raw(p); - ctx.dirty = 1; - ctx.task_is_done = true; - push_ctx(ctx); - } - // case (START_ARRAY, Nil) if style == FlattenStyle - // case path 2 - else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path_size) && - ctx.style == write_style::flatten_style) { - // flatten this array into the parent - if (json_token::END_ARRAY != p.next_token()) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - // push back task - push_ctx(ctx); - // add child task - push_context(p.get_current_token(), 2, ctx.g, ctx.style, nullptr, 0); - } else { - // END_ARRAY - ctx.task_is_done = true; - push_ctx(ctx); - } - } - // case (_, Nil) - // case path 3 - else if (path_is_empty(ctx.path_size)) { - // general case: just copy the child tree verbatim - if (!(ctx.g.copy_current_structure(p))) { - // JSON validation check - return false; - } - ctx.dirty = 1; - ctx.task_is_done = true; - push_ctx(ctx); - } - // case (START_OBJECT, Key :: xs) - // case path 4 - else if (json_token::START_OBJECT == ctx.token && - path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::KEY)) { - if (json_token::END_OBJECT != p.next_token()) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - - if (ctx.dirty > 0) { - // once a match has been found we can skip other fields - if (!p.try_skip_children()) { - // JSON validation check - return false; - } - push_ctx(ctx); - } else { - // need to try more children - push_ctx(ctx); - push_context( - p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1); - } - } else { - ctx.task_is_done = true; - push_ctx(ctx); - } - } - // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) - // case path 5 - else if (json_token::START_ARRAY == ctx.token && - path_match_elements(ctx.path_ptr, - ctx.path_size, - path_instruction_type::SUBSCRIPT, - path_instruction_type::WILDCARD, - path_instruction_type::SUBSCRIPT, - path_instruction_type::WILDCARD)) { - // special handling for the non-structure preserving double wildcard - // behavior in Hive - if (ctx.is_first_enter) { - ctx.is_first_enter = false; - ctx.g.write_start_array(); - } - - if (p.next_token() != json_token::END_ARRAY) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - push_ctx(ctx); - push_context(p.get_current_token(), - 5, - ctx.g, - write_style::flatten_style, - ctx.path_ptr + 4, - ctx.path_size - 4); - } else { - ctx.g.write_end_array(); - ctx.task_is_done = true; - push_ctx(ctx); - } - } - // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle - // case path 6 - else if (json_token::START_ARRAY == ctx.token && - path_match_elements(ctx.path_ptr, - ctx.path_size, - path_instruction_type::SUBSCRIPT, - path_instruction_type::WILDCARD) && - ctx.style != write_style::quoted_style) { - // retain Flatten, otherwise use Quoted... cannot use Raw within an array - write_style next_style = write_style::raw_style; - switch (ctx.style) { - case write_style::raw_style: next_style = write_style::quoted_style; break; - case write_style::flatten_style: next_style = write_style::flatten_style; break; - case write_style::quoted_style: next_style = write_style::quoted_style; // never happen - } - - // temporarily buffer child matches, the emitted json will need to be - // modified slightly if there is only a single element written - - json_generator<> child_g; - if (ctx.is_first_enter) { - ctx.is_first_enter = false; - // create a child generator with hide outer array tokens mode. - child_g = ctx.g.new_child_generator(); - // write first [ without output, without update len, only update internal state - child_g.write_first_start_array_without_output(); - } else { - child_g = ctx.child_g; - } - - if (p.next_token() != json_token::END_ARRAY) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - - push_ctx(ctx); - // track the number of array elements and only emit an outer array if - // we've written more than one element, this matches Hive's behavior - push_context( - p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 2, ctx.path_size - 2); - } else { - char* child_g_start = child_g.get_output_start_position(); - size_t child_g_len = child_g.get_output_len(); - - if (ctx.dirty > 1) { - // add outer array tokens - ctx.g.write_child_raw_value( - child_g_start, child_g_len, /* write_outer_array_tokens */ true); - ctx.task_is_done = true; - push_ctx(ctx); - } else if (ctx.dirty == 1) { - // remove outer array tokens - ctx.g.write_child_raw_value( - child_g_start, child_g_len, /* write_outer_array_tokens */ false); - ctx.task_is_done = true; - push_ctx(ctx); - } // else do not write anything - } - } - // case (START_ARRAY, Subscript :: Wildcard :: xs) - // case path 7 - else if (json_token::START_ARRAY == ctx.token && - path_match_elements(ctx.path_ptr, - ctx.path_size, - path_instruction_type::SUBSCRIPT, - path_instruction_type::WILDCARD)) { - if (ctx.is_first_enter) { - ctx.is_first_enter = false; - ctx.g.write_start_array(); - } - - if (p.next_token() != json_token::END_ARRAY) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - - // wildcards can have multiple matches, continually update the dirty - // count - push_ctx(ctx); - push_context(p.get_current_token(), - 7, - ctx.g, - write_style::quoted_style, - ctx.path_ptr + 2, - ctx.path_size - 2); - } else { - ctx.g.write_end_array(); - ctx.task_is_done = true; - push_ctx(ctx); - } - } - /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */ - // case path 8 - else if (json_token::START_ARRAY == ctx.token && - thrust::get<0>( - path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size))) { - int idx = thrust::get<1>( - path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size)); - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - ctx.is_first_enter = false; - - int i = idx; - while (i > 0) { - if (p.get_current_token() == json_token::END_ARRAY) { - // terminate, nothing has been written - return false; - } - - if (!p.try_skip_children()) { return false; } - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - - --i; - } - - // i == 0 - push_ctx(ctx); - push_context(p.get_current_token(), - 8, - ctx.g, - write_style::quoted_style, - ctx.path_ptr + 2, - ctx.path_size - 2); - } - // case (START_ARRAY, Subscript :: Index(idx) :: xs) - // case path 9 - else if (json_token::START_ARRAY == ctx.token && - thrust::get<0>(path_match_subscript_index(ctx.path_ptr, ctx.path_size))) { - int idx = thrust::get<1>(path_match_subscript_index(ctx.path_ptr, ctx.path_size)); - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - - int i = idx; - while (i > 0) { - if (p.get_current_token() == json_token::END_ARRAY) { - // terminate, nothing has been written - return false; - } - - if (!p.try_skip_children()) { return false; } - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - - --i; - } - - // i == 0 - push_ctx(ctx); - push_context( - p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 2, ctx.path_size - 2); - } - // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name - // case path 10 - else if (json_token::FIELD_NAME == ctx.token && - thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size)) && - p.match_current_field_name( - thrust::get<1>(path_match_named(ctx.path_ptr, ctx.path_size)))) { - if (p.next_token() != json_token::VALUE_NULL) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - push_ctx(ctx); - push_context( - p.get_current_token(), 10, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1); - } else { - return false; - } - } - // case (FIELD_NAME, Wildcard :: xs) - // case path 11 - else if (json_token::FIELD_NAME == ctx.token && - path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) { - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - push_ctx(ctx); - push_context( - p.get_current_token(), 11, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1); - } - // case _ => - // case path 12 - else { - if (!p.try_skip_children()) { return false; } - // default case path, return false for this task - ctx.dirty = 0; - ctx.task_is_done = true; - push_ctx(ctx); - } - } else { - // current context is done. - - // pop parent task - // update parent task info according to current task result - if (pop_context(p_ctx)) { - // case (VALUE_STRING, Nil) if style == RawStyle - // case path 1 - if (1 == ctx.case_path) { - // never happen - } - // case (START_ARRAY, Nil) if style == FlattenStyle - // case path 2 - else if (2 == ctx.case_path) { - // collect result from child task - p_ctx.dirty += ctx.dirty; - // copy generator states to parent task; - p_ctx.g = ctx.g; - push_ctx(p_ctx); - } - // case (_, Nil) - // case path 3 - else if (3 == ctx.case_path) { - // never happen - } - // case (START_OBJECT, Key :: xs) - // case path 4 - else if (4 == ctx.case_path) { - if (p_ctx.dirty < 1 && ctx.dirty > 0) { p_ctx.dirty = ctx.dirty; } - // copy generator states to parent task; - p_ctx.g = ctx.g; - push_ctx(p_ctx); - } - // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) - // case path 5 - else if (5 == ctx.case_path) { - // collect result from child task - p_ctx.dirty += ctx.dirty; - // copy generator states to parent task; - p_ctx.g = ctx.g; - push_ctx(p_ctx); - } - // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle - // case path 6 - else if (6 == ctx.case_path) { - // collect result from child task - p_ctx.dirty += ctx.dirty; - // update child generator for parent task - p_ctx.child_g = ctx.g; - push_ctx(p_ctx); - } - // case (START_ARRAY, Subscript :: Wildcard :: xs) - // case path 7 - else if (7 == ctx.case_path) { - // collect result from child task - p_ctx.dirty += ctx.dirty; - // copy generator states to parent task; - p_ctx.g = ctx.g; - push_ctx(p_ctx); - } - /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */ - // case path 8 - // case (START_ARRAY, Subscript :: Index(idx) :: xs) - // case path 9 - else if (8 == ctx.case_path || 9 == ctx.case_path) { - // collect result from child task - p_ctx.dirty += ctx.dirty; - - // post logic: - while (p.next_token() != json_token::END_ARRAY) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - // advance the token stream to the end of the array - if (!p.try_skip_children()) { return false; } - } - // task is done - p_ctx.task_is_done = true; - // copy generator states to parent task; - p_ctx.g = ctx.g; - push_ctx(p_ctx); - } - // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name - // case path 10 - else if (10 == ctx.case_path) { - // collect result from child task - p_ctx.dirty += ctx.dirty; - // task is done - p_ctx.task_is_done = true; - // copy generator states to parent task; - p_ctx.g = ctx.g; - push_ctx(p_ctx); - } - // case (FIELD_NAME, Wildcard :: xs) - // case path 11 - else if (11 == ctx.case_path) { - // collect result from child task - p_ctx.dirty += ctx.dirty; - // task is done - p_ctx.task_is_done = true; - // copy generator states to parent task; - p_ctx.g = ctx.g; - push_ctx(p_ctx); - } - // case _ => - // case path 12 - else { - // never happen - } - } else { - // has no parent task, stack is empty, will exit - } - } - } - - // copy output len - root_g.set_output_len(ctx.g.get_output_len()); - return ctx.dirty > 0; - } -}; - -} // namespace detail - /** * Extracts json object from a json string based on json path specified, and * returns json string of the extracted json object. It will return null if the diff --git a/src/main/cpp/src/json_parser.hpp b/src/main/cpp/src/json_parser.cuh similarity index 91% rename from src/main/cpp/src/json_parser.hpp rename to src/main/cpp/src/json_parser.cuh index 65c735d1ff..ec0790aa6b 100644 --- a/src/main/cpp/src/json_parser.hpp +++ b/src/main/cpp/src/json_parser.cuh @@ -16,6 +16,7 @@ #pragma once #include "ftos_converter.cuh" +#include "string_to_float_cudf.cuh" #include #include @@ -40,7 +41,7 @@ enum class write_style { // allow single quotes to represent strings in JSON // e.g.: {'k': 'v'} is valid when it's true -constexpr bool curr_allow_single_quotes = true; +constexpr bool allow_single_quotes = true; // Whether allow unescaped control characters in JSON Strings. // Unescaped control characters are ASCII characters with value less than 32, @@ -50,17 +51,20 @@ constexpr bool curr_allow_single_quotes = true; // e.g., how to represent carriage return and newline characters: // if true, allow "\n\r" two control characters without escape directly // if false, "\n\r" are not allowed, should use escape characters: "\\n\\r" -constexpr bool curr_allow_unescaped_control_chars = true; +constexpr bool allow_unescaped_control_chars = true; -// deep JSON nesting depth will consume more memory, we can tuning this in -// future. we ever run into a limit of 254, here use a small value 64. -constexpr int curr_max_json_nesting_depth = 64; +/** + * @brief Maximum JSON nesting depth + * JSON with a greater depth is invalid + * If set this to be a greater value, should update `context_stack` + */ +constexpr int max_json_nesting_depth = 64; // Define the maximum JSON String length, counts utf8 bytes. // By default, maximum JSON String length is negative one, means no // limitation. e.g.: The length of String "\\n" is 1, JSON parser does not // count escape characters. -constexpr int curr_max_string_utf8_bytes = 20000000; +constexpr int max_string_utf8_bytes = 20000000; // /** @@ -73,7 +77,7 @@ constexpr int curr_max_string_utf8_bytes = 20000000; * e.g.: The length of number -123.45e-67 is 7. if maximum JSON number length * is 6, then this number is a invalid number. */ -constexpr int curr_max_num_len = 1000; +constexpr int max_num_len = 1000; /** * whether allow tailing useless sub-string in JSON. @@ -82,7 +86,7 @@ constexpr int curr_max_num_len = 1000; * 'v'} is valid. * {'k' : 'v'}_extra_tail_sub_string */ -constexpr bool curr_allow_tailing_sub_string = true; +constexpr bool allow_tailing_sub_string = true; /** * JSON token enum @@ -175,12 +179,6 @@ enum class json_token { * range: [0, 32) * */ -template class json_parser { public: __device__ inline json_parser(char const* const _json_start_pos, cudf::size_type const _json_len) @@ -191,6 +189,37 @@ class json_parser { } private: + /** + * @brief get the bit value for specified bit from a int64 number + */ + __device__ inline bool get_bit_value(int64_t number, int bitIndex) + { + // Shift the number right by the bitIndex to bring the desired bit to the rightmost position + long shifted = number >> bitIndex; + + // Extract the rightmost bit by performing a bitwise AND with 1 + bool bit_value = shifted & 1; + + return bit_value; + } + + /** + * @brief set the bit value for specified bit to a int64 number + */ + __device__ inline void set_bit_value(int64_t& number, int bit_index, bool bit_value) + { + // Create a mask with a 1 at the desired bit index + long mask = 1L << bit_index; + + if (bit_value) { + // Set the bit to 1 by performing a bitwise OR with the mask + number |= mask; + } else { + // Set the bit to 0 by performing a bitwise AND with the complement of the mask + number &= ~mask; + } + } + /** * is current position EOF */ @@ -258,8 +287,9 @@ class json_parser { */ __device__ inline void push_context(json_token token) { - bool v = json_token::START_OBJECT == token ? true : false; - context_stack[stack_size++] = v; + bool v = json_token::START_OBJECT == token ? true : false; + set_bit_value(context_stack, stack_size, v); + stack_size++; } /** @@ -267,7 +297,10 @@ class json_parser { * true is object, false is array * only has two contexts: object or array */ - __device__ inline bool is_object_context() { return context_stack[stack_size - 1]; } + __device__ inline bool is_object_context() + { + return get_bit_value(context_stack, stack_size - 1); + } /** * pop top context from stack @@ -1041,24 +1074,16 @@ class json_parser { */ __device__ inline void parse_number() { - // reset the float parts - float_integer_len = 0; - float_fraction_len = 0; - float_exp_len = 0; - float_exp_has_sign = false; - // parse sign - if (try_skip(curr_pos, '-')) { - float_sign = false; - } else { - float_sign = true; - } - float_integer_pos = curr_pos; + try_skip(curr_pos, '-'); // parse unsigned number bool is_float = false; - if (try_unsigned_number(is_float)) { - if (check_max_num_len()) { + // store number digits length + // e.g.: +1.23e-45 length is 5 + int number_digits_length = 0; + if (try_unsigned_number(is_float, number_digits_length)) { + if (check_max_num_len(number_digits_length)) { curr_token = (is_float ? json_token::VALUE_NUMBER_FLOAT : json_token::VALUE_NUMBER_INT); // success parsed a number, update the token length number_token_len = curr_pos - current_token_start_pos; @@ -1071,21 +1096,16 @@ class json_parser { } /** - * verify max number length if enabled - * e.g.: -1.23e-456, int len is 1, fraction len is 2, exp digits len is 3 + * verify max number digits length if enabled + * e.g.: +1.23e-45 length is 5 */ - __device__ inline bool check_max_num_len() + __device__ inline bool check_max_num_len(int number_digits_length) { - // exp part contains + or - sign char, do not count the exp sign - int exp_digit_len = float_exp_len; - if (float_exp_len > 0 && float_exp_has_sign) { exp_digit_len--; } - - int sum_len = float_integer_len + float_fraction_len + exp_digit_len; return // disabled num len check max_num_len <= 0 || // enabled num len check - (max_num_len > 0 && sum_len <= max_num_len); + (max_num_len > 0 && number_digits_length <= max_num_len); } /** @@ -1106,20 +1126,20 @@ class json_parser { * * @param[out] is_float, if contains `.` or `e`, set true */ - __device__ inline bool try_unsigned_number(bool& is_float) + __device__ inline bool try_unsigned_number(bool& is_float, int& number_digits_length) { if (!eof(curr_pos)) { char c = *curr_pos; if (c >= '1' && c <= '9') { curr_pos++; - float_integer_len++; + number_digits_length++; // first digit is [1-9] // path: INT = [1-9] [0-9]* - float_integer_len += skip_zero_or_more_digits(); - return parse_number_from_fraction(is_float); + number_digits_length += skip_zero_or_more_digits(); + return parse_number_from_fraction(is_float, number_digits_length); } else if (c == '0') { curr_pos++; - float_integer_len++; + number_digits_length++; // check leading zeros if (!eof(curr_pos)) { @@ -1132,7 +1152,7 @@ class json_parser { // first digit is [0] // path: INT = '0' - return parse_number_from_fraction(is_float); + return parse_number_from_fraction(is_float, number_digits_length); } else { // first digit is non [0-9] return false; @@ -1147,22 +1167,21 @@ class json_parser { * parse: ('.' [0-9]+)? EXP? * @param[is_float] is float */ - __device__ inline bool parse_number_from_fraction(bool& is_float) + __device__ inline bool parse_number_from_fraction(bool& is_float, int& number_digits_length) { // parse fraction if (try_skip(curr_pos, '.')) { // has fraction - float_fraction_pos = curr_pos; - is_float = true; + is_float = true; // try pattern: [0-9]+ - if (!try_skip_one_or_more_digits(float_fraction_len)) { return false; } + if (!try_skip_one_or_more_digits(number_digits_length)) { return false; } } // parse exp if (!eof(curr_pos) && (*curr_pos == 'e' || *curr_pos == 'E')) { curr_pos++; is_float = true; - return try_parse_exp(); + return try_parse_exp(number_digits_length); } return true; @@ -1192,12 +1211,12 @@ class json_parser { * try skip one or more [0-9] * @param[out] len: skipped num of digits */ - __device__ inline bool try_skip_one_or_more_digits(int& len) + __device__ inline bool try_skip_one_or_more_digits(int& number_digits_length) { if (!eof(curr_pos) && is_digit(*curr_pos)) { curr_pos++; - len++; - len += skip_zero_or_more_digits(); + number_digits_length++; + number_digits_length += skip_zero_or_more_digits(); return true; } else { return false; @@ -1208,21 +1227,15 @@ class json_parser { * parse [eE][+-]?[0-9]+ * @param[out] exp_len exp len */ - __device__ inline bool try_parse_exp() + __device__ inline bool try_parse_exp(int& number_digits_length) { // already parsed [eE] - float_exp_pos = curr_pos; - // parse [+-]? - if (!eof(curr_pos) && (*curr_pos == '+' || *curr_pos == '-')) { - float_exp_len++; - curr_pos++; - float_exp_has_sign = true; - } + if (!eof(curr_pos) && (*curr_pos == '+' || *curr_pos == '-')) { curr_pos++; } // parse [0-9]+ - return try_skip_one_or_more_digits(float_exp_len); + return try_skip_one_or_more_digits(number_digits_length); } // =========== Parse number end =========== @@ -1500,12 +1513,12 @@ class json_parser { return number_token_len; case json_token::VALUE_NUMBER_FLOAT: { // number normalization: - // 0.03E-2 => 0.3E-5; infinity; - // 200.000 => 200.0, 351.980 => 351.98, 12345678900000000000.0 - // => 1.23456789E19 0.0000000000003 => 3.0E-13; 0.003 => 0.003; 0.0003 - // => 3.0E-4 leverage function: `get_current_float_parts` - double d_value = - cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len)); + // 0.03E-2 => 0.3E-5, 200.000 => 200.0, 351.980 => 351.98, + // 12345678900000000000.0 => 1.23456789E19, 1E308 => 1.0E308 + // 0.0000000000003 => 3.0E-13; 0.003 => 0.003; 0.0003 => 3.0E-4 + // 1.0E309 => "Infinity", -1E309 => "-Infinity" + double d_value = spark_rapids_jni::detail::stod( + cudf::string_view(current_token_start_pos, number_token_len)); return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination); } case json_token::VALUE_TRUE: @@ -1589,8 +1602,9 @@ class json_parser { } return number_token_len; case json_token::VALUE_NUMBER_FLOAT: { - double d_value = - cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len)); + // number normalization: + double d_value = spark_rapids_jni::detail::stod( + cudf::string_view(current_token_start_pos, number_token_len)); return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination); } case json_token::VALUE_TRUE: @@ -1654,21 +1668,6 @@ class json_parser { stack_size = 0; } - /** - * get float parts, current token should be VALUE_NUMBER_FLOAT. - */ - __device__ thrust::tuple - get_current_float_parts() - { - return thrust::make_tuple(float_sign, - float_integer_pos, - float_integer_len, - float_fraction_pos, - float_fraction_len, - float_exp_pos, - float_exp_len); - } - /** * match field name string when current token is FIELD_NAME, * return true if current token is FIELD_NAME and match successfully. @@ -1789,44 +1788,27 @@ class json_parser { char const* curr_pos; json_token curr_token{json_token::INIT}; - // saves the nested contexts: JSON object context or JSON array context - // true is JSON object context; false is JSON array context - // When encounter EOF and this stack is non-empty, means non-closed JSON - // object/array, then parsing will fail. - bool context_stack[max_json_nesting_depth]; + // 64 bits long saves the nested object/array contexts + // true(bit value 1) is JSON object context + // false(bit value 0) is JSON array context + // JSON parser checks array/object are mached, e.g.: [1,2) are wrong + int64_t context_stack; int stack_size = 0; - // save current token start pos, used by coping current row text + // save current token start pos, used by coping current token text char const* current_token_start_pos; - // used to copy int/float string verbatim, note: int/float have no escape - // chars + // used to store number token length cudf::size_type number_token_len; - // The following variables record number token informations. - // if current token is int/float, use the following variables to save - // float parts e.g.: -123.000456E-000789, sign is false; integer part is 123; - // fraction part is 000456; exp part is -000789. The following parts is used - // by normalization, e.g.: 0.001 => 1E-3 - bool float_sign; - char const* float_integer_pos; - int float_integer_len; - char const* float_fraction_pos; - int float_fraction_len; - char const* float_exp_pos; - int float_exp_len; - // true indicates has '-' or '+' in the exp part; - // the exp sign char is not counted when checking the max number length - bool float_exp_has_sign; - // Records string/field name token utf8 bytes size after unescaped - // e.g.: For JSON string "\\n", after unescaped, it ues 1 byte '\n' - // used by `write_unescaped_text` and `write_escaped_text` bytes + // e.g.: For JSON 4 chars string "\\n", after unescaped, get 1 char '\n' // used by checking the max string length int string_token_utf8_bytes; - // Records bytes diff for escape writing - // e.g.: "\\n" string_token_utf8_bytes is 1, - // when `write_escaped_text` bytes is 4: " \ n " - // this diff will be 4 - 1 = 3; + + // Records bytes diff between escape writing and unescape writing + // e.g.: 4 chars string "\\n", string_token_utf8_bytes is 1, + // when `write_escaped_text`, will write out 4 chars: " \ n ", + // then this diff will be 4 - 1 = 3 int bytes_diff_for_escape_writing; }; diff --git a/src/main/cpp/src/string_to_float_cudf.cuh b/src/main/cpp/src/string_to_float_cudf.cuh new file mode 100644 index 0000000000..5a7824d495 --- /dev/null +++ b/src/main/cpp/src/string_to_float_cudf.cuh @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include + +namespace spark_rapids_jni { +namespace detail { + +/** + * @brief This function converts the given string into a + * floating point double value. + * + * This will also map strings containing "NaN", "Inf", etc. + * to the appropriate float values. + * + * This function will also handle scientific notation format. + * + * This function is a copy of cudf::strings::detail::stod with + * the namespace changed to spark_rapids_jni::detail and fixed + * an overflow bug of `exp_ten`. It is a short-term solution to + * resolve a bug in get_json_object. We should remove this file + * once the bug is fixed in cudf in long term. + * This diff is `if (exp_ten >= 1e8) break;` + */ +__device__ inline double stod(cudf::string_view const& d_str) +{ + char const* in_ptr = d_str.data(); + char const* end = in_ptr + d_str.size_bytes(); + if (end == in_ptr) return 0.0; + double sign{1.0}; + if (*in_ptr == '-' || *in_ptr == '+') { + sign = (*in_ptr == '-' ? -1 : 1); + ++in_ptr; + } + + constexpr double infinity = std::numeric_limits::infinity(); + constexpr uint64_t max_holding = (std::numeric_limits::max() - 9L) / 10L; + + // special strings: NaN, Inf + if ((in_ptr < end) && *in_ptr > '9') { + auto const inf_nan = cudf::string_view(in_ptr, static_cast(end - in_ptr)); + if (cudf::strings::detail::is_nan_str(inf_nan)) { return nan(""); } + if (cudf::strings::detail::is_inf_str(inf_nan)) { return sign * infinity; } + } + + // Parse and store the mantissa as much as we can, + // until we are about to exceed the limit of uint64_t + uint64_t digits = 0; + int exp_off = 0; + bool decimal = false; + while (in_ptr < end) { + char ch = *in_ptr; + if (ch == '.') { + decimal = true; + ++in_ptr; + continue; + } + if (ch < '0' || ch > '9') break; + if (digits > max_holding) + exp_off += (int)!decimal; + else { + digits = (digits * 10L) + static_cast(ch - '0'); + if (digits > max_holding) { + digits = digits / 10L; + exp_off += (int)!decimal; + } else + exp_off -= (int)decimal; + } + ++in_ptr; + } + if (digits == 0) { return sign * static_cast(0); } + + // check for exponent char + int exp_ten = 0; + int exp_sign = 1; + if (in_ptr < end) { + char ch = *in_ptr++; + if (ch == 'e' || ch == 'E') { + if (in_ptr < end) { + ch = *in_ptr; + if (ch == '-' || ch == '+') { + exp_sign = (ch == '-' ? -1 : 1); + ++in_ptr; + } + while (in_ptr < end) { + ch = *in_ptr++; + if (ch < '0' || ch > '9') break; + exp_ten = (exp_ten * 10) + (int)(ch - '0'); + if (exp_ten >= 1e8) break; + } + } + } + } + + int const num_digits = static_cast(log10(static_cast(digits))) + 1; + exp_ten *= exp_sign; + exp_ten += exp_off; + exp_ten += num_digits - 1; + if (exp_ten > std::numeric_limits::max_exponent10) { + return sign > 0 ? infinity : -infinity; + } + + double base = sign * static_cast(digits); + + exp_ten += 1 - num_digits; + // If 10^exp_ten would result in a subnormal value, the base and + // exponent should be adjusted so that 10^exp_ten is a normal value + auto const subnormal_shift = std::numeric_limits::min_exponent10 - exp_ten; + if (subnormal_shift > 0) { + // Handle subnormal values. Ensure that both base and exponent are + // normal values before computing their product. + base = base / exp10(static_cast(num_digits - 1 + subnormal_shift)); + exp_ten += num_digits - 1; // adjust exponent + auto const exponent = exp10(static_cast(exp_ten + subnormal_shift)); + return base * exponent; + } + + double const exponent = exp10(static_cast(std::abs(exp_ten))); + return exp_ten < 0 ? base / exponent : base * exponent; +} + +} // namespace detail +} // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index bd034651b7..4ff9c91a3f 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -23,6 +23,9 @@ public class JSONUtils { NativeDepsLoader.loadNativeDeps(); } + // Keep the same with `max_path_depth` in `get_json_object.cu' + public static final int MAX_PATH_DEPTH = 16; + public enum PathInstructionType { SUBSCRIPT, WILDCARD, diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index a5a519363b..ea23c4c9ba 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -96,13 +96,11 @@ void getJsonObjectTest4() { keyPath(), namedPath("k5"), keyPath(), namedPath("k6"), keyPath(), namedPath("k7"), - keyPath(), namedPath("k8"), - keyPath(), namedPath("k9"), - keyPath(), namedPath("k10") + keyPath(), namedPath("k8") }; - String JSON = "{\"k1\":{\"k2\":{\"k3\":{\"k4\":{\"k5\":{\"k6\":{\"k7\":{\"k8\":{\"k9\":{\"k10\":\"v10\"}}}}}}}}}}"; - String expectedStr = "v10"; + String JSON = "{\"k1\":{\"k2\":{\"k3\":{\"k4\":{\"k5\":{\"k6\":{\"k7\":{\"k8\":\"v8\"}}}}}}}}"; + String expectedStr = "v8"; try ( ColumnVector jsonCv = ColumnVector.fromStrings( JSON, JSON, JSON, JSON, JSON, JSON, JSON); @@ -202,6 +200,8 @@ void getJsonObjectTest_Number_Normalization() { "[-0.0]", "[-0]", "[12345678999999999999999999]", + "[9.299999257686047e-0005603333574677677]", + "9.299999257686047e0005603333574677677", "[1E308]", "[1.0E309,-1E309,1E5000]", "0.3", @@ -216,6 +216,8 @@ void getJsonObjectTest_Number_Normalization() { "[-0.0]", "[0]", "[12345678999999999999999999]", + "[0.0]", + "\"Infinity\"", "[1.0E308]", "[\"Infinity\",\"-Infinity\",\"Infinity\"]", "0.3", diff --git a/thirdparty/cudf b/thirdparty/cudf index e6cfd4503a..578c240a20 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit e6cfd4503af063d3bba28954ab7ec67dbbb44e71 +Subproject commit 578c240a20049a5c4b83b08f54c235aad5318f1a diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha index fd9b77a7c7..aee5915dc4 100644 --- a/thirdparty/cudf-pins/rapids-cmake.sha +++ b/thirdparty/cudf-pins/rapids-cmake.sha @@ -1 +1 @@ -096ae3c0a6b2c593f8fdb38468be527027bf79d7 +69f5222465ec3c8c54f107fcf8750f040034e156 diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json index cf97df89ae..4261acab1a 100644 --- a/thirdparty/cudf-pins/versions.json +++ b/thirdparty/cudf-pins/versions.json @@ -62,7 +62,7 @@ { "always_download" : true, "git_shallow" : false, - "git_tag" : "745b1847f56c8f4b0c4e094f93837c2a91e18318", + "git_tag" : "5e69e97c42504c17a333a36e1796dce4b83150a4", "git_url" : "https://github.com/rapidsai/kvikio.git", "version" : "24.04" }, @@ -131,7 +131,7 @@ { "always_download" : true, "git_shallow" : false, - "git_tag" : "0651edf0fce5ebf53528382b475fc29a2f3afa67", + "git_tag" : "e14a2291301ce9c8ef76b2b2404eb02336584724", "git_url" : "https://github.com/rapidsai/rmm.git", "version" : "24.04" },