From 89b05dc1d2107b8b066b5ebc2c95a5b4dff59bb3 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 22 Mar 2024 17:54:07 +0800
Subject: [PATCH 001/124] Init version 24.06.0-SNAPSHOT

Bump up JNI version to 24.06.0-SNAPSHOT

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 .gitmodules                 | 2 +-
 CONTRIBUTING.md             | 2 +-
 pom.xml                     | 2 +-
 src/main/cpp/CMakeLists.txt | 2 +-
 thirdparty/cudf             | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/.gitmodules b/.gitmodules
index 103a678946..12b07c5b18 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "thirdparty/cudf"]
 	path = thirdparty/cudf
 	url = https://github.com/rapidsai/cudf.git
-	branch = branch-24.04
+	branch = branch-24.06
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 60dd78e5d8..9f5c5be5c0 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -160,7 +160,7 @@ $ ./build/build-in-docker install ...
 ```
 
 Now cd to ~/repos/NVIDIA/spark-rapids and build with one of the options from
-[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.04/CONTRIBUTING.md#building-from-source).
+[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.06/CONTRIBUTING.md#building-from-source).
 
 ```bash
 $ ./build/buildall
diff --git a/pom.xml b/pom.xml
index 8dca17cf3e..745f8127d1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>24.04.0-SNAPSHOT</version>
+  <version>24.06.0-SNAPSHOT</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 704a5a308b..b0badf950a 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -32,7 +32,7 @@ rapids_cuda_init_architectures(SPARK_RAPIDS_JNI)
 
 project(
   SPARK_RAPIDS_JNI
-  VERSION 24.04.00
+  VERSION 24.06.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 23aad9ec76..80a02c6f9a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 23aad9ec76ca0367be994a551a9b0a4838839883
+Subproject commit 80a02c6f9a6ca6a6bfc20a25553426026e0d4be4

From d696211860b9f76e5a2011fceead55632dc10b35 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 26 Mar 2024 01:54:35 +0000
Subject: [PATCH 002/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 23aad9ec76..80a02c6f9a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 23aad9ec76ca0367be994a551a9b0a4838839883
+Subproject commit 80a02c6f9a6ca6a6bfc20a25553426026e0d4be4

From 48dae597f6f2af8e59aff0229a0c6c26a63e26ec Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:03:52 +0000
Subject: [PATCH 003/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 23aad9ec76..80a02c6f9a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 23aad9ec76ca0367be994a551a9b0a4838839883
+Subproject commit 80a02c6f9a6ca6a6bfc20a25553426026e0d4be4

From 3fdc1c6b9c6e19c520f54fd858afe4bdc24cef6b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 26 Mar 2024 22:02:12 +0800
Subject: [PATCH 004/124] Update submodule cudf to
 933e32ab9ad8e5057282c48129ddbd745c538967 (#1900)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 80a02c6f9a..933e32ab9a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 80a02c6f9a6ca6a6bfc20a25553426026e0d4be4
+Subproject commit 933e32ab9ad8e5057282c48129ddbd745c538967

From 6d178a4c54012a864c60bbface8db009e506e8b9 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 26 Mar 2024 22:00:17 +0000
Subject: [PATCH 005/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e3cbf62fce..933e32ab9a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e3cbf62fcef479a051d116c451e69ddaa4568b57
+Subproject commit 933e32ab9ad8e5057282c48129ddbd745c538967

From 7bce1ca901a93dd5c97f721441a5cabae78a6df9 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:12:09 +0800
Subject: [PATCH 006/124] Update submodule cudf to
 a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327 (#1904)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 933e32ab9a..a7ceedecbb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 933e32ab9ad8e5057282c48129ddbd745c538967
+Subproject commit a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327

From 764f30277b1a430f1fd18304e0372badb24788e8 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:36:28 +0000
Subject: [PATCH 007/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e3cbf62fce..a7ceedecbb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e3cbf62fcef479a051d116c451e69ddaa4568b57
+Subproject commit a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327

From 09c94957dee33aa505e5c42b273281892ea661e4 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:40:08 +0000
Subject: [PATCH 008/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e3cbf62fce..a7ceedecbb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e3cbf62fcef479a051d116c451e69ddaa4568b57
+Subproject commit a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327

From 990e261d06883ad2dc035853e0a4d936a1f0fccd Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 28 Mar 2024 04:33:01 +0800
Subject: [PATCH 009/124] Update submodule cudf to
 7c69e6666cec5fa444bb43005973f3bfa495575b (#1912)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a7ceedecbb..7c69e6666c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327
+Subproject commit 7c69e6666cec5fa444bb43005973f3bfa495575b

From 9e4650bedf459393cb1d7d0f32bdfba21b2166f5 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 27 Mar 2024 22:00:04 +0000
Subject: [PATCH 010/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 35f818b3e4..7c69e6666c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 35f818b3e4bef8e331f083dadc9a4c45e2987a78
+Subproject commit 7c69e6666cec5fa444bb43005973f3bfa495575b

From 4f1c2bb5460c0082c5c5238917cf089c257383cb Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 1 Apr 2024 20:29:44 +0800
Subject: [PATCH 011/124] Update submodule cudf to
 aab6137c80c50eccc5007120f7140cfe6646b5e0 (#1916)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7c69e6666c..aab6137c80 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7c69e6666cec5fa444bb43005973f3bfa495575b
+Subproject commit aab6137c80c50eccc5007120f7140cfe6646b5e0

From 69e10521f6171d57092fba91f0c97affbd6b2ced Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 1 Apr 2024 18:28:18 +0000
Subject: [PATCH 012/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 35f818b3e4..aab6137c80 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 35f818b3e4bef8e331f083dadc9a4c45e2987a78
+Subproject commit aab6137c80c50eccc5007120f7140cfe6646b5e0

From 6cba28fc4abf16141327adbab670da67d3502350 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 2 Apr 2024 12:51:20 +0800
Subject: [PATCH 013/124] Move device code in get_json_object to cu or cuh
 (#1915)

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/get_json_object.cu           | 1137 ++++++++++++++++
 src/main/cpp/src/get_json_object.hpp          | 1142 +----------------
 .../src/{json_parser.hpp => json_parser.cuh}  |    0
 .../nvidia/spark/rapids/jni/JSONUtils.java    |    2 +
 4 files changed, 1140 insertions(+), 1141 deletions(-)
 rename src/main/cpp/src/{json_parser.hpp => json_parser.cuh} (100%)

diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index e944337861..27285eef4f 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -50,6 +50,1143 @@ namespace spark_rapids_jni {
 
 namespace detail {
 
+/**
+ * write JSON style
+ */
+enum class write_style { raw_style, quoted_style, flatten_style };
+
+/**
+ * path instruction
+ */
+struct path_instruction {
+  __device__ inline path_instruction(path_instruction_type _type) : type(_type) {}
+
+  path_instruction_type type;
+
+  // used when type is named type
+  cudf::string_view name;
+
+  // used when type is index
+  int index{-1};
+};
+
+/**
+ * JSON generator is used to write out JSON content.
+ * Because of get_json_object only outputs JSON object as a whole item,
+ * it's no need to store internal state for JSON object when outputing,
+ * only need to store internal state for JSON array.
+ */
+template <int max_json_nesting_depth = curr_max_json_nesting_depth>
+class json_generator {
+ public:
+  __device__ json_generator(char* _output) : output(_output), output_len(0) {}
+  __device__ json_generator() : output(nullptr), output_len(0) {}
+
+  __device__ json_generator<>& operator=(json_generator<> const& other)
+  {
+    this->output      = other.output;
+    this->output_len  = other.output_len;
+    this->array_depth = other.array_depth;
+    for (size_t i = 0; i < max_json_nesting_depth; i++) {
+      this->is_first_item[i] = other.is_first_item[i];
+    }
+
+    return *this;
+  }
+
+  // create a nested child generator based on this parent generator,
+  // child generator is a view, parent and child share the same byte array
+  __device__ json_generator new_child_generator()
+  {
+    if (nullptr == output) {
+      return json_generator();
+    } else {
+      return json_generator(output + output_len);
+    }
+  }
+
+  // write [
+  // add an extra comma if needed,
+  // e.g.: when JSON content is: [[1,2,3]
+  // writing a new [ should result: [[1,2,3],[
+  __device__ void write_start_array()
+  {
+    try_write_comma();
+
+    // update internal state
+    if (array_depth > 0) { is_first_item[array_depth - 1] = false; }
+
+    if (output) { *(output + output_len) = '['; }
+
+    output_len++;
+    is_first_item[array_depth] = true;
+    array_depth++;
+  }
+
+  // write ]
+  __device__ void write_end_array()
+  {
+    if (output) { *(output + output_len) = ']'; }
+    output_len++;
+    array_depth--;
+  }
+
+  // write first start array without output, only update internal state
+  __device__ void write_first_start_array_without_output()
+  {
+    // hide the outer start array token
+    // Note: do not inc output_len
+    is_first_item[array_depth] = true;
+    array_depth++;
+  }
+
+  // return true if it's in a array context and it's not writing the first item.
+  __device__ bool need_comma() { return (array_depth > 0 && !is_first_item[array_depth - 1]); }
+
+  /**
+   * write comma accroding to current generator state
+   */
+  __device__ void try_write_comma()
+  {
+    if (need_comma()) {
+      // in array context and writes first item
+      if (output) { *(output + output_len) = ','; }
+      output_len++;
+    }
+  }
+
+  /**
+   * copy current structure when parsing. If current token is start
+   * object/array, then copy to corresponding matched end object/array. return
+   * false if JSON format is invalid return true if JSON format is valid
+   */
+  __device__ bool copy_current_structure(json_parser<>& parser)
+  {
+    // first try add comma
+    try_write_comma();
+
+    if (array_depth > 0) { is_first_item[array_depth - 1] = false; }
+
+    if (nullptr != output) {
+      auto copy_to       = output + output_len;
+      auto [b, copy_len] = parser.copy_current_structure(copy_to);
+      output_len += copy_len;
+      return b;
+    } else {
+      char* copy_to      = nullptr;
+      auto [b, copy_len] = parser.copy_current_structure(copy_to);
+      output_len += copy_len;
+      return b;
+    }
+  }
+
+  /**
+   * Get current text from JSON parser and then write the text
+   * Note: Because JSON strings contains '\' to do escape,
+   * JSON parser should do unescape to remove '\' and JSON parser
+   * then can not return a pointer and length pair (char *, len),
+   * For number token, JSON parser can return a pair (char *, len)
+   */
+  __device__ void write_raw(json_parser<>& parser)
+  {
+    if (array_depth > 0) { is_first_item[array_depth - 1] = false; }
+
+    if (nullptr != output) {
+      auto copied = parser.write_unescaped_text(output + output_len);
+      output_len += copied;
+    } else {
+      auto len = parser.compute_unescaped_len();
+      output_len += len;
+    }
+  }
+
+  /**
+   * write child raw value
+   * e.g.:
+   *
+   * write_outer_array_tokens = false
+   * need_comma = true
+   * [1,2,3]1,2,3
+   *        ^
+   *        |
+   *    child pointer
+   * ==>>
+   * [1,2,3],1,2,3
+   *
+   *
+   * write_outer_array_tokens = true
+   * need_comma = true
+   *   [12,3,4
+   *     ^
+   *     |
+   * child pointer
+   * ==>>
+   *   [1,[2,3,4]
+   *
+   * For more information about param write_outer_array_tokens, refer to
+   * `write_first_start_array_without_output`
+   * @param child_block_begin
+   * @param child_block_len
+   * @param write_outer_array_tokens whether write outer array tokens for child block
+   */
+  __device__ void write_child_raw_value(char* child_block_begin,
+                                        size_t child_block_len,
+                                        bool write_outer_array_tokens)
+  {
+    bool insert_comma = need_comma();
+
+    is_first_item[array_depth - 1] = false;
+
+    if (nullptr != output) {
+      if (write_outer_array_tokens) {
+        if (insert_comma) {
+          *(child_block_begin + child_block_len + 2) = ']';
+          move_forward(child_block_begin, child_block_len, 2);
+          *(child_block_begin + 1) = '[';
+          *(child_block_begin)     = ',';
+        } else {
+          *(child_block_begin + child_block_len + 1) = ']';
+          move_forward(child_block_begin, child_block_len, 1);
+          *(child_block_begin) = '[';
+        }
+      } else {
+        if (insert_comma) {
+          move_forward(child_block_begin, child_block_len, 1);
+          *(child_block_begin) = ',';
+        } else {
+          // do not need comma && do not need write outer array tokens
+          // do nothing, because child generator buff is directly after the
+          // parent generator
+        }
+      }
+    }
+
+    // update length
+    if (insert_comma) { output_len++; }
+    if (write_outer_array_tokens) { output_len += 2; }
+    output_len += child_block_len;
+  }
+
+  // move memory block forward by specified bytes
+  // e.g.:  memory is: 1 2 0 0, begin is 1, len is 2, after moving,
+  // memory is: 1 2 1 2.
+  // e.g.:  memory is: 1 2 0 0, begin is 1, len is 1, after moving,
+  // memory is: 1 1 2 0.
+  // Note: should move from end to begin to avoid overwrite buffer
+  __device__ void move_forward(char* begin, size_t len, int forward)
+  {
+    char* pos = begin + len + forward - 1;
+    char* e   = begin + forward - 1;
+    while (pos > e) {
+      *pos = *(pos - forward);
+      pos--;
+    }
+  }
+
+  __device__ void reset() { output_len = 0; }
+
+  __device__ inline size_t get_output_len() const { return output_len; }
+  __device__ inline char* get_output_start_position() const { return output; }
+  __device__ inline char* get_current_output_position() const { return output + output_len; }
+
+  /**
+   * generator may contain trash output, e.g.: generator writes some output,
+   * then JSON format is invalid, the previous output becomes trash.
+   */
+  __device__ inline void set_output_len_zero() { output_len = 0; }
+
+  __device__ inline void set_output_len(size_t len) { output_len = len; }
+
+ private:
+  char* output;
+  size_t output_len;
+
+  bool is_first_item[max_json_nesting_depth];
+  int array_depth = 0;
+};
+
+/**
+ * path evaluator which can run on both CPU and GPU
+ */
+struct path_evaluator {
+  static __device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; }
+
+  static __device__ inline bool path_match_element(path_instruction const* path_ptr,
+                                                   size_t path_size,
+                                                   path_instruction_type path_type0)
+  {
+    if (path_size < 1) { return false; }
+    return path_ptr[0].type == path_type0;
+  }
+
+  static __device__ inline bool path_match_elements(path_instruction const* path_ptr,
+                                                    size_t path_size,
+                                                    path_instruction_type path_type0,
+                                                    path_instruction_type path_type1)
+  {
+    if (path_size < 2) { return false; }
+    return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1;
+  }
+
+  static __device__ inline bool path_match_elements(path_instruction const* path_ptr,
+                                                    size_t path_size,
+                                                    path_instruction_type path_type0,
+                                                    path_instruction_type path_type1,
+                                                    path_instruction_type path_type2,
+                                                    path_instruction_type path_type3)
+  {
+    if (path_size < 4) { return false; }
+    return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1 &&
+           path_ptr[2].type == path_type2 && path_ptr[3].type == path_type3;
+  }
+
+  static __device__ inline thrust::tuple<bool, int> path_match_subscript_index(
+    path_instruction const* path_ptr, size_t path_size)
+  {
+    auto match = path_match_elements(
+      path_ptr, path_size, path_instruction_type::SUBSCRIPT, path_instruction_type::INDEX);
+    if (match) {
+      return thrust::make_tuple(true, path_ptr[1].index);
+    } else {
+      return thrust::make_tuple(false, 0);
+    }
+  }
+
+  static __device__ inline thrust::tuple<bool, cudf::string_view> path_match_named(
+    path_instruction const* path_ptr, size_t path_size)
+  {
+    auto match = path_match_element(path_ptr, path_size, path_instruction_type::NAMED);
+    if (match) {
+      return thrust::make_tuple(true, path_ptr[0].name);
+    } else {
+      return thrust::make_tuple(false, cudf::string_view());
+    }
+  }
+
+  static __device__ inline thrust::tuple<bool, int> path_match_subscript_index_subscript_wildcard(
+    path_instruction const* path_ptr, size_t path_size)
+  {
+    auto match = path_match_elements(path_ptr,
+                                     path_size,
+                                     path_instruction_type::SUBSCRIPT,
+                                     path_instruction_type::INDEX,
+                                     path_instruction_type::SUBSCRIPT,
+                                     path_instruction_type::WILDCARD);
+    if (match) {
+      return thrust::make_tuple(true, path_ptr[1].index);
+    } else {
+      return thrust::make_tuple(false, 0);
+    }
+  }
+
+  /**
+   *
+   * The following commented function is recursive version,
+   * The next function below is the rewritten version,
+   * Keep version here is for review purpuse, because rewritten version(iterative)
+   * is not human friendly.
+   *
+   */
+  // static __device__ bool evaluate_path(json_parser<>& p,
+  //                                            json_generator<>& g,
+  //                                            write_style style,
+  //                                            path_instruction const* path_ptr,
+  //                                            int path_size)
+  // {
+  //   auto token = p.get_current_token();
+
+  //   // case (VALUE_STRING, Nil) if style == RawStyle
+  //   // case path 1
+  //   if (json_token::VALUE_STRING == token && path_is_empty(path_size) &&
+  //       style == write_style::raw_style) {
+  //     // there is no array wildcard or slice parent, emit this string without
+  //     // quotes write current string in parser to generator
+  //     g.write_raw(p);
+  //     return true;
+  //   }
+  //   // case (START_ARRAY, Nil) if style == FlattenStyle
+  //   // case path 2
+  //   else if (json_token::START_ARRAY == token && path_is_empty(path_size) &&
+  //            style == write_style::flatten_style) {
+  //     // flatten this array into the parent
+  //     bool dirty = false;
+  //     while (json_token::END_ARRAY != p.next_token()) {
+  //       // JSON validation check
+  //       if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //       dirty |= path_evaluator::evaluate_path(p, g, style, nullptr, 0);
+  //     }
+  //     return dirty;
+  //   }
+  //   // case (_, Nil)
+  //   // case path 3
+  //   else if (path_is_empty(path_size)) {
+  //     // general case: just copy the child tree verbatim
+  //     return g.copy_current_structure(p);
+  //   }
+  //   // case (START_OBJECT, Key :: xs)
+  //   // case path 4
+  //   else if (json_token::START_OBJECT == token &&
+  //            path_match_element(path_ptr, path_size, path_instruction_type::KEY)) {
+  //     bool dirty = false;
+  //     while (json_token::END_OBJECT != p.next_token()) {
+  //       // JSON validation check
+  //       if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //       if (dirty) {
+  //         // once a match has been found we can skip other fields
+  //         if (!p.try_skip_children()) {
+  //           // JSON validation check
+  //           return false;
+  //         }
+  //       } else {
+  //         dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
+  //       }
+  //     }
+  //     return dirty;
+  //   }
+  //   // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
+  //   // case path 5
+  //   else if (json_token::START_ARRAY == token &&
+  //            path_match_elements(path_ptr,
+  //                                path_size,
+  //                                path_instruction_type::SUBSCRIPT,
+  //                                path_instruction_type::WILDCARD,
+  //                                path_instruction_type::SUBSCRIPT,
+  //                                path_instruction_type::WILDCARD)) {
+  //     // special handling for the non-structure preserving double wildcard
+  //     // behavior in Hive
+  //     bool dirty = false;
+  //     g.write_start_array();
+  //     while (p.next_token() != json_token::END_ARRAY) {
+  //       // JSON validation check
+  //       if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //       dirty |= path_evaluator::evaluate_path(
+  //         p, g, write_style::flatten_style, path_ptr + 4, path_size - 4);
+  //     }
+  //     g.write_end_array();
+  //     return dirty;
+  //   }
+  //   // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
+  //   // case path 6
+  //   else if (json_token::START_ARRAY == token &&
+  //            path_match_elements(path_ptr,
+  //                                path_size,
+  //                                path_instruction_type::SUBSCRIPT,
+  //                                path_instruction_type::WILDCARD) &&
+  //            style != write_style::quoted_style) {
+  //     // retain Flatten, otherwise use Quoted... cannot use Raw within an array
+  //     write_style next_style = write_style::raw_style;
+  //     switch (style) {
+  //       case write_style::raw_style: next_style = write_style::quoted_style; break;
+  //       case write_style::flatten_style: next_style = write_style::flatten_style; break;
+  //       case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
+  //     }
+
+  //     // temporarily buffer child matches, the emitted json will need to be
+  //     // modified slightly if there is only a single element written
+
+  //     int dirty = 0;
+  //     // create a child generator with hide outer array tokens mode.
+  //     auto child_g = g.new_child_generator(/*hide_outer_array_tokens*/ true);
+
+  //     // Note: child generator does not actually write the outer start array
+  //     // token into buffer it only updates internal nested state
+  //     child_g.write_start_array();
+
+  //     while (p.next_token() != json_token::END_ARRAY) {
+  //       // JSON validation check
+  //       if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //       // track the number of array elements and only emit an outer array if
+  //       // we've written more than one element, this matches Hive's behavior
+  //       dirty +=
+  //         (path_evaluator::evaluate_path(p, child_g, next_style, path_ptr + 2, path_size - 2) ? 1
+  //                                                                                             :
+  //                                                                                             0);
+  //     }
+
+  //     // Note: child generator does not actually write the outer end array token
+  //     // into buffer it only updates internal nested state
+  //     child_g.write_end_array();
+
+  //     char* child_g_start = child_g.get_output_start_position();
+  //     size_t child_g_len  = child_g.get_output_len();  // len already excluded outer [ ]
+
+  //     if (dirty > 1) {
+  //       // add outer array tokens
+  //       g.write_child_raw_value(child_g_start, child_g_len, true);
+  //     } else if (dirty == 1) {
+  //       // remove outer array tokens
+  //       g.write_child_raw_value(child_g_start, child_g_len, false);
+  //     }  // else do not write anything
+
+  //     return dirty > 0;
+  //   }
+  //   // case (START_ARRAY, Subscript :: Wildcard :: xs)
+  //   // case path 7
+  //   else if (json_token::START_ARRAY == token &&
+  //            path_match_elements(path_ptr,
+  //                                path_size,
+  //                                path_instruction_type::SUBSCRIPT,
+  //                                path_instruction_type::WILDCARD)) {
+  //     bool dirty = false;
+  //     g.write_start_array();
+  //     while (p.next_token() != json_token::END_ARRAY) {
+  //       // JSON validation check
+  //       if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //       // wildcards can have multiple matches, continually update the dirty
+  //       // count
+  //       dirty |= path_evaluator::evaluate_path(
+  //         p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
+  //     }
+  //     g.write_end_array();
+
+  //     return dirty;
+  //   }
+  //   /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
+  //   // case path 8
+  //   else if (json_token::START_ARRAY == token &&
+  //            thrust::get<0>(path_match_subscript_index_subscript_wildcard(path_ptr, path_size)))
+  //            {
+  //     int idx = thrust::get<1>(path_match_subscript_index_subscript_wildcard(path_ptr,
+  //     path_size)); p.next_token();
+  //     // JSON validation check
+  //     if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //     int i = idx;
+  //     while (i >= 0) {
+  //       if (p.get_current_token() == json_token::END_ARRAY) {
+  //         // terminate, nothing has been written
+  //         return false;
+  //       }
+  //       if (0 == i) {
+  //         bool dirty = path_evaluator::evaluate_path(
+  //           p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
+  //         while (p.next_token() != json_token::END_ARRAY) {
+  //           // JSON validation check
+  //           if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //           // advance the token stream to the end of the array
+  //           if (!p.try_skip_children()) { return false; }
+  //         }
+  //         return dirty;
+  //       } else {
+  //         // i > 0
+  //         if (!p.try_skip_children()) { return false; }
+
+  //         p.next_token();
+  //         // JSON validation check
+  //         if (json_token::ERROR == p.get_current_token()) { return false; }
+  //       }
+  //       --i;
+  //     }
+  //     // path parser guarantees idx >= 0
+  //     // will never reach to here
+  //     return false;
+  //   }
+  //   // case (START_ARRAY, Subscript :: Index(idx) :: xs)
+  //   // case path 9
+  //   else if (json_token::START_ARRAY == token &&
+  //            thrust::get<0>(path_match_subscript_index(path_ptr, path_size))) {
+  //     int idx = thrust::get<1>(path_match_subscript_index(path_ptr, path_size));
+  //     p.next_token();
+  //     // JSON validation check
+  //     if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //     int i = idx;
+  //     while (i >= 0) {
+  //       if (p.get_current_token() == json_token::END_ARRAY) {
+  //         // terminate, nothing has been written
+  //         return false;
+  //       }
+  //       if (0 == i) {
+  //         bool dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 2, path_size - 2);
+  //         while (p.next_token() != json_token::END_ARRAY) {
+  //           // JSON validation check
+  //           if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //           // advance the token stream to the end of the array
+  //           if (!p.try_skip_children()) { return false; }
+  //         }
+  //         return dirty;
+  //       } else {
+  //         // i > 0
+  //         if (!p.try_skip_children()) { return false; }
+
+  //         p.next_token();
+  //         // JSON validation check
+  //         if (json_token::ERROR == p.get_current_token()) { return false; }
+  //       }
+  //       --i;
+  //     }
+  //     // path parser guarantees idx >= 0
+  //     // will never reach to here
+  //     return false;
+  //   }
+  //   // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
+  //   // case path 10
+  //   else if (json_token::FIELD_NAME == token &&
+  //            thrust::get<0>(path_match_named(path_ptr, path_size)) &&
+  //            p.match_current_field_name(thrust::get<1>(path_match_named(path_ptr, path_size)))) {
+  //     if (p.next_token() != json_token::VALUE_NULL) {
+  //       // JSON validation check
+  //       if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //       return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
+  //     } else {
+  //       return false;
+  //     }
+  //   }
+  //   // case (FIELD_NAME, Wildcard :: xs)
+  //   // case path 11
+  //   else if (json_token::FIELD_NAME == token &&
+  //            path_match_element(path_ptr, path_size, path_instruction_type::WILDCARD)) {
+  //     p.next_token();
+  //     // JSON validation check
+  //     if (json_token::ERROR == p.get_current_token()) { return false; }
+
+  //     return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
+  //   }
+  //   // case _ =>
+  //   // case path 12
+  //   else {
+  //     if (!p.try_skip_children()) { return false; }
+  //     return false;
+  //   }
+  // }
+
+  /**
+   *
+   * This function is rewritten from above commented recursive function.
+   * this function is equivalent to the above commented recursive function.
+   */
+  static __device__ bool evaluate_path(json_parser<>& p,
+                                       json_generator<>& root_g,
+                                       write_style root_style,
+                                       path_instruction const* root_path_ptr,
+                                       int root_path_size)
+  {
+    // manually maintained context stack in lieu of calling evaluate_path recursively.
+    struct context {
+      // current token
+      json_token token;
+
+      // which case path that this task is from
+      int case_path;
+
+      // used to save current generator
+      json_generator<> g;
+
+      write_style style;
+      path_instruction const* path_ptr;
+      int path_size;
+
+      // is this context task is done
+      bool task_is_done = false;
+
+      // whether written output
+      // if dirty > 0, indicates success
+      int dirty = 0;
+
+      // for some case paths
+      bool is_first_enter = true;
+
+      // used to save child JSON generator for case path 8
+      json_generator<> child_g;
+
+      __device__ context()
+        : token(json_token::INIT),
+          case_path(-1),
+          g(json_generator<>()),
+          style(write_style::raw_style),
+          path_ptr(nullptr),
+          path_size(0)
+      {
+      }
+
+      __device__ context(json_token _token,
+                         int _case_path,
+                         json_generator<> _g,
+                         write_style _style,
+                         path_instruction const* _path_ptr,
+                         int _path_size)
+        : token(_token),
+          case_path(_case_path),
+          g(_g),
+          style(_style),
+          path_ptr(_path_ptr),
+          path_size(_path_size)
+      {
+      }
+
+      __device__ context& operator=(context const&) = default;
+    };
+
+    // path max depth limitation
+    // There is a same constant in JSONUtil.java, keep them consistent when changing
+    constexpr int max_path_depth = 32;
+
+    // stack
+    context stack[max_path_depth];
+    int stack_pos = 0;
+
+    // push context function
+    auto push_context = [&stack, &stack_pos](json_token _token,
+                                             int _case_path,
+                                             json_generator<> _g,
+                                             write_style _style,
+                                             path_instruction const* _path_ptr,
+                                             int _path_size) {
+      if (stack_pos == max_path_depth - 1) { return false; }
+      stack[stack_pos++] = context(_token, _case_path, _g, _style, _path_ptr, _path_size);
+      return true;
+    };
+
+    // push context function
+    auto push_ctx = [&stack, &stack_pos](context ctx) {
+      if (stack_pos == max_path_depth - 1) { return false; }
+      stack[stack_pos++] = ctx;
+      return true;
+    };
+
+    // pop context function
+    auto pop_context = [&stack, &stack_pos](context& c) {
+      if (stack_pos > 0) {
+        c = stack[--stack_pos];
+        return true;
+      }
+      return false;
+    };
+
+    // put the first context task
+    push_context(p.get_current_token(), -1, root_g, root_style, root_path_ptr, root_path_size);
+
+    // current context task
+    context ctx;
+
+    // parent context task
+    context p_ctx;
+
+    while (pop_context(ctx)) {
+      if (!ctx.task_is_done) {
+        // task is not done.
+
+        // case (VALUE_STRING, Nil) if style == RawStyle
+        // case path 1
+        if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path_size) &&
+            ctx.style == write_style::raw_style) {
+          // there is no array wildcard or slice parent, emit this string without
+          // quotes write current string in parser to generator
+          ctx.g.write_raw(p);
+          ctx.dirty        = 1;
+          ctx.task_is_done = true;
+          push_ctx(ctx);
+        }
+        // case (START_ARRAY, Nil) if style == FlattenStyle
+        // case path 2
+        else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path_size) &&
+                 ctx.style == write_style::flatten_style) {
+          // flatten this array into the parent
+          if (json_token::END_ARRAY != p.next_token()) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+            // push back task
+            push_ctx(ctx);
+            // add child task
+            push_context(p.get_current_token(), 2, ctx.g, ctx.style, nullptr, 0);
+          } else {
+            // END_ARRAY
+            ctx.task_is_done = true;
+            push_ctx(ctx);
+          }
+        }
+        // case (_, Nil)
+        // case path 3
+        else if (path_is_empty(ctx.path_size)) {
+          // general case: just copy the child tree verbatim
+          if (!(ctx.g.copy_current_structure(p))) {
+            // JSON validation check
+            return false;
+          }
+          ctx.dirty        = 1;
+          ctx.task_is_done = true;
+          push_ctx(ctx);
+        }
+        // case (START_OBJECT, Key :: xs)
+        // case path 4
+        else if (json_token::START_OBJECT == ctx.token &&
+                 path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::KEY)) {
+          if (json_token::END_OBJECT != p.next_token()) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+
+            if (ctx.dirty > 0) {
+              // once a match has been found we can skip other fields
+              if (!p.try_skip_children()) {
+                // JSON validation check
+                return false;
+              }
+              push_ctx(ctx);
+            } else {
+              // need to try more children
+              push_ctx(ctx);
+              push_context(
+                p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+            }
+          } else {
+            ctx.task_is_done = true;
+            push_ctx(ctx);
+          }
+        }
+        // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
+        // case path 5
+        else if (json_token::START_ARRAY == ctx.token &&
+                 path_match_elements(ctx.path_ptr,
+                                     ctx.path_size,
+                                     path_instruction_type::SUBSCRIPT,
+                                     path_instruction_type::WILDCARD,
+                                     path_instruction_type::SUBSCRIPT,
+                                     path_instruction_type::WILDCARD)) {
+          // special handling for the non-structure preserving double wildcard
+          // behavior in Hive
+          if (ctx.is_first_enter) {
+            ctx.is_first_enter = false;
+            ctx.g.write_start_array();
+          }
+
+          if (p.next_token() != json_token::END_ARRAY) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+            push_ctx(ctx);
+            push_context(p.get_current_token(),
+                         5,
+                         ctx.g,
+                         write_style::flatten_style,
+                         ctx.path_ptr + 4,
+                         ctx.path_size - 4);
+          } else {
+            ctx.g.write_end_array();
+            ctx.task_is_done = true;
+            push_ctx(ctx);
+          }
+        }
+        // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
+        // case path 6
+        else if (json_token::START_ARRAY == ctx.token &&
+                 path_match_elements(ctx.path_ptr,
+                                     ctx.path_size,
+                                     path_instruction_type::SUBSCRIPT,
+                                     path_instruction_type::WILDCARD) &&
+                 ctx.style != write_style::quoted_style) {
+          // retain Flatten, otherwise use Quoted... cannot use Raw within an array
+          write_style next_style = write_style::raw_style;
+          switch (ctx.style) {
+            case write_style::raw_style: next_style = write_style::quoted_style; break;
+            case write_style::flatten_style: next_style = write_style::flatten_style; break;
+            case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
+          }
+
+          // temporarily buffer child matches, the emitted json will need to be
+          // modified slightly if there is only a single element written
+
+          json_generator<> child_g;
+          if (ctx.is_first_enter) {
+            ctx.is_first_enter = false;
+            // create a child generator with hide outer array tokens mode.
+            child_g = ctx.g.new_child_generator();
+            // write first [ without output, without update len, only update internal state
+            child_g.write_first_start_array_without_output();
+          } else {
+            child_g = ctx.child_g;
+          }
+
+          if (p.next_token() != json_token::END_ARRAY) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+
+            push_ctx(ctx);
+            // track the number of array elements and only emit an outer array if
+            // we've written more than one element, this matches Hive's behavior
+            push_context(
+              p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 2, ctx.path_size - 2);
+          } else {
+            char* child_g_start = child_g.get_output_start_position();
+            size_t child_g_len  = child_g.get_output_len();
+
+            if (ctx.dirty > 1) {
+              // add outer array tokens
+              ctx.g.write_child_raw_value(
+                child_g_start, child_g_len, /* write_outer_array_tokens */ true);
+              ctx.task_is_done = true;
+              push_ctx(ctx);
+            } else if (ctx.dirty == 1) {
+              // remove outer array tokens
+              ctx.g.write_child_raw_value(
+                child_g_start, child_g_len, /* write_outer_array_tokens */ false);
+              ctx.task_is_done = true;
+              push_ctx(ctx);
+            }  // else do not write anything
+          }
+        }
+        // case (START_ARRAY, Subscript :: Wildcard :: xs)
+        // case path 7
+        else if (json_token::START_ARRAY == ctx.token &&
+                 path_match_elements(ctx.path_ptr,
+                                     ctx.path_size,
+                                     path_instruction_type::SUBSCRIPT,
+                                     path_instruction_type::WILDCARD)) {
+          if (ctx.is_first_enter) {
+            ctx.is_first_enter = false;
+            ctx.g.write_start_array();
+          }
+
+          if (p.next_token() != json_token::END_ARRAY) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+
+            // wildcards can have multiple matches, continually update the dirty
+            // count
+            push_ctx(ctx);
+            push_context(p.get_current_token(),
+                         7,
+                         ctx.g,
+                         write_style::quoted_style,
+                         ctx.path_ptr + 2,
+                         ctx.path_size - 2);
+          } else {
+            ctx.g.write_end_array();
+            ctx.task_is_done = true;
+            push_ctx(ctx);
+          }
+        }
+        /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
+        // case path 8
+        else if (json_token::START_ARRAY == ctx.token &&
+                 thrust::get<0>(
+                   path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size))) {
+          int idx = thrust::get<1>(
+            path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size));
+
+          p.next_token();
+          // JSON validation check
+          if (json_token::ERROR == p.get_current_token()) { return false; }
+          ctx.is_first_enter = false;
+
+          int i = idx;
+          while (i > 0) {
+            if (p.get_current_token() == json_token::END_ARRAY) {
+              // terminate, nothing has been written
+              return false;
+            }
+
+            if (!p.try_skip_children()) { return false; }
+
+            p.next_token();
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+
+            --i;
+          }
+
+          // i == 0
+          push_ctx(ctx);
+          push_context(p.get_current_token(),
+                       8,
+                       ctx.g,
+                       write_style::quoted_style,
+                       ctx.path_ptr + 2,
+                       ctx.path_size - 2);
+        }
+        // case (START_ARRAY, Subscript :: Index(idx) :: xs)
+        // case path 9
+        else if (json_token::START_ARRAY == ctx.token &&
+                 thrust::get<0>(path_match_subscript_index(ctx.path_ptr, ctx.path_size))) {
+          int idx = thrust::get<1>(path_match_subscript_index(ctx.path_ptr, ctx.path_size));
+
+          p.next_token();
+          // JSON validation check
+          if (json_token::ERROR == p.get_current_token()) { return false; }
+
+          int i = idx;
+          while (i > 0) {
+            if (p.get_current_token() == json_token::END_ARRAY) {
+              // terminate, nothing has been written
+              return false;
+            }
+
+            if (!p.try_skip_children()) { return false; }
+
+            p.next_token();
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+
+            --i;
+          }
+
+          // i == 0
+          push_ctx(ctx);
+          push_context(
+            p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 2, ctx.path_size - 2);
+        }
+        // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
+        // case path 10
+        else if (json_token::FIELD_NAME == ctx.token &&
+                 thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size)) &&
+                 p.match_current_field_name(
+                   thrust::get<1>(path_match_named(ctx.path_ptr, ctx.path_size)))) {
+          if (p.next_token() != json_token::VALUE_NULL) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+            push_ctx(ctx);
+            push_context(
+              p.get_current_token(), 10, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+          } else {
+            return false;
+          }
+        }
+        // case (FIELD_NAME, Wildcard :: xs)
+        // case path 11
+        else if (json_token::FIELD_NAME == ctx.token &&
+                 path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) {
+          p.next_token();
+          // JSON validation check
+          if (json_token::ERROR == p.get_current_token()) { return false; }
+          push_ctx(ctx);
+          push_context(
+            p.get_current_token(), 11, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+        }
+        // case _ =>
+        // case path 12
+        else {
+          if (!p.try_skip_children()) { return false; }
+          // default case path, return false for this task
+          ctx.dirty        = 0;
+          ctx.task_is_done = true;
+          push_ctx(ctx);
+        }
+      } else {
+        // current context is done.
+
+        // pop parent task
+        // update parent task info according to current task result
+        if (pop_context(p_ctx)) {
+          // case (VALUE_STRING, Nil) if style == RawStyle
+          // case path 1
+          if (1 == ctx.case_path) {
+            // never happen
+          }
+          // case (START_ARRAY, Nil) if style == FlattenStyle
+          // case path 2
+          else if (2 == ctx.case_path) {
+            // collect result from child task
+            p_ctx.dirty += ctx.dirty;
+            // copy generator states to parent task;
+            p_ctx.g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          // case (_, Nil)
+          // case path 3
+          else if (3 == ctx.case_path) {
+            // never happen
+          }
+          // case (START_OBJECT, Key :: xs)
+          // case path 4
+          else if (4 == ctx.case_path) {
+            if (p_ctx.dirty < 1 && ctx.dirty > 0) { p_ctx.dirty = ctx.dirty; }
+            // copy generator states to parent task;
+            p_ctx.g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
+          // case path 5
+          else if (5 == ctx.case_path) {
+            // collect result from child task
+            p_ctx.dirty += ctx.dirty;
+            // copy generator states to parent task;
+            p_ctx.g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
+          // case path 6
+          else if (6 == ctx.case_path) {
+            // collect result from child task
+            p_ctx.dirty += ctx.dirty;
+            // update child generator for parent task
+            p_ctx.child_g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          // case (START_ARRAY, Subscript :: Wildcard :: xs)
+          // case path 7
+          else if (7 == ctx.case_path) {
+            // collect result from child task
+            p_ctx.dirty += ctx.dirty;
+            // copy generator states to parent task;
+            p_ctx.g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
+          // case path 8
+          // case (START_ARRAY, Subscript :: Index(idx) :: xs)
+          // case path 9
+          else if (8 == ctx.case_path || 9 == ctx.case_path) {
+            // collect result from child task
+            p_ctx.dirty += ctx.dirty;
+
+            // post logic:
+            while (p.next_token() != json_token::END_ARRAY) {
+              // JSON validation check
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+              // advance the token stream to the end of the array
+              if (!p.try_skip_children()) { return false; }
+            }
+            // task is done
+            p_ctx.task_is_done = true;
+            // copy generator states to parent task;
+            p_ctx.g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
+          // case path 10
+          else if (10 == ctx.case_path) {
+            // collect result from child task
+            p_ctx.dirty += ctx.dirty;
+            // task is done
+            p_ctx.task_is_done = true;
+            // copy generator states to parent task;
+            p_ctx.g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          // case (FIELD_NAME, Wildcard :: xs)
+          // case path 11
+          else if (11 == ctx.case_path) {
+            // collect result from child task
+            p_ctx.dirty += ctx.dirty;
+            // task is done
+            p_ctx.task_is_done = true;
+            // copy generator states to parent task;
+            p_ctx.g = ctx.g;
+            push_ctx(p_ctx);
+          }
+          // case _ =>
+          // case path 12
+          else {
+            // never happen
+          }
+        } else {
+          // has no parent task, stack is empty, will exit
+        }
+      }
+    }
+
+    // copy output len
+    root_g.set_output_len(ctx.g.get_output_len());
+    return ctx.dirty > 0;
+  }
+};
+
 rmm::device_uvector<path_instruction> construct_path_commands(
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
   cudf::string_scalar const& all_names_scalar,
diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp
index 0522cabee5..628d8aa749 100644
--- a/src/main/cpp/src/get_json_object.hpp
+++ b/src/main/cpp/src/get_json_object.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "json_parser.hpp"
+#include "json_parser.cuh"
 
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -35,1146 +35,6 @@ namespace spark_rapids_jni {
  */
 enum class path_instruction_type { SUBSCRIPT, WILDCARD, KEY, INDEX, NAMED };
 
-namespace detail {
-
-/**
- * write JSON style
- */
-enum class write_style { raw_style, quoted_style, flatten_style };
-
-/**
- * path instruction
- */
-struct path_instruction {
-  __device__ inline path_instruction(path_instruction_type _type) : type(_type) {}
-
-  path_instruction_type type;
-
-  // used when type is named type
-  cudf::string_view name;
-
-  // used when type is index
-  int index{-1};
-};
-
-/**
- * JSON generator is used to write out JSON content.
- * Because of get_json_object only outputs JSON object as a whole item,
- * it's no need to store internal state for JSON object when outputing,
- * only need to store internal state for JSON array.
- */
-template <int max_json_nesting_depth = curr_max_json_nesting_depth>
-class json_generator {
- public:
-  __device__ json_generator(char* _output) : output(_output), output_len(0) {}
-  __device__ json_generator() : output(nullptr), output_len(0) {}
-
-  __device__ json_generator<>& operator=(json_generator<> const& other)
-  {
-    this->output      = other.output;
-    this->output_len  = other.output_len;
-    this->array_depth = other.array_depth;
-    for (size_t i = 0; i < max_json_nesting_depth; i++) {
-      this->is_first_item[i] = other.is_first_item[i];
-    }
-
-    return *this;
-  }
-
-  // create a nested child generator based on this parent generator,
-  // child generator is a view, parent and child share the same byte array
-  __device__ json_generator new_child_generator()
-  {
-    if (nullptr == output) {
-      return json_generator();
-    } else {
-      return json_generator(output + output_len);
-    }
-  }
-
-  // write [
-  // add an extra comma if needed,
-  // e.g.: when JSON content is: [[1,2,3]
-  // writing a new [ should result: [[1,2,3],[
-  __device__ void write_start_array()
-  {
-    try_write_comma();
-
-    // update internal state
-    if (array_depth > 0) { is_first_item[array_depth - 1] = false; }
-
-    if (output) { *(output + output_len) = '['; }
-
-    output_len++;
-    is_first_item[array_depth] = true;
-    array_depth++;
-  }
-
-  // write ]
-  __device__ void write_end_array()
-  {
-    if (output) { *(output + output_len) = ']'; }
-    output_len++;
-    array_depth--;
-  }
-
-  // write first start array without output, only update internal state
-  __device__ void write_first_start_array_without_output()
-  {
-    // hide the outer start array token
-    // Note: do not inc output_len
-    is_first_item[array_depth] = true;
-    array_depth++;
-  }
-
-  // return true if it's in a array context and it's not writing the first item.
-  __device__ bool need_comma() { return (array_depth > 0 && !is_first_item[array_depth - 1]); }
-
-  /**
-   * write comma accroding to current generator state
-   */
-  __device__ void try_write_comma()
-  {
-    if (need_comma()) {
-      // in array context and writes first item
-      if (output) { *(output + output_len) = ','; }
-      output_len++;
-    }
-  }
-
-  /**
-   * copy current structure when parsing. If current token is start
-   * object/array, then copy to corresponding matched end object/array. return
-   * false if JSON format is invalid return true if JSON format is valid
-   */
-  __device__ bool copy_current_structure(json_parser<>& parser)
-  {
-    // first try add comma
-    try_write_comma();
-
-    if (array_depth > 0) { is_first_item[array_depth - 1] = false; }
-
-    if (nullptr != output) {
-      auto copy_to       = output + output_len;
-      auto [b, copy_len] = parser.copy_current_structure(copy_to);
-      output_len += copy_len;
-      return b;
-    } else {
-      char* copy_to      = nullptr;
-      auto [b, copy_len] = parser.copy_current_structure(copy_to);
-      output_len += copy_len;
-      return b;
-    }
-  }
-
-  /**
-   * Get current text from JSON parser and then write the text
-   * Note: Because JSON strings contains '\' to do escape,
-   * JSON parser should do unescape to remove '\' and JSON parser
-   * then can not return a pointer and length pair (char *, len),
-   * For number token, JSON parser can return a pair (char *, len)
-   */
-  __device__ void write_raw(json_parser<>& parser)
-  {
-    if (array_depth > 0) { is_first_item[array_depth - 1] = false; }
-
-    if (nullptr != output) {
-      auto copied = parser.write_unescaped_text(output + output_len);
-      output_len += copied;
-    } else {
-      auto len = parser.compute_unescaped_len();
-      output_len += len;
-    }
-  }
-
-  /**
-   * write child raw value
-   * e.g.:
-   *
-   * write_outer_array_tokens = false
-   * need_comma = true
-   * [1,2,3]1,2,3
-   *        ^
-   *        |
-   *    child pointer
-   * ==>>
-   * [1,2,3],1,2,3
-   *
-   *
-   * write_outer_array_tokens = true
-   * need_comma = true
-   *   [12,3,4
-   *     ^
-   *     |
-   * child pointer
-   * ==>>
-   *   [1,[2,3,4]
-   *
-   * For more information about param write_outer_array_tokens, refer to
-   * `write_first_start_array_without_output`
-   * @param child_block_begin
-   * @param child_block_len
-   * @param write_outer_array_tokens whether write outer array tokens for child block
-   */
-  __device__ void write_child_raw_value(char* child_block_begin,
-                                        size_t child_block_len,
-                                        bool write_outer_array_tokens)
-  {
-    bool insert_comma = need_comma();
-
-    is_first_item[array_depth - 1] = false;
-
-    if (nullptr != output) {
-      if (write_outer_array_tokens) {
-        if (insert_comma) {
-          *(child_block_begin + child_block_len + 2) = ']';
-          move_forward(child_block_begin, child_block_len, 2);
-          *(child_block_begin + 1) = '[';
-          *(child_block_begin)     = ',';
-        } else {
-          *(child_block_begin + child_block_len + 1) = ']';
-          move_forward(child_block_begin, child_block_len, 1);
-          *(child_block_begin) = '[';
-        }
-      } else {
-        if (insert_comma) {
-          move_forward(child_block_begin, child_block_len, 1);
-          *(child_block_begin) = ',';
-        } else {
-          // do not need comma && do not need write outer array tokens
-          // do nothing, because child generator buff is directly after the
-          // parent generator
-        }
-      }
-    }
-
-    // update length
-    if (insert_comma) { output_len++; }
-    if (write_outer_array_tokens) { output_len += 2; }
-    output_len += child_block_len;
-  }
-
-  // move memory block forward by specified bytes
-  // e.g.:  memory is: 1 2 0 0, begin is 1, len is 2, after moving,
-  // memory is: 1 2 1 2.
-  // e.g.:  memory is: 1 2 0 0, begin is 1, len is 1, after moving,
-  // memory is: 1 1 2 0.
-  // Note: should move from end to begin to avoid overwrite buffer
-  __device__ void move_forward(char* begin, size_t len, int forward)
-  {
-    char* pos = begin + len + forward - 1;
-    char* e   = begin + forward - 1;
-    while (pos > e) {
-      *pos = *(pos - forward);
-      pos--;
-    }
-  }
-
-  __device__ void reset() { output_len = 0; }
-
-  __device__ inline size_t get_output_len() const { return output_len; }
-  __device__ inline char* get_output_start_position() const { return output; }
-  __device__ inline char* get_current_output_position() const { return output + output_len; }
-
-  /**
-   * generator may contain trash output, e.g.: generator writes some output,
-   * then JSON format is invalid, the previous output becomes trash.
-   */
-  __device__ inline void set_output_len_zero() { output_len = 0; }
-
-  __device__ inline void set_output_len(size_t len) { output_len = len; }
-
- private:
-  char* output;
-  size_t output_len;
-
-  bool is_first_item[max_json_nesting_depth];
-  int array_depth = 0;
-};
-
-/**
- * path evaluator which can run on both CPU and GPU
- */
-struct path_evaluator {
-  static __device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; }
-
-  static __device__ inline bool path_match_element(path_instruction const* path_ptr,
-                                                   size_t path_size,
-                                                   path_instruction_type path_type0)
-  {
-    if (path_size < 1) { return false; }
-    return path_ptr[0].type == path_type0;
-  }
-
-  static __device__ inline bool path_match_elements(path_instruction const* path_ptr,
-                                                    size_t path_size,
-                                                    path_instruction_type path_type0,
-                                                    path_instruction_type path_type1)
-  {
-    if (path_size < 2) { return false; }
-    return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1;
-  }
-
-  static __device__ inline bool path_match_elements(path_instruction const* path_ptr,
-                                                    size_t path_size,
-                                                    path_instruction_type path_type0,
-                                                    path_instruction_type path_type1,
-                                                    path_instruction_type path_type2,
-                                                    path_instruction_type path_type3)
-  {
-    if (path_size < 4) { return false; }
-    return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1 &&
-           path_ptr[2].type == path_type2 && path_ptr[3].type == path_type3;
-  }
-
-  static __device__ inline thrust::tuple<bool, int> path_match_subscript_index(
-    path_instruction const* path_ptr, size_t path_size)
-  {
-    auto match = path_match_elements(
-      path_ptr, path_size, path_instruction_type::SUBSCRIPT, path_instruction_type::INDEX);
-    if (match) {
-      return thrust::make_tuple(true, path_ptr[1].index);
-    } else {
-      return thrust::make_tuple(false, 0);
-    }
-  }
-
-  static __device__ inline thrust::tuple<bool, cudf::string_view> path_match_named(
-    path_instruction const* path_ptr, size_t path_size)
-  {
-    auto match = path_match_element(path_ptr, path_size, path_instruction_type::NAMED);
-    if (match) {
-      return thrust::make_tuple(true, path_ptr[0].name);
-    } else {
-      return thrust::make_tuple(false, cudf::string_view());
-    }
-  }
-
-  static __device__ inline thrust::tuple<bool, int> path_match_subscript_index_subscript_wildcard(
-    path_instruction const* path_ptr, size_t path_size)
-  {
-    auto match = path_match_elements(path_ptr,
-                                     path_size,
-                                     path_instruction_type::SUBSCRIPT,
-                                     path_instruction_type::INDEX,
-                                     path_instruction_type::SUBSCRIPT,
-                                     path_instruction_type::WILDCARD);
-    if (match) {
-      return thrust::make_tuple(true, path_ptr[1].index);
-    } else {
-      return thrust::make_tuple(false, 0);
-    }
-  }
-
-  /**
-   *
-   * The following commented function is recursive version,
-   * The next function below is the rewritten version,
-   * Keep version here is for review purpuse, because rewritten version(iterative)
-   * is not human friendly.
-   *
-   */
-  // static __device__ bool evaluate_path(json_parser<>& p,
-  //                                            json_generator<>& g,
-  //                                            write_style style,
-  //                                            path_instruction const* path_ptr,
-  //                                            int path_size)
-  // {
-  //   auto token = p.get_current_token();
-
-  //   // case (VALUE_STRING, Nil) if style == RawStyle
-  //   // case path 1
-  //   if (json_token::VALUE_STRING == token && path_is_empty(path_size) &&
-  //       style == write_style::raw_style) {
-  //     // there is no array wildcard or slice parent, emit this string without
-  //     // quotes write current string in parser to generator
-  //     g.write_raw(p);
-  //     return true;
-  //   }
-  //   // case (START_ARRAY, Nil) if style == FlattenStyle
-  //   // case path 2
-  //   else if (json_token::START_ARRAY == token && path_is_empty(path_size) &&
-  //            style == write_style::flatten_style) {
-  //     // flatten this array into the parent
-  //     bool dirty = false;
-  //     while (json_token::END_ARRAY != p.next_token()) {
-  //       // JSON validation check
-  //       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //       dirty |= path_evaluator::evaluate_path(p, g, style, nullptr, 0);
-  //     }
-  //     return dirty;
-  //   }
-  //   // case (_, Nil)
-  //   // case path 3
-  //   else if (path_is_empty(path_size)) {
-  //     // general case: just copy the child tree verbatim
-  //     return g.copy_current_structure(p);
-  //   }
-  //   // case (START_OBJECT, Key :: xs)
-  //   // case path 4
-  //   else if (json_token::START_OBJECT == token &&
-  //            path_match_element(path_ptr, path_size, path_instruction_type::KEY)) {
-  //     bool dirty = false;
-  //     while (json_token::END_OBJECT != p.next_token()) {
-  //       // JSON validation check
-  //       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //       if (dirty) {
-  //         // once a match has been found we can skip other fields
-  //         if (!p.try_skip_children()) {
-  //           // JSON validation check
-  //           return false;
-  //         }
-  //       } else {
-  //         dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-  //       }
-  //     }
-  //     return dirty;
-  //   }
-  //   // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
-  //   // case path 5
-  //   else if (json_token::START_ARRAY == token &&
-  //            path_match_elements(path_ptr,
-  //                                path_size,
-  //                                path_instruction_type::SUBSCRIPT,
-  //                                path_instruction_type::WILDCARD,
-  //                                path_instruction_type::SUBSCRIPT,
-  //                                path_instruction_type::WILDCARD)) {
-  //     // special handling for the non-structure preserving double wildcard
-  //     // behavior in Hive
-  //     bool dirty = false;
-  //     g.write_start_array();
-  //     while (p.next_token() != json_token::END_ARRAY) {
-  //       // JSON validation check
-  //       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //       dirty |= path_evaluator::evaluate_path(
-  //         p, g, write_style::flatten_style, path_ptr + 4, path_size - 4);
-  //     }
-  //     g.write_end_array();
-  //     return dirty;
-  //   }
-  //   // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
-  //   // case path 6
-  //   else if (json_token::START_ARRAY == token &&
-  //            path_match_elements(path_ptr,
-  //                                path_size,
-  //                                path_instruction_type::SUBSCRIPT,
-  //                                path_instruction_type::WILDCARD) &&
-  //            style != write_style::quoted_style) {
-  //     // retain Flatten, otherwise use Quoted... cannot use Raw within an array
-  //     write_style next_style = write_style::raw_style;
-  //     switch (style) {
-  //       case write_style::raw_style: next_style = write_style::quoted_style; break;
-  //       case write_style::flatten_style: next_style = write_style::flatten_style; break;
-  //       case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
-  //     }
-
-  //     // temporarily buffer child matches, the emitted json will need to be
-  //     // modified slightly if there is only a single element written
-
-  //     int dirty = 0;
-  //     // create a child generator with hide outer array tokens mode.
-  //     auto child_g = g.new_child_generator(/*hide_outer_array_tokens*/ true);
-
-  //     // Note: child generator does not actually write the outer start array
-  //     // token into buffer it only updates internal nested state
-  //     child_g.write_start_array();
-
-  //     while (p.next_token() != json_token::END_ARRAY) {
-  //       // JSON validation check
-  //       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //       // track the number of array elements and only emit an outer array if
-  //       // we've written more than one element, this matches Hive's behavior
-  //       dirty +=
-  //         (path_evaluator::evaluate_path(p, child_g, next_style, path_ptr + 2, path_size - 2) ? 1
-  //                                                                                             :
-  //                                                                                             0);
-  //     }
-
-  //     // Note: child generator does not actually write the outer end array token
-  //     // into buffer it only updates internal nested state
-  //     child_g.write_end_array();
-
-  //     char* child_g_start = child_g.get_output_start_position();
-  //     size_t child_g_len  = child_g.get_output_len();  // len already excluded outer [ ]
-
-  //     if (dirty > 1) {
-  //       // add outer array tokens
-  //       g.write_child_raw_value(child_g_start, child_g_len, true);
-  //     } else if (dirty == 1) {
-  //       // remove outer array tokens
-  //       g.write_child_raw_value(child_g_start, child_g_len, false);
-  //     }  // else do not write anything
-
-  //     return dirty > 0;
-  //   }
-  //   // case (START_ARRAY, Subscript :: Wildcard :: xs)
-  //   // case path 7
-  //   else if (json_token::START_ARRAY == token &&
-  //            path_match_elements(path_ptr,
-  //                                path_size,
-  //                                path_instruction_type::SUBSCRIPT,
-  //                                path_instruction_type::WILDCARD)) {
-  //     bool dirty = false;
-  //     g.write_start_array();
-  //     while (p.next_token() != json_token::END_ARRAY) {
-  //       // JSON validation check
-  //       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //       // wildcards can have multiple matches, continually update the dirty
-  //       // count
-  //       dirty |= path_evaluator::evaluate_path(
-  //         p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
-  //     }
-  //     g.write_end_array();
-
-  //     return dirty;
-  //   }
-  //   /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
-  //   // case path 8
-  //   else if (json_token::START_ARRAY == token &&
-  //            thrust::get<0>(path_match_subscript_index_subscript_wildcard(path_ptr, path_size)))
-  //            {
-  //     int idx = thrust::get<1>(path_match_subscript_index_subscript_wildcard(path_ptr,
-  //     path_size)); p.next_token();
-  //     // JSON validation check
-  //     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //     int i = idx;
-  //     while (i >= 0) {
-  //       if (p.get_current_token() == json_token::END_ARRAY) {
-  //         // terminate, nothing has been written
-  //         return false;
-  //       }
-  //       if (0 == i) {
-  //         bool dirty = path_evaluator::evaluate_path(
-  //           p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
-  //         while (p.next_token() != json_token::END_ARRAY) {
-  //           // JSON validation check
-  //           if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //           // advance the token stream to the end of the array
-  //           if (!p.try_skip_children()) { return false; }
-  //         }
-  //         return dirty;
-  //       } else {
-  //         // i > 0
-  //         if (!p.try_skip_children()) { return false; }
-
-  //         p.next_token();
-  //         // JSON validation check
-  //         if (json_token::ERROR == p.get_current_token()) { return false; }
-  //       }
-  //       --i;
-  //     }
-  //     // path parser guarantees idx >= 0
-  //     // will never reach to here
-  //     return false;
-  //   }
-  //   // case (START_ARRAY, Subscript :: Index(idx) :: xs)
-  //   // case path 9
-  //   else if (json_token::START_ARRAY == token &&
-  //            thrust::get<0>(path_match_subscript_index(path_ptr, path_size))) {
-  //     int idx = thrust::get<1>(path_match_subscript_index(path_ptr, path_size));
-  //     p.next_token();
-  //     // JSON validation check
-  //     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //     int i = idx;
-  //     while (i >= 0) {
-  //       if (p.get_current_token() == json_token::END_ARRAY) {
-  //         // terminate, nothing has been written
-  //         return false;
-  //       }
-  //       if (0 == i) {
-  //         bool dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 2, path_size - 2);
-  //         while (p.next_token() != json_token::END_ARRAY) {
-  //           // JSON validation check
-  //           if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //           // advance the token stream to the end of the array
-  //           if (!p.try_skip_children()) { return false; }
-  //         }
-  //         return dirty;
-  //       } else {
-  //         // i > 0
-  //         if (!p.try_skip_children()) { return false; }
-
-  //         p.next_token();
-  //         // JSON validation check
-  //         if (json_token::ERROR == p.get_current_token()) { return false; }
-  //       }
-  //       --i;
-  //     }
-  //     // path parser guarantees idx >= 0
-  //     // will never reach to here
-  //     return false;
-  //   }
-  //   // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-  //   // case path 10
-  //   else if (json_token::FIELD_NAME == token &&
-  //            thrust::get<0>(path_match_named(path_ptr, path_size)) &&
-  //            p.match_current_field_name(thrust::get<1>(path_match_named(path_ptr, path_size)))) {
-  //     if (p.next_token() != json_token::VALUE_NULL) {
-  //       // JSON validation check
-  //       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //       return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-  //     } else {
-  //       return false;
-  //     }
-  //   }
-  //   // case (FIELD_NAME, Wildcard :: xs)
-  //   // case path 11
-  //   else if (json_token::FIELD_NAME == token &&
-  //            path_match_element(path_ptr, path_size, path_instruction_type::WILDCARD)) {
-  //     p.next_token();
-  //     // JSON validation check
-  //     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-  //     return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-  //   }
-  //   // case _ =>
-  //   // case path 12
-  //   else {
-  //     if (!p.try_skip_children()) { return false; }
-  //     return false;
-  //   }
-  // }
-
-  /**
-   *
-   * This function is rewritten from above commented recursive function.
-   * this function is equivalent to the above commented recursive function.
-   */
-  static __device__ bool evaluate_path(json_parser<>& p,
-                                       json_generator<>& root_g,
-                                       write_style root_style,
-                                       path_instruction const* root_path_ptr,
-                                       int root_path_size)
-  {
-    // manually maintained context stack in lieu of calling evaluate_path recursively.
-    struct context {
-      // current token
-      json_token token;
-
-      // which case path that this task is from
-      int case_path;
-
-      // used to save current generator
-      json_generator<> g;
-
-      write_style style;
-      path_instruction const* path_ptr;
-      int path_size;
-
-      // is this context task is done
-      bool task_is_done = false;
-
-      // whether written output
-      // if dirty > 0, indicates success
-      int dirty = 0;
-
-      // for some case paths
-      bool is_first_enter = true;
-
-      // used to save child JSON generator for case path 8
-      json_generator<> child_g;
-
-      __device__ context()
-        : token(json_token::INIT),
-          case_path(-1),
-          g(json_generator<>()),
-          style(write_style::raw_style),
-          path_ptr(nullptr),
-          path_size(0)
-      {
-      }
-
-      __device__ context(json_token _token,
-                         int _case_path,
-                         json_generator<> _g,
-                         write_style _style,
-                         path_instruction const* _path_ptr,
-                         int _path_size)
-        : token(_token),
-          case_path(_case_path),
-          g(_g),
-          style(_style),
-          path_ptr(_path_ptr),
-          path_size(_path_size)
-      {
-      }
-
-      __device__ context& operator=(context const&) = default;
-    };
-
-    // path max depth limitation
-    constexpr int max_path_depth = 32;
-
-    // stack
-    context stack[max_path_depth];
-    int stack_pos = 0;
-
-    // push context function
-    auto push_context = [&stack, &stack_pos](json_token _token,
-                                             int _case_path,
-                                             json_generator<> _g,
-                                             write_style _style,
-                                             path_instruction const* _path_ptr,
-                                             int _path_size) {
-      if (stack_pos == max_path_depth - 1) { return false; }
-      stack[stack_pos++] = context(_token, _case_path, _g, _style, _path_ptr, _path_size);
-      return true;
-    };
-
-    // push context function
-    auto push_ctx = [&stack, &stack_pos](context ctx) {
-      if (stack_pos == max_path_depth - 1) { return false; }
-      stack[stack_pos++] = ctx;
-      return true;
-    };
-
-    // pop context function
-    auto pop_context = [&stack, &stack_pos](context& c) {
-      if (stack_pos > 0) {
-        c = stack[--stack_pos];
-        return true;
-      }
-      return false;
-    };
-
-    // put the first context task
-    push_context(p.get_current_token(), -1, root_g, root_style, root_path_ptr, root_path_size);
-
-    // current context task
-    context ctx;
-
-    // parent context task
-    context p_ctx;
-
-    while (pop_context(ctx)) {
-      if (!ctx.task_is_done) {
-        // task is not done.
-
-        // case (VALUE_STRING, Nil) if style == RawStyle
-        // case path 1
-        if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path_size) &&
-            ctx.style == write_style::raw_style) {
-          // there is no array wildcard or slice parent, emit this string without
-          // quotes write current string in parser to generator
-          ctx.g.write_raw(p);
-          ctx.dirty        = 1;
-          ctx.task_is_done = true;
-          push_ctx(ctx);
-        }
-        // case (START_ARRAY, Nil) if style == FlattenStyle
-        // case path 2
-        else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path_size) &&
-                 ctx.style == write_style::flatten_style) {
-          // flatten this array into the parent
-          if (json_token::END_ARRAY != p.next_token()) {
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-            // push back task
-            push_ctx(ctx);
-            // add child task
-            push_context(p.get_current_token(), 2, ctx.g, ctx.style, nullptr, 0);
-          } else {
-            // END_ARRAY
-            ctx.task_is_done = true;
-            push_ctx(ctx);
-          }
-        }
-        // case (_, Nil)
-        // case path 3
-        else if (path_is_empty(ctx.path_size)) {
-          // general case: just copy the child tree verbatim
-          if (!(ctx.g.copy_current_structure(p))) {
-            // JSON validation check
-            return false;
-          }
-          ctx.dirty        = 1;
-          ctx.task_is_done = true;
-          push_ctx(ctx);
-        }
-        // case (START_OBJECT, Key :: xs)
-        // case path 4
-        else if (json_token::START_OBJECT == ctx.token &&
-                 path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::KEY)) {
-          if (json_token::END_OBJECT != p.next_token()) {
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-
-            if (ctx.dirty > 0) {
-              // once a match has been found we can skip other fields
-              if (!p.try_skip_children()) {
-                // JSON validation check
-                return false;
-              }
-              push_ctx(ctx);
-            } else {
-              // need to try more children
-              push_ctx(ctx);
-              push_context(
-                p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
-            }
-          } else {
-            ctx.task_is_done = true;
-            push_ctx(ctx);
-          }
-        }
-        // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
-        // case path 5
-        else if (json_token::START_ARRAY == ctx.token &&
-                 path_match_elements(ctx.path_ptr,
-                                     ctx.path_size,
-                                     path_instruction_type::SUBSCRIPT,
-                                     path_instruction_type::WILDCARD,
-                                     path_instruction_type::SUBSCRIPT,
-                                     path_instruction_type::WILDCARD)) {
-          // special handling for the non-structure preserving double wildcard
-          // behavior in Hive
-          if (ctx.is_first_enter) {
-            ctx.is_first_enter = false;
-            ctx.g.write_start_array();
-          }
-
-          if (p.next_token() != json_token::END_ARRAY) {
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-            push_ctx(ctx);
-            push_context(p.get_current_token(),
-                         5,
-                         ctx.g,
-                         write_style::flatten_style,
-                         ctx.path_ptr + 4,
-                         ctx.path_size - 4);
-          } else {
-            ctx.g.write_end_array();
-            ctx.task_is_done = true;
-            push_ctx(ctx);
-          }
-        }
-        // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
-        // case path 6
-        else if (json_token::START_ARRAY == ctx.token &&
-                 path_match_elements(ctx.path_ptr,
-                                     ctx.path_size,
-                                     path_instruction_type::SUBSCRIPT,
-                                     path_instruction_type::WILDCARD) &&
-                 ctx.style != write_style::quoted_style) {
-          // retain Flatten, otherwise use Quoted... cannot use Raw within an array
-          write_style next_style = write_style::raw_style;
-          switch (ctx.style) {
-            case write_style::raw_style: next_style = write_style::quoted_style; break;
-            case write_style::flatten_style: next_style = write_style::flatten_style; break;
-            case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
-          }
-
-          // temporarily buffer child matches, the emitted json will need to be
-          // modified slightly if there is only a single element written
-
-          json_generator<> child_g;
-          if (ctx.is_first_enter) {
-            ctx.is_first_enter = false;
-            // create a child generator with hide outer array tokens mode.
-            child_g = ctx.g.new_child_generator();
-            // write first [ without output, without update len, only update internal state
-            child_g.write_first_start_array_without_output();
-          } else {
-            child_g = ctx.child_g;
-          }
-
-          if (p.next_token() != json_token::END_ARRAY) {
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-
-            push_ctx(ctx);
-            // track the number of array elements and only emit an outer array if
-            // we've written more than one element, this matches Hive's behavior
-            push_context(
-              p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 2, ctx.path_size - 2);
-          } else {
-            char* child_g_start = child_g.get_output_start_position();
-            size_t child_g_len  = child_g.get_output_len();
-
-            if (ctx.dirty > 1) {
-              // add outer array tokens
-              ctx.g.write_child_raw_value(
-                child_g_start, child_g_len, /* write_outer_array_tokens */ true);
-              ctx.task_is_done = true;
-              push_ctx(ctx);
-            } else if (ctx.dirty == 1) {
-              // remove outer array tokens
-              ctx.g.write_child_raw_value(
-                child_g_start, child_g_len, /* write_outer_array_tokens */ false);
-              ctx.task_is_done = true;
-              push_ctx(ctx);
-            }  // else do not write anything
-          }
-        }
-        // case (START_ARRAY, Subscript :: Wildcard :: xs)
-        // case path 7
-        else if (json_token::START_ARRAY == ctx.token &&
-                 path_match_elements(ctx.path_ptr,
-                                     ctx.path_size,
-                                     path_instruction_type::SUBSCRIPT,
-                                     path_instruction_type::WILDCARD)) {
-          if (ctx.is_first_enter) {
-            ctx.is_first_enter = false;
-            ctx.g.write_start_array();
-          }
-
-          if (p.next_token() != json_token::END_ARRAY) {
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-
-            // wildcards can have multiple matches, continually update the dirty
-            // count
-            push_ctx(ctx);
-            push_context(p.get_current_token(),
-                         7,
-                         ctx.g,
-                         write_style::quoted_style,
-                         ctx.path_ptr + 2,
-                         ctx.path_size - 2);
-          } else {
-            ctx.g.write_end_array();
-            ctx.task_is_done = true;
-            push_ctx(ctx);
-          }
-        }
-        /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
-        // case path 8
-        else if (json_token::START_ARRAY == ctx.token &&
-                 thrust::get<0>(
-                   path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size))) {
-          int idx = thrust::get<1>(
-            path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size));
-
-          p.next_token();
-          // JSON validation check
-          if (json_token::ERROR == p.get_current_token()) { return false; }
-          ctx.is_first_enter = false;
-
-          int i = idx;
-          while (i > 0) {
-            if (p.get_current_token() == json_token::END_ARRAY) {
-              // terminate, nothing has been written
-              return false;
-            }
-
-            if (!p.try_skip_children()) { return false; }
-
-            p.next_token();
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-
-            --i;
-          }
-
-          // i == 0
-          push_ctx(ctx);
-          push_context(p.get_current_token(),
-                       8,
-                       ctx.g,
-                       write_style::quoted_style,
-                       ctx.path_ptr + 2,
-                       ctx.path_size - 2);
-        }
-        // case (START_ARRAY, Subscript :: Index(idx) :: xs)
-        // case path 9
-        else if (json_token::START_ARRAY == ctx.token &&
-                 thrust::get<0>(path_match_subscript_index(ctx.path_ptr, ctx.path_size))) {
-          int idx = thrust::get<1>(path_match_subscript_index(ctx.path_ptr, ctx.path_size));
-
-          p.next_token();
-          // JSON validation check
-          if (json_token::ERROR == p.get_current_token()) { return false; }
-
-          int i = idx;
-          while (i > 0) {
-            if (p.get_current_token() == json_token::END_ARRAY) {
-              // terminate, nothing has been written
-              return false;
-            }
-
-            if (!p.try_skip_children()) { return false; }
-
-            p.next_token();
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-
-            --i;
-          }
-
-          // i == 0
-          push_ctx(ctx);
-          push_context(
-            p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 2, ctx.path_size - 2);
-        }
-        // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-        // case path 10
-        else if (json_token::FIELD_NAME == ctx.token &&
-                 thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size)) &&
-                 p.match_current_field_name(
-                   thrust::get<1>(path_match_named(ctx.path_ptr, ctx.path_size)))) {
-          if (p.next_token() != json_token::VALUE_NULL) {
-            // JSON validation check
-            if (json_token::ERROR == p.get_current_token()) { return false; }
-            push_ctx(ctx);
-            push_context(
-              p.get_current_token(), 10, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
-          } else {
-            return false;
-          }
-        }
-        // case (FIELD_NAME, Wildcard :: xs)
-        // case path 11
-        else if (json_token::FIELD_NAME == ctx.token &&
-                 path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) {
-          p.next_token();
-          // JSON validation check
-          if (json_token::ERROR == p.get_current_token()) { return false; }
-          push_ctx(ctx);
-          push_context(
-            p.get_current_token(), 11, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
-        }
-        // case _ =>
-        // case path 12
-        else {
-          if (!p.try_skip_children()) { return false; }
-          // default case path, return false for this task
-          ctx.dirty        = 0;
-          ctx.task_is_done = true;
-          push_ctx(ctx);
-        }
-      } else {
-        // current context is done.
-
-        // pop parent task
-        // update parent task info according to current task result
-        if (pop_context(p_ctx)) {
-          // case (VALUE_STRING, Nil) if style == RawStyle
-          // case path 1
-          if (1 == ctx.case_path) {
-            // never happen
-          }
-          // case (START_ARRAY, Nil) if style == FlattenStyle
-          // case path 2
-          else if (2 == ctx.case_path) {
-            // collect result from child task
-            p_ctx.dirty += ctx.dirty;
-            // copy generator states to parent task;
-            p_ctx.g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          // case (_, Nil)
-          // case path 3
-          else if (3 == ctx.case_path) {
-            // never happen
-          }
-          // case (START_OBJECT, Key :: xs)
-          // case path 4
-          else if (4 == ctx.case_path) {
-            if (p_ctx.dirty < 1 && ctx.dirty > 0) { p_ctx.dirty = ctx.dirty; }
-            // copy generator states to parent task;
-            p_ctx.g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
-          // case path 5
-          else if (5 == ctx.case_path) {
-            // collect result from child task
-            p_ctx.dirty += ctx.dirty;
-            // copy generator states to parent task;
-            p_ctx.g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
-          // case path 6
-          else if (6 == ctx.case_path) {
-            // collect result from child task
-            p_ctx.dirty += ctx.dirty;
-            // update child generator for parent task
-            p_ctx.child_g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          // case (START_ARRAY, Subscript :: Wildcard :: xs)
-          // case path 7
-          else if (7 == ctx.case_path) {
-            // collect result from child task
-            p_ctx.dirty += ctx.dirty;
-            // copy generator states to parent task;
-            p_ctx.g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
-          // case path 8
-          // case (START_ARRAY, Subscript :: Index(idx) :: xs)
-          // case path 9
-          else if (8 == ctx.case_path || 9 == ctx.case_path) {
-            // collect result from child task
-            p_ctx.dirty += ctx.dirty;
-
-            // post logic:
-            while (p.next_token() != json_token::END_ARRAY) {
-              // JSON validation check
-              if (json_token::ERROR == p.get_current_token()) { return false; }
-              // advance the token stream to the end of the array
-              if (!p.try_skip_children()) { return false; }
-            }
-            // task is done
-            p_ctx.task_is_done = true;
-            // copy generator states to parent task;
-            p_ctx.g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-          // case path 10
-          else if (10 == ctx.case_path) {
-            // collect result from child task
-            p_ctx.dirty += ctx.dirty;
-            // task is done
-            p_ctx.task_is_done = true;
-            // copy generator states to parent task;
-            p_ctx.g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          // case (FIELD_NAME, Wildcard :: xs)
-          // case path 11
-          else if (11 == ctx.case_path) {
-            // collect result from child task
-            p_ctx.dirty += ctx.dirty;
-            // task is done
-            p_ctx.task_is_done = true;
-            // copy generator states to parent task;
-            p_ctx.g = ctx.g;
-            push_ctx(p_ctx);
-          }
-          // case _ =>
-          // case path 12
-          else {
-            // never happen
-          }
-        } else {
-          // has no parent task, stack is empty, will exit
-        }
-      }
-    }
-
-    // copy output len
-    root_g.set_output_len(ctx.g.get_output_len());
-    return ctx.dirty > 0;
-  }
-};
-
-}  // namespace detail
-
 /**
  * Extracts json object from a json string based on json path specified, and
  * returns json string of the extracted json object. It will return null if the
diff --git a/src/main/cpp/src/json_parser.hpp b/src/main/cpp/src/json_parser.cuh
similarity index 100%
rename from src/main/cpp/src/json_parser.hpp
rename to src/main/cpp/src/json_parser.cuh
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
index bd034651b7..f3e3a425f8 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
@@ -23,6 +23,8 @@ public class JSONUtils {
     NativeDepsLoader.loadNativeDeps();
   }
 
+  public static final int MAX_PATH_DEPTH = 32;
+
   public enum PathInstructionType {
     SUBSCRIPT,
     WILDCARD,

From 8a8be0c6bc4146f6934f443b75c3af9aaca237ef Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 2 Apr 2024 18:42:58 +0000
Subject: [PATCH 014/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 35f818b3e4..aab6137c80 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 35f818b3e4bef8e331f083dadc9a4c45e2987a78
+Subproject commit aab6137c80c50eccc5007120f7140cfe6646b5e0

From ca2b3940ab82df41b998091e3c6977c18972c5ee Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 5 Apr 2024 04:38:16 +0800
Subject: [PATCH 015/124] [submodule-sync] bot-submodule-sync-branch-24.06 to
 branch-24.06 [skip ci] [bot] (#1920)

* Update submodule cudf to aab8a76b532b46713b9784302ffd202586ecb5cc

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 08d86c92b3e3ccd950e4d63033d44675510cbb74

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 2584fd9d1e1fffb2aefd0417ba0994d7a563e076

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 2584fd9d1e1fffb2aefd0417ba0994d7a563e076

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 5192b608eeed4bda9317c657253c3a5630aa4c5d

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to fbaad8a480d3b2755afe04431c5abe6c098224b4

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to fbaad8a480d3b2755afe04431c5abe6c098224b4

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       |  2 +-
 thirdparty/cudf-pins/rapids-cmake.sha |  2 +-
 thirdparty/cudf-pins/versions.json    | 18 +++++++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index aab6137c80..c0f84bf5bb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit aab6137c80c50eccc5007120f7140cfe6646b5e0
+Subproject commit c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index fd9b77a7c7..b58220aa4b 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-096ae3c0a6b2c593f8fdb38468be527027bf79d7
+4ab8d745796eb9350527567b6c0584bfa208179e
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index f8e4379cd4..889c4cb961 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,9 +62,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "c98eabbad1f60dfe640d03f03a2df58b23f3e167",
+      "git_tag" : "f8a732cfa0d271490791ac375a21da405994c5e6",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
-      "version" : "24.04"
+      "version" : "24.06"
     },
     "NVTX3" : 
     {
@@ -110,10 +110,18 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "047df5e87d84834f8f4225898476145741acfa80",
+      "git_tag" : "e38b993f4cb3207745735c51d4f61cdaa735b7ac",
       "git_url" : "https://github.com/rapidsai/jitify.git",
       "version" : "2.0.0"
     },
+    "nanoarrow" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "version" : "0.4.0"
+    },
     "nvcomp" : 
     {
       "always_download" : true,
@@ -131,9 +139,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "0651edf0fce5ebf53528382b475fc29a2f3afa67",
+      "git_tag" : "8675b2c9ccaa33130cc2e5a291f3fff31f7c903d",
       "git_url" : "https://github.com/rapidsai/rmm.git",
-      "version" : "24.04"
+      "version" : "24.06"
     },
     "spdlog" : 
     {

From 5178a9d9591735215c47790ae22305c5610c96db Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 8 Apr 2024 01:27:16 +0000
Subject: [PATCH 016/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 35f818b3e4..c0f84bf5bb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 35f818b3e4bef8e331f083dadc9a4c45e2987a78
+Subproject commit c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1

From ffe0f3ad0ad0cfa78b97c71db93c175819b48beb Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Mon, 8 Apr 2024 09:20:44 -0700
Subject: [PATCH 017/124] Remove tests of deleted cudf::hash HASH_MURMUR3
 (#1929)

- Removes tests for the default `cudf::hash`
- Updates cudf deps pins to workaround the issue being fixed by #1928

Testing:

```bash
$ cd thirdparty/cudf
$ git checkout branch-24.06
$ cd -
$ ./build/build-in-docker clean install -DGPU_ARCHS='NATIVE' -DBUILD_TESTS=0N -DskipTests -Dlibcudf.clean.skip=false -Dlibcudf.dependency.mode=latest -Dsubmodule.check.skip
$ ./target/cmake-build/gtests/HASH
```

Fixes #1926

Signed-off-by: Gera Shegalov <gera@apache.org>
---
 pom.xml                            |   2 +-
 src/main/cpp/tests/hash.cpp        | 282 -----------------------------
 thirdparty/cudf-pins/versions.json |   2 +-
 3 files changed, 2 insertions(+), 284 deletions(-)

diff --git a/pom.xml b/pom.xml
index 745f8127d1..92db0aeefd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -96,7 +96,7 @@
     <libcudf.build.configure>false</libcudf.build.configure>
     <libcudf.clean.skip>true</libcudf.clean.skip>
     <libcudf.install.path>${project.build.directory}/libcudf-install</libcudf.install.path>
-    <libcudf.dependency.mode>pinned</libcudf.dependency.mode>
+    <libcudf.dependency.mode>latest</libcudf.dependency.mode>
     <libcudfjni.build.path>${project.build.directory}/libcudfjni</libcudfjni.build.path>
     <maven.compiler.source>1.8</maven.compiler.source>
     <maven.compiler.target>1.8</maven.compiler.target>
diff --git a/src/main/cpp/tests/hash.cpp b/src/main/cpp/tests/hash.cpp
index 265603a9af..9ce57ad018 100644
--- a/src/main/cpp/tests/hash.cpp
+++ b/src/main/cpp/tests/hash.cpp
@@ -30,41 +30,6 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve
 
 class HashTest : public cudf::test::BaseFixture {};
 
-TEST_F(HashTest, MultiValue)
-{
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
-
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, limits::min(), limits::max()});
-
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-
-  using ts = cudf::timestamp_s;
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(100),
-     static_cast<ts::duration>(-100),
-     ts::duration::min(),
-     ts::duration::max()});
-
-  auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col});
-  auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2, secs_col});
-
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
 TEST_F(HashTest, MultiValueNulls)
 {
   // Nulls with different values should be equal
@@ -115,14 +80,6 @@ TEST_F(HashTest, MultiValueNulls)
   auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
   auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
 
-  {
-    auto const output1 = cudf::hash(input1);
-    auto const output2 = cudf::hash(input2);
-
-    EXPECT_EQ(input1.num_rows(), output1->size());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-  }
-
   {
     auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
     auto const output2 = spark_rapids_jni::murmur_hash3_32(input2);
@@ -141,221 +98,6 @@ TEST_F(HashTest, MultiValueNulls)
   }
 }
 
-TEST_F(HashTest, BasicList)
-{
-  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
-  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
-
-  auto const col = LCW{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
-  auto const input  = cudf::table_view({col});
-  auto const expect = ICW{1607593296,
-                          1607593296,
-                          -636010097,
-                          -132459357,
-                          -636010097,
-                          -2008850957,
-                          -1023787369,
-                          761197503,
-                          761197503,
-                          1340177511,
-                          -1023787369,
-                          -1023787369};
-
-  auto const output = cudf::hash(input);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto const expect_seeded = ICW{1607594268u,
-                                 1607594268u,
-                                 1576790066u,
-                                 1203671017u,
-                                 1576790066u,
-                                 2107478077u,
-                                 1756855002u,
-                                 2228938758u,
-                                 2228938758u,
-                                 3491134126u,
-                                 1756855002u,
-                                 1756855002u};
-
-  auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, NullableList)
-{
-  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
-  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
-
-  auto const valids = std::vector<bool>{1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0};
-  auto const col =
-    LCW{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
-  auto expect = ICW{-2023148619,
-                    -2023148619,
-                    -31671896,
-                    -31671896,
-                    -1205248335,
-                    1865773848,
-                    1865773848,
-                    -2023148682,
-                    -1205248335,
-                    -1205248335,
-                    -2023148682};
-
-  auto const output = cudf::hash(cudf::table_view({col}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto const expect_seeded = ICW{2271820643u,
-                                 2271820643u,
-                                 1038318696u,
-                                 1038318696u,
-                                 595138041u,
-                                 3027840870u,
-                                 3027840870u,
-                                 2271820578u,
-                                 595138041u,
-                                 595138041u,
-                                 2271820578u};
-
-  auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, ListOfStruct)
-{
-  auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
-    {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
-  auto col2 = cudf::test::strings_column_wrapper{
-    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
-  auto struct_col = cudf::test::structs_column_wrapper{
-    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
-
-  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-    0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
-
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    17, offsets.release(), struct_col.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{83451479,
-                                                                 83451479,
-                                                                 83455332,
-                                                                 83455332,
-                                                                 -759684425,
-                                                                 -959632766,
-                                                                 -959632766,
-                                                                 -959632766,
-                                                                 -959636527,
-                                                                 -656998704,
-                                                                 613652814,
-                                                                 1902080426,
-                                                                 1902080426,
-                                                                 2061025592,
-                                                                 2061025592,
-                                                                 -319840811,
-                                                                 -319840811};
-
-  auto const output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto expect_seeded = cudf::test::fixed_width_column_wrapper<uint32_t>{81710442u,
-                                                                        81710442u,
-                                                                        81729816u,
-                                                                        81729816u,
-                                                                        3532787573u,
-                                                                        3642097855u,
-                                                                        3642097855u,
-                                                                        3642097855u,
-                                                                        3642110391u,
-                                                                        3889855760u,
-                                                                        1494406307u,
-                                                                        103934081u,
-                                                                        103934081u,
-                                                                        3462063680u,
-                                                                        3462063680u,
-                                                                        1696730835u,
-                                                                        1696730835u};
-
-  auto const seeded_output =
-    cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, ListOfEmptyStruct)
-{
-  // []
-  // []
-  // Null
-  // Null
-  // [Null, Null]
-  // [Null, Null]
-  // [Null, Null]
-  // [Null]
-  // [Null]
-  // [{}]
-  // [{}]
-  // [{}, {}]
-  // [{}, {}]
-
-  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
-  auto struct_col = cudf::make_structs_column(14, {}, null_count, std::move(null_mask));
-
-  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-    0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  std::tie(null_mask, null_count) =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    13, offsets.release(), std::move(struct_col), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{2271818677u,
-                                                                 2271818677u,
-                                                                 2271818614u,
-                                                                 2271818614u,
-                                                                 3954409013u,
-                                                                 3954409013u,
-                                                                 3954409013u,
-                                                                 2295666275u,
-                                                                 2295666275u,
-                                                                 2295666276u,
-                                                                 2295666276u,
-                                                                 3954409052u,
-                                                                 3954409052u};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(HashTest, EmptyDeepList)
-{
-  // List<List<int>>, where all lists are empty
-  // []
-  // []
-  // Null
-  // Null
-
-  // Internal empty list
-  auto list1 = cudf::test::lists_column_wrapper<int>{};
-
-  auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    4, offsets.release(), list1.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{
-    2271818677u, 2271818677u, 2271818614u, 2271818614u};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
 template <typename T>
 class HashTestTyped : public cudf::test::BaseFixture {};
 
@@ -367,15 +109,6 @@ TYPED_TEST(HashTestTyped, Equality)
   auto const input = cudf::table_view({col});
 
   // Hash of same input should be equal
-
-  {
-    auto const output1 = cudf::hash(input);
-    auto const output2 = cudf::hash(input);
-
-    EXPECT_EQ(input.num_rows(), output1->size());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-  }
-
   {
     auto const output1 = spark_rapids_jni::murmur_hash3_32(input, 0);
     auto const output2 = spark_rapids_jni::murmur_hash3_32(input);
@@ -404,14 +137,6 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
   auto const input1 = cudf::table_view({col1});
   auto const input2 = cudf::table_view({col2});
 
-  {
-    auto const output1 = cudf::hash(input1);
-    auto const output2 = cudf::hash(input2);
-
-    EXPECT_EQ(input1.num_rows(), output1->size());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-  }
-
   {
     auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
     auto const output2 = spark_rapids_jni::murmur_hash3_32(input2);
@@ -454,13 +179,6 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
   auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
   auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
 
-  auto const hash_col          = cudf::hash(table_col);
-  auto const hash_col_neg_zero = cudf::hash(table_col_neg_zero);
-  auto const hash_col_neg_nan  = cudf::hash(table_col_neg_nan);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);
-
   // Spark hash is sensitive to 0 and -0
   {
     auto const spark_col         = spark_rapids_jni::murmur_hash3_32(table_col, 0);
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 889c4cb961..4bd96ee645 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "8675b2c9ccaa33130cc2e5a291f3fff31f7c903d",
+      "git_tag" : "cdf20a665cc3a0bc0da96975de336ce70408dcf6",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 9e8fa88f479bd290c23fea18ac0a699c9cdc239a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 9 Apr 2024 04:33:08 +0800
Subject: [PATCH 018/124] [submodule-sync] bot-submodule-sync-branch-24.06 to
 branch-24.06 [skip ci] [bot] (#1925)

* Update submodule cudf to 4e44d5d3c80852a15ae28d5afa0b13646ca3a4fd

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 0ed224d94a915eee4ce7cdc2d837c1be1c93afcc

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to a00c3c916947d16fbf997095a32a02ca510b78e5

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to c5eb3240387222373043ddf881d18fb5d18e0834

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 102d564db21df1d805c2d06571e75a96fa6d822f

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 6c31eacce54d8f1c9d3ad183d8879e09e679b369

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c0f84bf5bb..6c31eacce5 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1
+Subproject commit 6c31eacce54d8f1c9d3ad183d8879e09e679b369

From d953e2e8734be7575f0165da3801b61b2b8fa12a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 9 Apr 2024 11:15:16 +0800
Subject: [PATCH 019/124] Update submodule cudf to
 1862cdc089c3a77ccec70411e5cd6dac292a8029 (#1938)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6c31eacce5..1862cdc089 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6c31eacce54d8f1c9d3ad183d8879e09e679b369
+Subproject commit 1862cdc089c3a77ccec70411e5cd6dac292a8029

From 85ccb789dad9ed0243a782afd6b66b6182fba97d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 9 Apr 2024 22:32:02 +0800
Subject: [PATCH 020/124] Update submodule cudf to
 f1a3db28e1e5efe9f144f95a7392549ea2c221b1 (#1939)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1862cdc089..f1a3db28e1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1862cdc089c3a77ccec70411e5cd6dac292a8029
+Subproject commit f1a3db28e1e5efe9f144f95a7392549ea2c221b1
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 4bd96ee645..da5294084e 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,7 +62,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "f8a732cfa0d271490791ac375a21da405994c5e6",
+      "git_tag" : "9a00b0a58aaf0e0171d43e77b0add56adc5e9fa5",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },

From d389e0a164bc3c454c9919a480bba15703a11ce0 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 10 Apr 2024 04:41:17 +0800
Subject: [PATCH 021/124] Update submodule cudf to
 3b48f8b0290dc41073538487ad53c8923be2f0f8 (#1941)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f1a3db28e1..3b48f8b029 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f1a3db28e1e5efe9f144f95a7392549ea2c221b1
+Subproject commit 3b48f8b0290dc41073538487ad53c8923be2f0f8
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index b58220aa4b..8bd51f07a9 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-4ab8d745796eb9350527567b6c0584bfa208179e
+8b1a1e0e2302ec5a6cfeed762c4f281268e7adca

From 96e31ec34dbd22651fe5f8c0aca369440ad4a0f5 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 9 Apr 2024 23:16:53 +0000
Subject: [PATCH 022/124] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e6cfd4503a..3b48f8b029 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e6cfd4503af063d3bba28954ab7ec67dbbb44e71
+Subproject commit 3b48f8b0290dc41073538487ad53c8923be2f0f8

From 359be157995ec8b9ce9de9b53674a20457a26ac3 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:31:02 +0800
Subject: [PATCH 023/124] Update submodule cudf to
 15c148dcbba087ed1be32e0cef7188c9b609e7dc (#1942)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 3b48f8b029..15c148dcbb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 3b48f8b0290dc41073538487ad53c8923be2f0f8
+Subproject commit 15c148dcbba087ed1be32e0cef7188c9b609e7dc

From c52c008a4399273b81907641ff795f5146c5d20b Mon Sep 17 00:00:00 2001
From: Renjie Liu <liurenjie2008@gmail.com>
Date: Wed, 10 Apr 2024 15:51:06 +0800
Subject: [PATCH 024/124] Append new authorized user to blossom-ci whitelist
 [skip ci] (#1943)

Signed-off-by: liurenjie1024 <liurenjie2008@gmail.com>
---
 .github/workflows/blossom-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 2d09c02be0..f9e4c05e20 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -66,6 +66,7 @@ jobs:
       yinqingh,\
       thirtiseven,\
       parthosa,\
+      liurenjie1024,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person

From d5b550bba1eeee7fded13d81213b2552e6365f72 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 10 Apr 2024 16:35:26 +0000
Subject: [PATCH 025/124] Auto-merge use branch-24.06 versions

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       |  2 +-
 thirdparty/cudf-pins/rapids-cmake.sha |  2 +-
 thirdparty/cudf-pins/versions.json    | 18 +++++++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e6cfd4503a..15c148dcbb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e6cfd4503af063d3bba28954ab7ec67dbbb44e71
+Subproject commit 15c148dcbba087ed1be32e0cef7188c9b609e7dc
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index fd9b77a7c7..8bd51f07a9 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-096ae3c0a6b2c593f8fdb38468be527027bf79d7
+8b1a1e0e2302ec5a6cfeed762c4f281268e7adca
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index cf97df89ae..ca9dd8bb04 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,9 +62,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "745b1847f56c8f4b0c4e094f93837c2a91e18318",
+      "git_tag" : "9a00b0a58aaf0e0171d43e77b0add56adc5e9fa5",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
-      "version" : "24.04"
+      "version" : "24.06"
     },
     "NVTX3" : 
     {
@@ -114,6 +114,14 @@
       "git_url" : "https://github.com/rapidsai/jitify.git",
       "version" : "2.0.0"
     },
+    "nanoarrow" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "version" : "0.4.0"
+    },
     "nvcomp" : 
     {
       "always_download" : true,
@@ -131,9 +139,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "0651edf0fce5ebf53528382b475fc29a2f3afa67",
+      "git_tag" : "cdf20a665cc3a0bc0da96975de336ce70408dcf6",
       "git_url" : "https://github.com/rapidsai/rmm.git",
-      "version" : "24.04"
+      "version" : "24.06"
     },
     "spdlog" : 
     {
@@ -152,4 +160,4 @@
       "version" : "1.12.0"
     }
   }
-}
\ No newline at end of file
+}

From a9551825a2cc9439e74b6e7df01e767778f366f3 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 10 Apr 2024 17:42:26 +0000
Subject: [PATCH 026/124] Auto-merge use branch-24.06 versions

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       |  2 +-
 thirdparty/cudf-pins/rapids-cmake.sha |  2 +-
 thirdparty/cudf-pins/versions.json    | 18 +++++++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 94726ad056..15c148dcbb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 94726ad056e2473c836f47d310e2584bdf44d1f9
+Subproject commit 15c148dcbba087ed1be32e0cef7188c9b609e7dc
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index aee5915dc4..8bd51f07a9 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-69f5222465ec3c8c54f107fcf8750f040034e156
+8b1a1e0e2302ec5a6cfeed762c4f281268e7adca
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 4261acab1a..ca9dd8bb04 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,9 +62,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "5e69e97c42504c17a333a36e1796dce4b83150a4",
+      "git_tag" : "9a00b0a58aaf0e0171d43e77b0add56adc5e9fa5",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
-      "version" : "24.04"
+      "version" : "24.06"
     },
     "NVTX3" : 
     {
@@ -114,6 +114,14 @@
       "git_url" : "https://github.com/rapidsai/jitify.git",
       "version" : "2.0.0"
     },
+    "nanoarrow" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "version" : "0.4.0"
+    },
     "nvcomp" : 
     {
       "always_download" : true,
@@ -131,9 +139,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "e14a2291301ce9c8ef76b2b2404eb02336584724",
+      "git_tag" : "cdf20a665cc3a0bc0da96975de336ce70408dcf6",
       "git_url" : "https://github.com/rapidsai/rmm.git",
-      "version" : "24.04"
+      "version" : "24.06"
     },
     "spdlog" : 
     {
@@ -152,4 +160,4 @@
       "version" : "1.12.0"
     }
   }
-}
\ No newline at end of file
+}

From 2b6cdf99253df3c1870052a23fddc0a92fe73515 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 11 Apr 2024 04:32:04 +0800
Subject: [PATCH 027/124] Update submodule cudf to
 460b41edadc90a43b02b1f1e7dc23190cc14d0b4 (#1951)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 15c148dcbb..460b41edad 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 15c148dcbba087ed1be32e0cef7188c9b609e7dc
+Subproject commit 460b41edadc90a43b02b1f1e7dc23190cc14d0b4
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 8bd51f07a9..94d1ae4689 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-8b1a1e0e2302ec5a6cfeed762c4f281268e7adca
+fcfe900ff19d5433d843d04e76fe51838ec7169c
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index ca9dd8bb04..9bc11df470 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,7 +62,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "9a00b0a58aaf0e0171d43e77b0add56adc5e9fa5",
+      "git_tag" : "7b0231c8241164c8a0511cd737e7098cb7ccea3e",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "cdf20a665cc3a0bc0da96975de336ce70408dcf6",
+      "git_tag" : "af756c6e17be2f8b9209e7a8b805ffe0e6a4842e",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },
@@ -160,4 +160,4 @@
       "version" : "1.12.0"
     }
   }
-}
+}
\ No newline at end of file

From 5fd18cd26a917925b54dd209cc79ade4cd3e0406 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 11 Apr 2024 11:11:02 +0800
Subject: [PATCH 028/124] Update submodule cudf to
 888e9d5c38cb27402313681744b87462846bc405 (#1955)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 460b41edad..888e9d5c38 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 460b41edadc90a43b02b1f1e7dc23190cc14d0b4
+Subproject commit 888e9d5c38cb27402313681744b87462846bc405

From a3d31e1b6e9e83e042d031c7d16a2cf1194d0d73 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 12 Apr 2024 05:24:09 +0800
Subject: [PATCH 029/124] Update submodule cudf to
 af33b0aba4dafe82cb5d25811e5e737af6c7faad (#1959)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    |  2 +-
 thirdparty/cudf-pins/versions.json | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 888e9d5c38..af33b0aba4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 888e9d5c38cb27402313681744b87462846bc405
+Subproject commit af33b0aba4dafe82cb5d25811e5e737af6c7faad
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 9bc11df470..9171df43f4 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -120,6 +120,14 @@
       "git_shallow" : false,
       "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
       "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "patches" : 
+      [
+        {
+          "file" : "${current_json_dir}/nanoarrow_cmake.diff",
+          "fixed_in" : "0.5.0",
+          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]"
+        }
+      ],
       "version" : "0.4.0"
     },
     "nvcomp" : 
@@ -139,7 +147,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "af756c6e17be2f8b9209e7a8b805ffe0e6a4842e",
+      "git_tag" : "7d7d65ab115b3e96b0ebc51d45a90020d8037439",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From a3d49685a613a441ed4c284708eb09e058ca7132 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Thu, 11 Apr 2024 16:45:11 -0700
Subject: [PATCH 030/124] Restore pinned dependency mode (#1940)

Signed-off-by: Gera Shegalov <gera@apache.org>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 92db0aeefd..745f8127d1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -96,7 +96,7 @@
     <libcudf.build.configure>false</libcudf.build.configure>
     <libcudf.clean.skip>true</libcudf.clean.skip>
     <libcudf.install.path>${project.build.directory}/libcudf-install</libcudf.install.path>
-    <libcudf.dependency.mode>latest</libcudf.dependency.mode>
+    <libcudf.dependency.mode>pinned</libcudf.dependency.mode>
     <libcudfjni.build.path>${project.build.directory}/libcudfjni</libcudfjni.build.path>
     <maven.compiler.source>1.8</maven.compiler.source>
     <maven.compiler.target>1.8</maven.compiler.target>

From 133d8b486760f3774a554a3f6d227dbe5749164f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 12 Apr 2024 11:11:35 +0800
Subject: [PATCH 031/124] Update submodule cudf to
 ff22a7ac0d565be2b2221c6080966eb0338676ee (#1961)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index af33b0aba4..ff22a7ac0d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit af33b0aba4dafe82cb5d25811e5e737af6c7faad
+Subproject commit ff22a7ac0d565be2b2221c6080966eb0338676ee
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 9171df43f4..4eab39b3e7 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -147,7 +147,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "7d7d65ab115b3e96b0ebc51d45a90020d8037439",
+      "git_tag" : "9e02f34a03153650b7bf5ce398ee3f374c240476",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 0d2631759898f7f0d4782c0c0dd116727d2e8030 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 12 Apr 2024 16:29:29 +0800
Subject: [PATCH 032/124] Update submodule cudf to
 f19d4eb9f2ccbe1833aa8112c053e622bc138301 (#1964)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ff22a7ac0d..f19d4eb9f2 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ff22a7ac0d565be2b2221c6080966eb0338676ee
+Subproject commit f19d4eb9f2ccbe1833aa8112c053e622bc138301

From 16e4f8c4fdbab221a442b281bea6b4d06db6aae1 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 13 Apr 2024 04:31:29 +0800
Subject: [PATCH 033/124] Update submodule cudf to
 2e00cb1ebd7bee4a4085d1e691ad3b626bc10d0e (#1966)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f19d4eb9f2..2e00cb1ebd 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f19d4eb9f2ccbe1833aa8112c053e622bc138301
+Subproject commit 2e00cb1ebd7bee4a4085d1e691ad3b626bc10d0e

From 5277882343d31f9409d9884e2bf56a9f80b26338 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 13 Apr 2024 11:12:31 +0800
Subject: [PATCH 034/124] Update submodule cudf to
 9cc87f01810d598eca4b80ce95b4c1eb72617a3a (#1967)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2e00cb1ebd..9cc87f0181 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2e00cb1ebd7bee4a4085d1e691ad3b626bc10d0e
+Subproject commit 9cc87f01810d598eca4b80ce95b4c1eb72617a3a

From 8455335c717c150fa89ba2f9b647602ac63a67b3 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sun, 14 Apr 2024 04:47:12 +0800
Subject: [PATCH 035/124] Update submodule cudf to
 c8cb4953550dc7b0e0f30c9d33ef55e25f935ef4 (#1968)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9cc87f0181..c8cb495355 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9cc87f01810d598eca4b80ce95b4c1eb72617a3a
+Subproject commit c8cb4953550dc7b0e0f30c9d33ef55e25f935ef4

From 7df16e48cbab3ed01ad324ccb00d7740ad12410f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 15 Apr 2024 22:29:12 +0800
Subject: [PATCH 036/124] Update submodule cudf to
 1403e1b0b378397261d7cfa0025f791bb289f1e8 (#1969)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c8cb495355..1403e1b0b3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c8cb4953550dc7b0e0f30c9d33ef55e25f935ef4
+Subproject commit 1403e1b0b378397261d7cfa0025f791bb289f1e8

From 71dc61f03e539a11113088734e093b67b614c8c7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 16 Apr 2024 04:30:16 +0800
Subject: [PATCH 037/124] Update submodule cudf to
 74b39e213a4e6a6a1cf9f0e8d19a112fc6639214 (#1970)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1403e1b0b3..74b39e213a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1403e1b0b378397261d7cfa0025f791bb289f1e8
+Subproject commit 74b39e213a4e6a6a1cf9f0e8d19a112fc6639214

From 23517f82233eed3ab26f7c59ccc2531cf206fae6 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:31:34 +0800
Subject: [PATCH 038/124] Update submodule cudf to
 89196900f5739a39bd9861d3b494b47ff75e7f71 (#1971)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 74b39e213a..89196900f5 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 74b39e213a4e6a6a1cf9f0e8d19a112fc6639214
+Subproject commit 89196900f5739a39bd9861d3b494b47ff75e7f71
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 4eab39b3e7..df711a6f42 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -147,7 +147,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "9e02f34a03153650b7bf5ce398ee3f374c240476",
+      "git_tag" : "7ed529fb7b94d5639deb0f04efc08ab7ae7fd045",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From a4520a43ef72682a0b8a27afc145c3a33d69fe92 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 16 Apr 2024 06:59:02 -0700
Subject: [PATCH 039/124] Implement benchmark for `get_json_object` (#1952)

* Add benchmark for `get_json_object`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Fix compilation errors

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Move headers

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

---------

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 src/main/cpp/benchmarks/CMakeLists.txt        |   5 +-
 .../cpp/benchmarks/common/generate_input.cu   |  22 ++-
 src/main/cpp/benchmarks/get_json_object.cu    | 171 ++++++++++++++++++
 src/main/cpp/benchmarks/row_conversion.cpp    |   2 +-
 4 files changed, 188 insertions(+), 12 deletions(-)
 create mode 100644 src/main/cpp/benchmarks/get_json_object.cu

diff --git a/src/main/cpp/benchmarks/CMakeLists.txt b/src/main/cpp/benchmarks/CMakeLists.txt
index 23d35b0bea..ca0c43d2fe 100644
--- a/src/main/cpp/benchmarks/CMakeLists.txt
+++ b/src/main/cpp/benchmarks/CMakeLists.txt
@@ -23,7 +23,7 @@ target_compile_options(
 )
 
 target_link_libraries(
-  spark_rapids_jni_datagen PUBLIC cudf::cudf
+  spark_rapids_jni_datagen PUBLIC cudf::cudf nvtx3-cpp
 )
 
 target_include_directories(
@@ -78,5 +78,8 @@ ConfigureBench(STRING_TO_FLOAT_BENCH
 ConfigureBench(BLOOM_FILTER_BENCH
   bloom_filter.cu)
 
+ConfigureBench(GET_JSON_OBJECT_BENCH
+  get_json_object.cu)
+
 ConfigureBench(PARSE_URI_BENCH
   parse_uri.cpp)
diff --git a/src/main/cpp/benchmarks/common/generate_input.cu b/src/main/cpp/benchmarks/common/generate_input.cu
index 75f0a8fca0..d0a61d05a0 100644
--- a/src/main/cpp/benchmarks/common/generate_input.cu
+++ b/src/main/cpp/benchmarks/common/generate_input.cu
@@ -32,6 +32,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -358,12 +359,13 @@ rmm::device_uvector<cudf::size_type> sample_indices_with_run_length(cudf::size_t
     // This is gather.
     auto avg_repeated_sample_indices_iterator = thrust::make_transform_iterator(
       thrust::make_counting_iterator(0),
-      [rb              = run_lens.begin(),
-       re              = run_lens.end(),
-       samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
-        auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
-        return samples_indices[sample_idx];
-      });
+      cuda::proclaim_return_type<cudf::size_type>(
+        [rb              = run_lens.begin(),
+         re              = run_lens.end(),
+         samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
+          auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
+          return samples_indices[sample_idx];
+        }));
     rmm::device_uvector<cudf::size_type> repeated_sample_indices(num_rows,
                                                                  cudf::get_default_stream());
     thrust::copy(thrust::device,
@@ -519,10 +521,10 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
 
   return cudf::make_strings_column(
     num_rows,
-    std::move(offsets),
-    std::move(chars->release().data.release()[0]),
-    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
-    null_count);
+    std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
+    chars.release(),
+    null_count,
+    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
 }
 
 /**
diff --git a/src/main/cpp/benchmarks/get_json_object.cu b/src/main/cpp/benchmarks/get_json_object.cu
new file mode 100644
index 0000000000..442bcb6004
--- /dev/null
+++ b/src/main/cpp/benchmarks/get_json_object.cu
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/io/json.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <get_json_object.hpp>
+#include <nvbench/nvbench.cuh>
+
+// #define DEBUG_PRINT
+
+#ifdef DEBUG_PRINT
+
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <iostream>
+
+namespace {
+
+// Copy from `cudf/cpp/tests/utilities/column_utilities.cu`.
+struct strings_to_host_fn {
+  template <typename OffsetType,
+            std::enable_if_t<std::is_same_v<OffsetType, int32_t> ||
+                             std::is_same_v<OffsetType, int64_t>>* = nullptr>
+  void operator()(std::vector<std::string>& host_data,
+                  char const* chars,
+                  cudf::column_view const& offsets,
+                  rmm::cuda_stream_view stream)
+  {
+    auto const h_offsets = cudf::detail::make_std_vector_sync(
+      cudf::device_span<OffsetType const>(offsets.data<OffsetType>(), offsets.size()), stream);
+    // build std::string vector from chars and offsets
+    std::transform(std::begin(h_offsets),
+                   std::end(h_offsets) - 1,
+                   std::begin(h_offsets) + 1,
+                   host_data.begin(),
+                   [&](auto start, auto end) { return std::string(chars + start, end - start); });
+  }
+
+  template <typename OffsetType,
+            std::enable_if_t<!std::is_same_v<OffsetType, int32_t> &&
+                             !std::is_same_v<OffsetType, int64_t>>* = nullptr>
+  void operator()(std::vector<std::string>&,
+                  char const*,
+                  cudf::column_view const&,
+                  rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("invalid offsets type");
+  }
+};
+
+template <typename CV>
+std::vector<std::string> to_host_strings(CV const& c)
+{
+  std::vector<std::string> host_strs(c.size());
+  auto stream        = cudf::get_default_stream();
+  auto const scv     = cudf::strings_column_view(c);
+  auto const h_chars = cudf::detail::make_std_vector_sync<char>(
+    cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
+  auto const offsets =
+    cudf::slice(scv.offsets(), {scv.offset(), scv.offset() + scv.size() + 1}).front();
+  cudf::type_dispatcher(
+    offsets.type(), strings_to_host_fn{}, host_strs, h_chars.data(), offsets, stream);
+  return host_strs;
+}
+
+}  // namespace
+#endif  // #ifdef DEBUG_PRINT
+
+constexpr auto list_depth = 2;
+constexpr auto min_width  = 10;
+constexpr auto max_width  = 10;
+
+auto generate_input(std::size_t size_bytes, cudf::size_type max_depth)
+{
+  data_profile const table_profile =
+    data_profile_builder()
+      .no_validity()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
+      .distribution(cudf::type_id::LIST, distribution_id::NORMAL, min_width, max_width)
+      .list_depth(list_depth)
+      .list_type(cudf::type_id::STRING)
+      .struct_depth(max_depth > list_depth ? max_depth - list_depth : 1)
+      .struct_types(std::vector<cudf::type_id>{cudf::type_id::LIST});
+
+  auto const input_table = create_random_table(
+    std::vector<cudf::type_id>{cudf::type_id::INT32, cudf::type_id::STRING, cudf::type_id::STRUCT},
+    table_size_bytes{size_bytes},
+    table_profile);
+
+  std::vector<char> buffer;
+  cudf::io::sink_info sink(&buffer);
+  cudf::io::table_metadata mt{{{"int32"}, {"string"}, {"struct"}}};
+  auto write_opts =
+    cudf::io::json_writer_options::builder(sink, input_table->view()).lines(true).metadata(mt);
+  cudf::io::write_json(write_opts);
+
+  // Split one JSON string into separate JSON objects.
+  auto const json_str = std::string{buffer.begin(), buffer.end()};
+  auto const json_col = cudf::test::strings_column_wrapper{{json_str}};
+  auto split_strs =
+    cudf::strings::split_record(cudf::strings_column_view{json_col}, cudf::string_scalar("\n"))
+      ->release();
+
+  // Note that split_strs is a list of strings thus we need to extract the strings column.
+  auto& json_strings = split_strs.children[cudf::lists_column_view::child_column_index];
+
+#ifdef DEBUG_PRINT
+  {
+    auto const strs = to_host_strings(json_strings->view());
+    std::cout << "First input row: \n" << strs.front() << std::endl;
+  }
+#endif  // #ifdef DEBUG_PRINT
+  return std::move(json_strings);
+}
+
+void BM_get_json_object(nvbench::state& state)
+{
+  auto const size_bytes = static_cast<cudf::size_type>(state.get_int64("size_bytes"));
+  auto const max_depth  = static_cast<cudf::size_type>(state.get_int64("max_depth"));
+
+  auto const json_strings = generate_input(size_bytes, max_depth);
+
+  using path_instruction_type = spark_rapids_jni::path_instruction_type;
+  std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions;
+  instructions.emplace_back(path_instruction_type::KEY, "", -1);
+  instructions.emplace_back(path_instruction_type::NAMED, "struct", -1);
+  for (int i = 0; i < max_depth - list_depth; ++i) {
+    instructions.emplace_back(path_instruction_type::KEY, "", -1);
+    instructions.emplace_back(path_instruction_type::NAMED, "0", -1);
+  }
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    // Can also verify at https://jsonpath.com/.
+    [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object(
+      cudf::strings_column_view{json_strings->view()}, instructions);
+
+#ifdef DEBUG_PRINT
+    {
+      auto const strs = to_host_strings(output->view());
+      std::cout << "First output row: \n" << strs.front() << std::endl << std::endl << std::endl;
+    }
+#endif  // #ifdef DEBUG_PRINT
+  });
+  state.add_global_memory_reads<nvbench::int8_t>(size_bytes);
+}
+
+NVBENCH_BENCH(BM_get_json_object)
+  .set_name("get_json_object")
+  .add_int64_axis("size_bytes", {1'000'000, 10'000'000, 100'000'000, 1'000'000'000})
+  .add_int64_axis("max_depth", {2, 4, 6, 8});
diff --git a/src/main/cpp/benchmarks/row_conversion.cpp b/src/main/cpp/benchmarks/row_conversion.cpp
index ff2e11f838..d040715aff 100644
--- a/src/main/cpp/benchmarks/row_conversion.cpp
+++ b/src/main/cpp/benchmarks/row_conversion.cpp
@@ -113,7 +113,7 @@ static void variable_or_fixed_width(nvbench::state& state)
       bytes_per_row += cudf::size_of(t);
     } else if (t.id() == cudf::type_id::STRING) {
       auto sc = cudf::strings_column_view(table->get_column(i));
-      string_bytes += sc.chars_size();
+      string_bytes += sc.chars_size(cudf::get_default_stream());
     }
   }
 

From d096f47dfb6e938045a63de22dada28abbdb6955 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 16 Apr 2024 23:12:51 +0800
Subject: [PATCH 040/124] Update submodule cudf to
 c1dcc31c07e858dfc0f24ff77e5b111551ad8a0e (#1972)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 89196900f5..c1dcc31c07 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 89196900f5739a39bd9861d3b494b47ff75e7f71
+Subproject commit c1dcc31c07e858dfc0f24ff77e5b111551ad8a0e

From dc6a8a4ba4fb759ff8bbc0a7f2be454599c46787 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 17 Apr 2024 04:31:23 +0800
Subject: [PATCH 041/124] Update submodule cudf to
 f0be36bedd9a7d7c03d4b90666136070d650f22c (#1973)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c1dcc31c07..f0be36bedd 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c1dcc31c07e858dfc0f24ff77e5b111551ad8a0e
+Subproject commit f0be36bedd9a7d7c03d4b90666136070d650f22c
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index df711a6f42..ef08a8877d 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -147,7 +147,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "7ed529fb7b94d5639deb0f04efc08ab7ae7fd045",
+      "git_tag" : "588928f5ff2418aadcec8e6c91fcea4dd8cb9265",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 79514c7757d511d003b1bcf61e43a1023811321a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 17 Apr 2024 10:30:25 +0800
Subject: [PATCH 042/124] Update submodule cudf to
 02f8e2fc882ae58cc74053ea631e27ab27dfbe53 (#1974)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    |  2 +-
 thirdparty/cudf-pins/versions.json | 12 ++----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f0be36bedd..02f8e2fc88 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f0be36bedd9a7d7c03d4b90666136070d650f22c
+Subproject commit 02f8e2fc882ae58cc74053ea631e27ab27dfbe53
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index ef08a8877d..d0f4817d33 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -118,17 +118,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
+      "git_tag" : "11e73a8c85b45e3d49c8c541b4e1497a649fe03c",
       "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
-      "patches" : 
-      [
-        {
-          "file" : "${current_json_dir}/nanoarrow_cmake.diff",
-          "fixed_in" : "0.5.0",
-          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]"
-        }
-      ],
-      "version" : "0.4.0"
+      "version" : "0.5.0"
     },
     "nvcomp" : 
     {

From ea84037662a843630b125076120f9472dffc9a05 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 17 Apr 2024 22:29:27 +0800
Subject: [PATCH 043/124] Update submodule cudf to
 9192d259633c382c6f98f956dc7f43d754ebbf44 (#1975)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 02f8e2fc88..9192d25963 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 02f8e2fc882ae58cc74053ea631e27ab27dfbe53
+Subproject commit 9192d259633c382c6f98f956dc7f43d754ebbf44

From 294d2721f326189ae7a3d9cd88c47033ab43fadc Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 18 Apr 2024 04:30:42 +0800
Subject: [PATCH 044/124] Update submodule cudf to
 e928c4a01bfe528839b812aad8b5135029a0fa78 (#1976)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9192d25963..e928c4a01b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9192d259633c382c6f98f956dc7f43d754ebbf44
+Subproject commit e928c4a01bfe528839b812aad8b5135029a0fa78
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 94d1ae4689..b447a646ea 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-fcfe900ff19d5433d843d04e76fe51838ec7169c
+71c922863af1f3f0a908f56fc509cf5b56c7acf8
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index d0f4817d33..476398676c 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "588928f5ff2418aadcec8e6c91fcea4dd8cb9265",
+      "git_tag" : "9e6db746f1a4a6361fb9fadf381f749dc52faaea",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From fdfb73b406c37110ed3ec470d214d10f83509658 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 18 Apr 2024 10:30:21 +0800
Subject: [PATCH 045/124] Update submodule cudf to
 eaae68d8b099e90a2e3bcc968f98c652d36bb844 (#1977)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e928c4a01b..eaae68d8b0 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e928c4a01bfe528839b812aad8b5135029a0fa78
+Subproject commit eaae68d8b099e90a2e3bcc968f98c652d36bb844
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index b447a646ea..9f807c7aa9 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-71c922863af1f3f0a908f56fc509cf5b56c7acf8
+a512e37da0d27ff5d51b8a32f3b25713ec89ccbb

From 5637f77d7c729ac848524bbf0e17ed4d7644b834 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 19 Apr 2024 04:30:35 +0800
Subject: [PATCH 046/124] Update submodule cudf to
 cb8e434e9f2abec93af5877af062688069e5d164 (#1980)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index eaae68d8b0..cb8e434e9f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit eaae68d8b099e90a2e3bcc968f98c652d36bb844
+Subproject commit cb8e434e9f2abec93af5877af062688069e5d164
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 9f807c7aa9..720d6a15f4 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-a512e37da0d27ff5d51b8a32f3b25713ec89ccbb
+3683bbf2ccad929c9b4805037a3a842f23bde4d6

From a36a7ba5c1a2d86a77ca38e505b7ceeedeefae67 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 19 Apr 2024 16:31:09 +0800
Subject: [PATCH 047/124] Update submodule cudf to
 e0c4280e44d25006dca37d5e2e6c7f77dce3fd56 (#1982)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index cb8e434e9f..e0c4280e44 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit cb8e434e9f2abec93af5877af062688069e5d164
+Subproject commit e0c4280e44d25006dca37d5e2e6c7f77dce3fd56

From 2f6080792da5e1d3eca4de54b9d0cb8edeec6e6a Mon Sep 17 00:00:00 2001
From: Sameer Raheja <sameerz@users.noreply.github.com>
Date: Fri, 19 Apr 2024 05:52:30 -0700
Subject: [PATCH 048/124] Remove inactive users (#1981)

Signed-off-by: Sameer Raheja <sraheja@.nvidia.com>
Co-authored-by: Sameer Raheja <sraheja@.nvidia.com>
---
 .github/workflows/blossom-ci.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 2ed6c9f5b6..d5fd1195cc 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -35,20 +35,16 @@ jobs:
     # This job only runs for pull request comments
     if: contains( '\
       abellina,\
-      andygrove,\
       anfeng,\
       firestarman,\
       GaryShen2008,\
-      jbrennan333, \
       jlowe,\
-      krajendrannv,\
       mythrocks,\
       nartal1,\
       nvdbaranec,\
       NvTimLiu,\
       razajafri,\
       revans2,\
-      rongou,\
       rwlee,\
       sameerz,\
       tgravescs,\

From 6048ab7ff3de47975ed330e666cfd58beb17dfbc Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 19 Apr 2024 14:15:15 -0500
Subject: [PATCH 049/124] Explicitly pull in GTest dependency when building C++
 tests (#1983)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 src/main/cpp/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 37f7ee7076..4e098adee5 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -110,6 +110,7 @@ endif()
 
 # cudf
 if(BUILD_TESTS)
+  find_package(GTest REQUIRED)
   rapids_find_package(cudf REQUIRED COMPONENTS testing)
 else()
   rapids_find_package(cudf REQUIRED)

From 8665277f8d7ee1d3c897d8961bb239d04fa5ca6e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 20 Apr 2024 04:31:20 +0800
Subject: [PATCH 050/124] Update submodule cudf to
 21350fc2ac070315d110fca55cb6781ed7905596 (#1984)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e0c4280e44..21350fc2ac 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e0c4280e44d25006dca37d5e2e6c7f77dce3fd56
+Subproject commit 21350fc2ac070315d110fca55cb6781ed7905596
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 720d6a15f4..783c2dffa2 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-3683bbf2ccad929c9b4805037a3a842f23bde4d6
+81fd585166de32bcd9daae25008f2b333f107b39

From 10d0fcc9c6fc145cf5e5c7acebd607706b3982c2 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 20 Apr 2024 10:30:34 +0800
Subject: [PATCH 051/124] Update submodule cudf to
 14854b14fe2878f801319eca8d6cd1d5685b9ca6 (#1985)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 21350fc2ac..14854b14fe 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 21350fc2ac070315d110fca55cb6781ed7905596
+Subproject commit 14854b14fe2878f801319eca8d6cd1d5685b9ca6

From ec567da92a2ecfa16d22d62ae6dd5dd8c07da3ab Mon Sep 17 00:00:00 2001
From: "Hongbin Ma (Mahone)" <mahongbin@apache.org>
Date: Mon, 22 Apr 2024 16:00:18 +0800
Subject: [PATCH 052/124] add my id (#1986)

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>
---
 .github/workflows/blossom-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index d5fd1195cc..36ba8944a6 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -63,6 +63,7 @@ jobs:
       thirtiseven,\
       parthosa,\
       liurenjie1024,\
+      binmahone,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person

From 9ae7c6b8da45190c32e11e8f809a6befe82e72d7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 22 Apr 2024 22:30:59 +0800
Subject: [PATCH 053/124] Update submodule cudf to
 a2c81e71fd9a7bbb0a89eee8a456d0066fa3ecbb (#1988)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 14854b14fe..a2c81e71fd 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 14854b14fe2878f801319eca8d6cd1d5685b9ca6
+Subproject commit a2c81e71fd9a7bbb0a89eee8a456d0066fa3ecbb
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 783c2dffa2..82ff6c3e48 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-81fd585166de32bcd9daae25008f2b333f107b39
+ab8091f5a6efb9c2c036933b8f2805e33e85501f

From 10dd8c053bd8294ef6165da58ec5584ba380e25e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 Apr 2024 04:32:25 +0800
Subject: [PATCH 054/124] Update submodule cudf to
 475f5e5fcb5b703ffdf1e491b7f2230c514a41fc (#1989)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a2c81e71fd..475f5e5fcb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a2c81e71fd9a7bbb0a89eee8a456d0066fa3ecbb
+Subproject commit 475f5e5fcb5b703ffdf1e491b7f2230c514a41fc

From d2601b9683cf60eb659749387332e6b2fbfdeb87 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 23 Apr 2024 10:02:52 +0800
Subject: [PATCH 055/124] Trim C0 control chars in string to number type
 conversion (#1978)

* Trim C0 control chars in string to floats

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add test

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Also fix in to int and to decimal

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* update comment

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_string.cu               |  3 +-
 src/main/cpp/src/cast_string_to_float.cu      |  3 +-
 .../spark/rapids/jni/CastStringsTest.java     | 53 ++++++++++++++-----
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu
index a961981160..d32d153632 100644
--- a/src/main/cpp/src/cast_string.cu
+++ b/src/main/cpp/src/cast_string.cu
@@ -38,13 +38,14 @@ namespace detail {
 constexpr auto NUM_THREADS{256};
 
 /**
- * @brief Identify if a character is whitespace.
+ * @brief Identify if a character is whitespace or C0 control code.
  *
  * @param chr character to test
  * @return true if character is a whitespace character
  */
 constexpr bool is_whitespace(char const chr)
 {
+  if (chr >= 0x0000 && chr <= 0x001F) { return true; }
   switch (chr) {
     case ' ':
     case '\r':
diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu
index 5c3c749f02..99090efbe5 100644
--- a/src/main/cpp/src/cast_string_to_float.cu
+++ b/src/main/cpp/src/cast_string_to_float.cu
@@ -36,13 +36,14 @@ namespace detail {
 __device__ __inline__ bool is_digit(char c) { return c >= '0' && c <= '9'; }
 
 /**
- * @brief Identify if a character is whitespace.
+ * @brief Identify if a character is whitespace or C0 control code.
  *
  * @param chr character to test
  * @return true if character is a whitespace character
  */
 constexpr bool is_whitespace(char const chr)
 {
+  if (chr >= 0x0000 && chr <= 0x001F) { return true; }
   switch (chr) {
     case ' ':
     case '\r':
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index c39766454a..a9b1cfaa4d 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -33,14 +33,14 @@ public class CastStringsTest {
   @Test
   void castToIntegerTest() {
     Table.TestBuilder tb = new Table.TestBuilder();
-    tb.column(3l, 9l, 4l, 2l, 20l, null, null);
-    tb.column(5, 1, 0, 2, 7, null, null);
-    tb.column(new Byte[]{2, 3, 4, 5, 9, null, null});
+    tb.column(3l, 9l, 4l, 2l, 20l, null, null, 1l);
+    tb.column(5, 1, 0, 2, 7, null, null, 1);
+    tb.column(new Byte[]{2, 3, 4, 5, 9, null, null, 1});
     try (Table expected = tb.build()) {
       Table.TestBuilder tb2 = new Table.TestBuilder();
-      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd");
-      tb2.column("5", "1  ", "0", "2", "7.1", null, "asdf");
-      tb2.column("2", "3", " 4 ", "5", " 9.2 ", null, "7.8.3");
+      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd", "\u0000 \u001f1\u0014");
+      tb2.column("5", "1  ", "0", "2", "7.1", null, "asdf", "\u0000 \u001f1\u0014");
+      tb2.column("2", "3", " 4 ", "5", " 9.2 ", null, "7.8.3", "\u0000 \u001f1\u0014");
 
       List<ColumnVector> result = new ArrayList<>();
       try (Table origTable = tb2.build()) {
@@ -129,20 +129,49 @@ void castToIntegerAnsiTest() {
     }
   }
 
+  @Test
+  void castToFloatsTrimTest() {
+    Table.TestBuilder tb = new Table.TestBuilder();
+    tb.column(1.1f, 1.2f, 1.3f, 1.4f, 1.5f, null, null);
+    tb.column(1.1d, 1.2d, 1.3d, 1.4d, 1.5d, null, null);
+    try (Table expected = tb.build()) {
+      Table.TestBuilder tb2 = new Table.TestBuilder();
+      tb2.column("1.1\u0000", "1.2\u0014", "1.3\u001f", 
+          "\u0000\u00001.4\u0000", "1.5\u0000\u0020\u0000", "1.6\u009f", "1.7\u0021");
+      tb2.column("1.1\u0000", "1.2\u0014", "1.3\u001f", 
+          "\u0000\u00001.4\u0000", "1.5\u0000\u0020\u0000", "1.6\u009f", "1.7\u0021");
+
+      List<ColumnVector> result = new ArrayList<>();
+      try (Table origTable = tb2.build()) {
+        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
+          ColumnVector string_col = origTable.getColumn(i);
+          result.add(CastStrings.toFloat(string_col, false, 
+              expected.getColumn(i).getType()));
+        }
+        try (Table result_tbl = new Table(
+            result.toArray(new ColumnVector[result.size()]))) {
+          AssertUtils.assertTablesAreEqual(expected, result_tbl);
+        }
+      } finally {
+        result.forEach(ColumnVector::close);
+      }
+    }
+  }
+
   @Test
   void castToDecimalTest() {
     Table.TestBuilder tb = new Table.TestBuilder();
-    tb.decimal32Column(0,3, 9, 4, 2, 21, null, null);
-    tb.decimal64Column(0, 5l, 1l, 0l, 2l, 7l, null, null);
-    tb.decimal32Column(-1, 20, 30, 40, 51, 92, null, null);
+    tb.decimal32Column(0,3, 9, 4, 2, 21, null, null, 1);
+    tb.decimal64Column(0, 5l, 1l, 0l, 2l, 7l, null, null, 1l);
+    tb.decimal32Column(-1, 20, 30, 40, 51, 92, null, null, 10);
     try (Table expected = tb.build()) {
       int[] desiredPrecision = new int[]{2, 10, 3};
       int[] desiredScale = new int[]{0, 0, -1};
 
       Table.TestBuilder tb2 = new Table.TestBuilder();
-      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd");
-      tb2.column("5", "1 ", "0", "2", "7.1", null, "asdf");
-      tb2.column("2", "3", " 4 ", "5.07", "9.23", null, "7.8.3");
+      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd", "\u0000 \u001f1\u0014");
+      tb2.column("5", "1 ", "0", "2", "7.1", null, "asdf", "\u0000 \u001f1\u0014");
+      tb2.column("2", "3", " 4 ", "5.07", "9.23", null, "7.8.3", "\u0000 \u001f1\u0014");
 
       List<ColumnVector> result = new ArrayList<>();
       try (Table origTable = tb2.build()) {

From dfd01e99c01f7ce7c974a84b99edade938870388 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 Apr 2024 11:11:25 +0800
Subject: [PATCH 056/124] Update submodule cudf to
 818b29d2ee49a7cc6de910951f64c36c55cc6d08 (#1990)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 475f5e5fcb..818b29d2ee 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 475f5e5fcb5b703ffdf1e491b7f2230c514a41fc
+Subproject commit 818b29d2ee49a7cc6de910951f64c36c55cc6d08
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 82ff6c3e48..6c801f26ab 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-ab8091f5a6efb9c2c036933b8f2805e33e85501f
+75805eabd928d36281f9d0da5babf46dc1ead5a6

From 1c5f48c32d47a5321fd385b0feb9346b967f0642 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 Apr 2024 17:14:21 +0800
Subject: [PATCH 057/124] Update submodule cudf to
 7804ba7f817b3fccf13b0084e2d7e0ac2257ff5a (#1992)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 818b29d2ee..7804ba7f81 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 818b29d2ee49a7cc6de910951f64c36c55cc6d08
+Subproject commit 7804ba7f817b3fccf13b0084e2d7e0ac2257ff5a

From 113b796bb2acdd22f85bba3e487f2da8ad78f489 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 23 Apr 2024 18:23:11 -0500
Subject: [PATCH 058/124] Use RAPIDS cmake to locate GTest (#1994)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 src/main/cpp/CMakeLists.txt | 3 ++-
 thirdparty/cudf             | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 4e098adee5..d30abc0b8f 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -110,7 +110,8 @@ endif()
 
 # cudf
 if(BUILD_TESTS)
-  find_package(GTest REQUIRED)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest(BUILD_STATIC)
   rapids_find_package(cudf REQUIRED COMPONENTS testing)
 else()
   rapids_find_package(cudf REQUIRED)
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7804ba7f81..b16e5c25eb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7804ba7f817b3fccf13b0084e2d7e0ac2257ff5a
+Subproject commit b16e5c25eb7c38b26cb0d5b1e96047f0ef968c2b

From 8ae80c01d0b035c3c78387ed79c679694dfd5b25 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:36:55 +0800
Subject: [PATCH 059/124] [submodule-sync] bot-submodule-sync-branch-24.06 to
 branch-24.06 [skip ci] [bot] (#1993)

* Update submodule cudf to 7341866495b03bdf3f01f8f4e57953741c77e7aa

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 8db1851106e3a250609294a81502f5abff801f67

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index b16e5c25eb..8db1851106 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit b16e5c25eb7c38b26cb0d5b1e96047f0ef968c2b
+Subproject commit 8db1851106e3a250609294a81502f5abff801f67
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 6c801f26ab..a4956fc622 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-75805eabd928d36281f9d0da5babf46dc1ead5a6
+4f8fe5cc0b481ebb9919954616b8c6ed4aed19eb

From ad558563623e2e793245c2c0463de66fe3f2f2f8 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 25 Apr 2024 04:31:45 +0800
Subject: [PATCH 060/124] Update submodule cudf to
 117eff6bf1eb8a46c597fd8f9e76a22fa363f03a (#1995)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8db1851106..117eff6bf1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8db1851106e3a250609294a81502f5abff801f67
+Subproject commit 117eff6bf1eb8a46c597fd8f9e76a22fa363f03a
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index a4956fc622..2722e393f7 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-4f8fe5cc0b481ebb9919954616b8c6ed4aed19eb
+e1fbcd956fa9184c23860747410f0bda6251b2f5

From c72b9fed7b96fd2046a9eecb3da4548d2d742de0 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 25 Apr 2024 10:36:06 +0800
Subject: [PATCH 061/124] Update submodule cudf to
 70a5b2bda500fe46cd14860b4e2ca0109893c434 (#1996)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 117eff6bf1..70a5b2bda5 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 117eff6bf1eb8a46c597fd8f9e76a22fa363f03a
+Subproject commit 70a5b2bda500fe46cd14860b4e2ca0109893c434

From 99feacebbaaeb19395061547a3d143cfb5ecd0bd Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 25 Apr 2024 23:14:32 +0800
Subject: [PATCH 062/124] Update submodule cudf to
 4dc9ebbfe5b2a22949c5f24114918e4369d055cd (#1998)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 70a5b2bda5..4dc9ebbfe5 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 70a5b2bda500fe46cd14860b4e2ca0109893c434
+Subproject commit 4dc9ebbfe5b2a22949c5f24114918e4369d055cd

From cd85697fb55aee7cbd9009917194ec87783be8b5 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 Apr 2024 05:21:09 +0800
Subject: [PATCH 063/124] Update submodule cudf to
 65c2b53602d70f7f50c7dd7544ca0fd07ac8b455 (#1999)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4dc9ebbfe5..65c2b53602 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4dc9ebbfe5b2a22949c5f24114918e4369d055cd
+Subproject commit 65c2b53602d70f7f50c7dd7544ca0fd07ac8b455

From d78cda9d12eaba7c3a7bf59571409b9a5b167863 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 Apr 2024 20:30:01 +0800
Subject: [PATCH 064/124] Update submodule cudf to
 c62c5f69ca5036d69188ab8e43ac2ab5276d6cfa (#2001)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 65c2b53602..c62c5f69ca 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 65c2b53602d70f7f50c7dd7544ca0fd07ac8b455
+Subproject commit c62c5f69ca5036d69188ab8e43ac2ab5276d6cfa

From 2fc70bdd8d658cc4849343dc05edc69c3397cffe Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:59:20 -0400
Subject: [PATCH 065/124] URI Path Parsing (#1997)

* First pass at path parsing. Failing java tests on nulls

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* Fix edge cases

Signed-off-by: Paul Mattione <pmattione@nvidia.com>

* Update copyrights, minor formatting, lone fragment should be invalid

Signed-off-by: Paul Mattione <pmattione@nvidia.com>

---------

Signed-off-by: Mike Wilson <knobby@burntsheep.com>
Signed-off-by: Paul Mattione <pmattione@nvidia.com>
Co-authored-by: Mike Wilson <knobby@burntsheep.com>
---
 src/main/cpp/src/ParseURIJni.cpp              |  16 ++-
 src/main/cpp/src/parse_uri.cu                 |  26 +++--
 src/main/cpp/src/parse_uri.hpp                |  21 +++-
 src/main/cpp/tests/parse_uri.cpp              | 110 +++++++++++++++++-
 .../com/nvidia/spark/rapids/jni/ParseURI.java |  14 ++-
 .../nvidia/spark/rapids/jni/ParseURITest.java |  27 ++++-
 6 files changed, 195 insertions(+), 19 deletions(-)

diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp
index 354d47c424..91b898048b 100644
--- a/src/main/cpp/src/ParseURIJni.cpp
+++ b/src/main/cpp/src/ParseURIJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -92,4 +92,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWith
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parsePath(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong input_column)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_column);
+    return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_path(*input).release());
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 0e6ea2690d..398c033c3a 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -582,7 +582,9 @@ uri_parts __device__ validate_uri(const char* str,
   }
 
   // if the first ':' is after the other tokens, this doesn't have a scheme or it is invalid
-  if (col != -1 && (slash == -1 || col < slash) && (hash == -1 || col < hash)) {
+  bool const has_scheme =
+    (col != -1) && ((slash == -1) || (col < slash)) && ((hash == -1) || (col < hash));
+  if (has_scheme) {
     // we have a scheme up to the :
     ret.scheme = {str, col};
     if (!validate_scheme(ret.scheme)) {
@@ -600,9 +602,12 @@ uri_parts __device__ validate_uri(const char* str,
     slash -= skip;
   }
 
-  // no more string to parse is an error
+  // no more string to parse is generally an error, unless we had no scheme
   if (len <= 0) {
-    ret.valid = 0;
+    // If we had a scheme then this is entirely invalid.
+    // If no scheme then URI is entirely empty or we only had a fragment
+    // This is equivalent to having a path that is present but empty, so mark it ok
+    ret.valid = (static_cast<int>(!has_scheme) << static_cast<int>(URI_chunks::PATH));
     return ret;
   }
 
@@ -655,13 +660,6 @@ uri_parts __device__ validate_uri(const char* str,
                        next_slash == -1 ? question < 0 ? len - 2 : question - 2 : next_slash - 2};
       if (next_slash > 0) { ret.path = {str + next_slash, path_len - next_slash}; }
 
-      if (next_slash == -1 && ret.authority.size_bytes() == 0 && ret.query.size_bytes() == 0 &&
-          ret.fragment.size_bytes() == 0) {
-        // invalid! - but spark like to return things as long as you don't have illegal characters
-        // ret.valid = 0;
-        return ret;
-      }
-
       if (ret.authority.size_bytes() > 0) {
         auto ipv6_address = ret.authority.size_bytes() > 2 && *ret.authority.begin() == '[';
         if (!validate_authority(ret.authority, ipv6_address)) {
@@ -729,6 +727,7 @@ uri_parts __device__ validate_uri(const char* str,
       // path with no authority
       ret.path = {str, path_len};
     }
+
     if (!validate_path(ret.path)) {
       ret.valid = 0;
       return ret;
@@ -1003,4 +1002,11 @@ std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const
   return detail::parse_uri(input, detail::URI_chunks::QUERY, query_match, stream, mr);
 }
 
+std::unique_ptr<column> parse_uri_to_path(strings_column_view const& input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::parse_uri(input, detail::URI_chunks::PATH, std::nullopt, stream, mr);
+}
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp
index 004d800ddb..39add300f7 100644
--- a/src/main/cpp/src/parse_uri.hpp
+++ b/src/main/cpp/src/parse_uri.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 namespace spark_rapids_jni {
 
 /**
- * @brief Parse protocol and copy from the input string column to the output char buffer.
+ * @brief Parse protocol and copy from the input string column to the output string column.
  *
  * @param input Input string column of URIs to parse
  * @param stream Stream on which to operate.
@@ -40,7 +40,7 @@ std::unique_ptr<cudf::column> parse_uri_to_protocol(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Parse host and copy from the input string column to the output char buffer.
+ * @brief Parse host and copy from the input string column to the output string column.
  *
  * @param input Input string column of URIs to parse
  * @param stream Stream on which to operate.
@@ -53,7 +53,7 @@ std::unique_ptr<cudf::column> parse_uri_to_host(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Parse query and copy from the input string column to the output char buffer.
+ * @brief Parse query and copy from the input string column to the output string column.
  *
  * @param input Input string column of URIs to parse
  * @param stream Stream on which to operate.
@@ -95,4 +95,17 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Parse path and copy from the input string column to the output string column.
+ *
+ * @param input Input string column of URIs to parse
+ * @param stream Stream on which to operate.
+ * @param mr Memory resource for returned column
+ * @return std::unique_ptr<column> String column of paths parsed.
+ */
+std::unique_ptr<cudf::column> parse_uri_to_path(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp
index a042ff46e9..4c7cf6446a 100644
--- a/src/main/cpp/tests/parse_uri.cpp
+++ b/src/main/cpp/tests/parse_uri.cpp
@@ -25,6 +25,7 @@
 struct ParseURIProtocolTests : public cudf::test::BaseFixture {};
 struct ParseURIHostTests : public cudf::test::BaseFixture {};
 struct ParseURIQueryTests : public cudf::test::BaseFixture {};
+struct ParseURIPathTests : public cudf::test::BaseFixture {};
 
 enum class test_types {
   SIMPLE,
@@ -79,8 +80,7 @@ cudf::test::strings_column_wrapper get_test_data(test_types t)
          "http://[fe80::7:8%eth0]",
          "http://[fe80::7:8%1]",
          "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\",
-         "www.nvidia.com:8100/servlet/"
-         "impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
+         "www.nvidia.com:8100/servlet/impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
          "https://nvidia.com/2Ru15Ss ",
          "http://www.nvidia.com/plugins//##",
          "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&amp;D4X0/Panels&amp;solutionId=0X54a/"
@@ -407,3 +407,109 @@ TEST_F(ParseURIQueryTests, Queries)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
   }
 }
+
+TEST_F(ParseURIPathTests, Simple)
+{
+  auto const col    = get_test_data(test_types::SIMPLE);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"/s/uri",
+                                                     "",
+                                                     "/to/a/cool/file",
+                                                     "/path/to/file",
+                                                     "/www.nvidia.com",
+                                                     "",
+                                                     "/network/path/to/file",
+                                                     "nvidia.com",
+                                                     "www.nvidia.com/s/uri"},
+                                                    {1, 1, 1, 1, 1, 0, 1, 1, 1});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, SparkEdges)
+{
+  auto const col    = get_test_data(test_types::SPARK_EDGES);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"/https&",
+     "//www.nvidia.com",
+     "",
+     "",
+     "",
+     "/absolute/path",
+     "",
+     "",
+     "",
+     "",
+     "/absolute/path",
+     "",
+     "",
+     "/q/This%20is%20a%20query",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "/file;param",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "/",
+     "/",
+     "/"},
+    {1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1});
+
+  cudf::test::print(result->view());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, IP6)
+{
+  auto const col    = get_test_data(test_types::IPv6);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"", "", "", "", "", "", "", "/path/to/file", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, IP4)
+{
+  auto const col    = get_test_data(test_types::IPv4);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"/", "/", "/", "/", "/", "/path/to/file"});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, UTF8)
+{
+  auto const col    = get_test_data(test_types::UTF8);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"/%4EV%49%44%49%41", "", "/123", ""},
+                                                    {1, 1, 1, 0});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
index 6de84ea519..6b71416dcb 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,9 +85,21 @@ public static ColumnVector parseURIQueryWithColumn(ColumnView uriColumn, ColumnV
     return new ColumnVector(parseQueryWithColumn(uriColumn.getNativeView(), queryColumn.getNativeView()));
   }
 
+  /**
+   * Parse path for each URI from the incoming column.
+   *
+   * @param URIColumn The input strings column in which each row contains a URI.
+   * @return A string column with the URI path extracted.
+   */
+  public static ColumnVector parseURIPath(ColumnView uriColumn) {
+    assert uriColumn.getType().equals(DType.STRING) : "Input type must be String";
+    return new ColumnVector(parsePath(uriColumn.getNativeView()));
+  }
+
   private static native long parseProtocol(long inputColumnHandle);
   private static native long parseHost(long inputColumnHandle);
   private static native long parseQuery(long inputColumnHandle);
   private static native long parseQueryWithLiteral(long inputColumnHandle, String query);
   private static native long parseQueryWithColumn(long inputColumnHandle, long queryColumnHandle);
+  private static native long parsePath(long inputColumnHandle);
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index ffe7e9e946..1ddf588b02 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,6 +159,27 @@ void testQuery(String[] testData, String[] params) {
     }
   }
 
+  void testPath(String[] testData) {
+    String[] expectedPathStrings = new String[testData.length];
+    for (int i=0; i<testData.length; i++) {
+      String path = null;
+      try {
+        URI uri = new URI(testData[i]);
+        path = uri.getRawPath();
+      } catch (URISyntaxException ex) {
+        // leave the path null if URI is invalid
+      } catch (NullPointerException ex) {
+        // leave the path null if URI is null
+      }
+      expectedPathStrings[i] = path;
+    }
+    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
+      ColumnVector expectedPath = ColumnVector.fromStrings(expectedPathStrings);
+      ColumnVector pathResult = ParseURI.parseURIPath(v0)) {
+      AssertUtils.assertColumnsAreEqual(expectedPath, pathResult);
+    }
+  }
+
   @Test
   void parseURISparkTest() {
     String[] testData = {
@@ -286,6 +307,7 @@ void parseURISparkTest() {
     testQuery(testData);
     testQuery(testData, "query");
     testQuery(testData, queries);
+    testPath(testData);
   }
 
   @Test
@@ -300,6 +322,7 @@ void parseURIUTF8Test() {
     testHost(testData);
     testQuery(testData);
     testQuery(testData, "query");
+    testPath(testData);
   }
 
   @Test
@@ -316,6 +339,7 @@ void parseURIIP4Test() {
     testHost(testData);
     testQuery(testData);
     testQuery(testData, "query");
+    testPath(testData);
   }
 
   @Test
@@ -345,5 +369,6 @@ void parseURIIP6Test() {
     testHost(testData);
     testQuery(testData);
     testQuery(testData, "query");
+    testPath(testData);
   }
 }

From d47f925098e174146481e8cdd02e314ed54acef8 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 27 Apr 2024 04:32:17 +0800
Subject: [PATCH 066/124] Update submodule cudf to
 79cd473f8ec18d1f0abed3faa6dd8d61f54bf384 (#2002)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c62c5f69ca..79cd473f8e 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c62c5f69ca5036d69188ab8e43ac2ab5276d6cfa
+Subproject commit 79cd473f8ec18d1f0abed3faa6dd8d61f54bf384

From f3d5933601860c766a15b34968fa79af21ecaa86 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 27 Apr 2024 11:11:09 +0800
Subject: [PATCH 067/124] Update submodule cudf to
 064dd7b02166cc67e882b708d66621bc3fafd70b (#2003)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 79cd473f8e..064dd7b021 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 79cd473f8ec18d1f0abed3faa6dd8d61f54bf384
+Subproject commit 064dd7b02166cc67e882b708d66621bc3fafd70b

From 2ca29f720377b9bd7e94a6e0aedd8f5db057aeea Mon Sep 17 00:00:00 2001
From: Chong Gao <chongg@nvidia.com>
Date: Mon, 29 Apr 2024 13:54:19 +0800
Subject: [PATCH 068/124] Remove Path key/subscript to save GPU memory usage to
 improve perf (#1987)

Signed-off-by: Chong Gao <res_life@163.com>
Co-authored-by: Chong Gao <res_life@163.com>
---
 src/main/cpp/src/get_json_object.cu           | 508 ++++--------------
 .../spark/rapids/jni/GetJsonObjectTest.java   |  93 ++--
 2 files changed, 151 insertions(+), 450 deletions(-)

diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index c7c6c242b8..df1871d6aa 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -322,25 +322,12 @@ __device__ inline bool path_match_elements(path_instruction const* path_ptr,
   return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1;
 }
 
-__device__ inline bool path_match_elements(path_instruction const* path_ptr,
-                                           size_t path_size,
-                                           path_instruction_type path_type0,
-                                           path_instruction_type path_type1,
-                                           path_instruction_type path_type2,
-                                           path_instruction_type path_type3)
+__device__ inline thrust::tuple<bool, int> path_match_index(path_instruction const* path_ptr,
+                                                            size_t path_size)
 {
-  if (path_size < 4) { return false; }
-  return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1 &&
-         path_ptr[2].type == path_type2 && path_ptr[3].type == path_type3;
-}
-
-__device__ inline thrust::tuple<bool, int> path_match_subscript_index(
-  path_instruction const* path_ptr, size_t path_size)
-{
-  auto match = path_match_elements(
-    path_ptr, path_size, path_instruction_type::SUBSCRIPT, path_instruction_type::INDEX);
+  auto match = path_match_element(path_ptr, path_size, path_instruction_type::INDEX);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[1].index);
+    return thrust::make_tuple(true, path_ptr[0].index);
   } else {
     return thrust::make_tuple(false, 0);
   }
@@ -357,301 +344,18 @@ __device__ inline thrust::tuple<bool, cudf::string_view> path_match_named(
   }
 }
 
-__device__ inline thrust::tuple<bool, int> path_match_subscript_index_subscript_wildcard(
+__device__ inline thrust::tuple<bool, int> path_match_index_wildcard(
   path_instruction const* path_ptr, size_t path_size)
 {
-  auto match = path_match_elements(path_ptr,
-                                   path_size,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::INDEX,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD);
+  auto match = path_match_elements(
+    path_ptr, path_size, path_instruction_type::INDEX, path_instruction_type::WILDCARD);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[1].index);
+    return thrust::make_tuple(true, path_ptr[0].index);
   } else {
     return thrust::make_tuple(false, 0);
   }
 }
 
-/**
- *
- * The following commented function is recursive version,
- * The next function below is the rewritten version,
- * Keep version here is for review purpuse, because rewritten version(iterative)
- * is not human friendly.
- *
- */
-// __device__ bool evaluate_path(json_parser& p,
-//                                            json_generator& g,
-//                                            write_style style,
-//                                            path_instruction const* path_ptr,
-//                                            int path_size)
-// {
-//   auto token = p.get_current_token();
-
-//   // case (VALUE_STRING, Nil) if style == RawStyle
-//   // case path 1
-//   if (json_token::VALUE_STRING == token && path_is_empty(path_size) &&
-//       style == write_style::raw_style) {
-//     // there is no array wildcard or slice parent, emit this string without
-//     // quotes write current string in parser to generator
-//     g.write_raw(p);
-//     return true;
-//   }
-//   // case (START_ARRAY, Nil) if style == FlattenStyle
-//   // case path 2
-//   else if (json_token::START_ARRAY == token && path_is_empty(path_size) &&
-//            style == write_style::flatten_style) {
-//     // flatten this array into the parent
-//     bool dirty = false;
-//     while (json_token::END_ARRAY != p.next_token()) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       dirty |= path_evaluator::evaluate_path(p, g, style, nullptr, 0);
-//     }
-//     return dirty;
-//   }
-//   // case (_, Nil)
-//   // case path 3
-//   else if (path_is_empty(path_size)) {
-//     // general case: just copy the child tree verbatim
-//     return g.copy_current_structure(p);
-//   }
-//   // case (START_OBJECT, Key :: xs)
-//   // case path 4
-//   else if (json_token::START_OBJECT == token &&
-//            path_match_element(path_ptr, path_size, path_instruction_type::KEY)) {
-//     bool dirty = false;
-//     while (json_token::END_OBJECT != p.next_token()) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       if (dirty) {
-//         // once a match has been found we can skip other fields
-//         if (!p.try_skip_children()) {
-//           // JSON validation check
-//           return false;
-//         }
-//       } else {
-//         dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-//       }
-//     }
-//     return dirty;
-//   }
-//   // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
-//   // case path 5
-//   else if (json_token::START_ARRAY == token &&
-//            path_match_elements(path_ptr,
-//                                path_size,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD)) {
-//     // special handling for the non-structure preserving double wildcard
-//     // behavior in Hive
-//     bool dirty = false;
-//     g.write_start_array();
-//     while (p.next_token() != json_token::END_ARRAY) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       dirty |= path_evaluator::evaluate_path(
-//         p, g, write_style::flatten_style, path_ptr + 4, path_size - 4);
-//     }
-//     g.write_end_array();
-//     return dirty;
-//   }
-//   // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
-//   // case path 6
-//   else if (json_token::START_ARRAY == token &&
-//            path_match_elements(path_ptr,
-//                                path_size,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD) &&
-//            style != write_style::quoted_style) {
-//     // retain Flatten, otherwise use Quoted... cannot use Raw within an array
-//     write_style next_style = write_style::raw_style;
-//     switch (style) {
-//       case write_style::raw_style: next_style = write_style::quoted_style; break;
-//       case write_style::flatten_style: next_style = write_style::flatten_style; break;
-//       case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
-//     }
-
-//     // temporarily buffer child matches, the emitted json will need to be
-//     // modified slightly if there is only a single element written
-
-//     int dirty = 0;
-//     // create a child generator with hide outer array tokens mode.
-//     auto child_g = g.new_child_generator(/*hide_outer_array_tokens*/ true);
-
-//     // Note: child generator does not actually write the outer start array
-//     // token into buffer it only updates internal nested state
-//     child_g.write_start_array();
-
-//     while (p.next_token() != json_token::END_ARRAY) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       // track the number of array elements and only emit an outer array if
-//       // we've written more than one element, this matches Hive's behavior
-//       dirty +=
-//         (path_evaluator::evaluate_path(p, child_g, next_style, path_ptr + 2, path_size - 2) ? 1
-//                                                                                             :
-//                                                                                             0);
-//     }
-
-//     // Note: child generator does not actually write the outer end array token
-//     // into buffer it only updates internal nested state
-//     child_g.write_end_array();
-
-//     char* child_g_start = child_g.get_output_start_position();
-//     size_t child_g_len  = child_g.get_output_len();  // len already excluded outer [ ]
-
-//     if (dirty > 1) {
-//       // add outer array tokens
-//       g.write_child_raw_value(child_g_start, child_g_len, true);
-//     } else if (dirty == 1) {
-//       // remove outer array tokens
-//       g.write_child_raw_value(child_g_start, child_g_len, false);
-//     }  // else do not write anything
-
-//     return dirty > 0;
-//   }
-//   // case (START_ARRAY, Subscript :: Wildcard :: xs)
-//   // case path 7
-//   else if (json_token::START_ARRAY == token &&
-//            path_match_elements(path_ptr,
-//                                path_size,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD)) {
-//     bool dirty = false;
-//     g.write_start_array();
-//     while (p.next_token() != json_token::END_ARRAY) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       // wildcards can have multiple matches, continually update the dirty
-//       // count
-//       dirty |= path_evaluator::evaluate_path(
-//         p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
-//     }
-//     g.write_end_array();
-
-//     return dirty;
-//   }
-//   /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
-//   // case path 8
-//   else if (json_token::START_ARRAY == token &&
-//            thrust::get<0>(path_match_subscript_index_subscript_wildcard(path_ptr, path_size)))
-//            {
-//     int idx = thrust::get<1>(path_match_subscript_index_subscript_wildcard(path_ptr,
-//     path_size)); p.next_token();
-//     // JSON validation check
-//     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//     int i = idx;
-//     while (i >= 0) {
-//       if (p.get_current_token() == json_token::END_ARRAY) {
-//         // terminate, nothing has been written
-//         return false;
-//       }
-//       if (0 == i) {
-//         bool dirty = path_evaluator::evaluate_path(
-//           p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
-//         while (p.next_token() != json_token::END_ARRAY) {
-//           // JSON validation check
-//           if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//           // advance the token stream to the end of the array
-//           if (!p.try_skip_children()) { return false; }
-//         }
-//         return dirty;
-//       } else {
-//         // i > 0
-//         if (!p.try_skip_children()) { return false; }
-
-//         p.next_token();
-//         // JSON validation check
-//         if (json_token::ERROR == p.get_current_token()) { return false; }
-//       }
-//       --i;
-//     }
-//     // path parser guarantees idx >= 0
-//     // will never reach to here
-//     return false;
-//   }
-//   // case (START_ARRAY, Subscript :: Index(idx) :: xs)
-//   // case path 9
-//   else if (json_token::START_ARRAY == token &&
-//            thrust::get<0>(path_match_subscript_index(path_ptr, path_size))) {
-//     int idx = thrust::get<1>(path_match_subscript_index(path_ptr, path_size));
-//     p.next_token();
-//     // JSON validation check
-//     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//     int i = idx;
-//     while (i >= 0) {
-//       if (p.get_current_token() == json_token::END_ARRAY) {
-//         // terminate, nothing has been written
-//         return false;
-//       }
-//       if (0 == i) {
-//         bool dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 2, path_size - 2);
-//         while (p.next_token() != json_token::END_ARRAY) {
-//           // JSON validation check
-//           if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//           // advance the token stream to the end of the array
-//           if (!p.try_skip_children()) { return false; }
-//         }
-//         return dirty;
-//       } else {
-//         // i > 0
-//         if (!p.try_skip_children()) { return false; }
-
-//         p.next_token();
-//         // JSON validation check
-//         if (json_token::ERROR == p.get_current_token()) { return false; }
-//       }
-//       --i;
-//     }
-//     // path parser guarantees idx >= 0
-//     // will never reach to here
-//     return false;
-//   }
-//   // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-//   // case path 10
-//   else if (json_token::FIELD_NAME == token &&
-//            thrust::get<0>(path_match_named(path_ptr, path_size)) &&
-//            p.match_current_field_name(thrust::get<1>(path_match_named(path_ptr, path_size)))) {
-//     if (p.next_token() != json_token::VALUE_NULL) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-//     } else {
-//       return false;
-//     }
-//   }
-//   // case (FIELD_NAME, Wildcard :: xs)
-//   // case path 11
-//   else if (json_token::FIELD_NAME == token &&
-//            path_match_element(path_ptr, path_size, path_instruction_type::WILDCARD)) {
-//     p.next_token();
-//     // JSON validation check
-//     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//     return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-//   }
-//   // case _ =>
-//   // case path 12
-//   else {
-//     if (!p.try_skip_children()) { return false; }
-//     return false;
-//   }
-// }
-
 /**
  *
  * This function is rewritten from above commented recursive function.
@@ -696,7 +400,7 @@ __device__ bool evaluate_path(json_parser& p,
   // There is a same constant in JSONUtil.java, keep them consistent when changing
   // Note: Spark-Rapids should guarantee the path depth is less or equal to this limit,
   // or GPU reports cudaErrorIllegalAddress
-  constexpr int max_path_depth = 16;
+  constexpr int max_path_depth = 8;
 
   // define stack; plus 1 indicates root context task needs an extra memory
   context stack[max_path_depth + 1];
@@ -770,37 +474,85 @@ __device__ bool evaluate_path(json_parser& p,
         ctx.dirty        = 1;
         ctx.task_is_done = true;
       }
-      // case (START_OBJECT, Key :: xs)
+      // case (START_OBJECT, Named :: xs)
       // case path 4
       else if (json_token::START_OBJECT == ctx.token &&
-               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::KEY)) {
-        if (json_token::END_OBJECT != p.next_token()) {
-          // JSON validation check
-          if (json_token::ERROR == p.get_current_token()) { return false; }
-
+               thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size))) {
+        if (!ctx.is_first_enter) {
+          // 2st enter
+          // skip the following children after the expect
           if (ctx.dirty > 0) {
-            // once a match has been found we can skip other fields
-            if (!p.try_skip_children()) {
+            while (json_token::END_OBJECT != p.next_token()) {
+              // JSON validation check
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // skip FIELD_NAME token
+              p.next_token();
               // JSON validation check
-              return false;
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // skip value of FIELD_NAME
+              if (!p.try_skip_children()) {
+                // JSON validation check
+                return false;
+              }
             }
+            ctx.task_is_done = true;
           } else {
-            // need to try more children
-            push_context(
-              p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+            return false;
           }
         } else {
-          ctx.task_is_done = true;
+          // below is 1st enter
+          ctx.is_first_enter = false;
+          // match first mached children with expected name
+          bool found_expected_child = false;
+          while (json_token::END_OBJECT != p.next_token()) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+
+            // need to try more children
+            auto match_named = path_match_named(ctx.path_ptr, ctx.path_size);
+            auto named       = thrust::get<1>(match_named);
+            // current token is FIELD_NAME
+            if (p.match_current_field_name(named)) {
+              // skip FIELD_NAME token
+              p.next_token();
+              // JSON validation check
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // meets null token, it's not expected, return false
+              if (json_token::VALUE_NULL == p.get_current_token()) { return false; }
+              // push sub task; sub task will update the result of path 4
+              push_context(
+                p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+              found_expected_child = true;
+              break;
+            } else {
+              // skip FIELD_NAME token
+              p.next_token();
+              // JSON validation check
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // current child is not expected, skip current child
+              if (!p.try_skip_children()) {
+                // JSON validation check
+                return false;
+              }
+            }
+          }
+          if (!found_expected_child) {
+            // did not find any expected sub child
+            ctx.task_is_done = true;
+            ctx.dirty        = false;
+          }
         }
       }
-      // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
+      // case (START_ARRAY, Wildcard :: Wildcard :: xs)
       // case path 5
       else if (json_token::START_ARRAY == ctx.token &&
                path_match_elements(ctx.path_ptr,
                                    ctx.path_size,
-                                   path_instruction_type::SUBSCRIPT,
                                    path_instruction_type::WILDCARD,
-                                   path_instruction_type::SUBSCRIPT,
                                    path_instruction_type::WILDCARD)) {
         // special handling for the non-structure preserving double wildcard
         // behavior in Hive
@@ -816,20 +568,17 @@ __device__ bool evaluate_path(json_parser& p,
                        5,
                        ctx.g,
                        write_style::flatten_style,
-                       ctx.path_ptr + 4,
-                       ctx.path_size - 4);
+                       ctx.path_ptr + 2,
+                       ctx.path_size - 2);
         } else {
           ctx.g.write_end_array();
           ctx.task_is_done = true;
         }
       }
-      // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
+      // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle
       // case path 6
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_elements(ctx.path_ptr,
-                                   ctx.path_size,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD) &&
+               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD) &&
                ctx.style != write_style::quoted_style) {
         // retain Flatten, otherwise use Quoted... cannot use Raw within an array
         write_style next_style = write_style::raw_style;
@@ -859,11 +608,10 @@ __device__ bool evaluate_path(json_parser& p,
           // track the number of array elements and only emit an outer array if
           // we've written more than one element, this matches Hive's behavior
           push_context(
-            p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 2, ctx.path_size - 2);
+            p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 1, ctx.path_size - 1);
         } else {
           char* child_g_start = child_g.get_output_start_position();
           size_t child_g_len  = child_g.get_output_len();
-
           if (ctx.dirty > 1) {
             // add outer array tokens
             ctx.g.write_child_raw_value(
@@ -877,18 +625,14 @@ __device__ bool evaluate_path(json_parser& p,
           }  // else do not write anything
         }
       }
-      // case (START_ARRAY, Subscript :: Wildcard :: xs)
+      // case (START_ARRAY, Wildcard :: xs)
       // case path 7
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_elements(ctx.path_ptr,
-                                   ctx.path_size,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD)) {
+               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) {
         if (ctx.is_first_enter) {
           ctx.is_first_enter = false;
           ctx.g.write_start_array();
         }
-
         if (p.next_token() != json_token::END_ARRAY) {
           // JSON validation check
           if (json_token::ERROR == p.get_current_token()) { return false; }
@@ -899,20 +643,18 @@ __device__ bool evaluate_path(json_parser& p,
                        7,
                        ctx.g,
                        write_style::quoted_style,
-                       ctx.path_ptr + 2,
-                       ctx.path_size - 2);
+                       ctx.path_ptr + 1,
+                       ctx.path_size - 1);
         } else {
           ctx.g.write_end_array();
           ctx.task_is_done = true;
         }
       }
-      /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
+      /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */
       // case path 8
       else if (json_token::START_ARRAY == ctx.token &&
-               thrust::get<0>(
-                 path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size))) {
-        int idx = thrust::get<1>(
-          path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size));
+               thrust::get<0>(path_match_index_wildcard(ctx.path_ptr, ctx.path_size))) {
+        int idx = thrust::get<1>(path_match_index_wildcard(ctx.path_ptr, ctx.path_size));
 
         p.next_token();
         // JSON validation check
@@ -940,14 +682,14 @@ __device__ bool evaluate_path(json_parser& p,
                      8,
                      ctx.g,
                      write_style::quoted_style,
-                     ctx.path_ptr + 2,
-                     ctx.path_size - 2);
+                     ctx.path_ptr + 1,
+                     ctx.path_size - 1);
       }
-      // case (START_ARRAY, Subscript :: Index(idx) :: xs)
+      // case (START_ARRAY, Index(idx) :: xs)
       // case path 9
       else if (json_token::START_ARRAY == ctx.token &&
-               thrust::get<0>(path_match_subscript_index(ctx.path_ptr, ctx.path_size))) {
-        int idx = thrust::get<1>(path_match_subscript_index(ctx.path_ptr, ctx.path_size));
+               thrust::get<0>(path_match_index(ctx.path_ptr, ctx.path_size))) {
+        int idx = thrust::get<1>(path_match_index(ctx.path_ptr, ctx.path_size));
 
         p.next_token();
         // JSON validation check
@@ -971,32 +713,7 @@ __device__ bool evaluate_path(json_parser& p,
 
         // i == 0
         push_context(
-          p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 2, ctx.path_size - 2);
-      }
-      // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-      // case path 10
-      else if (json_token::FIELD_NAME == ctx.token &&
-               thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size)) &&
-               p.match_current_field_name(
-                 thrust::get<1>(path_match_named(ctx.path_ptr, ctx.path_size)))) {
-        if (p.next_token() != json_token::VALUE_NULL) {
-          // JSON validation check
-          if (json_token::ERROR == p.get_current_token()) { return false; }
-          push_context(
-            p.get_current_token(), 10, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
-        } else {
-          return false;
-        }
-      }
-      // case (FIELD_NAME, Wildcard :: xs)
-      // case path 11
-      else if (json_token::FIELD_NAME == ctx.token &&
-               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) {
-        p.next_token();
-        // JSON validation check
-        if (json_token::ERROR == p.get_current_token()) { return false; }
-        push_context(
-          p.get_current_token(), 11, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+          p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
       }
       // case _ =>
       // case path 12
@@ -1024,22 +741,22 @@ __device__ bool evaluate_path(json_parser& p,
           // never happen
         }
         // path 2: case (START_ARRAY, Nil) if style == FlattenStyle
-        // path 5: case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
-        // path 7: case (START_ARRAY, Subscript :: Wildcard :: xs)
+        // path 5: case (START_ARRAY, Wildcard :: Wildcard :: xs)
+        // path 7: case (START_ARRAY, Wildcard :: xs)
         else if (2 == ctx.case_path || 5 == ctx.case_path || 7 == ctx.case_path) {
           // collect result from child task
           p_ctx.dirty += ctx.dirty;
           // copy generator states to parent task;
           p_ctx.g = ctx.g;
         }
-        // case (START_OBJECT, Key :: xs)
+        // case (START_OBJECT, Named :: xs)
         // case path 4
         else if (4 == ctx.case_path) {
-          if (p_ctx.dirty < 1 && ctx.dirty > 0) { p_ctx.dirty = ctx.dirty; }
+          p_ctx.dirty = ctx.dirty;
           // copy generator states to parent task;
           p_ctx.g = ctx.g;
         }
-        // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
+        // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle
         // case path 6
         else if (6 == ctx.case_path) {
           // collect result from child task
@@ -1047,9 +764,9 @@ __device__ bool evaluate_path(json_parser& p,
           // update child generator for parent task
           p_ctx.child_g = ctx.g;
         }
-        /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
+        /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */
         // case path 8
-        // case (START_ARRAY, Subscript :: Index(idx) :: xs)
+        // case (START_ARRAY, Index(idx) :: xs)
         // case path 9
         else if (8 == ctx.case_path || 9 == ctx.case_path) {
           // collect result from child task
@@ -1067,26 +784,6 @@ __device__ bool evaluate_path(json_parser& p,
           // copy generator states to parent task;
           p_ctx.g = ctx.g;
         }
-        // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-        // case path 10
-        else if (10 == ctx.case_path) {
-          // collect result from child task
-          p_ctx.dirty += ctx.dirty;
-          // task is done
-          p_ctx.task_is_done = true;
-          // copy generator states to parent task;
-          p_ctx.g = ctx.g;
-        }
-        // case (FIELD_NAME, Wildcard :: xs)
-        // case path 11
-        else if (11 == ctx.case_path) {
-          // collect result from child task
-          p_ctx.dirty += ctx.dirty;
-          // task is done
-          p_ctx.task_is_done = true;
-          // copy generator states to parent task;
-          p_ctx.g = ctx.g;
-        }
         // case path 3: case (_, Nil)
         // case path 12: case _ =>
         // others
@@ -1118,17 +815,12 @@ rmm::device_uvector<path_instruction> construct_path_commands(
     auto const& [type, name, index] = inst;
     switch (type) {
       case path_instruction_type::SUBSCRIPT:
-        path_commands.emplace_back(path_instruction{path_instruction_type::SUBSCRIPT});
+      case path_instruction_type::KEY:
+        // skip SUBSCRIPT and KEY to save stack size in `evaluate_path`
         break;
       case path_instruction_type::WILDCARD:
         path_commands.emplace_back(path_instruction{path_instruction_type::WILDCARD});
         break;
-      case path_instruction_type::KEY:
-        path_commands.emplace_back(path_instruction{path_instruction_type::KEY});
-        path_commands.back().name =
-          cudf::string_view(all_names_scalar.data() + name_pos, name.size());
-        name_pos += name.size();
-        break;
       case path_instruction_type::INDEX:
         path_commands.emplace_back(path_instruction{path_instruction_type::INDEX});
         path_commands.back().index = index;
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
index ea23c4c9ba..ff2d935cc3 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
@@ -28,7 +28,7 @@ public class GetJsonObjectTest {
   @Test
   void getJsonObjectTest() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k") };
+        namedPath("k") };
     try (ColumnVector jsonCv = ColumnVector.fromStrings(
         "{\"k\": \"v\"}");
         ColumnVector expected = ColumnVector.fromStrings(
@@ -44,7 +44,7 @@ void getJsonObjectTest() {
   @Test
   void getJsonObjectTest2() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(),
+
         namedPath("k1_111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
     };
 
@@ -69,7 +69,7 @@ void getJsonObjectTest2() {
   @Test
   void getJsonObjectTest3() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k1"), keyPath(), namedPath("k2")
+        namedPath("k1"), namedPath("k2")
     };
     String JSON = "{\"k1\":{\"k2\":\"v2\"}}";
     String expectedStr = "v2";
@@ -89,14 +89,14 @@ void getJsonObjectTest3() {
   @Test
   void getJsonObjectTest4() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k1"),
-        keyPath(), namedPath("k2"),
-        keyPath(), namedPath("k3"),
-        keyPath(), namedPath("k4"),
-        keyPath(), namedPath("k5"),
-        keyPath(), namedPath("k6"),
-        keyPath(), namedPath("k7"),
-        keyPath(), namedPath("k8")
+        namedPath("k1"),
+        namedPath("k2"),
+        namedPath("k3"),
+        namedPath("k4"),
+        namedPath("k5"),
+        namedPath("k6"),
+        namedPath("k7"),
+        namedPath("k8")
     };
 
     String JSON = "{\"k1\":{\"k2\":{\"k3\":{\"k4\":{\"k5\":{\"k6\":{\"k7\":{\"k8\":\"v8\"}}}}}}}}";
@@ -117,7 +117,7 @@ void getJsonObjectTest4() {
   @Test
   void getJsonObjectTest_Baidu_unescape_backslash() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("URdeosurl")
+        namedPath("URdeosurl")
     };
 
     String JSON = "{\"brand\":\"ssssss\",\"duratRon\":15,\"eqTosuresurl\":\"\",\"RsZxarthrl\":false,\"xonRtorsurl\":\"\",\"xonRtorsurlstOTe\":0,\"TRctures\":[{\"RxaGe\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/TRc\\/-196588744s840172444s-773690137.zTG\"}],\"Toster\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/TRc\\/-196588744s840172444s-773690137.zTG\",\"reserUed\":{\"bRtLate\":391.79,\"xooUZRke\":26876,\"nahrlIeneratRonNOTe\":0,\"useJublRc\":6,\"URdeoRd\":821284086},\"tRtle\":\"ssssssssssmMsssssssssssssssssss\",\"url\":\"s{storehrl}\",\"usersTortraRt\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/TRc\\/-6971178959s-664926866s-6096674871.zTG\",\"URdeosurl\":\"http:\\/\\/nadURdeo2.baRdu.cox\\/5fa3893aed7fc0f8231dab7be23efc75s820s6240.xT3\",\"URdeoRd\":821284086}";
@@ -138,7 +138,7 @@ void getJsonObjectTest_Baidu_unescape_backslash() {
   @Test
   void getJsonObjectTest_Baidu_get_unexist_field_name() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("Vgdezsurl")
+        namedPath("Vgdezsurl")
     };
 
     String JSON = "{\"brand\":\"ssssss\",\"duratgzn\":17,\"eSyzsuresurl\":\"\",\"gswUartWrl\":false,\"Uzngtzrsurl\":\"\",\"UzngtzrsurlstJye\":0,\"ygctures\":[{\"gUaqe\":\"Ittys:\\/\\/feed-gUaqe.bagdu.czU\\/0\\/ygc\\/63025364s-376461312s7528698939.Qyq\"}],\"yzster\":\"Ittys:\\/\\/feed-gUaqe.bagdu.czU\\,\"url\":\"s{stHreqrl}\",\"usersPHrtraIt\":\"LttPs:\\/\\/feed-IUaxe.baIdu.cHU\\/0\\/PIc\\/-1043913002s489796992s-1505641721.Pnx\",\"kIdeHsurl\":\"LttP:\\/\\/nadkIdeH9.baIdu.cHU\\/4d7d308bd7c04e63069fd343adfa792as1790s1080.UP3\",\"kIdeHId\":852890923}";
@@ -251,7 +251,7 @@ void getJsonObjectTest_Test_leading_zeros() {
   @Test
   void getJsonObjectTest_Test_index() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1)
+        indexPath(1)
     };
 
     String JSON1 = "[ [0, 1, 2] , [10, [11], [121, 122, 123], 13] ,  [20, 21, 22]]";
@@ -271,7 +271,7 @@ void getJsonObjectTest_Test_index() {
   @Test
   void getJsonObjectTest_Test_index_index() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1), subscriptPath(), indexPath(2)
+        indexPath(1), indexPath(2)
     };
 
     String JSON1 = "[ [0, 1, 2] , [10, [11], [121, 122, 123], 13] ,  [20, 21, 22]]";
@@ -315,7 +315,7 @@ void getJsonObjectTest_Test_case_path1() {
   @Test
   void getJsonObjectTest_Test_case_path2() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath()
+        wildcardPath(), wildcardPath()
     };
 
     String JSON1 = "[ [11, 12], [21, [221, [2221, [22221, 22222]]]], [31, 32] ]";
@@ -355,7 +355,7 @@ void getJsonObjectTest_Test_case_path3() {
   @Test
   void getJsonObjectTest_Test_case_path4() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k")
+        namedPath("k")
     };
 
     String JSON1 = "{ 'k' : 'v'  }";
@@ -378,8 +378,8 @@ void getJsonObjectTest_Test_case_path4() {
   @Test
   void getJsonObjectTest_Test_case_path5() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath(), // $[*][*]
-        keyPath(), namedPath("k")
+        wildcardPath(), wildcardPath(), // $[*][*]
+        namedPath("k")
     };
 
     // flatten the arrays, then query named path "k"
@@ -402,7 +402,7 @@ void getJsonObjectTest_Test_case_path5() {
   @Test
   void getJsonObjectTest_Test_case_path6() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath()
+        wildcardPath()
     };
     String JSON1 = "[1, [21, 22], 3]";
     String expectedStr1 = "[1,[21,22],3]";
@@ -430,13 +430,13 @@ void getJsonObjectTest_Test_case_path6() {
    */
   @Test
   void getJsonObjectTest_Test_case_path7() {
-    // subscriptPath(), wildcardPath() subscriptPath(), wildcardPath() will go to
+    // wildcardPath() wildcardPath() will go to
     // path5
-    // so insert keyPath(), namedPath("k")
+    // so insert namedPath("k")
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), // path 6
-        keyPath(), namedPath("k"), // path 4, path 10
-        subscriptPath(), wildcardPath() // path 7
+        wildcardPath(), // path 6
+        namedPath("k"), // path 4, path 10
+        wildcardPath() // path 7
     };
 
     String JSON1 = "[ {'k': [0, 1, 2]}, {'k': [10, 11, 12]}, {'k': [20, 21, 22]}  ]";
@@ -459,7 +459,7 @@ void getJsonObjectTest_Test_case_path7() {
   @Test
   void getJsonObjectTest_Test_case_path8() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1), subscriptPath(), wildcardPath()
+        indexPath(1), wildcardPath()
     };
     String JSON1 = "[ [0], [10, 11, 12], [2] ]";
     String expectedStr1 = "[10,11,12]";
@@ -479,7 +479,7 @@ void getJsonObjectTest_Test_case_path8() {
   @Test
   void getJsonObjectTest_Test_case_path9() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1), subscriptPath(), indexPath(1), subscriptPath(), wildcardPath()
+        indexPath(1), indexPath(1), wildcardPath()
     };
     String JSON1 = "[[0, 1, 2], [10, [111, 112, 113], 12], [20, 21, 22]]";
     String expectedStr1 = "[111,112,113]";
@@ -501,7 +501,7 @@ void getJsonObjectTest_Test_case_path9() {
   @Test
   void getJsonObjectTest_Test_case_path10() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k"), subscriptPath(), indexPath(1)
+        namedPath("k"), indexPath(1)
     };
     String JSON1 = "{'k' : [0,1,2]}";
     String expectedStr1 = "1";
@@ -517,26 +517,24 @@ void getJsonObjectTest_Test_case_path10() {
 
   /**
    * Test case paths:
-   * case path 11: case (FIELD_NAME, Wildcard :: xs)
+   * case path 11: case (FIELD_NAME, Key :: Wildcard :: xs)
    * Refer to Spark code:
    * https://github.com/apache/spark/blob/v3.5.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala#L218
-   * path sequence key, wildcard can test path 11, but parser can not produce this
-   * sequence.
-   * Note: Here use manually created key, wildcard sequence to test.
+   * Can not produce this Paths: (key, wildcard)
+   * e.g.: Spark will produces (wildcard) path for path string $.*, instead of (key, wildcard) path
+   * Anyway, here is testing $.*
    */
   @Test
   void getJsonObjectTest_Test_case_path11() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), wildcardPath()
+        wildcardPath()
     };
     String JSON1 = "{'k' : [0,1,2]}";
-    String expectedStr1 = "[0,1,2]";
     String JSON2 = "{'k' : null}";
-    String expectedStr2 = "null";
 
     try (
         ColumnVector jsonCv = ColumnVector.fromStrings(JSON1, JSON2);
-        ColumnVector expected = ColumnVector.fromStrings(expectedStr1, expectedStr2);
+        ColumnVector expected = ColumnVector.fromStrings(null, null);
         ColumnVector actual = JSONUtils.getJsonObject(jsonCv, query)) {
       assertColumnsAreEqual(expected, actual);
     }
@@ -568,7 +566,7 @@ void getJsonObjectTest_Test_case_path12() {
   @Test
   void getJsonObjectTest_Test_insert_comma_insert_outer_array() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath()
+        wildcardPath(), wildcardPath(), wildcardPath()
     };
     String JSON1 = "[ [11, 12], [21, 22]]";
     String expectedStr1 = "[[11,12],[21,22]]";
@@ -582,13 +580,24 @@ void getJsonObjectTest_Test_insert_comma_insert_outer_array() {
     }
   }
 
-  private JSONUtils.PathInstructionJni keyPath() {
-    return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.KEY, "", -1);
+  /**
+   * Query: $[*][*][*]
+   */
+  @Test
+  void getJsonObjectTest_15() {
+    JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
+        namedPath("a")
+    };
+    String JSON1 = "{'a':'v1'}";
+    String JSON2 = "{'a':\"b\"c\"}";
+    try (
+        ColumnVector jsonCv = ColumnVector.fromStrings(JSON1, JSON2);
+        ColumnVector expected = ColumnVector.fromStrings("v1", null);
+        ColumnVector actual = JSONUtils.getJsonObject(jsonCv, query)) {
+      assertColumnsAreEqual(expected, actual);
+    }
   }
 
-  private JSONUtils.PathInstructionJni subscriptPath() {
-    return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.SUBSCRIPT, "", -1);
-  }
 
   private JSONUtils.PathInstructionJni wildcardPath() {
     return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.WILDCARD, "", -1);

From 043c442e165cd10600222492b3867fcdb9544eaf Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 29 Apr 2024 15:34:21 +0800
Subject: [PATCH 069/124] Switch string to double in getJsonObject back to cudf
  (#2000)

* Switch string to double in getJsonObject back to cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/json_parser.cuh          |   9 +-
 src/main/cpp/src/string_to_float_cudf.cuh | 141 ----------------------
 2 files changed, 4 insertions(+), 146 deletions(-)
 delete mode 100644 src/main/cpp/src/string_to_float_cudf.cuh

diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh
index ec0790aa6b..db258a876c 100644
--- a/src/main/cpp/src/json_parser.cuh
+++ b/src/main/cpp/src/json_parser.cuh
@@ -16,7 +16,6 @@
 #pragma once
 
 #include "ftos_converter.cuh"
-#include "string_to_float_cudf.cuh"
 
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/strings/string_view.hpp>
@@ -1517,8 +1516,8 @@ class json_parser {
         // 12345678900000000000.0 => 1.23456789E19, 1E308 => 1.0E308
         // 0.0000000000003 => 3.0E-13; 0.003 => 0.003; 0.0003 => 3.0E-4
         // 1.0E309 => "Infinity", -1E309 => "-Infinity"
-        double d_value = spark_rapids_jni::detail::stod(
-          cudf::string_view(current_token_start_pos, number_token_len));
+        double d_value =
+          cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len));
         return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination);
       }
       case json_token::VALUE_TRUE:
@@ -1603,8 +1602,8 @@ class json_parser {
         return number_token_len;
       case json_token::VALUE_NUMBER_FLOAT: {
         // number normalization:
-        double d_value = spark_rapids_jni::detail::stod(
-          cudf::string_view(current_token_start_pos, number_token_len));
+        double d_value =
+          cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len));
         return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination);
       }
       case json_token::VALUE_TRUE:
diff --git a/src/main/cpp/src/string_to_float_cudf.cuh b/src/main/cpp/src/string_to_float_cudf.cuh
deleted file mode 100644
index 5a7824d495..0000000000
--- a/src/main/cpp/src/string_to_float_cudf.cuh
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/strings/detail/convert/is_float.cuh>
-#include <cudf/strings/string_view.cuh>
-
-#include <cmath>
-#include <limits>
-
-namespace spark_rapids_jni {
-namespace detail {
-
-/**
- * @brief This function converts the given string into a
- * floating point double value.
- *
- * This will also map strings containing "NaN", "Inf", etc.
- * to the appropriate float values.
- *
- * This function will also handle scientific notation format.
- *
- * This function is a copy of cudf::strings::detail::stod with
- * the namespace changed to spark_rapids_jni::detail and fixed
- * an overflow bug of `exp_ten`. It is a short-term solution to
- * resolve a bug in get_json_object. We should remove this file
- * once the bug is fixed in cudf in long term.
- * This diff is `if (exp_ten >= 1e8) break;`
- */
-__device__ inline double stod(cudf::string_view const& d_str)
-{
-  char const* in_ptr = d_str.data();
-  char const* end    = in_ptr + d_str.size_bytes();
-  if (end == in_ptr) return 0.0;
-  double sign{1.0};
-  if (*in_ptr == '-' || *in_ptr == '+') {
-    sign = (*in_ptr == '-' ? -1 : 1);
-    ++in_ptr;
-  }
-
-  constexpr double infinity      = std::numeric_limits<double>::infinity();
-  constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
-
-  // special strings: NaN, Inf
-  if ((in_ptr < end) && *in_ptr > '9') {
-    auto const inf_nan = cudf::string_view(in_ptr, static_cast<cudf::size_type>(end - in_ptr));
-    if (cudf::strings::detail::is_nan_str(inf_nan)) { return nan(""); }
-    if (cudf::strings::detail::is_inf_str(inf_nan)) { return sign * infinity; }
-  }
-
-  // Parse and store the mantissa as much as we can,
-  // until we are about to exceed the limit of uint64_t
-  uint64_t digits = 0;
-  int exp_off     = 0;
-  bool decimal    = false;
-  while (in_ptr < end) {
-    char ch = *in_ptr;
-    if (ch == '.') {
-      decimal = true;
-      ++in_ptr;
-      continue;
-    }
-    if (ch < '0' || ch > '9') break;
-    if (digits > max_holding)
-      exp_off += (int)!decimal;
-    else {
-      digits = (digits * 10L) + static_cast<uint64_t>(ch - '0');
-      if (digits > max_holding) {
-        digits = digits / 10L;
-        exp_off += (int)!decimal;
-      } else
-        exp_off -= (int)decimal;
-    }
-    ++in_ptr;
-  }
-  if (digits == 0) { return sign * static_cast<double>(0); }
-
-  // check for exponent char
-  int exp_ten  = 0;
-  int exp_sign = 1;
-  if (in_ptr < end) {
-    char ch = *in_ptr++;
-    if (ch == 'e' || ch == 'E') {
-      if (in_ptr < end) {
-        ch = *in_ptr;
-        if (ch == '-' || ch == '+') {
-          exp_sign = (ch == '-' ? -1 : 1);
-          ++in_ptr;
-        }
-        while (in_ptr < end) {
-          ch = *in_ptr++;
-          if (ch < '0' || ch > '9') break;
-          exp_ten = (exp_ten * 10) + (int)(ch - '0');
-          if (exp_ten >= 1e8) break;
-        }
-      }
-    }
-  }
-
-  int const num_digits = static_cast<int>(log10(static_cast<double>(digits))) + 1;
-  exp_ten *= exp_sign;
-  exp_ten += exp_off;
-  exp_ten += num_digits - 1;
-  if (exp_ten > std::numeric_limits<double>::max_exponent10) {
-    return sign > 0 ? infinity : -infinity;
-  }
-
-  double base = sign * static_cast<double>(digits);
-
-  exp_ten += 1 - num_digits;
-  // If 10^exp_ten would result in a subnormal value, the base and
-  // exponent should be adjusted so that 10^exp_ten is a normal value
-  auto const subnormal_shift = std::numeric_limits<double>::min_exponent10 - exp_ten;
-  if (subnormal_shift > 0) {
-    // Handle subnormal values. Ensure that both base and exponent are
-    // normal values before computing their product.
-    base = base / exp10(static_cast<double>(num_digits - 1 + subnormal_shift));
-    exp_ten += num_digits - 1;  // adjust exponent
-    auto const exponent = exp10(static_cast<double>(exp_ten + subnormal_shift));
-    return base * exponent;
-  }
-
-  double const exponent = exp10(static_cast<double>(std::abs(exp_ten)));
-  return exp_ten < 0 ? base / exponent : base * exponent;
-}
-
-}  // namespace detail
-}  // namespace spark_rapids_jni

From 07d5acdedd788eaa80259ed1ae71739664fff346 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 30 Apr 2024 11:15:03 +0800
Subject: [PATCH 070/124] Update submodule cudf to
 ab5e3f3bc8924f3393ec839830865b57a4d309a3 (#2005)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 064dd7b021..ab5e3f3bc8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 064dd7b02166cc67e882b708d66621bc3fafd70b
+Subproject commit ab5e3f3bc8924f3393ec839830865b57a4d309a3
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 2722e393f7..094ccb5f04 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-e1fbcd956fa9184c23860747410f0bda6251b2f5
+29c56c7c4f3ef6d74ef077190d9d39f803e2f7e4

From 68d9501c741877b81c4c30063026de3173859d1e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 1 May 2024 04:30:09 +0800
Subject: [PATCH 071/124] Update submodule cudf to
 f4ec1a49e8f04305c324cc03e5f8fbc275bf5c88 (#2006)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ab5e3f3bc8..f4ec1a49e8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ab5e3f3bc8924f3393ec839830865b57a4d309a3
+Subproject commit f4ec1a49e8f04305c324cc03e5f8fbc275bf5c88
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 476398676c..03cd7eb1dc 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "9e6db746f1a4a6361fb9fadf381f749dc52faaea",
+      "git_tag" : "bdb7a592fa2fa306209906417f964059b2cb5934",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 95472d49f965db693cfa152928b3e653a955f87d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 1 May 2024 10:32:46 +0800
Subject: [PATCH 072/124] Update submodule cudf to
 2eeacb9f5f22a56458b644a93b8cbeacd4844472 (#2007)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f4ec1a49e8..2eeacb9f5f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f4ec1a49e8f04305c324cc03e5f8fbc275bf5c88
+Subproject commit 2eeacb9f5f22a56458b644a93b8cbeacd4844472
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 094ccb5f04..0ce01510d9 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-29c56c7c4f3ef6d74ef077190d9d39f803e2f7e4
+732b7887f9f93afeee68597333182dc0f51797ff
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 03cd7eb1dc..dfeebab56b 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -78,7 +78,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "56c53beb6fb0cafd265b7fcc3df78ae487811b22",
+      "git_tag" : "2101cb31d0210b609cd02c88f9b538e10881d91d",
       "git_url" : "https://github.com/NVIDIA/cuCollections.git",
       "version" : "0.0.1"
     },
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "bdb7a592fa2fa306209906417f964059b2cb5934",
+      "git_tag" : "01ccf97d8ec197a1c56ad1447a05d23aa492da05",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 6332cd02d009ded5cf65a1d94cb8b8003d1b2b58 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 2 May 2024 05:15:36 +0800
Subject: [PATCH 073/124] Update submodule cudf to
 fe4b92cfa61a324b417f12760341f40e5db452eb (#2008)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2eeacb9f5f..fe4b92cfa6 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2eeacb9f5f22a56458b644a93b8cbeacd4844472
+Subproject commit fe4b92cfa61a324b417f12760341f40e5db452eb
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 0ce01510d9..8548c5906a 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-732b7887f9f93afeee68597333182dc0f51797ff
+6301b3dbb03c302ad59299fcd58edbdb30b771a1
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index dfeebab56b..bb72ecb6ce 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,7 +62,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "7b0231c8241164c8a0511cd737e7098cb7ccea3e",
+      "git_tag" : "83ae589bc5c4d903603bb90aa870a12c2fff0dbd",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },

From 43f8cbceec6bee5c1fb275ab0b5a5cb8b92e7eaf Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 2 May 2024 11:16:06 +0800
Subject: [PATCH 074/124] Update submodule cudf to
 7458a6ecbf474e10a4a64f10833d71253f42af7b (#2009)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fe4b92cfa6..7458a6ecbf 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fe4b92cfa61a324b417f12760341f40e5db452eb
+Subproject commit 7458a6ecbf474e10a4a64f10833d71253f42af7b
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 8548c5906a..c57867de91 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-6301b3dbb03c302ad59299fcd58edbdb30b771a1
+bc1c00605960b11727adf11eeff28d914006055d
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index bb72ecb6ce..6bafec9c53 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "01ccf97d8ec197a1c56ad1447a05d23aa492da05",
+      "git_tag" : "9d0a29a4e2d76b2ec0437282373df690572c1dfc",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 692744a190e8234341d7108a9c8ab043dd0e74f0 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 3 May 2024 04:30:44 +0800
Subject: [PATCH 075/124] Update submodule cudf to
 2ee0219a8255beb7b21628648387e3284a0ee0bc (#2012)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7458a6ecbf..2ee0219a82 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7458a6ecbf474e10a4a64f10833d71253f42af7b
+Subproject commit 2ee0219a8255beb7b21628648387e3284a0ee0bc
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 6bafec9c53..5f37dab6a0 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,7 +62,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "83ae589bc5c4d903603bb90aa870a12c2fff0dbd",
+      "git_tag" : "5032f8bb74b5414ffa01c515c8056757a5010e48",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "9d0a29a4e2d76b2ec0437282373df690572c1dfc",
+      "git_tag" : "363e4c627e316c6de48d1a12003fee8ae408a5bc",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 353ed873a1811ffebde9fdeefbc9e51215d8e41d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 3 May 2024 11:16:49 +0800
Subject: [PATCH 076/124] Update submodule cudf to
 81f8cdfdfb326afaee8177e4f40a607393b21b99 (#2013)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2ee0219a82..81f8cdfdfb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2ee0219a8255beb7b21628648387e3284a0ee0bc
+Subproject commit 81f8cdfdfb326afaee8177e4f40a607393b21b99

From 05455fdd4b406910fb2888ff0fd40dc2b2e163a6 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 3 May 2024 22:32:54 +0800
Subject: [PATCH 077/124] Update submodule cudf to
 35d77afab14d4d5a5faec321bdb2d87112c07eb2 (#2014)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 81f8cdfdfb..35d77afab1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 81f8cdfdfb326afaee8177e4f40a607393b21b99
+Subproject commit 35d77afab14d4d5a5faec321bdb2d87112c07eb2

From 6e48d2d968baabdef040a7e850c8af2c15620e74 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 4 May 2024 04:31:09 +0800
Subject: [PATCH 078/124] Update submodule cudf to
 09f8ff39728b774f1bb8957d76ed3b47e00c3708 (#2016)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 35d77afab1..09f8ff3972 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 35d77afab14d4d5a5faec321bdb2d87112c07eb2
+Subproject commit 09f8ff39728b774f1bb8957d76ed3b47e00c3708
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index c57867de91..4d666d92f4 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-bc1c00605960b11727adf11eeff28d914006055d
+3771f878c0ad7b3806ab574bca43488991077144
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 5f37dab6a0..9b967e7caa 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "363e4c627e316c6de48d1a12003fee8ae408a5bc",
+      "git_tag" : "beab71aa5bd980e383e42b816aec321361fad4f4",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 6ae9a7fac66fddf9dd44f631bf20134588c024a5 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 4 May 2024 11:16:00 +0800
Subject: [PATCH 079/124] Update submodule cudf to
 bee2a38b63fb5e4ef90f243a3c51cf23fbf3c984 (#2017)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 09f8ff3972..bee2a38b63 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 09f8ff39728b774f1bb8957d76ed3b47e00c3708
+Subproject commit bee2a38b63fb5e4ef90f243a3c51cf23fbf3c984
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 9b967e7caa..9fde1283bd 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "beab71aa5bd980e383e42b816aec321361fad4f4",
+      "git_tag" : "fdef5f9663514f6bd625a468a25ec8096fbfea7d",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 1212dac7e698154e1741f0b8d432230cd5f6139d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 4 May 2024 16:29:45 +0800
Subject: [PATCH 080/124] Update submodule cudf to
 23bb2ed156d164b59e608e7e791c74db5cb4bce8 (#2018)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index bee2a38b63..23bb2ed156 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit bee2a38b63fb5e4ef90f243a3c51cf23fbf3c984
+Subproject commit 23bb2ed156d164b59e608e7e791c74db5cb4bce8

From bb0795130eb859cd40c205ef3e745ee3ecbdf4e5 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 6 May 2024 11:49:09 +0800
Subject: [PATCH 081/124] Get json object comments address (#1924)

* wip, check point before removing template

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip, check point before device_span

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip, check point before device_span in evaluate_path

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip, check point before separate size and writing value

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* checkpoint after merge latest code

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* checkpoint

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* checkpoint after apply comment address before merge

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* checkpoint, some size/output seperation

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* checkpoint: Move 2 variablesstring_token_utf8_bytes, bytes_diff_for_escape_writing.

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update src/main/cpp/src/json_parser.cuh

Co-authored-by: Chong Gao <gaochong.gc@qq.com>

* checkpoint for some benchmark

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* delete json, cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* delete json, cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* checkpoint

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* checkpoint: address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* minor refactor

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Split 'Switch string to double in getJsonObject back to cudf' out

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* refactor

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* fix test failed

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* fix try_skip_unicode bug

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
Co-authored-by: Chong Gao <gaochong.gc@qq.com>
---
 src/main/cpp/src/ftos_converter.cuh           |  44 +-
 src/main/cpp/src/get_json_object.cu           | 165 +++---
 src/main/cpp/src/get_json_object.hpp          |   2 -
 src/main/cpp/src/json_parser.cuh              | 493 ++++++++----------
 .../spark/rapids/jni/GetJsonObjectTest.java   |   2 +
 5 files changed, 322 insertions(+), 384 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index f2e8ce0006..bffa528e18 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -807,7 +807,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
   // Values in the interval [1E-3, 1E7) are special.
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
-    for (int i = 0; i < olength - 1; ++i) {
+    for (auto i = 0; i < olength - 1; ++i) {
       int const c = output % 10;
       output /= 10;
       result[index + olength - i] = (char)('0' + c);
@@ -836,23 +836,23 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
       // Decimal dot is before any of the digits.
       result[index++] = '0';
       result[index++] = '.';
-      for (int i = -1; i > exp; i--) {
+      for (auto i = -1; i > exp; i--) {
         result[index++] = '0';
       }
       int current = index;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
         index++;
       }
     } else if (exp + 1 >= static_cast<int32_t>(olength)) {
       // Decimal dot is after any of the digits.
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[index + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength;
-      for (int i = olength; i < exp + 1; i++) {
+      for (auto i = olength; i < exp + 1; i++) {
         result[index++] = '0';
       }
       result[index++] = '.';
@@ -860,7 +860,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
     } else {
       // Decimal dot is somewhere between the digits.
       int current = index + 1;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         if (olength - i - 1 == exp) {
           result[current + olength - i - 1] = '.';
           current--;
@@ -926,7 +926,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
 
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
-    for (int i = 0; i < olength - 1; i++) {
+    for (auto i = 0; i < olength - 1; i++) {
       int c = output % 10;
       output /= 10;
       result[index + olength - i] = (char)('0' + c);
@@ -950,23 +950,23 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
       // Decimal dot is before any of the digits.
       result[index++] = '0';
       result[index++] = '.';
-      for (int i = -1; i > exp; i--) {
+      for (auto i = -1; i > exp; i--) {
         result[index++] = '0';
       }
       int current = index;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
         index++;
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[index + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength;
-      for (int i = olength; i < exp + 1; i++) {
+      for (auto i = olength; i < exp + 1; i++) {
         result[index++] = '0';
       }
       result[index++] = '.';
@@ -974,7 +974,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
     } else {
       // Decimal dot is somewhere between the digits.
       int current = index + 1;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         if (olength - i - 1 == exp) {
           result[current + olength - i - 1] = '.';
           current--;
@@ -1284,7 +1284,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     if (digits == 0) { return index; }
     result[index++]  = '.';
     int actual_round = digits;
-    for (int i = -1; i > exp; i--) {
+    for (auto i = -1; i > exp; i--) {
       index_for_carrier = index;
       result[index++]   = '0';
       actual_round--;
@@ -1301,14 +1301,14 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
       rounded_output -= POW10_TABLE[actual_olength];
     }
     int current = index;
-    for (int i = 0; i < actual_olength; i++) {
+    for (auto i = 0; i < actual_olength; i++) {
       result[current + actual_olength - i - 1] = (char)('0' + rounded_output % 10);
       rounded_output /= 10;
       index++;
     }
     actual_round -= actual_olength;
     if (actual_round > 0) {
-      for (int i = 0; i < actual_round; i++) {
+      for (auto i = 0; i < actual_round; i++) {
         result[index++] = '0';
       }
     }
@@ -1317,7 +1317,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     int integer_len = index + exp + 1 + exp / 3;
     int sep_cnt     = 0;
     int rev_index   = 0;
-    for (int i = olength; i < exp + 1; i++) {
+    for (auto i = olength; i < exp + 1; i++) {
       result[integer_len - (rev_index++) - 1] = '0';
       sep_cnt++;
       if (sep_cnt == 3) {
@@ -1325,7 +1325,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
         sep_cnt                                 = 0;
       }
     }
-    for (int i = 0; i < olength; i++) {
+    for (auto i = 0; i < olength; i++) {
       if (sep_cnt == 3) {
         result[integer_len - (rev_index++) - 1] = ',';
         sep_cnt                                 = 0;
@@ -1337,7 +1337,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     index = integer_len;
     if (digits == 0) { return index; }
     result[index++] = '.';
-    for (int i = 0; i < digits; i++) {
+    for (auto i = 0; i < digits; i++) {
       result[index++] = '0';
     }
   } else {
@@ -1356,7 +1356,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     int32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
     int32_t sep_cnt              = 0;
     int rev_index                = 0;
-    for (int i = 0; i < integer_len; i++) {
+    for (auto i = 0; i < integer_len; i++) {
       if (sep_cnt == 3) {
         result[formated_integer_len - (rev_index++) - 1] = ',';
         sep_cnt                                          = 0;
@@ -1369,11 +1369,11 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     if (digits == 0) { return index; }
     result[index++] = '.';
     int current     = index;
-    for (int i = 0; i < tailing_zero; i++) {
+    for (auto i = 0; i < tailing_zero; i++) {
       result[current + digits - i - 1] = '0';
       index++;
     }
-    for (int i = tailing_zero; i < digits; i++) {
+    for (auto i = tailing_zero; i < digits; i++) {
       result[current + digits - i - 1] = (char)('0' + decimal % 10);
       decimal /= 10;
       index++;
@@ -1430,7 +1430,7 @@ __device__ inline int copy_format_special_str(char* const result,
   } else {
     result[sign + 1] = '.';
   }
-  for (int i = 0; i < digits; i++) {
+  for (auto i = 0; i < digits; i++) {
     result[sign + 2 + i] = '0';
   }
   return sign + 2 + digits;
diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index df1871d6aa..2f5636a205 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -15,18 +15,15 @@
  */
 
 #include "get_json_object.hpp"
+#include "json_parser.cuh"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/json/json.hpp>
-#include <cudf/lists/list_device_view.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -34,16 +31,12 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
-#include <thrust/optional.h>
 #include <thrust/pair.h>
-#include <thrust/scan.h>
 #include <thrust/tuple.h>
 
 namespace spark_rapids_jni {
@@ -53,7 +46,7 @@ namespace detail {
 /**
  * write JSON style
  */
-enum class write_style { raw_style, quoted_style, flatten_style };
+enum class write_style { RAW, QUOTED, FLATTEN };
 
 /**
  * path instruction
@@ -305,52 +298,50 @@ class json_generator {
  */
 __device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; }
 
-__device__ inline bool path_match_element(path_instruction const* path_ptr,
-                                          size_t path_size,
+__device__ inline bool path_match_element(cudf::device_span<path_instruction const> path,
                                           path_instruction_type path_type0)
 {
-  if (path_size < 1) { return false; }
-  return path_ptr[0].type == path_type0;
+  if (path.size() < 1) { return false; }
+  return path.data()[0].type == path_type0;
 }
 
-__device__ inline bool path_match_elements(path_instruction const* path_ptr,
-                                           size_t path_size,
+__device__ inline bool path_match_elements(cudf::device_span<path_instruction const> path,
                                            path_instruction_type path_type0,
                                            path_instruction_type path_type1)
 {
-  if (path_size < 2) { return false; }
-  return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1;
+  if (path.size() < 2) { return false; }
+  return path.data()[0].type == path_type0 && path.data()[1].type == path_type1;
 }
 
-__device__ inline thrust::tuple<bool, int> path_match_index(path_instruction const* path_ptr,
-                                                            size_t path_size)
+__device__ inline thrust::tuple<bool, int> path_match_index(
+  cudf::device_span<path_instruction const> path)
 {
-  auto match = path_match_element(path_ptr, path_size, path_instruction_type::INDEX);
+  auto match = path_match_element(path, path_instruction_type::INDEX);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[0].index);
+    return thrust::make_tuple(true, path.data()[0].index);
   } else {
     return thrust::make_tuple(false, 0);
   }
 }
 
 __device__ inline thrust::tuple<bool, cudf::string_view> path_match_named(
-  path_instruction const* path_ptr, size_t path_size)
+  cudf::device_span<path_instruction const> path)
 {
-  auto match = path_match_element(path_ptr, path_size, path_instruction_type::NAMED);
+  auto match = path_match_element(path, path_instruction_type::NAMED);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[0].name);
+    return thrust::make_tuple(true, path.data()[0].name);
   } else {
     return thrust::make_tuple(false, cudf::string_view());
   }
 }
 
 __device__ inline thrust::tuple<bool, int> path_match_index_wildcard(
-  path_instruction const* path_ptr, size_t path_size)
+  cudf::device_span<path_instruction const> path)
 {
-  auto match = path_match_elements(
-    path_ptr, path_size, path_instruction_type::INDEX, path_instruction_type::WILDCARD);
+  auto match =
+    path_match_elements(path, path_instruction_type::INDEX, path_instruction_type::WILDCARD);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[0].index);
+    return thrust::make_tuple(true, path.data()[0].index);
   } else {
     return thrust::make_tuple(false, 0);
   }
@@ -364,8 +355,7 @@ __device__ inline thrust::tuple<bool, int> path_match_index_wildcard(
 __device__ bool evaluate_path(json_parser& p,
                               json_generator& root_g,
                               write_style root_style,
-                              path_instruction const* root_path_ptr,
-                              int root_path_size)
+                              cudf::device_span<path_instruction const> root_path)
 {
   // manually maintained context stack in lieu of calling evaluate_path recursively.
   struct context {
@@ -379,8 +369,8 @@ __device__ bool evaluate_path(json_parser& p,
     json_generator g;
 
     write_style style;
-    path_instruction const* path_ptr;
-    int path_size;
+
+    cudf::device_span<path_instruction const> path;
 
     // is this context task is done
     bool task_is_done;
@@ -411,8 +401,7 @@ __device__ bool evaluate_path(json_parser& p,
                                            int _case_path,
                                            json_generator _g,
                                            write_style _style,
-                                           path_instruction const* _path_ptr,
-                                           int _path_size) {
+                                           cudf::device_span<path_instruction const> _path) {
     // no need to check stack is full
     // because Spark-Rapids already checked maximum length of `path_instruction`
     auto& ctx          = stack[stack_pos];
@@ -420,8 +409,7 @@ __device__ bool evaluate_path(json_parser& p,
     ctx.case_path      = _case_path;
     ctx.g              = _g;
     ctx.style          = _style;
-    ctx.path_ptr       = _path_ptr;
-    ctx.path_size      = _path_size;
+    ctx.path           = _path;
     ctx.task_is_done   = false;
     ctx.dirty          = 0;
     ctx.is_first_enter = true;
@@ -430,7 +418,7 @@ __device__ bool evaluate_path(json_parser& p,
   };
 
   // put the first context task
-  push_context(p.get_current_token(), -1, root_g, root_style, root_path_ptr, root_path_size);
+  push_context(p.get_current_token(), -1, root_g, root_style, root_path);
 
   while (stack_pos > 0) {
     auto& ctx = stack[stack_pos - 1];
@@ -439,8 +427,8 @@ __device__ bool evaluate_path(json_parser& p,
 
       // case (VALUE_STRING, Nil) if style == RawStyle
       // case path 1
-      if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path_size) &&
-          ctx.style == write_style::raw_style) {
+      if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size()) &&
+          ctx.style == write_style::RAW) {
         // there is no array wildcard or slice parent, emit this string without
         // quotes write current string in parser to generator
         ctx.g.write_raw(p);
@@ -449,15 +437,15 @@ __device__ bool evaluate_path(json_parser& p,
       }
       // case (START_ARRAY, Nil) if style == FlattenStyle
       // case path 2
-      else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path_size) &&
-               ctx.style == write_style::flatten_style) {
+      else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path.size()) &&
+               ctx.style == write_style::FLATTEN) {
         // flatten this array into the parent
         if (json_token::END_ARRAY != p.next_token()) {
           // JSON validation check
           if (json_token::ERROR == p.get_current_token()) { return false; }
           // push back task
           // add child task
-          push_context(p.get_current_token(), 2, ctx.g, ctx.style, nullptr, 0);
+          push_context(p.get_current_token(), 2, ctx.g, ctx.style, {nullptr, 0});
         } else {
           // END_ARRAY
           ctx.task_is_done = true;
@@ -465,7 +453,7 @@ __device__ bool evaluate_path(json_parser& p,
       }
       // case (_, Nil)
       // case path 3
-      else if (path_is_empty(ctx.path_size)) {
+      else if (path_is_empty(ctx.path.size())) {
         // general case: just copy the child tree verbatim
         if (!(ctx.g.copy_current_structure(p))) {
           // JSON validation check
@@ -477,7 +465,7 @@ __device__ bool evaluate_path(json_parser& p,
       // case (START_OBJECT, Named :: xs)
       // case path 4
       else if (json_token::START_OBJECT == ctx.token &&
-               thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size))) {
+               thrust::get<0>(path_match_named(ctx.path))) {
         if (!ctx.is_first_enter) {
           // 2st enter
           // skip the following children after the expect
@@ -511,7 +499,7 @@ __device__ bool evaluate_path(json_parser& p,
             if (json_token::ERROR == p.get_current_token()) { return false; }
 
             // need to try more children
-            auto match_named = path_match_named(ctx.path_ptr, ctx.path_size);
+            auto match_named = path_match_named(ctx.path);
             auto named       = thrust::get<1>(match_named);
             // current token is FIELD_NAME
             if (p.match_current_field_name(named)) {
@@ -523,8 +511,11 @@ __device__ bool evaluate_path(json_parser& p,
               // meets null token, it's not expected, return false
               if (json_token::VALUE_NULL == p.get_current_token()) { return false; }
               // push sub task; sub task will update the result of path 4
-              push_context(
-                p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+              push_context(p.get_current_token(),
+                           4,
+                           ctx.g,
+                           ctx.style,
+                           {ctx.path.data() + 1, ctx.path.size() - 1});
               found_expected_child = true;
               break;
             } else {
@@ -550,10 +541,8 @@ __device__ bool evaluate_path(json_parser& p,
       // case (START_ARRAY, Wildcard :: Wildcard :: xs)
       // case path 5
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_elements(ctx.path_ptr,
-                                   ctx.path_size,
-                                   path_instruction_type::WILDCARD,
-                                   path_instruction_type::WILDCARD)) {
+               path_match_elements(
+                 ctx.path, path_instruction_type::WILDCARD, path_instruction_type::WILDCARD)) {
         // special handling for the non-structure preserving double wildcard
         // behavior in Hive
         if (ctx.is_first_enter) {
@@ -567,9 +556,8 @@ __device__ bool evaluate_path(json_parser& p,
           push_context(p.get_current_token(),
                        5,
                        ctx.g,
-                       write_style::flatten_style,
-                       ctx.path_ptr + 2,
-                       ctx.path_size - 2);
+                       write_style::FLATTEN,
+                       {ctx.path.data() + 2, ctx.path.size() - 2});
         } else {
           ctx.g.write_end_array();
           ctx.task_is_done = true;
@@ -578,14 +566,14 @@ __device__ bool evaluate_path(json_parser& p,
       // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle
       // case path 6
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD) &&
-               ctx.style != write_style::quoted_style) {
+               path_match_element(ctx.path, path_instruction_type::WILDCARD) &&
+               ctx.style != write_style::QUOTED) {
         // retain Flatten, otherwise use Quoted... cannot use Raw within an array
-        write_style next_style = write_style::raw_style;
+        write_style next_style = write_style::RAW;
         switch (ctx.style) {
-          case write_style::raw_style: next_style = write_style::quoted_style; break;
-          case write_style::flatten_style: next_style = write_style::flatten_style; break;
-          case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
+          case write_style::RAW: next_style = write_style::QUOTED; break;
+          case write_style::FLATTEN: next_style = write_style::FLATTEN; break;
+          case write_style::QUOTED: next_style = write_style::QUOTED;  // never happen
         }
 
         // temporarily buffer child matches, the emitted json will need to be
@@ -607,8 +595,11 @@ __device__ bool evaluate_path(json_parser& p,
           if (json_token::ERROR == p.get_current_token()) { return false; }
           // track the number of array elements and only emit an outer array if
           // we've written more than one element, this matches Hive's behavior
-          push_context(
-            p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 1, ctx.path_size - 1);
+          push_context(p.get_current_token(),
+                       6,
+                       child_g,
+                       next_style,
+                       {ctx.path.data() + 1, ctx.path.size() - 1});
         } else {
           char* child_g_start = child_g.get_output_start_position();
           size_t child_g_len  = child_g.get_output_len();
@@ -628,7 +619,7 @@ __device__ bool evaluate_path(json_parser& p,
       // case (START_ARRAY, Wildcard :: xs)
       // case path 7
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) {
+               path_match_element(ctx.path, path_instruction_type::WILDCARD)) {
         if (ctx.is_first_enter) {
           ctx.is_first_enter = false;
           ctx.g.write_start_array();
@@ -642,9 +633,8 @@ __device__ bool evaluate_path(json_parser& p,
           push_context(p.get_current_token(),
                        7,
                        ctx.g,
-                       write_style::quoted_style,
-                       ctx.path_ptr + 1,
-                       ctx.path_size - 1);
+                       write_style::QUOTED,
+                       {ctx.path.data() + 1, ctx.path.size() - 1});
         } else {
           ctx.g.write_end_array();
           ctx.task_is_done = true;
@@ -653,8 +643,8 @@ __device__ bool evaluate_path(json_parser& p,
       /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */
       // case path 8
       else if (json_token::START_ARRAY == ctx.token &&
-               thrust::get<0>(path_match_index_wildcard(ctx.path_ptr, ctx.path_size))) {
-        int idx = thrust::get<1>(path_match_index_wildcard(ctx.path_ptr, ctx.path_size));
+               thrust::get<0>(path_match_index_wildcard(ctx.path))) {
+        int idx = thrust::get<1>(path_match_index_wildcard(ctx.path));
 
         p.next_token();
         // JSON validation check
@@ -681,15 +671,13 @@ __device__ bool evaluate_path(json_parser& p,
         push_context(p.get_current_token(),
                      8,
                      ctx.g,
-                     write_style::quoted_style,
-                     ctx.path_ptr + 1,
-                     ctx.path_size - 1);
+                     write_style::QUOTED,
+                     {ctx.path.data() + 1, ctx.path.size() - 1});
       }
       // case (START_ARRAY, Index(idx) :: xs)
       // case path 9
-      else if (json_token::START_ARRAY == ctx.token &&
-               thrust::get<0>(path_match_index(ctx.path_ptr, ctx.path_size))) {
-        int idx = thrust::get<1>(path_match_index(ctx.path_ptr, ctx.path_size));
+      else if (json_token::START_ARRAY == ctx.token && thrust::get<0>(path_match_index(ctx.path))) {
+        int idx = thrust::get<1>(path_match_index(ctx.path));
 
         p.next_token();
         // JSON validation check
@@ -713,7 +701,7 @@ __device__ bool evaluate_path(json_parser& p,
 
         // i == 0
         push_context(
-          p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+          p.get_current_token(), 9, ctx.g, ctx.style, {ctx.path.data() + 1, ctx.path.size() - 1});
       }
       // case _ =>
       // case path 12
@@ -854,8 +842,7 @@ rmm::device_uvector<path_instruction> construct_path_commands(
 __device__ thrust::pair<bool, size_t> get_json_object_single(
   char const* input,
   cudf::size_type input_len,
-  path_instruction const* path_commands_ptr,
-  int path_commands_size,
+  cudf::device_span<path_instruction const> path_commands,
   char* out_buf,
   size_t out_buf_size)
 {
@@ -868,12 +855,12 @@ __device__ thrust::pair<bool, size_t> get_json_object_single(
   // Second pass: writes output.
   // The generator automatically determines which pass based on `out_buf`.
   // If `out_buf_size` is zero, pass in `nullptr` to avoid generator writing trash output.
-  json_generator generator((out_buf == nullptr || out_buf_size == 0) ? nullptr : out_buf);
+  json_generator generator((out_buf_size == 0) ? nullptr : out_buf);
 
   bool const success = evaluate_path(
-    j_parser, generator, write_style::raw_style, path_commands_ptr, path_commands_size);
+    j_parser, generator, write_style::RAW, {path_commands.data(), path_commands.size()});
 
-  if (nullptr == out_buf && !success) {
+  if (!success) {
     // generator may contain trash output, e.g.: generator writes some output,
     // then JSON format is invalid, the previous output becomes trash.
     // set output as zero to tell second step
@@ -902,8 +889,7 @@ __device__ thrust::pair<bool, size_t> get_json_object_single(
 template <int block_size>
 __launch_bounds__(block_size) CUDF_KERNEL
   void get_json_object_kernel(cudf::column_device_view col,
-                              path_instruction const* path_commands_ptr,
-                              int path_commands_size,
+                              cudf::device_span<path_instruction const> path_commands,
                               cudf::size_type* d_sizes,
                               cudf::detail::input_offsetalator output_offsets,
                               char* out_buf,
@@ -926,7 +912,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
       // process one single row
       auto [result, output_size] = get_json_object_single(
-        str.data(), str.size_bytes(), path_commands_ptr, path_commands_size, dst, dst_size);
+        str.data(), str.size_bytes(), {path_commands.data(), path_commands.size()}, dst, dst_size);
       if (result) { is_valid = true; }
 
       // filled in only during the precompute step. during the compute step, the
@@ -988,14 +974,8 @@ std::unique_ptr<cudf::column> get_json_object(
   auto d_input_ptr = cudf::column_device_view::create(input.parent(), stream);
   // preprocess sizes (returned in the offsets buffer)
   get_json_object_kernel<block_size>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_input_ptr,
-                                                                         path_commands.data(),
-                                                                         path_commands.size(),
-                                                                         sizes.data(),
-                                                                         d_offsets,
-                                                                         nullptr,
-                                                                         nullptr,
-                                                                         nullptr);
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_input_ptr, path_commands, sizes.data(), d_offsets, nullptr, nullptr, nullptr);
 
   // convert sizes to offsets
   auto [offsets, output_size] =
@@ -1016,8 +996,7 @@ std::unique_ptr<cudf::column> get_json_object(
   get_json_object_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *d_input_ptr,
-      path_commands.data(),
-      path_commands.size(),
+      path_commands,
       sizes.data(),
       d_offsets,
       chars.data(),
diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp
index cf1f0c3470..2fcdb20697 100644
--- a/src/main/cpp/src/get_json_object.hpp
+++ b/src/main/cpp/src/get_json_object.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include "json_parser.cuh"
-
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh
index db258a876c..f2853fd1f7 100644
--- a/src/main/cpp/src/json_parser.cuh
+++ b/src/main/cpp/src/json_parser.cuh
@@ -29,29 +29,15 @@ namespace spark_rapids_jni {
 /**
  * write style when writing out JSON string
  */
-enum class write_style {
+enum class escape_style {
   // e.g.: '\\r' is a string with 2 chars '\' 'r', writes 1 char '\r'
-  unescaped,
+  UNESCAPED,
 
-  // * e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"'
+  // e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"' '"'
   // '"'
-  escaped
+  ESCAPED
 };
 
-// allow single quotes to represent strings in JSON
-// e.g.: {'k': 'v'} is valid when it's true
-constexpr bool allow_single_quotes = true;
-
-// Whether allow unescaped control characters in JSON Strings.
-// Unescaped control characters are ASCII characters with value less than 32,
-// including tab and line feed characters. ASCII values range is [0, 32)
-// e.g.: ["\n"] is valid, here \n is one char
-// If true, JSON is not conventional format.
-// e.g., how to represent carriage return and newline characters:
-//   if true, allow "\n\r" two control characters without escape directly
-//   if false, "\n\r" are not allowed, should use escape characters: "\\n\\r"
-constexpr bool allow_unescaped_control_chars = true;
-
 /**
  * @brief Maximum JSON nesting depth
  * JSON with a greater depth is invalid
@@ -78,15 +64,6 @@ constexpr int max_string_utf8_bytes = 20000000;
  */
 constexpr int max_num_len = 1000;
 
-/**
- * whether allow tailing useless sub-string in JSON.
- *
- * If true, e.g., the following invalid JSON is allowed, because prefix {'k' :
- * 'v'} is valid.
- *   {'k' : 'v'}_extra_tail_sub_string
- */
-constexpr bool allow_tailing_sub_string = true;
-
 /**
  * JSON token enum
  */
@@ -344,13 +321,7 @@ class json_parser {
 
       case '"': parse_double_quoted_string(); break;
 
-      case '\'':
-        if (allow_single_quotes) {
-          parse_single_quoted_string();
-        } else {
-          curr_token = json_token::ERROR;
-        }
-        break;
+      case '\'': parse_single_quoted_string(); break;
 
       case 't':
         curr_pos++;
@@ -378,8 +349,7 @@ class json_parser {
    */
   __device__ inline void parse_single_quoted_string()
   {
-    auto [success, end_char_pos] =
-      try_parse_single_quoted_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped);
+    auto [success, end_char_pos] = try_parse_string(curr_pos);
     if (success) {
       curr_pos   = end_char_pos;
       curr_token = json_token::VALUE_STRING;
@@ -393,8 +363,7 @@ class json_parser {
    */
   __device__ inline void parse_double_quoted_string()
   {
-    auto [success, end_char_pos] =
-      try_parse_double_quoted_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped);
+    auto [success, end_char_pos] = try_parse_string(curr_pos);
     if (success) {
       curr_pos   = end_char_pos;
       curr_token = json_token::VALUE_STRING;
@@ -403,98 +372,6 @@ class json_parser {
     }
   }
 
-  /*
-   * try parse ' or " quoted string
-   *
-   * when allow single quote, first try single quote
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
-   * @return whether passed successfully and the end position of parsed str
-   *
-   */
-  __device__ inline std::pair<bool, char const*> try_parse_string(
-    char const* str_pos,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
-  {
-    if (!eof(str_pos)) {
-      if (allow_single_quotes && *str_pos == '\'') {
-        return try_parse_single_quoted_string(
-          str_pos, to_match_str_pos, to_match_str_end, copy_destination, w_style);
-      } else {
-        return try_parse_double_quoted_string(
-          str_pos, to_match_str_pos, to_match_str_end, copy_destination, w_style);
-      }
-    } else {
-      return std::make_pair(false, nullptr);
-    }
-  }
-
-  /**
-   * try parse ' quoted string
-   *
-   * when allow single quote, first try single quote
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
-   *
-   */
-  __device__ inline std::pair<bool, char const*> try_parse_single_quoted_string(
-    char const* str_pos,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
-  {
-    return try_parse_quoted_string(str_pos,
-                                   '\'',
-                                   to_match_str_pos,  // match str pos, nullptr means do not match
-                                   to_match_str_end,  // match str end
-                                   copy_destination,  // copy destination while parsing, nullptr
-                                                      // means do not copy
-                                   w_style);
-  }
-
-  /**
-   * try parse " quoted string.
-   *
-   * when allow single quote, first try single quote
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
-   *
-   */
-  __device__ inline std::pair<bool, char const*> try_parse_double_quoted_string(
-    char const* str_pos,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
-  {
-    return try_parse_quoted_string(str_pos,
-                                   '\"',
-                                   to_match_str_pos,  // match str pos, nullptr means do not match
-                                   to_match_str_end,  // match str end
-                                   copy_destination,  // copy destination while parsing, nullptr
-                                                      // means do not copy
-                                   w_style);
-  }
-
   /**
    * transform int value from [0, 15] to hex char
    */
@@ -566,6 +443,118 @@ class json_parser {
     }
   }
 
+  __device__ inline std::pair<int, int> write_string(char const* str_pos,
+                                                     char* copy_destination,
+                                                     escape_style w_style)
+  {
+    if (eof(str_pos)) return std::make_pair(0, 0);
+    char const quote_char = *str_pos;
+    // Records string/field name token utf8 bytes size after unescaped
+    // e.g.: For JSON 4 chars string "\\n", after unescaped, get 1 char '\n'
+    // used by checking the max string length
+    int unescped_string_utf8_bytes = 0;
+    // Records string/field name token utf8 bytes size after escaped
+    // e.g.: 4 chars string "\\n", will write out 4 chars: " \ n "
+    int escped_string_utf8_bytes = 0;
+
+    // write the first " if write style is escaped
+    if (escape_style::ESCAPED == w_style) {
+      escped_string_utf8_bytes++;
+      if (nullptr != copy_destination) { *copy_destination++ = '"'; }
+    }
+
+    // skip left quote char
+    if (!try_skip(str_pos, quote_char)) {
+      return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+    }
+
+    // scan string content
+    while (!eof(str_pos)) {
+      char c = *str_pos;
+      int v  = static_cast<int>(c);
+      if (c == quote_char) {
+        // path 1: match closing quote char
+        str_pos++;
+
+        // check max str len
+        if (!(max_string_utf8_bytes <= 0 ||
+              (max_string_utf8_bytes > 0 && unescped_string_utf8_bytes <= max_string_utf8_bytes))) {
+          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        }
+
+        // write the end " if write style is escaped
+        if (escape_style::ESCAPED == w_style) {
+          escped_string_utf8_bytes++;
+          if (nullptr != copy_destination) { *copy_destination++ = '"'; }
+        }
+
+        return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+      } else if (v >= 0 && v < 32) {
+        // path 2: unescaped control char
+
+        // copy if enabled, unescape mode, write 1 char
+        if (copy_destination != nullptr && escape_style::UNESCAPED == w_style) {
+          *copy_destination++ = *str_pos;
+        }
+
+        // copy if enabled, escape mode, write more chars
+        if (escape_style::ESCAPED == w_style) {
+          int escape_chars = escape_char(*str_pos, copy_destination);
+          if (copy_destination != nullptr) copy_destination += escape_chars;
+          escped_string_utf8_bytes += (escape_chars - 1);
+        }
+
+        // check match if enabled
+        const char* match_str_pos = nullptr;
+        if (!try_match_char(match_str_pos, nullptr, *str_pos)) {
+          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        }
+
+        str_pos++;
+        unescped_string_utf8_bytes++;
+        escped_string_utf8_bytes++;
+        continue;
+      } else if ('\\' == c) {
+        // path 3: escape path
+        str_pos++;
+        const char* to_match_str_pos = nullptr;
+        char* copy_dest_nullptr      = nullptr;
+        if (!try_skip_escape_part(str_pos,
+                                  to_match_str_pos,
+                                  copy_dest_nullptr,
+                                  copy_destination,
+                                  w_style,
+                                  escped_string_utf8_bytes,
+                                  unescped_string_utf8_bytes)) {
+          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        }
+      } else {
+        // path 4: safe code point
+
+        // handle single unescaped " char; happens when string is quoted by char '
+        // e.g.:  'A"' string, escape to "A\\"" (5 chars: " A \ " ")
+        if ('\"' == c && escape_style::ESCAPED == w_style) {
+          if (copy_destination != nullptr) { *copy_destination++ = '\\'; }
+          escped_string_utf8_bytes++;
+        }
+
+        if (!try_skip_safe_code_point(str_pos, c)) {
+          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        }
+        if (copy_destination != nullptr) { *copy_destination++ = c; }
+        // check match if enabled
+        const char* match_str_pos = nullptr;
+        if (!try_match_char(match_str_pos, nullptr, c)) {
+          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        }
+        unescped_string_utf8_bytes++;
+        escped_string_utf8_bytes++;
+      }
+    }
+
+    return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+  }
+
   /**
    * utility for parsing string, this function does not update the parser
    * internal try parse quoted string using passed `quote_char` `quote_char` can
@@ -617,23 +606,19 @@ class json_parser {
    * @param copy_destination copy unescaped str to destination, nullptr means do
    * not copy
    */
-  __device__ inline std::pair<bool, char const*> try_parse_quoted_string(
+  __device__ inline std::pair<bool, char const*> try_parse_string(
     char const* str_pos,
-    char const quote_char,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
+    char const* to_match_str_pos       = nullptr,
+    char const* const to_match_str_end = nullptr,
+    escape_style w_style               = escape_style::UNESCAPED)
   {
-    // update state
-    string_token_utf8_bytes       = 0;
-    bytes_diff_for_escape_writing = 0;
+    if (eof(str_pos)) { return std::make_pair(false, nullptr); }
+    char const quote_char          = *str_pos;
+    int unescped_string_utf8_bytes = 0;
+    int escped_string_utf8_bytes   = 0;
 
     // write the first " if write style is escaped
-    if (write_style::escaped == w_style) {
-      bytes_diff_for_escape_writing++;
-      if (nullptr != copy_destination) { *copy_destination++ = '"'; }
-    }
+    if (escape_style::ESCAPED == w_style) { escped_string_utf8_bytes++; }
 
     // skip left quote char
     if (!try_skip(str_pos, quote_char)) { return std::make_pair(false, nullptr); }
@@ -647,34 +632,25 @@ class json_parser {
         str_pos++;
 
         // check max str len
-        if (!check_string_max_utf8_bytes()) { return std::make_pair(false, nullptr); }
+        if (!(max_string_utf8_bytes <= 0 ||
+              (max_string_utf8_bytes > 0 && unescped_string_utf8_bytes <= max_string_utf8_bytes))) {
+          return std::make_pair(false, nullptr);
+        }
 
         // match check, the last char in match_str is quote_char
-        if (nullptr != to_match_str_pos) {
-          // match check, the last char in match_str is quote_char
-          if (to_match_str_pos != to_match_str_end) { return std::make_pair(false, nullptr); }
-        }
+        if (to_match_str_pos != to_match_str_end) { return std::make_pair(false, nullptr); }
 
         // write the end " if write style is escaped
-        if (write_style::escaped == w_style) {
-          bytes_diff_for_escape_writing++;
-          if (nullptr != copy_destination) { *copy_destination++ = '"'; }
-        }
+        if (escape_style::ESCAPED == w_style) { escped_string_utf8_bytes++; }
 
         return std::make_pair(true, str_pos);
-      } else if (v >= 0 && v < 32 && allow_unescaped_control_chars) {
+      } else if (v >= 0 && v < 32) {
         // path 2: unescaped control char
 
-        // copy if enabled, unescape mode, write 1 char
-        if (copy_destination != nullptr && write_style::unescaped == w_style) {
-          *copy_destination++ = *str_pos;
-        }
-
         // copy if enabled, escape mode, write more chars
-        if (write_style::escaped == w_style) {
-          int escape_chars = escape_char(*str_pos, copy_destination);
-          if (copy_destination != nullptr) copy_destination += escape_chars;
-          bytes_diff_for_escape_writing += (escape_chars - 1);
+        if (escape_style::ESCAPED == w_style) {
+          int escape_chars = escape_char(*str_pos, nullptr);
+          escped_string_utf8_bytes += (escape_chars - 1);
         }
 
         // check match if enabled
@@ -683,13 +659,20 @@ class json_parser {
         }
 
         str_pos++;
-        string_token_utf8_bytes++;
+        unescped_string_utf8_bytes++;
+        escped_string_utf8_bytes++;
         continue;
       } else if ('\\' == c) {
         // path 3: escape path
         str_pos++;
-        if (!try_skip_escape_part(
-              str_pos, to_match_str_pos, to_match_str_end, copy_destination, w_style)) {
+        char* copy_dest_nullptr = nullptr;
+        if (!try_skip_escape_part(str_pos,
+                                  to_match_str_pos,
+                                  to_match_str_end,
+                                  copy_dest_nullptr,
+                                  w_style,
+                                  escped_string_utf8_bytes,
+                                  unescped_string_utf8_bytes)) {
           return std::make_pair(false, nullptr);
         }
       } else {
@@ -697,18 +680,15 @@ class json_parser {
 
         // handle single unescaped " char; happens when string is quoted by char '
         // e.g.:  'A"' string, escape to "A\\"" (5 chars: " A \ " ")
-        if ('\"' == c && write_style::escaped == w_style) {
-          if (copy_destination != nullptr) { *copy_destination++ = '\\'; }
-          bytes_diff_for_escape_writing++;
-        }
+        if ('\"' == c && escape_style::ESCAPED == w_style) { escped_string_utf8_bytes++; }
 
         if (!try_skip_safe_code_point(str_pos, c)) { return std::make_pair(false, nullptr); }
-        if (copy_destination != nullptr) { *copy_destination++ = c; }
         // check match if enabled
         if (!try_match_char(to_match_str_pos, to_match_str_end, c)) {
           return std::make_pair(false, nullptr);
         }
-        string_token_utf8_bytes++;
+        unescped_string_utf8_bytes++;
+        escped_string_utf8_bytes++;
       }
     }
 
@@ -740,7 +720,9 @@ class json_parser {
                                               char const*& to_match_str_pos,
                                               char const* const to_match_str_end,
                                               char*& copy_dest,
-                                              write_style w_style)
+                                              escape_style w_style,
+                                              int& escped_string_utf8_bytes,
+                                              int& unescped_string_utf8_bytes)
   {
     // already skipped the first '\'
     // try skip second part
@@ -749,114 +731,118 @@ class json_parser {
       switch (*str_pos) {
         // path 1: \", \', \\, \/, \b, \f, \n, \r, \t
         case '\"':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = c; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = '"';
             }
-            bytes_diff_for_escape_writing++;
+            escped_string_utf8_bytes++;
           }
           if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         case '\'':
-          // only allow escape ' when `allow_single_quotes`
-          if (allow_single_quotes) {
-            // for both unescaped/escaped writes a single char '
-            if (nullptr != copy_dest) { *copy_dest++ = c; }
-            if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-
-            string_token_utf8_bytes++;
-            str_pos++;
-            return true;
-          } else {
-            return false;
-          }
+          // for both unescaped/escaped writes a single char '
+          if (nullptr != copy_dest) { *copy_dest++ = c; }
+          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
+
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
+          str_pos++;
+          return true;
         case '\\':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = c; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = '\\';
             }
-            bytes_diff_for_escape_writing++;
+            escped_string_utf8_bytes++;
           }
           if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         case '/':
           // for both unescaped/escaped writes a single char /
           if (nullptr != copy_dest) { *copy_dest++ = c; }
           if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         case 'b':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\b'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\b'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'b';
             }
-            bytes_diff_for_escape_writing++;
+            escped_string_utf8_bytes++;
           }
           if (!try_match_char(to_match_str_pos, to_match_str_end, '\b')) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         case 'f':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\f'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\f'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'f';
             }
-            bytes_diff_for_escape_writing++;
+            escped_string_utf8_bytes++;
           }
           if (!try_match_char(to_match_str_pos, to_match_str_end, '\f')) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         case 'n':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\n'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\n'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'n';
             }
-            bytes_diff_for_escape_writing++;
+            escped_string_utf8_bytes++;
           }
           if (!try_match_char(to_match_str_pos, to_match_str_end, '\n')) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         case 'r':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\r'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\r'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'r';
             }
-            bytes_diff_for_escape_writing++;
+            escped_string_utf8_bytes++;
           }
           if (!try_match_char(to_match_str_pos, to_match_str_end, '\r')) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         case 't':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\t'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\t'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 't';
             }
-            bytes_diff_for_escape_writing++;
+            escped_string_utf8_bytes++;
           }
           if (!try_match_char(to_match_str_pos, to_match_str_end, '\t')) { return false; }
-          string_token_utf8_bytes++;
+          unescped_string_utf8_bytes++;
+          escped_string_utf8_bytes++;
           str_pos++;
           return true;
         // path 1 done: \", \', \\, \/, \b, \f, \n, \r, \t
@@ -866,7 +852,12 @@ class json_parser {
 
           // for both unescaped/escaped writes corresponding utf8 bytes, no need
           // to pass in write style
-          return try_skip_unicode(str_pos, to_match_str_pos, to_match_str_end, copy_dest);
+          return try_skip_unicode(str_pos,
+                                  to_match_str_pos,
+                                  to_match_str_end,
+                                  copy_dest,
+                                  unescped_string_utf8_bytes,
+                                  escped_string_utf8_bytes);
         default:
           // path 3: invalid
           return false;
@@ -997,7 +988,9 @@ class json_parser {
   __device__ bool try_skip_unicode(char const*& str_pos,
                                    char const*& to_match_str_pos,
                                    char const* const to_match_str_end,
-                                   char*& copy_dest)
+                                   char*& copy_dest,
+                                   int& unescped_string_utf8_bytes,
+                                   int& escped_string_utf8_bytes)
   {
     // already parsed u
     bool is_success = try_skip_hex(str_pos) && try_skip_hex(str_pos) && try_skip_hex(str_pos) &&
@@ -1011,7 +1004,8 @@ class json_parser {
       // is 4
       char buff[4];
       cudf::size_type bytes = from_char_utf8(utf_char, buff);
-      string_token_utf8_bytes += bytes;
+      unescped_string_utf8_bytes += bytes;
+      escped_string_utf8_bytes += bytes;
 
       if (nullptr != copy_dest) {
         for (cudf::size_type i = 0; i < bytes; i++) {
@@ -1107,18 +1101,6 @@ class json_parser {
       (max_num_len > 0 && number_digits_length <= max_num_len);
   }
 
-  /**
-   * verify max string length if enabled
-   */
-  __device__ inline bool check_string_max_utf8_bytes()
-  {
-    return
-      // disabled str len check
-      max_string_utf8_bytes <= 0 ||
-      // enabled str len check
-      (max_string_utf8_bytes > 0 && string_token_utf8_bytes <= max_string_utf8_bytes);
-  }
-
   /**
    * parse:  INT ('.' [0-9]+)? EXP?
    * and verify leading zeroes
@@ -1285,7 +1267,7 @@ class json_parser {
   __device__ inline void parse_field_name()
   {
     auto [success, end_char_pos] =
-      try_parse_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped);
+      try_parse_string(curr_pos, nullptr, nullptr, escape_style::UNESCAPED);
     if (success) {
       curr_pos   = end_char_pos;
       curr_token = json_token::FIELD_NAME;
@@ -1314,14 +1296,11 @@ class json_parser {
           current_token_start_pos = curr_pos;
           parse_first_token_in_value();
         } else {
-          if (allow_tailing_sub_string) {
-            // previous token is not INIT, means already get a token; stack is
-            // empty; Successfully parsed. Note: ignore the tailing sub-string
-            curr_token = json_token::SUCCESS;
-          } else {
-            // not eof, has extra useless tailing characters.
-            curr_token = json_token::ERROR;
-          }
+          // Allow tail useless sub-string in JSON, e.g.:
+          // The following invalid JSON is allowed: {'k' : 'v'}_extra_tail_sub_string
+          // previous token is not INIT, means already get a token; stack is
+          // empty; Successfully parsed. Note: ignore the tailing sub-string
+          curr_token = json_token::SUCCESS;
         }
       } else {
         // stack is non-empty
@@ -1495,9 +1474,7 @@ class json_parser {
       case json_token::VALUE_STRING:
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::unescaped);
-        return string_token_utf8_bytes;
+        return write_string(current_token_start_pos, destination, escape_style::UNESCAPED).first;
       case json_token::VALUE_NUMBER_INT:
         if (number_token_len == 2 && current_token_start_pos[0] == '-' &&
             current_token_start_pos[1] == '0') {
@@ -1548,9 +1525,7 @@ class json_parser {
       case json_token::FIELD_NAME:
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::unescaped);
-        return string_token_utf8_bytes;
+        return write_string(current_token_start_pos, destination, escape_style::UNESCAPED).first;
       case json_token::START_ARRAY:
         if (nullptr != destination) { *destination++ = '['; }
         return 1;
@@ -1582,12 +1557,11 @@ class json_parser {
   __device__ cudf::size_type write_escaped_text(char* destination)
   {
     switch (curr_token) {
-      case json_token::VALUE_STRING:
+      case json_token::VALUE_STRING: {
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::escaped);
-        return string_token_utf8_bytes + bytes_diff_for_escape_writing;
+        return write_string(current_token_start_pos, destination, escape_style::ESCAPED).second;
+      }
       case json_token::VALUE_NUMBER_INT:
         if (number_token_len == 2 && current_token_start_pos[0] == '-' &&
             current_token_start_pos[1] == '0') {
@@ -1631,12 +1605,11 @@ class json_parser {
           *destination++ = 'l';
         }
         return 4;
-      case json_token::FIELD_NAME:
+      case json_token::FIELD_NAME: {
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::escaped);
-        return string_token_utf8_bytes + bytes_diff_for_escape_writing;
+        return write_string(current_token_start_pos, destination, escape_style::ESCAPED).second;
+      }
       case json_token::START_ARRAY:
         if (nullptr != destination) { *destination++ = '['; }
         return 1;
@@ -1684,11 +1657,8 @@ class json_parser {
   __device__ bool match_current_field_name(char const* to_match_str_ptr, cudf::size_type len)
   {
     if (json_token::FIELD_NAME == curr_token) {
-      auto [b, end_pos] = try_parse_string(current_token_start_pos,
-                                           to_match_str_ptr,
-                                           to_match_str_ptr + len,
-                                           nullptr,
-                                           write_style::unescaped);
+      auto [b, end_pos] = try_parse_string(
+        current_token_start_pos, to_match_str_ptr, to_match_str_ptr + len, escape_style::UNESCAPED);
       return b;
     } else {
       return false;
@@ -1798,17 +1768,6 @@ class json_parser {
   char const* current_token_start_pos;
   // used to store number token length
   cudf::size_type number_token_len;
-
-  // Records string/field name token utf8 bytes size after unescaped
-  // e.g.: For JSON 4 chars string "\\n", after unescaped, get 1 char '\n'
-  // used by checking the max string length
-  int string_token_utf8_bytes;
-
-  // Records bytes diff between escape writing and unescape writing
-  // e.g.: 4 chars string "\\n", string_token_utf8_bytes is 1,
-  // when `write_escaped_text`, will write out 4 chars: " \ n ",
-  // then this diff will be 4 - 1 = 3
-  int bytes_diff_for_escape_writing;
 };
 
 }  // namespace spark_rapids_jni
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
index ff2d935cc3..f0cb579e43 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
@@ -169,12 +169,14 @@ void getJsonObjectTest_Escape() {
     String JSON4 = "['a','b','\"C\"']";
     // \\u4e2d\\u56FD is 中国
     String JSON5 = "'\\u4e2d\\u56FD\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\b'";
+    String JSON6 = "['\\u4e2d\\u56FD\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\b']"; 
 
     String expectedStr1 = "{\"a\":\"A\"}";
     String expectedStr2 = "{\"a\":\"A\\\"\"}";
     String expectedStr3 = "{\"a\":\"B'\"}";
     String expectedStr4 = "[\"a\",\"b\",\"\\\"C\\\"\"]";
     String expectedStr5 = "中国\"'\\/\b\f\n\r\t\b";
+    String expectedStr6 = "中国\\\"'\\\\/\\b\\f\\n\\r\\t\\b";
 
     try (
         ColumnVector jsonCv = ColumnVector.fromStrings(

From dcaf8031c7c0283a0862dfa16a62d9db9fd91230 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 7 May 2024 04:32:47 +0800
Subject: [PATCH 082/124] Update submodule cudf to
 4dc616227f5c872031603426a3235282f6d23554 (#2019)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 23bb2ed156..4dc616227f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 23bb2ed156d164b59e608e7e791c74db5cb4bce8
+Subproject commit 4dc616227f5c872031603426a3235282f6d23554
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 4d666d92f4..26610e63b3 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-3771f878c0ad7b3806ab574bca43488991077144
+365322aca32fd6ecd7027f5d7ec7be50b7f3cc2a
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 9fde1283bd..333bb644b0 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,7 +62,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "5032f8bb74b5414ffa01c515c8056757a5010e48",
+      "git_tag" : "ff837fffda2bf4afa0dc80bc482a18f5645b4901",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "fdef5f9663514f6bd625a468a25ec8096fbfea7d",
+      "git_tag" : "e13b253fdfa19ccdf563be5fbf4ffa1cdc6b87a0",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 980633db2d86f68d7c5a4241e5e5ddc59ff9aedb Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 7 May 2024 17:27:55 -0500
Subject: [PATCH 083/124] Adjust the launch bounds to get_json_object to avoid
 spilling (#2015)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 src/main/cpp/src/get_json_object.cu           |  26 +-
 src/main/cpp/src/json_parser.cuh              | 852 ++++++++----------
 .../spark/rapids/jni/GetJsonObjectTest.java   |   2 +-
 3 files changed, 410 insertions(+), 470 deletions(-)

diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index 2f5636a205..b743d14cdc 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -269,8 +269,6 @@ class json_generator {
     }
   }
 
-  __device__ void reset() { output_len = 0; }
-
   __device__ inline size_t get_output_len() const { return output_len; }
   __device__ inline char* get_output_start_position() const { return output; }
   __device__ inline char* get_current_output_position() const { return output + output_len; }
@@ -371,7 +369,6 @@ __device__ bool evaluate_path(json_parser& p,
     write_style style;
 
     cudf::device_span<path_instruction const> path;
-
     // is this context task is done
     bool task_is_done;
 
@@ -840,13 +837,12 @@ rmm::device_uvector<path_instruction> construct_path_commands(
  * @returns A pair containing the result code and the output buffer.
  */
 __device__ thrust::pair<bool, size_t> get_json_object_single(
-  char const* input,
-  cudf::size_type input_len,
+  char_range input,
   cudf::device_span<path_instruction const> path_commands,
   char* out_buf,
   size_t out_buf_size)
 {
-  json_parser j_parser(input, input_len);
+  json_parser j_parser(input);
   j_parser.next_token();
   // JSON validation check
   if (json_token::ERROR == j_parser.get_current_token()) { return {false, 0}; }
@@ -878,16 +874,24 @@ __device__ thrust::pair<bool, size_t> get_json_object_single(
  * (chars and validity).
  *
  * @param col Device view of the incoming string
- * @param commands JSONPath command buffer
+ * @param path_commands JSONPath command buffer
+ * @param d_sizes a buffer used to write the output sizes in the first pass,
+ *        and is read back in on the second pass to compute offsets.
  * @param output_offsets Buffer used to store the string offsets for the results
  *        of the query
  * @param out_buf Buffer used to store the results of the query
  * @param out_validity Output validity buffer
  * @param out_valid_count Output count of # of valid bits
- * @param options Options controlling behavior
  */
 template <int block_size>
-__launch_bounds__(block_size) CUDF_KERNEL
+// We have 1 for the minBlocksPerMultiprocessor in the launch bounds to avoid spilling from
+// the kernel itself. By default NVCC uses a heuristic to find a balance between the
+// maximum number of registers used by a kernel and the parallelism of the kernel.
+// If lots of registers are used the parallelism may suffer. But in our case
+// NVCC gets this wrong and we want to avoid spilling all the time or else
+// the performance is really bad. This essentially tells NVCC to prefer using lots
+// of registers over spilling.
+__launch_bounds__(block_size, 1) CUDF_KERNEL
   void get_json_object_kernel(cudf::column_device_view col,
                               cudf::device_span<path_instruction const> path_commands,
                               cudf::size_type* d_sizes,
@@ -911,8 +915,8 @@ __launch_bounds__(block_size) CUDF_KERNEL
         out_buf != nullptr ? output_offsets[tid + 1] - output_offsets[tid] : 0;
 
       // process one single row
-      auto [result, output_size] = get_json_object_single(
-        str.data(), str.size_bytes(), {path_commands.data(), path_commands.size()}, dst, dst_size);
+      auto [result, output_size] =
+        get_json_object_single(str, {path_commands.data(), path_commands.size()}, dst, dst_size);
       if (result) { is_valid = true; }
 
       // filled in only during the precompute step. during the compute step, the
diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh
index f2853fd1f7..217ec0047b 100644
--- a/src/main/cpp/src/json_parser.cuh
+++ b/src/main/cpp/src/json_parser.cuh
@@ -33,7 +33,7 @@ enum class escape_style {
   // e.g.: '\\r' is a string with 2 chars '\' 'r', writes 1 char '\r'
   UNESCAPED,
 
-  // e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"' '"'
+  // e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"'
   // '"'
   ESCAPED
 };
@@ -45,12 +45,6 @@ enum class escape_style {
  */
 constexpr int max_json_nesting_depth = 64;
 
-// Define the maximum JSON String length, counts utf8 bytes.
-// By default, maximum JSON String length is negative one, means no
-// limitation. e.g.: The length of String "\\n" is 1, JSON parser does not
-// count escape characters.
-constexpr int max_string_utf8_bytes = 20000000;
-
 //
 /**
  * Define the maximum JSON number length. Negative or zero means no
@@ -112,6 +106,84 @@ enum class json_token {
 
 };
 
+/**
+ * This is similar to cudf::string_view, but cudf::string_view enforces
+ * UTF-8 encoding, which adds overhead that is not needed for this process.
+ */
+class char_range {
+ public:
+  __device__ inline char_range(char const* const start, cudf::size_type const len)
+    : _data(start), _len(len)
+  {
+  }
+
+  __device__ inline char_range(cudf::string_view const& input)
+    : _data(input.data()), _len(input.size_bytes())
+  {
+  }
+
+  // Warning it looks like there is some kind of a bug in CUDA where you don't want to initialize
+  // a member variable with a static method like this.
+  __device__ inline static char_range null() { return char_range(nullptr, 0); }
+
+  __device__ inline char_range(char_range const&)            = default;
+  __device__ inline char_range(char_range&&)                 = default;
+  __device__ inline char_range& operator=(char_range const&) = default;
+  __device__ inline char_range& operator=(char_range&&)      = default;
+  __device__ inline ~char_range()                            = default;
+
+  __device__ inline cudf::size_type size() const { return _len; }
+  __device__ inline char const* data() const { return _data; }
+  __device__ inline char const* start() const { return _data; }
+  __device__ inline char const* end() const { return _data + _len; }
+
+  __device__ inline bool eof(cudf::size_type pos) const { return pos >= _len; }
+  __device__ inline bool is_null() const { return _data == nullptr; }
+  __device__ inline bool is_empty() const { return _len == 0; }
+
+  __device__ inline char operator[](cudf::size_type pos) const { return _data[pos]; }
+
+  __device__ inline cudf::string_view slice_sv(cudf::size_type pos, cudf::size_type len) const
+  {
+    return cudf::string_view(_data + pos, len);
+  }
+
+  __device__ inline char_range slice(cudf::size_type pos, cudf::size_type len) const
+  {
+    return char_range(_data + pos, len);
+  }
+
+ private:
+  char const* _data;
+  cudf::size_type _len;
+};
+
+/**
+ * A char_range that keeps track of where in the data it currently is.
+ */
+class char_range_reader {
+ public:
+  __device__ inline explicit char_range_reader(char_range range) : _range(range), _pos(0) {}
+
+  __device__ inline char_range_reader(char_range range, cudf::size_type start)
+    : _range(range), _pos(start)
+  {
+  }
+
+  __device__ inline bool eof() const { return _range.eof(_pos); }
+  __device__ inline bool is_null() const { return _range.is_null(); }
+
+  __device__ inline void next() { _pos++; }
+
+  __device__ inline char current_char() const { return _range[_pos]; }
+
+  __device__ inline cudf::size_type pos() const { return _pos; }
+
+ private:
+  char_range _range;
+  cudf::size_type _pos;
+};
+
 /**
  * JSON parser, provides token by token parsing.
  * Follow Jackson JSON format by default.
@@ -120,8 +192,9 @@ enum class json_token {
  * For JSON format:
  * Refer to https://www.json.org/json-en.html.
  *
- * Note: when setting `allow_single_quotes` or `allow_unescaped_control_chars`,
- * then JSON format is not conventional.
+ * Note: This is not conventional as it allows
+ * single quotes and unescaped control characters
+ * to match what SPARK does for get_json_object
  *
  * White space can only be 4 chars: ' ', '\n', '\r', '\t',
  * Jackson does not allow other control chars as white spaces.
@@ -136,31 +209,18 @@ enum class json_token {
  *   infinity, +infinity, -infinity
  *   1e, 1e+, 1e-, -1., 1.
  *
- * When `allow_single_quotes` is true:
- *   Valid string examples:
+ * Valid string examples:
  *     "\'" , "\"" ,  '\'' , '\"' , '"' , "'"
  *
- *  When `allow_single_quotes` is false:
- *   Invalid string examples:
- *     "\'"
- *
- *  When `allow_unescaped_control_chars` is true:
- *    Valid string: "asscii_control_chars"
- *      here `asscii_control_chars` represents control chars which in Ascii code
- * range: [0, 32)
- *
- *  When `allow_unescaped_control_chars` is false:
- *    Invalid string: "asscii_control_chars"
- *      here `asscii_control_chars` represents control chars which in Ascii code
+ * Valid string: "ascii_control_chars"
+ *    here `ascii_control_chars` represents control chars which in Ascii code
  * range: [0, 32)
  *
  */
 class json_parser {
  public:
-  __device__ inline json_parser(char const* const _json_start_pos, cudf::size_type const _json_len)
-    : json_start_pos(_json_start_pos),
-      json_end_pos(_json_start_pos + _json_len),
-      curr_pos(_json_start_pos)
+  __device__ inline explicit json_parser(char_range _chars)
+    : chars(_chars), curr_pos(0), current_token(json_token::INIT)
   {
   }
 
@@ -199,12 +259,13 @@ class json_parser {
   /**
    * is current position EOF
    */
-  __device__ inline bool eof(char const* pos) { return pos >= json_end_pos; }
+  __device__ inline bool eof(cudf::size_type pos) const { return pos >= chars.size(); }
+  __device__ inline bool eof() const { return curr_pos >= chars.size(); }
 
   /**
    * is hex digits: 0-9, A-F, a-f
    */
-  __device__ inline bool is_hex_digit(char c)
+  __device__ inline bool is_hex_digit(char c) const
   {
     return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
   }
@@ -212,12 +273,12 @@ class json_parser {
   /**
    * is 0 to 9 digit
    */
-  __device__ inline bool is_digit(char c) { return (c >= '0' && c <= '9'); }
+  __device__ inline bool is_digit(char c) const { return (c >= '0' && c <= '9'); }
 
   /**
    * is white spaces: ' ', '\t', '\n' '\r'
    */
-  __device__ inline bool is_whitespace(char c)
+  __device__ inline bool is_whitespace(char c) const
   {
     return c == ' ' || c == '\t' || c == '\n' || c == '\r';
   }
@@ -225,19 +286,28 @@ class json_parser {
   /**
    * skips 4 characters: ' ', '\t', '\n' '\r'
    */
-  __device__ inline void skip_whitespaces(char const*& pos)
+  __device__ inline void skip_whitespaces()
   {
-    while (!eof(pos) && is_whitespace(*pos)) {
-      pos++;
+    while (!eof() && is_whitespace(chars[curr_pos])) {
+      curr_pos++;
     }
   }
 
   /**
    * check current char, if it's expected, then plus the position
    */
-  __device__ inline bool try_skip(char const*& pos, char expected)
+  __device__ inline bool try_skip(char_range_reader& reader, char expected)
+  {
+    if (!reader.eof() && reader.current_char() == expected) {
+      reader.next();
+      return true;
+    }
+    return false;
+  }
+
+  __device__ inline bool try_skip(cudf::size_type& pos, char expected)
   {
-    if (!eof(pos) && *pos == expected) {
+    if (!eof(pos) && chars[pos] == expected) {
       pos++;
       return true;
     }
@@ -288,6 +358,8 @@ class json_parser {
    */
   __device__ inline bool is_context_stack_empty() { return stack_size == 0; }
 
+  __device__ inline void set_current_error() { current_token = json_token::ERROR; }
+
   /**
    * parse the first value token from current position
    * e.g., after finished this function:
@@ -296,79 +368,63 @@ class json_parser {
    *   current token is string/num/true/false/null if current value is terminal
    *   current token is ERROR if parse failed
    */
-  __device__ inline void parse_first_token_in_value()
+  __device__ inline void parse_first_token_in_value_and_set_current()
   {
+    current_token_start_pos = curr_pos;
     // already checked eof
-    char c = *curr_pos;
+    char c = chars[curr_pos];
     switch (c) {
       case '{':
         if (!try_push_context(json_token::START_OBJECT)) {
-          curr_token = json_token::ERROR;
-          return;
+          set_current_error();
+        } else {
+          curr_pos++;
+          current_token = json_token::START_OBJECT;
         }
-        curr_pos++;
-        curr_token = json_token::START_OBJECT;
         break;
-
       case '[':
         if (!try_push_context(json_token::START_ARRAY)) {
-          curr_token = json_token::ERROR;
-          return;
+          set_current_error();
+        } else {
+          curr_pos++;
+          current_token = json_token::START_ARRAY;
         }
-        curr_pos++;
-        curr_token = json_token::START_ARRAY;
         break;
-
-      case '"': parse_double_quoted_string(); break;
-
-      case '\'': parse_single_quoted_string(); break;
-
+      case '"':
+        // fall through
+      case '\'': parse_string_and_set_current(); break;
       case 't':
         curr_pos++;
-        parse_true();
+        parse_true_and_set_current();
         break;
-
       case 'f':
         curr_pos++;
-        parse_false();
+        parse_false_and_set_current();
         break;
-
       case 'n':
         curr_pos++;
-        parse_null();
+        parse_null_and_set_current();
         break;
-
-      default: parse_number();
+      default: parse_number_and_set_current(); break;
     }
   }
 
   // =========== Parse string begin ===========
 
   /**
-   * parse ' quoted string
+   * parse quoted string and set current token
    */
-  __device__ inline void parse_single_quoted_string()
+  __device__ inline void parse_string_and_set_current()
   {
-    auto [success, end_char_pos] = try_parse_string(curr_pos);
+    // TODO eventually chars should be a reader so we can just pass it in...
+    char_range_reader reader(chars, curr_pos);
+    auto [success, end_char_pos] = try_parse_string(reader);
     if (success) {
-      curr_pos   = end_char_pos;
-      curr_token = json_token::VALUE_STRING;
+      // TODO remove end_char_pos, and just get it from the reader...
+      curr_pos      = end_char_pos;
+      current_token = json_token::VALUE_STRING;
     } else {
-      curr_token = json_token::ERROR;
-    }
-  }
-
-  /**
-   * parse " quoted string
-   */
-  __device__ inline void parse_double_quoted_string()
-  {
-    auto [success, end_char_pos] = try_parse_string(curr_pos);
-    if (success) {
-      curr_pos   = end_char_pos;
-      curr_token = json_token::VALUE_STRING;
-    } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
@@ -443,90 +499,60 @@ class json_parser {
     }
   }
 
-  __device__ inline std::pair<int, int> write_string(char const* str_pos,
-                                                     char* copy_destination,
-                                                     escape_style w_style)
+  __device__ inline int write_string(char_range_reader& str,
+                                     char* copy_destination,
+                                     escape_style w_style)
   {
-    if (eof(str_pos)) return std::make_pair(0, 0);
-    char const quote_char = *str_pos;
-    // Records string/field name token utf8 bytes size after unescaped
-    // e.g.: For JSON 4 chars string "\\n", after unescaped, get 1 char '\n'
-    // used by checking the max string length
-    int unescped_string_utf8_bytes = 0;
-    // Records string/field name token utf8 bytes size after escaped
-    // e.g.: 4 chars string "\\n", will write out 4 chars: " \ n "
-    int escped_string_utf8_bytes = 0;
+    if (str.eof()) { return 0; }
+    char const quote_char = str.current_char();
+    int output_size_bytes = 0;
 
     // write the first " if write style is escaped
     if (escape_style::ESCAPED == w_style) {
-      escped_string_utf8_bytes++;
+      output_size_bytes++;
       if (nullptr != copy_destination) { *copy_destination++ = '"'; }
     }
 
     // skip left quote char
-    if (!try_skip(str_pos, quote_char)) {
-      return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
-    }
+    // No need to check because we just read it in.
+    str.next();
 
     // scan string content
-    while (!eof(str_pos)) {
-      char c = *str_pos;
-      int v  = static_cast<int>(c);
+    while (!str.eof()) {
+      char const c = str.current_char();
+      int const v  = static_cast<int>(c);
       if (c == quote_char) {
         // path 1: match closing quote char
-        str_pos++;
-
-        // check max str len
-        if (!(max_string_utf8_bytes <= 0 ||
-              (max_string_utf8_bytes > 0 && unescped_string_utf8_bytes <= max_string_utf8_bytes))) {
-          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
-        }
+        str.next();
 
         // write the end " if write style is escaped
         if (escape_style::ESCAPED == w_style) {
-          escped_string_utf8_bytes++;
+          output_size_bytes++;
           if (nullptr != copy_destination) { *copy_destination++ = '"'; }
         }
 
-        return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        return output_size_bytes;
       } else if (v >= 0 && v < 32) {
         // path 2: unescaped control char
 
         // copy if enabled, unescape mode, write 1 char
-        if (copy_destination != nullptr && escape_style::UNESCAPED == w_style) {
-          *copy_destination++ = *str_pos;
-        }
-
-        // copy if enabled, escape mode, write more chars
-        if (escape_style::ESCAPED == w_style) {
-          int escape_chars = escape_char(*str_pos, copy_destination);
-          if (copy_destination != nullptr) copy_destination += escape_chars;
-          escped_string_utf8_bytes += (escape_chars - 1);
-        }
-
-        // check match if enabled
-        const char* match_str_pos = nullptr;
-        if (!try_match_char(match_str_pos, nullptr, *str_pos)) {
-          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        if (escape_style::UNESCAPED == w_style) {
+          output_size_bytes++;
+          if (copy_destination != nullptr) { *copy_destination++ = str.current_char(); }
+        } else {
+          // escape_style::ESCAPED
+          int const escape_chars = escape_char(str.current_char(), copy_destination);
+          if (copy_destination != nullptr) { copy_destination += escape_chars; }
+          output_size_bytes += escape_chars;
         }
 
-        str_pos++;
-        unescped_string_utf8_bytes++;
-        escped_string_utf8_bytes++;
-        continue;
+        str.next();
       } else if ('\\' == c) {
         // path 3: escape path
-        str_pos++;
-        const char* to_match_str_pos = nullptr;
-        char* copy_dest_nullptr      = nullptr;
-        if (!try_skip_escape_part(str_pos,
-                                  to_match_str_pos,
-                                  copy_dest_nullptr,
-                                  copy_destination,
-                                  w_style,
-                                  escped_string_utf8_bytes,
-                                  unescped_string_utf8_bytes)) {
-          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+        str.next();
+        char_range_reader to_match(char_range::null());
+        if (!try_skip_escape_part(str, to_match, copy_destination, w_style, output_size_bytes)) {
+          return output_size_bytes;
         }
       } else {
         // path 4: safe code point
@@ -535,24 +561,17 @@ class json_parser {
         // e.g.:  'A"' string, escape to "A\\"" (5 chars: " A \ " ")
         if ('\"' == c && escape_style::ESCAPED == w_style) {
           if (copy_destination != nullptr) { *copy_destination++ = '\\'; }
-          escped_string_utf8_bytes++;
+          output_size_bytes++;
         }
 
-        if (!try_skip_safe_code_point(str_pos, c)) {
-          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
-        }
         if (copy_destination != nullptr) { *copy_destination++ = c; }
-        // check match if enabled
-        const char* match_str_pos = nullptr;
-        if (!try_match_char(match_str_pos, nullptr, c)) {
-          return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
-        }
-        unescped_string_utf8_bytes++;
-        escped_string_utf8_bytes++;
+        str.next();
+        output_size_bytes++;
       }
     }
 
-    return std::make_pair(unescped_string_utf8_bytes, escped_string_utf8_bytes);
+    // technically this is an error state, but we will do our best from here...
+    return output_size_bytes;
   }
 
   /**
@@ -568,7 +587,7 @@ class json_parser {
    * } , :), string quote char(" ') and Escape char \ are all Ascii(The leading
    * bit is 0), so it's safe that do not convert byte array to UTF-8 char.
    *
-   * When quote is " and allow_unescaped_control_chars is false, grammar is:
+   * When quote is " grammar is:
    *
    *   STRING
    *     : '"' (ESC | SAFECODEPOINT)* '"'
@@ -587,121 +606,94 @@ class json_parser {
    *     ;
    *
    *   fragment SAFECODEPOINT
-   *       // 1 not " or ' depending to allow_single_quotes
+   *       // 1 not " or '
    *       // 2 not \
    *       // 3 non control character: Ascii value not in [0, 32)
    *     : ~ ["\\\u0000-\u001F]
    *     ;
    *
-   * When allow_unescaped_control_chars is true:
-   *   Allow [0-32) control Ascii chars directly without escape
-   * When allow_single_quotes is true:
-   *   These strings are allowed: '\'' , '\"' , '"' , "\"" , "\'" , "'"
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param quote_char expected quote char
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
+   * @param str string to parse
+   * @param to_match expected match str
+   * @param w_style the escape style for writing.
+   * @return a pair of success and length, where success is true if the string
+   * is valid and length is the number of bytes needed to encode the string
+   * in the given style.
    */
-  __device__ inline std::pair<bool, char const*> try_parse_string(
-    char const* str_pos,
-    char const* to_match_str_pos       = nullptr,
-    char const* const to_match_str_end = nullptr,
-    escape_style w_style               = escape_style::UNESCAPED)
+  __device__ inline std::pair<bool, cudf::size_type> try_parse_string(
+    char_range_reader& str,
+    char_range_reader to_match = char_range_reader(char_range::null()),
+    escape_style w_style       = escape_style::UNESCAPED)
   {
-    if (eof(str_pos)) { return std::make_pair(false, nullptr); }
-    char const quote_char          = *str_pos;
-    int unescped_string_utf8_bytes = 0;
-    int escped_string_utf8_bytes   = 0;
+    if (str.eof()) { return std::make_pair(false, 0); }
+    char const quote_char = str.current_char();
+    int output_size_bytes = 0;
 
     // write the first " if write style is escaped
-    if (escape_style::ESCAPED == w_style) { escped_string_utf8_bytes++; }
+    if (escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
     // skip left quote char
-    if (!try_skip(str_pos, quote_char)) { return std::make_pair(false, nullptr); }
+    // We don't need to actually verify what it is, because we just read it.
+    str.next();
 
     // scan string content
-    while (!eof(str_pos)) {
-      char c = *str_pos;
+    while (!str.eof()) {
+      char c = str.current_char();
       int v  = static_cast<int>(c);
       if (c == quote_char) {
         // path 1: match closing quote char
-        str_pos++;
-
-        // check max str len
-        if (!(max_string_utf8_bytes <= 0 ||
-              (max_string_utf8_bytes > 0 && unescped_string_utf8_bytes <= max_string_utf8_bytes))) {
-          return std::make_pair(false, nullptr);
-        }
+        str.next();
 
         // match check, the last char in match_str is quote_char
-        if (to_match_str_pos != to_match_str_end) { return std::make_pair(false, nullptr); }
+        if (!to_match.is_null() && !to_match.eof()) { return std::make_pair(false, 0); }
 
         // write the end " if write style is escaped
-        if (escape_style::ESCAPED == w_style) { escped_string_utf8_bytes++; }
+        if (escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
-        return std::make_pair(true, str_pos);
+        return std::make_pair(true, str.pos());
       } else if (v >= 0 && v < 32) {
         // path 2: unescaped control char
 
         // copy if enabled, escape mode, write more chars
         if (escape_style::ESCAPED == w_style) {
-          int escape_chars = escape_char(*str_pos, nullptr);
-          escped_string_utf8_bytes += (escape_chars - 1);
+          int escape_chars = escape_char(str.current_char(), nullptr);
+          output_size_bytes += (escape_chars - 1);
         }
 
         // check match if enabled
-        if (!try_match_char(to_match_str_pos, to_match_str_end, *str_pos)) {
-          return std::make_pair(false, nullptr);
-        }
+        if (!try_match_char(to_match, str.current_char())) { return std::make_pair(false, 0); }
 
-        str_pos++;
-        unescped_string_utf8_bytes++;
-        escped_string_utf8_bytes++;
+        str.next();
+        output_size_bytes++;
         continue;
       } else if ('\\' == c) {
         // path 3: escape path
-        str_pos++;
+        str.next();
         char* copy_dest_nullptr = nullptr;
-        if (!try_skip_escape_part(str_pos,
-                                  to_match_str_pos,
-                                  to_match_str_end,
-                                  copy_dest_nullptr,
-                                  w_style,
-                                  escped_string_utf8_bytes,
-                                  unescped_string_utf8_bytes)) {
-          return std::make_pair(false, nullptr);
+        if (!try_skip_escape_part(str, to_match, copy_dest_nullptr, w_style, output_size_bytes)) {
+          return std::make_pair(false, 0);
         }
       } else {
         // path 4: safe code point
 
         // handle single unescaped " char; happens when string is quoted by char '
         // e.g.:  'A"' string, escape to "A\\"" (5 chars: " A \ " ")
-        if ('\"' == c && escape_style::ESCAPED == w_style) { escped_string_utf8_bytes++; }
+        if ('\"' == c && escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
-        if (!try_skip_safe_code_point(str_pos, c)) { return std::make_pair(false, nullptr); }
+        if (!try_skip_safe_code_point(str, c)) { return std::make_pair(false, 0); }
         // check match if enabled
-        if (!try_match_char(to_match_str_pos, to_match_str_end, c)) {
-          return std::make_pair(false, nullptr);
-        }
-        unescped_string_utf8_bytes++;
-        escped_string_utf8_bytes++;
+        if (!try_match_char(to_match, c)) { return std::make_pair(false, 0); }
+        output_size_bytes++;
       }
     }
 
-    return std::make_pair(false, nullptr);
+    return std::make_pair(false, 0);
   }
 
-  __device__ inline bool try_match_char(char const*& char_pos,
-                                        char const* const char_end_pos,
-                                        char c)
+  __device__ inline bool try_match_char(char_range_reader& reader, char c)
   {
-    if (nullptr != char_pos) {
-      if (char_pos < char_end_pos && *char_pos == c) {
-        char_pos++;
+    if (!reader.is_null()) {
+      if (!reader.eof() && reader.current_char() == c) {
+        reader.next();
         return true;
       } else {
         return false;
@@ -716,19 +708,17 @@ class json_parser {
    * skip the HEX chars in \u HEX HEX HEX HEX.
    * @return positive escaped ASCII value if success, -1 otherwise
    */
-  __device__ inline bool try_skip_escape_part(char const*& str_pos,
-                                              char const*& to_match_str_pos,
-                                              char const* const to_match_str_end,
+  __device__ inline bool try_skip_escape_part(char_range_reader& str,
+                                              char_range_reader& to_match,
                                               char*& copy_dest,
                                               escape_style w_style,
-                                              int& escped_string_utf8_bytes,
-                                              int& unescped_string_utf8_bytes)
+                                              int& output_size_bytes)
   {
     // already skipped the first '\'
     // try skip second part
-    if (!eof(str_pos)) {
-      char c = *str_pos;
-      switch (*str_pos) {
+    if (!str.eof()) {
+      char const c = str.current_char();
+      switch (c) {
         // path 1: \", \', \\, \/, \b, \f, \n, \r, \t
         case '\"':
           if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
@@ -737,21 +727,19 @@ class json_parser {
               *copy_dest++ = '\\';
               *copy_dest++ = '"';
             }
-            escped_string_utf8_bytes++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, c)) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case '\'':
           // for both unescaped/escaped writes a single char '
           if (nullptr != copy_dest) { *copy_dest++ = c; }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
+          if (!try_match_char(to_match, c)) { return false; }
 
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          output_size_bytes++;
+          str.next();
           return true;
         case '\\':
           if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
@@ -760,20 +748,18 @@ class json_parser {
               *copy_dest++ = '\\';
               *copy_dest++ = '\\';
             }
-            escped_string_utf8_bytes++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, c)) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case '/':
           // for both unescaped/escaped writes a single char /
           if (nullptr != copy_dest) { *copy_dest++ = c; }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, c)) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'b':
           if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\b'; }
@@ -782,12 +768,11 @@ class json_parser {
               *copy_dest++ = '\\';
               *copy_dest++ = 'b';
             }
-            escped_string_utf8_bytes++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\b')) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\b')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'f':
           if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\f'; }
@@ -796,12 +781,11 @@ class json_parser {
               *copy_dest++ = '\\';
               *copy_dest++ = 'f';
             }
-            escped_string_utf8_bytes++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\f')) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\f')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'n':
           if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\n'; }
@@ -810,12 +794,11 @@ class json_parser {
               *copy_dest++ = '\\';
               *copy_dest++ = 'n';
             }
-            escped_string_utf8_bytes++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\n')) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\n')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'r':
           if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\r'; }
@@ -824,12 +807,11 @@ class json_parser {
               *copy_dest++ = '\\';
               *copy_dest++ = 'r';
             }
-            escped_string_utf8_bytes++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\r')) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\r')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 't':
           if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\t'; }
@@ -838,26 +820,20 @@ class json_parser {
               *copy_dest++ = '\\';
               *copy_dest++ = 't';
             }
-            escped_string_utf8_bytes++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\t')) { return false; }
-          unescped_string_utf8_bytes++;
-          escped_string_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\t')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         // path 1 done: \", \', \\, \/, \b, \f, \n, \r, \t
         case 'u':
           // path 2: \u HEX HEX HEX HEX
-          str_pos++;
+          str.next();
 
           // for both unescaped/escaped writes corresponding utf8 bytes, no need
           // to pass in write style
-          return try_skip_unicode(str_pos,
-                                  to_match_str_pos,
-                                  to_match_str_end,
-                                  copy_dest,
-                                  unescped_string_utf8_bytes,
-                                  escped_string_utf8_bytes);
+          return try_skip_unicode(str, to_match, copy_dest, output_size_bytes);
         default:
           // path 3: invalid
           return false;
@@ -871,13 +847,13 @@ class json_parser {
   /**
    * parse:
    *   fragment SAFECODEPOINT
-   *       // 1 not " or ' depending to allow_single_quotes
+   *       // 1 not " or '
    *       // 2 not \
    *       // 3 non control character: Ascii value not in [0, 32)
    *     : ~ ["\\\u0000-\u001F]
    *     ;
    */
-  __device__ inline bool try_skip_safe_code_point(char const*& str_pos, char c)
+  __device__ inline bool try_skip_safe_code_point(char_range_reader& str, char c)
   {
     // 1 the char is not quoted(' or ") char, here satisfy, do not need to check
     // again
@@ -887,7 +863,7 @@ class json_parser {
     // 3. chars not in [0, 32)
     int v = static_cast<int>(c);
     if (!(v >= 0 && v < 32)) {
-      str_pos++;
+      str.next();
       return true;
     } else {
       return false;
@@ -905,18 +881,6 @@ class json_parser {
     return 0;
   }
 
-  /**
-   * parse four HEX chars to unsigned int
-   */
-  __device__ inline cudf::char_utf8 parse_code_point(char const* p)
-  {
-    cudf::char_utf8 v = 0;
-    for (size_t i = 0; i < 4; i++) {
-      v = v * 16 + hex_value(p[i]);
-    }
-    return v;
-  }
-
   /**
    * @brief Returns the number of bytes in the specified character.
    *
@@ -985,59 +949,45 @@ class json_parser {
    * try skip 4 HEX chars
    * in pattern: '\\' 'u' HEX HEX HEX HEX, it's a code point of unicode
    */
-  __device__ bool try_skip_unicode(char const*& str_pos,
-                                   char const*& to_match_str_pos,
-                                   char const* const to_match_str_end,
+  __device__ bool try_skip_unicode(char_range_reader& str,
+                                   char_range_reader& to_match,
                                    char*& copy_dest,
-                                   int& unescped_string_utf8_bytes,
-                                   int& escped_string_utf8_bytes)
+                                   int& output_size_bytes)
   {
-    // already parsed u
-    bool is_success = try_skip_hex(str_pos) && try_skip_hex(str_pos) && try_skip_hex(str_pos) &&
-                      try_skip_hex(str_pos);
-    if (is_success) {
-      // parse 4 HEX chars to uint32_t value
-      auto code_point = parse_code_point(str_pos - 4);
-      auto utf_char   = codepoint_to_utf8(code_point);
-      // write utf8 bytes.
-      // In UTF-8, the maximum number of bytes used to encode a single character
-      // is 4
-      char buff[4];
-      cudf::size_type bytes = from_char_utf8(utf_char, buff);
-      unescped_string_utf8_bytes += bytes;
-      escped_string_utf8_bytes += bytes;
-
-      if (nullptr != copy_dest) {
-        for (cudf::size_type i = 0; i < bytes; i++) {
-          *copy_dest++ = buff[i];
-        }
+    // already parsed \u
+    // now we expect 4 hex chars.
+    cudf::char_utf8 code_point = 0;
+    for (size_t i = 0; i < 4; i++) {
+      if (str.eof()) { return false; }
+      char const c = str.current_char();
+      str.next();
+      if (!is_hex_digit(c)) { return false; }
+      code_point = (code_point * 16) + hex_value(c);
+    }
+    auto utf_char = codepoint_to_utf8(code_point);
+    // write utf8 bytes.
+    // In UTF-8, the maximum number of bytes used to encode a single character
+    // is 4
+    char buff[4];
+    cudf::size_type const bytes = from_char_utf8(utf_char, buff);
+    output_size_bytes += bytes;
+
+    // TODO I think if we do an escape sequence for \n/etc it will return
+    // the wrong thing....
+    if (nullptr != copy_dest) {
+      for (cudf::size_type i = 0; i < bytes; i++) {
+        *copy_dest++ = buff[i];
       }
+    }
 
-      if (nullptr != to_match_str_pos) {
-        for (cudf::size_type i = 0; i < bytes; i++) {
-          if (!(to_match_str_pos < to_match_str_end && *to_match_str_pos == buff[i])) {
-            return false;
-          }
-          to_match_str_pos++;
-        }
+    if (!to_match.is_null()) {
+      for (cudf::size_type i = 0; i < bytes; i++) {
+        if (!(to_match.eof() && to_match.current_char() == buff[i])) { return false; }
+        to_match.next();
       }
-
-      return true;
-    } else {
-      return false;
     }
-  }
 
-  /**
-   * try skip HEX
-   */
-  __device__ inline bool try_skip_hex(char const*& str_pos)
-  {
-    if (!eof(str_pos) && is_hex_digit(*str_pos)) {
-      str_pos++;
-      return true;
-    }
-    return false;
+    return true;
   }
 
   // =========== Parse string end ===========
@@ -1065,7 +1015,7 @@ class json_parser {
    *
    * Note: Leading zeroes are not allowed, keep consistent with Spark, e.g.: 00, -01 are invalid
    */
-  __device__ inline void parse_number()
+  __device__ inline void parse_number_and_set_current()
   {
     // parse sign
     try_skip(curr_pos, '-');
@@ -1077,14 +1027,14 @@ class json_parser {
     int number_digits_length = 0;
     if (try_unsigned_number(is_float, number_digits_length)) {
       if (check_max_num_len(number_digits_length)) {
-        curr_token = (is_float ? json_token::VALUE_NUMBER_FLOAT : json_token::VALUE_NUMBER_INT);
+        current_token = (is_float ? json_token::VALUE_NUMBER_FLOAT : json_token::VALUE_NUMBER_INT);
         // success parsed a number, update the token length
         number_token_len = curr_pos - current_token_start_pos;
       } else {
-        curr_token = json_token::ERROR;
+        set_current_error();
       }
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
@@ -1109,8 +1059,8 @@ class json_parser {
    */
   __device__ inline bool try_unsigned_number(bool& is_float, int& number_digits_length)
   {
-    if (!eof(curr_pos)) {
-      char c = *curr_pos;
+    if (!eof()) {
+      char const c = chars[curr_pos];
       if (c >= '1' && c <= '9') {
         curr_pos++;
         number_digits_length++;
@@ -1123,8 +1073,8 @@ class json_parser {
         number_digits_length++;
 
         // check leading zeros
-        if (!eof(curr_pos)) {
-          char next_char_after_zero = *curr_pos;
+        if (!eof()) {
+          char const next_char_after_zero = chars[curr_pos];
           if (next_char_after_zero >= '0' && next_char_after_zero <= '9') {
             // e.g.: 01 is invalid
             return false;
@@ -1159,7 +1109,7 @@ class json_parser {
     }
 
     // parse exp
-    if (!eof(curr_pos) && (*curr_pos == 'e' || *curr_pos == 'E')) {
+    if (!eof() && (chars[curr_pos] == 'e' || chars[curr_pos] == 'E')) {
       curr_pos++;
       is_float = true;
       return try_parse_exp(number_digits_length);
@@ -1175,8 +1125,8 @@ class json_parser {
   __device__ inline int skip_zero_or_more_digits()
   {
     int digits = 0;
-    while (!eof(curr_pos)) {
-      if (is_digit(*curr_pos)) {
+    while (!eof()) {
+      if (is_digit(chars[curr_pos])) {
         digits++;
         curr_pos++;
       } else {
@@ -1194,7 +1144,7 @@ class json_parser {
    */
   __device__ inline bool try_skip_one_or_more_digits(int& number_digits_length)
   {
-    if (!eof(curr_pos) && is_digit(*curr_pos)) {
+    if (!eof() && is_digit(chars[curr_pos])) {
       curr_pos++;
       number_digits_length++;
       number_digits_length += skip_zero_or_more_digits();
@@ -1213,7 +1163,7 @@ class json_parser {
     // already parsed [eE]
 
     // parse [+-]?
-    if (!eof(curr_pos) && (*curr_pos == '+' || *curr_pos == '-')) { curr_pos++; }
+    if (!eof() && (chars[curr_pos] == '+' || chars[curr_pos] == '-')) { curr_pos++; }
 
     // parse [0-9]+
     return try_skip_one_or_more_digits(number_digits_length);
@@ -1224,55 +1174,58 @@ class json_parser {
   /**
    * parse true
    */
-  __device__ inline void parse_true()
+  __device__ inline void parse_true_and_set_current()
   {
     // already parsed 't'
     if (try_skip(curr_pos, 'r') && try_skip(curr_pos, 'u') && try_skip(curr_pos, 'e')) {
-      curr_token = json_token::VALUE_TRUE;
+      current_token = json_token::VALUE_TRUE;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
   /**
    * parse false
    */
-  __device__ inline void parse_false()
+  __device__ inline void parse_false_and_set_current()
   {
     // already parsed 'f'
     if (try_skip(curr_pos, 'a') && try_skip(curr_pos, 'l') && try_skip(curr_pos, 's') &&
         try_skip(curr_pos, 'e')) {
-      curr_token = json_token::VALUE_FALSE;
+      current_token = json_token::VALUE_FALSE;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
   /**
    * parse null
    */
-  __device__ inline void parse_null()
+  __device__ inline void parse_null_and_set_current()
   {
     // already parsed 'n'
     if (try_skip(curr_pos, 'u') && try_skip(curr_pos, 'l') && try_skip(curr_pos, 'l')) {
-      curr_token = json_token::VALUE_NULL;
+      current_token = json_token::VALUE_NULL;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
   /**
    * parse the key string in key:value pair
    */
-  __device__ inline void parse_field_name()
+  __device__ inline void parse_field_name_and_set_current()
   {
-    auto [success, end_char_pos] =
-      try_parse_string(curr_pos, nullptr, nullptr, escape_style::UNESCAPED);
+    // TODO eventually chars should be a reader so we can just pass it in...
+    char_range_reader reader(chars, curr_pos);
+    current_token_start_pos      = curr_pos;
+    auto [success, end_char_pos] = try_parse_string(reader);
     if (success) {
-      curr_pos   = end_char_pos;
-      curr_token = json_token::FIELD_NAME;
+      // TODO remove end_char_pos, and just get it from the reader...
+      curr_pos      = end_char_pos;
+      current_token = json_token::FIELD_NAME;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
@@ -1282,55 +1235,50 @@ class json_parser {
    * @param[out] has_comma_before_token has comma before next token
    * @param[out] has_colon_before_token has colon before next token
    */
-  __device__ inline json_token parse_next_token(bool& has_comma_before_token,
-                                                bool& has_colon_before_token)
+  __device__ inline void parse_next_token_and_set_current(bool& has_comma_before_token,
+                                                          bool& has_colon_before_token)
   {
-    skip_whitespaces(curr_pos);
-    if (!eof(curr_pos)) {
-      char c = *curr_pos;
+    skip_whitespaces();
+    if (!eof()) {
+      char const c = chars[curr_pos];
       if (is_context_stack_empty()) {
         // stack is empty
 
-        if (curr_token == json_token::INIT) {
+        if (current_token == json_token::INIT) {
           // main root entry point
-          current_token_start_pos = curr_pos;
-          parse_first_token_in_value();
+          parse_first_token_in_value_and_set_current();
         } else {
-          // Allow tail useless sub-string in JSON, e.g.:
-          // The following invalid JSON is allowed: {'k' : 'v'}_extra_tail_sub_string
           // previous token is not INIT, means already get a token; stack is
           // empty; Successfully parsed. Note: ignore the tailing sub-string
-          curr_token = json_token::SUCCESS;
+          current_token = json_token::SUCCESS;
         }
       } else {
         // stack is non-empty
 
         if (is_object_context()) {
           // in JSON object context
-          if (curr_token == json_token::START_OBJECT) {
+          if (current_token == json_token::START_OBJECT) {
             // previous token is '{'
             if (c == '}') {
               // empty object
               // close curr object context
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_OBJECT;
               pop_curr_context();
+              current_token = json_token::END_OBJECT;
             } else {
               // parse key in key:value pair
-              current_token_start_pos = curr_pos;
-              parse_field_name();
+              parse_field_name_and_set_current();
             }
-          } else if (curr_token == json_token::FIELD_NAME) {
+          } else if (current_token == json_token::FIELD_NAME) {
             if (c == ':') {
               has_colon_before_token = true;
               // skip ':' and parse value in key:value pair
               curr_pos++;
-              skip_whitespaces(curr_pos);
-              current_token_start_pos = curr_pos;
-              parse_first_token_in_value();
+              skip_whitespaces();
+              parse_first_token_in_value_and_set_current();
             } else {
-              curr_token = json_token::ERROR;
+              set_current_error();
             }
           } else {
             // expect next key:value pair or '}'
@@ -1338,67 +1286,63 @@ class json_parser {
               // end of object
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_OBJECT;
               pop_curr_context();
+              current_token = json_token::END_OBJECT;
             } else if (c == ',') {
               has_comma_before_token = true;
               // parse next key:value pair
               curr_pos++;
-              skip_whitespaces(curr_pos);
-              current_token_start_pos = curr_pos;
-              parse_field_name();
+              skip_whitespaces();
+              parse_field_name_and_set_current();
             } else {
-              curr_token = json_token::ERROR;
+              set_current_error();
             }
           }
         } else {
           // in Json array context
-          if (curr_token == json_token::START_ARRAY) {
+          if (current_token == json_token::START_ARRAY) {
             // previous token is '['
             if (c == ']') {
               // curr: ']', empty array
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_ARRAY;
               pop_curr_context();
+              current_token = json_token::END_ARRAY;
             } else {
               // non-empty array, parse the first value in the array
-              current_token_start_pos = curr_pos;
-              parse_first_token_in_value();
+              parse_first_token_in_value_and_set_current();
             }
           } else {
             if (c == ',') {
               has_comma_before_token = true;
               // skip ',' and parse the next value
               curr_pos++;
-              skip_whitespaces(curr_pos);
-              current_token_start_pos = curr_pos;
-              parse_first_token_in_value();
+              skip_whitespaces();
+              parse_first_token_in_value_and_set_current();
             } else if (c == ']') {
               // end of array
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_ARRAY;
               pop_curr_context();
+              current_token = json_token::END_ARRAY;
             } else {
-              curr_token = json_token::ERROR;
+              set_current_error();
             }
           }
         }
       }
     } else {
       // eof
-      if (is_context_stack_empty() && curr_token != json_token::INIT) {
+      if (is_context_stack_empty() && current_token != json_token::INIT) {
         // reach eof; stack is empty; previous token is not INIT
-        curr_token = json_token::SUCCESS;
+        current_token = json_token::SUCCESS;
       } else {
         // eof, and meet the following cases:
         //   - has unclosed JSON array/object;
         //   - the whole JSON is empty
-        curr_token = json_token::ERROR;
+        set_current_error();
       }
     }
-    return curr_token;
   }
 
  public:
@@ -1411,23 +1355,19 @@ class json_parser {
     // parse next token
     bool has_comma_before_token;  // no-initialization because of do not care here
     bool has_colon_before_token;  // no-initialization because of do not care here
-    return parse_next_token(has_comma_before_token, has_colon_before_token);
+    parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
+    return current_token;
   }
 
   /**
    * get current token
    */
-  __device__ json_token get_current_token() { return curr_token; }
+  __device__ json_token get_current_token() { return current_token; }
 
-  /**
-   * is valid JSON by parsing through all tokens
-   */
-  __device__ bool is_valid()
+  // TODO make this go away!!!!
+  __device__ inline char_range current_range()
   {
-    while (curr_token != json_token::ERROR && curr_token != json_token::SUCCESS) {
-      next_token();
-    }
-    return curr_token == json_token::SUCCESS;
+    return chars.slice(current_token_start_pos, curr_pos - current_token_start_pos);
   }
 
   /**
@@ -1437,12 +1377,12 @@ class json_parser {
    */
   __device__ bool try_skip_children()
   {
-    if (curr_token == json_token::ERROR || curr_token == json_token::INIT ||
-        curr_token == json_token::SUCCESS) {
+    if (current_token == json_token::ERROR || current_token == json_token::INIT ||
+        current_token == json_token::SUCCESS) {
       return false;
     }
 
-    if (curr_token != json_token::START_OBJECT && curr_token != json_token::START_ARRAY) {
+    if (current_token != json_token::START_OBJECT && current_token != json_token::START_ARRAY) {
       return true;
     }
 
@@ -1470,20 +1410,22 @@ class json_parser {
    */
   __device__ cudf::size_type write_unescaped_text(char* destination)
   {
-    switch (curr_token) {
-      case json_token::VALUE_STRING:
+    switch (current_token) {
+      case json_token::VALUE_STRING: {
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        return write_string(current_token_start_pos, destination, escape_style::UNESCAPED).first;
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::UNESCAPED);
+      }
       case json_token::VALUE_NUMBER_INT:
-        if (number_token_len == 2 && current_token_start_pos[0] == '-' &&
-            current_token_start_pos[1] == '0') {
+        if (number_token_len == 2 && chars[current_token_start_pos] == '-' &&
+            chars[current_token_start_pos + 1] == '0') {
           if (nullptr != destination) *destination++ = '0';
           return 1;
         }
         if (nullptr != destination) {
           for (cudf::size_type i = 0; i < number_token_len; ++i) {
-            *destination++ = *(current_token_start_pos + i);
+            *destination++ = chars[current_token_start_pos + i];
           }
         }
         return number_token_len;
@@ -1494,7 +1436,7 @@ class json_parser {
         // 0.0000000000003 => 3.0E-13; 0.003 => 0.003; 0.0003 => 3.0E-4
         // 1.0E309 => "Infinity", -1E309 => "-Infinity"
         double d_value =
-          cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len));
+          cudf::strings::detail::stod(chars.slice_sv(current_token_start_pos, number_token_len));
         return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination);
       }
       case json_token::VALUE_TRUE:
@@ -1522,10 +1464,12 @@ class json_parser {
           *destination++ = 'l';
         }
         return 4;
-      case json_token::FIELD_NAME:
+      case json_token::FIELD_NAME: {
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        return write_string(current_token_start_pos, destination, escape_style::UNESCAPED).first;
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::UNESCAPED);
+      }
       case json_token::START_ARRAY:
         if (nullptr != destination) { *destination++ = '['; }
         return 1;
@@ -1556,28 +1500,29 @@ class json_parser {
    */
   __device__ cudf::size_type write_escaped_text(char* destination)
   {
-    switch (curr_token) {
+    switch (current_token) {
       case json_token::VALUE_STRING: {
         // can not copy from JSON directly due to escaped chars
-        // rewind the pos; parse again with copy
-        return write_string(current_token_start_pos, destination, escape_style::ESCAPED).second;
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::ESCAPED);
       }
-      case json_token::VALUE_NUMBER_INT:
-        if (number_token_len == 2 && current_token_start_pos[0] == '-' &&
-            current_token_start_pos[1] == '0') {
+      case json_token::VALUE_NUMBER_INT: {
+        if (number_token_len == 2 && chars[current_token_start_pos] == '-' &&
+            chars[current_token_start_pos + 1] == '0') {
           if (nullptr != destination) *destination++ = '0';
           return 1;
         }
         if (nullptr != destination) {
           for (cudf::size_type i = 0; i < number_token_len; ++i) {
-            *destination++ = *(current_token_start_pos + i);
+            *destination++ = chars[current_token_start_pos + i];
           }
         }
         return number_token_len;
+      }
       case json_token::VALUE_NUMBER_FLOAT: {
         // number normalization:
         double d_value =
-          cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len));
+          cudf::strings::detail::stod(chars.slice_sv(current_token_start_pos, number_token_len));
         return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination);
       }
       case json_token::VALUE_TRUE:
@@ -1607,8 +1552,8 @@ class json_parser {
         return 4;
       case json_token::FIELD_NAME: {
         // can not copy from JSON directly due to escaped chars
-        // rewind the pos; parse again with copy
-        return write_string(current_token_start_pos, destination, escape_style::ESCAPED).second;
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::ESCAPED);
       }
       case json_token::START_ARRAY:
         if (nullptr != destination) { *destination++ = '['; }
@@ -1630,35 +1575,25 @@ class json_parser {
     return 0;
   }
 
-  /**
-   * reset the parser
-   */
-  __device__ void reset()
-  {
-    curr_pos   = json_start_pos;
-    curr_token = json_token::INIT;
-    stack_size = 0;
-  }
-
   /**
    * match field name string when current token is FIELD_NAME,
    * return true if current token is FIELD_NAME and match successfully.
    * return false otherwise,
-   * Note: to_match_str_ptr should not be nullptr
    */
   __device__ bool match_current_field_name(cudf::string_view name)
   {
-    return match_current_field_name(name.data(), name.size_bytes());
+    return match_current_field_name(char_range(name));
   }
 
   /**
    * match current field name
    */
-  __device__ bool match_current_field_name(char const* to_match_str_ptr, cudf::size_type len)
+  __device__ bool match_current_field_name(char_range name)
   {
-    if (json_token::FIELD_NAME == curr_token) {
-      auto [b, end_pos] = try_parse_string(
-        current_token_start_pos, to_match_str_ptr, to_match_str_ptr + len, escape_style::UNESCAPED);
+    if (json_token::FIELD_NAME == current_token) {
+      char_range_reader reader(current_range());
+      char_range_reader to_match(name);
+      auto [b, end_pos] = try_parse_string(reader, to_match, escape_style::UNESCAPED);
       return b;
     } else {
       return false;
@@ -1673,7 +1608,7 @@ class json_parser {
    */
   __device__ thrust::pair<bool, size_t> copy_current_structure(char* copy_to)
   {
-    switch (curr_token) {
+    switch (current_token) {
       case json_token::INIT:
       case json_token::ERROR:
       case json_token::SUCCESS:
@@ -1715,10 +1650,10 @@ class json_parser {
           bool has_colon_before_token = false;
 
           // parse and get has_comma_before_token, has_colon_before_token
-          parse_next_token(has_comma_before_token, has_colon_before_token);
+          parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
 
           // check the JSON format
-          if (curr_token == json_token::ERROR) { return thrust::make_pair(false, 0); }
+          if (current_token == json_token::ERROR) { return thrust::make_pair(false, 0); }
 
           // write out the token
           if (nullptr != copy_to) {
@@ -1752,10 +1687,9 @@ class json_parser {
   }
 
  private:
-  char const* const json_start_pos;
-  char const* const json_end_pos;
-  char const* curr_pos;
-  json_token curr_token{json_token::INIT};
+  char_range const chars;
+  cudf::size_type curr_pos;
+  json_token current_token;
 
   // 64 bits long saves the nested object/array contexts
   // true(bit value 1) is JSON object context
@@ -1764,8 +1698,10 @@ class json_parser {
   int64_t context_stack;
   int stack_size = 0;
 
+  // TODO remove if possible
   // save current token start pos, used by coping current token text
-  char const* current_token_start_pos;
+  cudf::size_type current_token_start_pos;
+  // TODO remove if possible
   // used to store number token length
   cudf::size_type number_token_len;
 };
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
index f0cb579e43..bba6650d0f 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
@@ -311,7 +311,7 @@ void getJsonObjectTest_Test_case_path1() {
    * case path 5: case (START_ARRAY, Subscript :: Wildcard :: Subscript ::
    * Wildcard :: xs), set flatten style
    * case path 2: case (START_ARRAY, Nil) if style == FlattenStyle
-   * 
+   *
    * First use path5 [*][*] to enable flatten style.
    */
   @Test

From ea1ecc366b1db0161b4b6907a6b1bbeba11401a3 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 8 May 2024 10:30:16 +0800
Subject: [PATCH 084/124] [submodule-sync] bot-submodule-sync-branch-24.06 to
 branch-24.06 [skip ci] [bot] (#2020)

* Update submodule cudf to d5ad366e9787999f00450ec858b5d18b813b3106

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to d5ad366e9787999f00450ec858b5d18b813b3106

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to d5ad366e9787999f00450ec858b5d18b813b3106

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to e87a78d422e25474dd23b031ef98eeb8a293d718

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 46ae8cbc5cad97d45500901b1b15ed7c2f3eb0fc

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4dc616227f..46ae8cbc5c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4dc616227f5c872031603426a3235282f6d23554
+Subproject commit 46ae8cbc5cad97d45500901b1b15ed7c2f3eb0fc
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 26610e63b3..22055be71d 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-365322aca32fd6ecd7027f5d7ec7be50b7f3cc2a
+b465a12cfcdbf5add65807118e32ed27b6d8ce08
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 333bb644b0..3a89c5b906 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -62,7 +62,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "ff837fffda2bf4afa0dc80bc482a18f5645b4901",
+      "git_tag" : "6ed7bccb42d82eb1ac60a98dfd460a7785881949",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },

From e2225c44168c1c34930ee40255eb3df8a83c08df Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 8 May 2024 17:15:11 +0800
Subject: [PATCH 085/124] Update submodule cudf to
 5f1f0dd503ac55facfb91ae0c528b88b306831df (#2023)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 46ae8cbc5c..5f1f0dd503 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 46ae8cbc5cad97d45500901b1b15ed7c2f3eb0fc
+Subproject commit 5f1f0dd503ac55facfb91ae0c528b88b306831df
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 3a89c5b906..021ea5ae7f 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -139,7 +139,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "e13b253fdfa19ccdf563be5fbf4ffa1cdc6b87a0",
+      "git_tag" : "4f01c559ff61f2512ae14bae0d86f4a11a1bef93",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From b3442a2c5fad2a41d4f6cd594e0389c18d4905dc Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 9 May 2024 04:32:10 +0800
Subject: [PATCH 086/124] Update submodule cudf to
 eaf555616ff83a75b3c3b11ce18e1c393604ccf4 (#2025)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5f1f0dd503..eaf555616f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5f1f0dd503ac55facfb91ae0c528b88b306831df
+Subproject commit eaf555616ff83a75b3c3b11ce18e1c393604ccf4
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 021ea5ae7f..c3465ef57d 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -32,6 +32,11 @@
           "fixed_in" : "",
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue."
         },
+        {
+          "file" : "cccl/kernel_pointer_hiding.diff",
+          "fixed_in" : "2.4",
+          "issue" : "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]"
+        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "fixed_in" : "",

From 9b368dc89064e3bd7373b35b83c1df285fcb6a59 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 9 May 2024 11:14:18 +0800
Subject: [PATCH 087/124] Update submodule cudf to
 c576e97a6a7afef225e9c7746885ac436f224ee3 (#2026)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index eaf555616f..c576e97a6a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit eaf555616ff83a75b3c3b11ce18e1c393604ccf4
+Subproject commit c576e97a6a7afef225e9c7746885ac436f224ee3
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 22055be71d..34d122be22 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-b465a12cfcdbf5add65807118e32ed27b6d8ce08
+6f917c953c05bc95d0c0cf755d38d3cac916d9ad

From 324f89aa168faa9151d58d54e2f65ca4a0521de3 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 9 May 2024 23:12:09 +0800
Subject: [PATCH 088/124] Update submodule cudf to
 a4cd1d877631e4554c53b57202564398b758324c (#2027)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c576e97a6a..a4cd1d8776 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c576e97a6a7afef225e9c7746885ac436f224ee3
+Subproject commit a4cd1d877631e4554c53b57202564398b758324c
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index c3465ef57d..da6e010236 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -144,7 +144,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "4f01c559ff61f2512ae14bae0d86f4a11a1bef93",
+      "git_tag" : "f11c8ca44ec4ab282157e7a1d7be5a3abafe8c57",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 19d8d48df886981a5f4a067cc952827dac1de0ce Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 13 May 2024 12:40:25 -0700
Subject: [PATCH 089/124] Fix string functors to adapt to the new
 `make_strings_children` from cudf (#2034)

* Fix string functors

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Fix style

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Include changes from cudf

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

---------

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 src/main/cpp/src/cast_decimal_to_string.cu |  9 +++++----
 src/main/cpp/src/cast_float_to_string.cu   | 11 ++++++-----
 src/main/cpp/src/format_float.cu           |  9 +++++----
 src/main/cpp/src/map_utils.cu              | 19 +++++++------------
 thirdparty/cudf                            |  2 +-
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/src/main/cpp/src/cast_decimal_to_string.cu b/src/main/cpp/src/cast_decimal_to_string.cu
index 9d0e27ed59..91b155dae4 100644
--- a/src/main/cpp/src/cast_decimal_to_string.cu
+++ b/src/main/cpp/src/cast_decimal_to_string.cu
@@ -52,8 +52,9 @@ namespace {
 template <typename DecimalType>
 struct decimal_to_non_ansi_string_fn {
   column_device_view d_decimals;
-  size_type* d_offsets{};
-  char* d_chars{};
+  cudf::size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Calculates the size of the string required to convert the element, in base-10 format.
@@ -162,13 +163,13 @@ struct decimal_to_non_ansi_string_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_decimals.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       decimal_to_non_ansi_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_decimals.element<DecimalType>(idx));
+      d_sizes[idx] = compute_output_size(d_decimals.element<DecimalType>(idx));
     }
   }
 };
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 78cedbbf64..b294ca6f1b 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,8 +34,9 @@ namespace {
 template <typename FloatType>
 struct float_to_string_fn {
   cudf::column_device_view d_floats;
-  cudf::size_type* d_offsets;
+  cudf::size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ cudf::size_type compute_output_size(cudf::size_type idx) const
   {
@@ -56,13 +57,13 @@ struct float_to_string_fn {
   __device__ void operator()(cudf::size_type idx) const
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       float_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(idx);
+      d_sizes[idx] = compute_output_size(idx);
     }
   }
 };
@@ -124,4 +125,4 @@ std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
   return detail::float_to_string(floats, stream, mr);
 }
 
-}  // namespace spark_rapids_jni
\ No newline at end of file
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index 1d537595d7..bc3c85bbcc 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -35,8 +35,9 @@ template <typename FloatType>
 struct format_float_fn {
   cudf::column_device_view d_floats;
   int digits;
-  cudf::size_type* d_offsets;
+  cudf::size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ cudf::size_type compute_output_size(FloatType const value) const
   {
@@ -56,13 +57,13 @@ struct format_float_fn {
   __device__ void operator()(cudf::size_type const idx) const
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       format_float(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+      d_sizes[idx] = compute_output_size(d_floats.element<FloatType>(idx));
     }
   }
 };
@@ -128,4 +129,4 @@ std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
   return detail::format_float(floats, digits, stream, mr);
 }
 
-}  // namespace spark_rapids_jni
\ No newline at end of file
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu
index 6ae54f4fe9..529ed1d90c 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/map_utils.cu
@@ -16,10 +16,6 @@
 
 #include "map_utils_debug.cuh"
 
-//
-#include <limits>
-
-//
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
@@ -31,11 +27,11 @@
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-//
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-//
+#include <cub/device/device_radix_sort.cuh>
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -51,9 +47,7 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
-//
-#include <cub/device/device_radix_sort.cuh>
-#include <cuda/functional>
+#include <limits>
 
 namespace spark_rapids_jni {
 
@@ -520,8 +514,9 @@ struct substring_fn {
   cudf::device_span<char const> const d_string;
   cudf::device_span<thrust::pair<SymbolOffsetT, SymbolOffsetT> const> const d_ranges;
 
-  cudf::size_type* d_offsets{};
-  char* d_chars{};
+  cudf::size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type const idx)
   {
@@ -530,7 +525,7 @@ struct substring_fn {
     if (d_chars) {
       memcpy(d_chars + d_offsets[idx], d_string.data() + range.first, size);
     } else {
-      d_offsets[idx] = size;
+      d_sizes[idx] = size;
     }
   }
 };
diff --git a/thirdparty/cudf b/thirdparty/cudf
index a4cd1d8776..13f028f01a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a4cd1d877631e4554c53b57202564398b758324c
+Subproject commit 13f028f01ad043b0d24f3e4a28f4267c02806390

From 73ca6cfefd3d43708d0750b660f4dd2978b199ec Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 13 May 2024 14:30:54 -0700
Subject: [PATCH 090/124] Fix null handling in `percentile` evaluation when the
 input histogram is empty (#2030)

* Fix percentile null handling when the input is empty

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Add Java test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

---------

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 src/main/cpp/src/histogram.cu                 | 29 +++++------
 .../spark/rapids/jni/HistogramTest.java       | 48 +++++++++++++++++++
 2 files changed, 63 insertions(+), 14 deletions(-)
 create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/HistogramTest.java

diff --git a/src/main/cpp/src/histogram.cu b/src/main/cpp/src/histogram.cu
index 3d606e9f0a..d39ceaa5be 100644
--- a/src/main/cpp/src/histogram.cu
+++ b/src/main/cpp/src/histogram.cu
@@ -16,7 +16,6 @@
 
 #include "histogram.hpp"
 
-//
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -33,7 +32,6 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
-//
 #include <thrust/binary_search.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -42,7 +40,6 @@
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/scan.h>
 
-//
 #include <type_traits>
 
 namespace spark_rapids_jni {
@@ -69,7 +66,7 @@ struct fill_percentile_fn {
     auto const has_all_nulls = start >= end;
 
     auto const percentage_idx = idx % percentages.size();
-    if (out_validity && percentage_idx == 0) {
+    if (percentage_idx == 0) {
       // If the histogram only contains null elements, the output percentile will be null.
       out_validity[histogram_idx] = has_all_nulls ? 0 : 1;
     }
@@ -191,7 +188,13 @@ struct percentile_dispatcher {
                                 stream,
                                 mr);
 
-    auto const fill_percentile = [&](auto const sorted_validity_it, auto const out_validity) {
+    // We may always have nulls in the output due to either:
+    // - Having nulls in the input, and/or,
+    // - Having empty histograms.
+    auto out_validities =
+      rmm::device_uvector<int8_t>(num_histograms, stream, rmm::mr::get_current_device_resource());
+
+    auto const fill_percentile = [&](auto const sorted_validity_it) {
       auto const sorted_input_it =
         thrust::make_permutation_iterator(data.begin<T>(), ordered_indices);
       thrust::for_each_n(rmm::exec_policy(stream),
@@ -203,23 +206,21 @@ struct percentile_dispatcher {
                                             accumulated_counts,
                                             percentages,
                                             percentiles->mutable_view().begin<double>(),
-                                            out_validity});
+                                            out_validities.begin()});
     };
 
     if (!has_null) {
-      fill_percentile(thrust::make_constant_iterator(true), nullptr);
+      fill_percentile(thrust::make_constant_iterator(true));
     } else {
       auto const sorted_validity_it = thrust::make_permutation_iterator(
         cudf::detail::make_validity_iterator<false>(data), ordered_indices);
-      auto out_validities =
-        rmm::device_uvector<int8_t>(num_histograms, stream, rmm::mr::get_current_device_resource());
-      fill_percentile(sorted_validity_it, out_validities.begin());
-
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        out_validities.begin(), out_validities.end(), thrust::identity{}, stream, mr);
-      if (null_count > 0) { return {std::move(percentiles), std::move(null_mask), null_count}; }
+      fill_percentile(sorted_validity_it);
     }
 
+    auto [null_mask, null_count] = cudf::detail::valid_if(
+      out_validities.begin(), out_validities.end(), thrust::identity{}, stream, mr);
+    if (null_count > 0) { return {std::move(percentiles), std::move(null_mask), null_count}; }
+
     return {std::move(percentiles), rmm::device_buffer{}, 0};
   }
 };
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HistogramTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HistogramTest.java
new file mode 100644
index 0000000000..9a1812f660
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/HistogramTest.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.AssertUtils;
+import ai.rapids.cudf.ColumnVector;
+
+import org.junit.jupiter.api.Test;
+
+public class HistogramTest {
+  @Test
+  void testZeroFrequency() {
+    try (ColumnVector values = ColumnVector.fromInts(5, 10, 30);
+         ColumnVector freqs = ColumnVector.fromLongs(1, 0, 1);
+         ColumnVector histogram = Histogram.createHistogramIfValid(values, freqs, true);
+         ColumnVector percentiles = Histogram.percentileFromHistogram(histogram, new double[]{1},
+             false);
+         ColumnVector expected = ColumnVector.fromBoxedDoubles(5.0, null, 30.0)) {
+      AssertUtils.assertColumnsAreEqual(percentiles, expected);
+    }
+  }
+
+  @Test
+  void testAllNulls() {
+    try (ColumnVector values = ColumnVector.fromBoxedInts(null, null, null);
+         ColumnVector freqs = ColumnVector.fromLongs(1, 2, 3);
+         ColumnVector histogram = Histogram.createHistogramIfValid(values, freqs, true);
+         ColumnVector percentiles = Histogram.percentileFromHistogram(histogram, new double[]{0.5},
+             false);
+         ColumnVector expected = ColumnVector.fromBoxedDoubles(null, null, null)) {
+      AssertUtils.assertColumnsAreEqual(percentiles, expected);
+    }
+  }
+}

From f1e5dbb93e6bf368793c78a2b673c450f83ea053 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Tue, 14 May 2024 17:18:16 +0800
Subject: [PATCH 091/124] Drop Centos7 support (#2010)

* Drop Centos7 support

To fix: https://github.com/NVIDIA/spark-rapids-jni/issues/1991

Drop Centos7 support, switch to build in a Rocky 8 Docker image

Update the script to support both amd64 and arm64 CPUs

Signed-off-by: Tim Liu <timl@nvidia.com>

* Update for code review

Signed-off-by: Tim Liu <timl@nvidia.com>

* Dockerfile.multi to Dockerfile

Signed-off-by: Tim Liu <timl@nvidia.com>

* Change '--platform' param to be compatitable with lower Docker versions

Signed-off-by: Tim Liu <timl@nvidia.com>

* Update for code review

Signed-off-by: Tim Liu <timl@nvidia.com>

* Make cuda version consistent

Signed-off-by: Tim Liu <timl@nvidia.com>

* Update according to the review comments

Signed-off-by: Tim Liu <timl@nvidia.com>

* Update build/run-in-docker

Co-authored-by: Jason Lowe <jlowe@nvidia.com>

* Move CMAKE_GENERATOR back into build-in-docker

Signed-off-by: Tim Liu <timl@nvidia.com>

---------

Signed-off-by: Tim Liu <timl@nvidia.com>
Co-authored-by: Jason Lowe <jlowe@nvidia.com>
---
 build/build-in-docker   | 25 ++++++--------
 build/run-in-docker     | 11 ++++--
 ci/Dockerfile           | 34 +++++++++---------
 ci/Dockerfile.multi     | 76 -----------------------------------------
 ci/Jenkinsfile.premerge |  6 ++--
 ci/submodule-sync.sh    |  2 +-
 pom.xml                 |  5 +++
 7 files changed, 44 insertions(+), 115 deletions(-)
 mode change 100755 => 100644 ci/Dockerfile
 delete mode 100644 ci/Dockerfile.multi

diff --git a/build/build-in-docker b/build/build-in-docker
index 421cc1a855..49032185ba 100755
--- a/build/build-in-docker
+++ b/build/build-in-docker
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,30 +24,27 @@ set -e
 SCRIPTDIR=$(cd $(dirname $0); pwd)
 
 LOCAL_MAVEN_REPO=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
-CUDF_USE_PER_THREAD_DEFAULT_STREAM=${CUDF_USE_PER_THREAD_DEFAULT_STREAM:-ON}
 USE_GDS=${USE_GDS:-ON}
 export CMAKE_GENERATOR=${CMAKE_GENERATOR:-"Ninja"}
+# Make CUDA_VERSION consistent with the file run-in-docker
+export CUDA_VERSION=${CUDA_VERSION:-11.8.0}
+CUDA_CLASSIFIER=cuda${CUDA_VERSION%%.*}
+BUILD_FAULTINJ=${BUILD_FAULTINJ:-ON}
 
 if (( $# == 0 )); then
   echo "Usage: $0 <Maven build arguments>"
   exit 1
 fi
 
-_CUDF_CLEAN_SKIP=""
-# if ccache is enabled and libcudf.clean.skip not provided
-# by the user remove the cpp build directory
-#
-if [[ "$CCACHE_DISABLE" != "1" ]]; then
-  if [[ ! "$*" =~ " -Dlibcudf.clean.skip=" ]]; then
-    # Don't skip clean if ccache is enabled
-    # unless the user overrides
-    _CUDF_CLEAN_SKIP="-Dlibcudf.clean.skip=false"
-  fi
+# Set env for arm64 build, The possible values of 'uname -m' : [x86_64/i386/aarch64/mips/...]
+if [ "$(uname -m)" == "aarch64" ]; then
+  USE_GDS="OFF" # The GDS cuFiles RDMA libraries are not included in the arm64 CUDA toolkit.
+  BUILD_FAULTINJ="OFF" # libcupti_static.a linked by cufaultinj, does not exist in the arm64 CUDA toolkit.
 fi
 
 $SCRIPTDIR/run-in-docker mvn \
     -Dmaven.repo.local=$LOCAL_MAVEN_REPO \
-    -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$CUDF_USE_PER_THREAD_DEFAULT_STREAM \
     -DUSE_GDS=$USE_GDS \
-    $_CUDF_CLEAN_SKIP \
+    -DBUILD_FAULTINJ=${BUILD_FAULTINJ} \
+    -Dcuda.version=$CUDA_CLASSIFIER \
     "$@"
diff --git a/build/run-in-docker b/build/run-in-docker
index 62d40aac48..81152a1d9d 100755
--- a/build/run-in-docker
+++ b/build/run-in-docker
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,11 +27,16 @@ REPODIR=$SCRIPTDIR/..
 CUDA_VERSION=${CUDA_VERSION:-11.8.0}
 DOCKER_CMD=${DOCKER_CMD:-docker}
 DOCKER_BUILD_EXTRA_ARGS=${DOCKER_BUILD_EXTRA_ARGS:-""}
+if [ "$(uname -m)" == "aarch64" ]; then
+    DOCKER_BUILD_EXTRA_ARGS="--build-arg TARGETPLATFORM=linux/arm64 --build-arg CMAKE_ARCH=aarch64 $DOCKER_BUILD_EXTRA_ARGS"
+else
+    DOCKER_BUILD_EXTRA_ARGS="--build-arg TARGETPLATFORM=linux/amd64 --build-arg CMAKE_ARCH=x86_64 $DOCKER_BUILD_EXTRA_ARGS"
+fi
 DOCKER_RUN_EXTRA_ARGS=${DOCKER_RUN_EXTRA_ARGS:-""}
 LOCAL_CCACHE_DIR=${LOCAL_CCACHE_DIR:-"$HOME/.ccache"}
 LOCAL_MAVEN_REPO=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
 
-SPARK_IMAGE_NAME="spark-rapids-jni-build:${CUDA_VERSION}-devel-centos7"
+SPARK_IMAGE_NAME="spark-rapids-jni-build:${CUDA_VERSION}-devel-rockylinux8"
 
 # ensure directories exist
 mkdir -p "$LOCAL_CCACHE_DIR" "$LOCAL_MAVEN_REPO"
@@ -74,4 +79,4 @@ $DOCKER_CMD run $DOCKER_GPU_OPTS $DOCKER_RUN_EXTRA_ARGS -u $(id -u):$(id -g) --r
   -e VERBOSE \
   $DOCKER_OPTS \
   $SPARK_IMAGE_NAME \
-  scl enable devtoolset-11 "$RUN_CMD"
+  scl enable gcc-toolset-11 "$RUN_CMD"
diff --git a/ci/Dockerfile b/ci/Dockerfile
old mode 100755
new mode 100644
index e3b703a11e..f36ede2233
--- a/ci/Dockerfile
+++ b/ci/Dockerfile
@@ -17,31 +17,29 @@
 ###
 # Build the image for spark-rapids-jni development environment.
 #
-# Arguments: CUDA_VERSION=11.8.0
+# Arguments: CUDA_VERSION=[11.X.Y, 12.X.Y], OS_RELEASE=[8, 9], TARGETPLATFORM=[linux/amd64, linux/arm64]
 #
 ###
 ARG CUDA_VERSION=11.8.0
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
-ARG DEVTOOLSET_VERSION=11
+ARG OS_RELEASE=8
+ARG TARGETPLATFORM=linux/amd64
+# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
+# check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
+FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
+ARG TOOLSET_VERSION=11
 ### Install basic requirements
-RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} rh-python38 epel-release
-RUN yum install -y zlib-devel maven tar wget patch ninja-build
-# require git 2.18+ to keep consistent submodule operations
-RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo.x86_64.rpm && yum install -y git
-# pin urllib3<2.0 for https://github.com/psf/requests/issues/6432
-RUN scl enable rh-python38 "pip install requests 'urllib3<2.0'"
-
+RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-${TOOLSET_VERSION} python39 zlib-devel maven tar wget patch ninja-build git
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
+RUN mkdir -m 777 /usr/local/rapids /rapids
 
 # 3.22.3: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
 ARG CMAKE_VERSION=3.26.4
-
-RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
+# default x86_64 from x86 build, aarch64 cmake for arm build
+ARG CMAKE_ARCH=x86_64
+RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
 
 # ccache for interactive builds
 ARG CCACHE_VERSION=4.6
@@ -51,7 +49,7 @@ RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v
    cd ccache-${CCACHE_VERSION} && \
    mkdir build && \
    cd build && \
-   scl enable devtoolset-${DEVTOOLSET_VERSION} \
+   scl enable gcc-toolset-${TOOLSET_VERSION} \
       "cmake .. \
          -DCMAKE_BUILD_TYPE=Release \
          -DZSTD_FROM_INTERNET=ON \
diff --git a/ci/Dockerfile.multi b/ci/Dockerfile.multi
deleted file mode 100644
index d3b198530b..0000000000
--- a/ci/Dockerfile.multi
+++ /dev/null
@@ -1,76 +0,0 @@
-#
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-###
-# JNI CI image for multi-platform build
-#
-# Arguments: CUDA_VERSION=11.8.0
-#
-###
-ARG CUDA_VERSION=11.8.0
-ARG OS_RELEASE=8
-# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
-# check available offcial arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
-FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
-ARG TOOLSET_VERSION=11
-### Install basic requirements
-RUN dnf install -y scl-utils
-RUN dnf install -y gcc-toolset-${TOOLSET_VERSION} python39
-RUN dnf --enablerepo=powertools install -y zlib-devel maven tar wget patch ninja-build
-# require git 2.18+ to keep consistent submodule operations
-RUN dnf install -y git
-## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
-
-# 3.22.3+: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
-ARG CMAKE_VERSION=3.26.4
-# default as arm64 release
-ARG CMAKE_ARCH=aarch64
-# aarch64 cmake for arm build
-RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
-
-# ccache for interactive builds
-ARG CCACHE_VERSION=4.6
-RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
-   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
-   rm ccache-${CCACHE_VERSION}.tar.gz && \
-   cd ccache-${CCACHE_VERSION} && \
-   mkdir build && \
-   cd build && \
-   scl enable gcc-toolset-${TOOLSET_VERSION} \
-      "cmake .. \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DZSTD_FROM_INTERNET=ON \
-         -DREDIS_STORAGE_BACKEND=OFF && \
-      cmake --build . --parallel 4 --target install" && \
-   cd ../.. && \
-   rm -rf ccache-${CCACHE_VERSION}
-
-## install a version of boost that is needed for arrow/parquet to work
-RUN cd /usr/local && wget --quiet https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz && \
-  tar -xzf boost_1_79_0.tar.gz && \
-  rm boost_1_79_0.tar.gz && \
-  cd boost_1_79_0 && \
-  ./bootstrap.sh --prefix=/usr/local && \
-  ./b2 install --prefix=/usr/local --with-filesystem --with-system && \
-   cd /usr/local && \
-   rm -rf boost_1_79_0
-
-# disable cuda container constraints to allow running w/ elder drivers on data-center GPUs
-ENV NVIDIA_DISABLE_REQUIRE="true"
diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge
index a59db1af9a..0a00eb6f1b 100644
--- a/ci/Jenkinsfile.premerge
+++ b/ci/Jenkinsfile.premerge
@@ -30,7 +30,7 @@ import ipp.blossom.*
 
 def githubHelper // blossom github helper
 def TEMP_IMAGE_BUILD = true
-def IMAGE_PREMERGE = "${common.ARTIFACTORY_NAME}/sw-spark-docker/plugin-jni:centos7-cuda11.8.0-blossom"
+def IMAGE_PREMERGE = "${common.ARTIFACTORY_NAME}/sw-spark-docker/plugin-jni:rockylinux8-cuda11.8.0-blossom"
 def cpuImage = pod.getCPUYAML(IMAGE_PREMERGE)
 def PREMERGE_DOCKERFILE = 'ci/Dockerfile'
 def PREMERGE_TAG
@@ -150,7 +150,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                         }
 
                         if (TEMP_IMAGE_BUILD) {
-                            PREMERGE_TAG = "centos7-cuda11.8.0-blossom-dev-${BUILD_TAG}"
+                            PREMERGE_TAG = "rockylinux8-cuda11.8.0-blossom-dev-${BUILD_TAG}"
                             IMAGE_PREMERGE = "${ARTIFACTORY_NAME}/sw-spark-docker-local/plugin-jni:${PREMERGE_TAG}"
                             docker.build(IMAGE_PREMERGE, "--network=host -f ${PREMERGE_DOCKERFILE} -t $IMAGE_PREMERGE .")
                             uploadDocker(IMAGE_PREMERGE)
@@ -212,7 +212,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                     container('gpu') {
                         timeout(time: 3, unit: 'HOURS') { // step only timeout for test run
                             common.resolveIncompatibleDriverIssue(this)
-                            sh 'scl enable devtoolset-11 "ci/premerge-build.sh"'
+                            sh 'scl enable gcc-toolset-11 "ci/premerge-build.sh"'
                             sh 'bash ci/fuzz-test.sh'
                         }
                     }
diff --git a/ci/submodule-sync.sh b/ci/submodule-sync.sh
index 18119dc45d..f591f73a23 100755
--- a/ci/submodule-sync.sh
+++ b/ci/submodule-sync.sh
@@ -18,7 +18,7 @@
 # NOTE:
 #     this script is for jenkins only, and should not be used for local development
 #     run with ci/Dockerfile in jenkins:
-#         scl enable devtoolset-11 rh-python38 "ci/submodule-sync.sh"
+#         scl enable gcc-toolset-11 rh-python38 "ci/submodule-sync.sh"
 
 set -ex
 
diff --git a/pom.xml b/pom.xml
index 745f8127d1..24daa4635e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -338,6 +338,11 @@
     </profile>
     <profile>
       <id>arm64</id>
+      <activation>
+        <os>
+          <arch>aarch64</arch>
+        </os>
+      </activation>
       <properties>
         <jni.classifier>${cuda.version}-arm64</jni.classifier>
       </properties>

From d80cb2462903ec196c7dc781f9b01f1e3a3111ac Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 14 May 2024 21:27:20 -0500
Subject: [PATCH 092/124] Provide default python in Dockerfile (#2040)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 ci/Dockerfile        | 3 ++-
 ci/submodule-sync.sh | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/Dockerfile b/ci/Dockerfile
index f36ede2233..e8df715e95 100644
--- a/ci/Dockerfile
+++ b/ci/Dockerfile
@@ -28,7 +28,8 @@ ARG TARGETPLATFORM=linux/amd64
 FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
 ARG TOOLSET_VERSION=11
 ### Install basic requirements
-RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-${TOOLSET_VERSION} python39 zlib-devel maven tar wget patch ninja-build git
+RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-${TOOLSET_VERSION} python39 zlib-devel maven tar wget patch ninja-build git && \
+  alternatives --set python /usr/bin/python3
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir -m 777 /usr/local/rapids /rapids
 
diff --git a/ci/submodule-sync.sh b/ci/submodule-sync.sh
index f591f73a23..1888696ba5 100755
--- a/ci/submodule-sync.sh
+++ b/ci/submodule-sync.sh
@@ -18,7 +18,7 @@
 # NOTE:
 #     this script is for jenkins only, and should not be used for local development
 #     run with ci/Dockerfile in jenkins:
-#         scl enable gcc-toolset-11 rh-python38 "ci/submodule-sync.sh"
+#         scl enable gcc-toolset-11 ci/submodule-sync.sh
 
 set -ex
 

From 834e8a1b49c6a135ec5842fd28ed3c5a323e9b12 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 15 May 2024 14:53:02 -0500
Subject: [PATCH 093/124] Add Python requests module to Dockerfile (#2042)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 ci/Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/Dockerfile b/ci/Dockerfile
index e8df715e95..b3f4239dc6 100644
--- a/ci/Dockerfile
+++ b/ci/Dockerfile
@@ -28,8 +28,10 @@ ARG TARGETPLATFORM=linux/amd64
 FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
 ARG TOOLSET_VERSION=11
 ### Install basic requirements
+# pin urllib3<2.0 for https://github.com/psf/requests/issues/6432
 RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-${TOOLSET_VERSION} python39 zlib-devel maven tar wget patch ninja-build git && \
-  alternatives --set python /usr/bin/python3
+  alternatives --set python /usr/bin/python3 && \
+  python -m pip install requests 'urllib3<2.0'
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir -m 777 /usr/local/rapids /rapids
 

From 80573d6766f06ddc79a48cc906d91fb800b55e18 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 16 May 2024 07:37:41 +0800
Subject: [PATCH 094/124] [submodule-sync] bot-submodule-sync-branch-24.06 to
 branch-24.06 [skip ci] [bot] (#2038)

* Update submodule cudf to bd93e203b0bdfaa2b736a385ea7595c904bd30d8

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 65a51ffa364b8a54fadab041cb5c563873303643

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to b810113d6255dbe123aafbc80018bf6165a0842f

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to b5a9c4b5114390fb45e27d9aab5eaa995de3fa37

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to ce1933fc07d5f8d1da3ad36217ea0b39d7a926fa

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 425a5dac64b7c74c061b588dc8725c5390517cf9

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to b4bdea295331862949afe408feb47522a4ff8f2a

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to c42c4189d3273205a75d7b3c3ab33446eefb7631

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 915c6bea2069f75b5637ff39befd877ba37a1922

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 13f028f01ad043b0d24f3e4a28f4267c02806390

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 0f6ce63431cff85a278eafc555e74ee0e101f6da

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 4a6d13f232aba099b47ea3c95fa429209fcf863b

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to fa9d028073f73218fe0dd4e49671c39fa11fc42c

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 08115239ad1f5155108430e0d0ac2f747f4bbd59

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to b5f6aa59cd9d2ebb238f9f249b305d1883169332

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to b5f6aa59cd9d2ebb238f9f249b305d1883169332

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       |  2 +-
 thirdparty/cudf-pins/rapids-cmake.sha |  2 +-
 thirdparty/cudf-pins/versions.json    | 16 ++++++++++++----
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 13f028f01a..b5f6aa59cd 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 13f028f01ad043b0d24f3e4a28f4267c02806390
+Subproject commit b5f6aa59cd9d2ebb238f9f249b305d1883169332
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 34d122be22..df1ef9ff01 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-6f917c953c05bc95d0c0cf755d38d3cac916d9ad
+78da6e709bb893fe31587f419a2c80fa72cd66f0
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index da6e010236..0cdce4483a 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -5,9 +5,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "740889f413af9b1ae1d81eb1e5a4a9fb4ce9cf97",
+      "git_tag" : "6a28035c2b49b432dc63f5ee7524d76b4ed2d762",
       "git_url" : "https://github.com/apache/arrow.git",
-      "version" : "14.0.2"
+      "version" : "16.0.0"
     },
     "CCCL" : 
     {
@@ -67,7 +67,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "6ed7bccb42d82eb1ac60a98dfd460a7785881949",
+      "git_tag" : "4355c292cb4ddd54efe2d5220d004816c29f3dd1",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },
@@ -95,6 +95,14 @@
       "git_url" : "https://github.com/dmlc/dlpack.git",
       "version" : "0.8"
     },
+    "flatbuffers" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "595bf0007ab1929570c7671f091313c8fc20644e",
+      "git_url" : "https://github.com/google/flatbuffers.git",
+      "version" : "24.3.25"
+    },
     "fmt" : 
     {
       "always_download" : true,
@@ -144,7 +152,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "f11c8ca44ec4ab282157e7a1d7be5a3abafe8c57",
+      "git_tag" : "32cd537a55b81726940bb698013a0d684e338c86",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From d2b881854e9f7363a7d232198a3c668f0b4e7d26 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 16 May 2024 16:33:35 +0800
Subject: [PATCH 095/124] [submodule-sync] bot-submodule-sync-branch-24.06 to
 branch-24.06 [skip ci] [bot] (#2043)

* Update submodule cudf to 516d0f9033e73d10a473e2ca3fcc891e980450bc

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 4e87069bd43ee969797265eaed00f82eda255dd4

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index b5f6aa59cd..4e87069bd4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit b5f6aa59cd9d2ebb238f9f249b305d1883169332
+Subproject commit 4e87069bd43ee969797265eaed00f82eda255dd4

From d874f5e6aa0ef85a7a2edea73b49a0b55de8309f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 16 May 2024 20:32:09 +0800
Subject: [PATCH 096/124] Update submodule cudf to
 bdd48f1ce16982f31e01108280d91b5d2a1f8847 (#2044)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4e87069bd4..bdd48f1ce1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4e87069bd43ee969797265eaed00f82eda255dd4
+Subproject commit bdd48f1ce16982f31e01108280d91b5d2a1f8847
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 0cdce4483a..7b62dd9666 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -152,7 +152,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "32cd537a55b81726940bb698013a0d684e338c86",
+      "git_tag" : "8ee39ad591fb4f76be625f9c0cd2963172e62e32",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 824963d16f958a6d4579524c9266888b89125daf Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 16 May 2024 22:34:43 +0800
Subject: [PATCH 097/124] Update submodule cudf to
 1e92f3f962cb27175e804889fd6d8c9be18b98c9 (#2045)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index bdd48f1ce1..1e92f3f962 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit bdd48f1ce16982f31e01108280d91b5d2a1f8847
+Subproject commit 1e92f3f962cb27175e804889fd6d8c9be18b98c9

From 27c7ca8ae70aa1af2dd2134ec660a4e6ddde5ea5 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 17 May 2024 04:29:54 +0800
Subject: [PATCH 098/124] Update submodule cudf to
 49af2615ca81e65c991954ed905c4a6151fc88fd (#2046)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1e92f3f962..49af2615ca 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1e92f3f962cb27175e804889fd6d8c9be18b98c9
+Subproject commit 49af2615ca81e65c991954ed905c4a6151fc88fd
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 7b62dd9666..2c91bf3bd2 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -67,7 +67,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "4355c292cb4ddd54efe2d5220d004816c29f3dd1",
+      "git_tag" : "261445be7993df57f624a3f4ee9fd15e7d26bb5e",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },
@@ -152,7 +152,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "8ee39ad591fb4f76be625f9c0cd2963172e62e32",
+      "git_tag" : "cab7e06cc40eaf9dc98f5563a8cc399fbc876388",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 39b1ba76a600db48e0ff6e21f89798c61965a4cf Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 17 May 2024 10:34:03 +0800
Subject: [PATCH 099/124] Update submodule cudf to
 6d5f9653debe57c7eb52f42fb980d38451a9a460 (#2048)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 49af2615ca..6d5f9653de 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 49af2615ca81e65c991954ed905c4a6151fc88fd
+Subproject commit 6d5f9653debe57c7eb52f42fb980d38451a9a460
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 2c91bf3bd2..defc0e75e6 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -152,7 +152,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "cab7e06cc40eaf9dc98f5563a8cc399fbc876388",
+      "git_tag" : "91d529f4a2dd7bffed779350d928e89ab23ed85a",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 71fe77fa3c1596c8a8dbf60a9088ff91debfce34 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 18 May 2024 04:34:28 +0800
Subject: [PATCH 100/124] Update submodule cudf to
 d10b8e4c9b437377cb6d231873e8f0fe9f8dc817 (#2049)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6d5f9653de..d10b8e4c9b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6d5f9653debe57c7eb52f42fb980d38451a9a460
+Subproject commit d10b8e4c9b437377cb6d231873e8f0fe9f8dc817

From 3bf63078376cf47d9ee12ba05ae317fe7469e55d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 18 May 2024 16:32:21 +0800
Subject: [PATCH 101/124] Update submodule cudf to
 e6e67615c248d4992d0bf2ce5a47b09534cd4c82 (#2050)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d10b8e4c9b..e6e67615c2 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d10b8e4c9b437377cb6d231873e8f0fe9f8dc817
+Subproject commit e6e67615c248d4992d0bf2ce5a47b09534cd4c82

From a89aaf16b41a01275f7c57414314b335cbf0270f Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Mon, 20 May 2024 15:42:23 -0400
Subject: [PATCH 102/124] Add Paul to build permissions list (#2052)

Signed-off-by: Paul Mattione <pmattione@nvidia.com>
---
 .github/workflows/blossom-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 36ba8944a6..5053ae4dae 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -64,6 +64,7 @@ jobs:
       parthosa,\
       liurenjie1024,\
       binmahone,\
+      pmattione-nvidia,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person

From c52f9e1b3e26917f31fbb27cecb070901195b3be Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 May 2024 04:31:05 +0800
Subject: [PATCH 103/124] Update submodule cudf to
 16e8625fde773d00732134ea985a42156bd8619b (#2053)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e6e67615c2..16e8625fde 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e6e67615c248d4992d0bf2ce5a47b09534cd4c82
+Subproject commit 16e8625fde773d00732134ea985a42156bd8619b

From 1e05106b5fdbae68e80d03c52630f8e18f2d10c5 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 May 2024 10:29:40 +0800
Subject: [PATCH 104/124] Update submodule cudf to
 1dd19102d0df7b8523e29a921c62654463278b43 (#2054)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 16e8625fde..1dd19102d0 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 16e8625fde773d00732134ea985a42156bd8619b
+Subproject commit 1dd19102d0df7b8523e29a921c62654463278b43
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index defc0e75e6..2324f94586 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -152,7 +152,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "91d529f4a2dd7bffed779350d928e89ab23ed85a",
+      "git_tag" : "46e153c18d17b07526d6ff2e04859fcbbd706879",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From 2913a9b84f4d5df2db1d7553f3bcd4f460f7de9c Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 May 2024 17:13:06 +0800
Subject: [PATCH 105/124] Update submodule cudf to
 8b7245548c63d1ce84031a0bd187cbfb8e072f8c (#2055)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1dd19102d0..8b7245548c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1dd19102d0df7b8523e29a921c62654463278b43
+Subproject commit 8b7245548c63d1ce84031a0bd187cbfb8e072f8c

From 04eaa45dc4da5374d00429b7dbb5671c06df265e Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 21 May 2024 07:24:29 -0500
Subject: [PATCH 106/124] Make the max jsonpath depth 16 to match other values
 in the code (#2047)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 src/main/cpp/benchmarks/get_json_object.cu               | 2 --
 src/main/cpp/src/get_json_object.cu                      | 6 +-----
 src/main/cpp/src/get_json_object.hpp                     | 2 +-
 src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java | 2 --
 4 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/main/cpp/benchmarks/get_json_object.cu b/src/main/cpp/benchmarks/get_json_object.cu
index 442bcb6004..51f9299dba 100644
--- a/src/main/cpp/benchmarks/get_json_object.cu
+++ b/src/main/cpp/benchmarks/get_json_object.cu
@@ -142,10 +142,8 @@ void BM_get_json_object(nvbench::state& state)
 
   using path_instruction_type = spark_rapids_jni::path_instruction_type;
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions;
-  instructions.emplace_back(path_instruction_type::KEY, "", -1);
   instructions.emplace_back(path_instruction_type::NAMED, "struct", -1);
   for (int i = 0; i < max_depth - list_depth; ++i) {
-    instructions.emplace_back(path_instruction_type::KEY, "", -1);
     instructions.emplace_back(path_instruction_type::NAMED, "0", -1);
   }
 
diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index b743d14cdc..e87c47ffe5 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -387,7 +387,7 @@ __device__ bool evaluate_path(json_parser& p,
   // There is a same constant in JSONUtil.java, keep them consistent when changing
   // Note: Spark-Rapids should guarantee the path depth is less or equal to this limit,
   // or GPU reports cudaErrorIllegalAddress
-  constexpr int max_path_depth = 8;
+  constexpr int max_path_depth = 16;
 
   // define stack; plus 1 indicates root context task needs an extra memory
   context stack[max_path_depth + 1];
@@ -799,10 +799,6 @@ rmm::device_uvector<path_instruction> construct_path_commands(
   for (auto const& inst : instructions) {
     auto const& [type, name, index] = inst;
     switch (type) {
-      case path_instruction_type::SUBSCRIPT:
-      case path_instruction_type::KEY:
-        // skip SUBSCRIPT and KEY to save stack size in `evaluate_path`
-        break;
       case path_instruction_type::WILDCARD:
         path_commands.emplace_back(path_instruction{path_instruction_type::WILDCARD});
         break;
diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp
index 2fcdb20697..b48ef17a66 100644
--- a/src/main/cpp/src/get_json_object.hpp
+++ b/src/main/cpp/src/get_json_object.hpp
@@ -33,7 +33,7 @@ namespace spark_rapids_jni {
 /**
  * path instruction type
  */
-enum class path_instruction_type { SUBSCRIPT, WILDCARD, KEY, INDEX, NAMED };
+enum class path_instruction_type { WILDCARD, INDEX, NAMED };
 
 /**
  * Extracts json object from a json string based on json path specified, and
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
index 4ff9c91a3f..bee6f1df74 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
@@ -27,9 +27,7 @@ public class JSONUtils {
   public static final int MAX_PATH_DEPTH = 16;
 
   public enum PathInstructionType {
-    SUBSCRIPT,
     WILDCARD,
-    KEY,
     INDEX,
     NAMED
   }

From fa9ec68ec0faf419f62d381dadd6a28fed5e14bf Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 May 2024 22:57:30 +0800
Subject: [PATCH 107/124] Update submodule cudf to
 b4daa16f1d67d505abbdd816d4123d4b3a418369 (#2056)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8b7245548c..b4daa16f1d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8b7245548c63d1ce84031a0bd187cbfb8e072f8c
+Subproject commit b4daa16f1d67d505abbdd816d4123d4b3a418369

From 956fb535f5311f00505af17bd4ec6762d8537e10 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Wed, 22 May 2024 01:01:52 +0800
Subject: [PATCH 108/124] Add regex rewrite kernel to find `literal[a,b]{x,y}`
 in a string (#2041)

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* support range filter

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* change some names

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>

* format

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* fix build

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 src/main/cpp/CMakeLists.txt                   |   2 +
 src/main/cpp/src/RegexRewriteUtilsJni.cpp     |  41 ++++++
 src/main/cpp/src/regex_rewrite_utils.cu       | 133 ++++++++++++++++++
 src/main/cpp/src/regex_rewrite_utils.hpp      |  45 ++++++
 .../spark/rapids/jni/RegexRewriteUtils.java   |  44 ++++++
 .../rapids/jni/RegexRewriteUtilsTest.java     |  51 +++++++
 6 files changed, 316 insertions(+)
 create mode 100644 src/main/cpp/src/RegexRewriteUtilsJni.cpp
 create mode 100644 src/main/cpp/src/regex_rewrite_utils.cu
 create mode 100644 src/main/cpp/src/regex_rewrite_utils.hpp
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
 create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/RegexRewriteUtilsTest.java

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index d30abc0b8f..169067bfdd 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -162,6 +162,7 @@ add_library(
   src/MapUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
+  src/RegexRewriteUtilsJni.cpp
   src/RowConversionJni.cpp
   src/SparkResourceAdaptorJni.cpp
   src/ZOrderJni.cpp
@@ -178,6 +179,7 @@ add_library(
   src/map_utils.cu
   src/murmur_hash.cu
   src/parse_uri.cu
+  src/regex_rewrite_utils.cu
   src/row_conversion.cu
   src/timezones.cu
   src/utilities.cu
diff --git a/src/main/cpp/src/RegexRewriteUtilsJni.cpp b/src/main/cpp/src/RegexRewriteUtilsJni.cpp
new file mode 100644
index 0000000000..28f346582c
--- /dev/null
+++ b/src/main/cpp/src/RegexRewriteUtilsJni.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+#include "regex_rewrite_utils.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_RegexRewriteUtils_literalRangePattern(
+  JNIEnv* env, jclass, jlong input, jlong target, jint d, jint start, jint end)
+{
+  JNI_NULL_CHECK(env, input, "input column is null", 0);
+  JNI_NULL_CHECK(env, target, "target is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(input);
+    cudf::strings_column_view scv(*cv);
+    cudf::string_scalar* ss_scalar = reinterpret_cast<cudf::string_scalar*>(target);
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::literal_range_pattern(scv, *ss_scalar, d, start, end));
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/cpp/src/regex_rewrite_utils.cu b/src/main/cpp/src/regex_rewrite_utils.cu
new file mode 100644
index 0000000000..2735b134f9
--- /dev/null
+++ b/src/main/cpp/src/regex_rewrite_utils.cu
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utf8.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+struct literal_range_pattern_fn {
+  __device__ bool operator()(
+    cudf::string_view d_string, cudf::string_view d_prefix, int range_len, int start, int end) const
+  {
+    int const n = d_string.length(), m = d_prefix.length();
+    for (int i = 0; i <= n - m - range_len; i++) {
+      bool match = true;
+      for (int j = 0; j < m; j++) {
+        if (d_string[i + j] != d_prefix[j]) {
+          match = false;
+          break;
+        }
+      }
+      if (match) {
+        for (int j = 0; j < range_len; j++) {
+          auto code_point = cudf::strings::detail::utf8_to_codepoint(d_string[i + m + j]);
+          if (code_point < start || code_point > end) {
+            match = false;
+            break;
+          }
+        }
+        if (match) { return true; }
+      }
+    }
+    return false;
+  }
+};
+
+std::unique_ptr<cudf::column> find_literal_range_pattern(cudf::strings_column_view const& strings,
+                                                         cudf::string_scalar const& prefix,
+                                                         int const range_len,
+                                                         int const start,
+                                                         int const end,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  auto const strings_count = strings.size();
+  if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::BOOL8); }
+
+  CUDF_EXPECTS(prefix.is_valid(stream), "Parameter prefix must be valid.");
+
+  auto const d_prefix       = cudf::string_view(prefix.data(), prefix.size());
+  auto const strings_column = cudf::column_device_view::create(strings.parent(), stream);
+  auto const d_strings      = *strings_column;
+
+  auto results         = make_numeric_column(cudf::data_type{cudf::type_id::BOOL8},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto const d_results = results->mutable_view().data<bool>();
+  // set the bool values by evaluating the passed function
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    thrust::make_counting_iterator<cudf::size_type>(strings_count),
+    d_results,
+    [d_strings, d_prefix, range_len, start, end, check_fn = literal_range_pattern_fn{}] __device__(
+      cudf::size_type idx) {
+      if (!d_strings.is_null(idx)) {
+        return check_fn(d_strings.element<cudf::string_view>(idx), d_prefix, range_len, start, end);
+      }
+      return false;
+    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+
+}  // namespace
+
+/**
+ * @brief Check if input string contains regex pattern `literal[start-end]{len,}`, which means
+ * a literal string followed by a range of characters in the range of start to end, with at least
+ * len characters.
+ *
+ * @param strings Column of strings to check for literal.
+ * @param literal UTF-8 encoded string to check in strings column.
+ * @param len Minimum number of characters to check after the literal.
+ * @param start Minimum UTF-8 codepoint value to check for in the range.
+ * @param end Maximum UTF-8 codepoint value to check for in the range.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<cudf::column> literal_range_pattern(cudf::strings_column_view const& input,
+                                                    cudf::string_scalar const& prefix,
+                                                    int const range_len,
+                                                    int const start,
+                                                    int const end,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return find_literal_range_pattern(input, prefix, range_len, start, end, stream, mr);
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/regex_rewrite_utils.hpp b/src/main/cpp/src/regex_rewrite_utils.hpp
new file mode 100644
index 0000000000..e5e500b180
--- /dev/null
+++ b/src/main/cpp/src/regex_rewrite_utils.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+namespace spark_rapids_jni {
+/**
+ * @brief Check if input string contains regex pattern `literal[start-end]{len,}`, which means
+ * a literal string followed by a range of characters in the range of start to end, with at least
+ * len characters.
+ *
+ * @param strings Column of strings to check for literal.
+ * @param literal UTF-8 encoded string to check in strings column.
+ * @param len Minimum number of characters to check after the literal.
+ * @param start Minimum UTF-8 codepoint value to check for in the range.
+ * @param end Maximum UTF-8 codepoint value to check for in the range.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<cudf::column> literal_range_pattern(
+  cudf::strings_column_view const& input,
+  cudf::string_scalar const& literal,
+  int const len,
+  int const start,
+  int const end,
+  rmm::cuda_stream_view stream      = rmm::cuda_stream_default,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+}  // namespace spark_rapids_jni
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
new file mode 100644
index 0000000000..9277c3e0f9
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.*;
+
+public class RegexRewriteUtils {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+/**
+ * @brief Check if input string contains regex pattern `literal[start-end]{len,}`, which means
+ * a literal string followed by a range of characters in the range of start to end, with at least
+ * len characters.
+ *
+ * @param strings Column of strings to check for literal.
+ * @param literal UTF-8 encoded string to check in strings column.
+ * @param len Minimum number of characters to check after the literal.
+ * @param start Minimum UTF-8 codepoint value to check for in the range.
+ * @param end Maximum UTF-8 codepoint value to check for in the range.
+ * @return ColumnVector of booleans where true indicates the string contains the pattern.
+ */
+  public static ColumnVector literalRangePattern(ColumnVector input, Scalar literal, int len, int start, int end) {
+    assert(input.getType().equals(DType.STRING)) : "column must be a String";
+    return new ColumnVector(literalRangePattern(input.getNativeView(), CudfAccessor.getScalarHandle(literal), len, start, end));
+  }
+
+  private static native long literalRangePattern(long input, long literal, int len, int start, int end);
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RegexRewriteUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RegexRewriteUtilsTest.java
new file mode 100644
index 0000000000..243967055a
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RegexRewriteUtilsTest.java
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.Scalar;
+import org.junit.jupiter.api.Test;
+
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+
+public class RegexRewriteUtilsTest {
+
+  @Test
+  void testLiteralRangePattern() {
+    int d = 3;
+    try (ColumnVector inputCv = ColumnVector.fromStrings(
+        "abc123", "aabc123", "aabc12", "abc1232", "aabc1232");
+        Scalar pattern = Scalar.fromString("abc");
+        ColumnVector expected = ColumnVector.fromBooleans(true, true, false, true, true);
+        ColumnVector actual = RegexRewriteUtils.literalRangePattern(inputCv, pattern, d, 48, 57)) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
+
+  @Test
+  void testLiteralRangePatternChinese() {
+    int d = 2;
+    try (ColumnVector inputCv = ColumnVector.fromStrings(
+        "数据砖块", "火花-急流英伟达", "英伟达Nvidia", "火花-急流");
+        Scalar pattern = Scalar.fromString("英");
+        ColumnVector expected = ColumnVector.fromBooleans(false, true, true, false);
+        ColumnVector actual = RegexRewriteUtils.literalRangePattern(inputCv, pattern, d, 19968, 40869)) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
+
+}

From ddb8bb7cb70745bd42d24dd5d4bf6abc301bd1da Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 21 May 2024 12:02:10 -0500
Subject: [PATCH 109/124] Fail get_json_object calls that are too long (#2057)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 src/main/cpp/src/get_json_object.cu | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index e87c47ffe5..98e4f2c408 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -43,6 +43,12 @@ namespace spark_rapids_jni {
 
 namespace detail {
 
+// path max depth limitation
+// There is a same constant in JSONUtil.java, keep them consistent when changing
+// Note: Spark-Rapids should guarantee the path depth is less or equal to this limit,
+// or GPU reports cudaErrorIllegalAddress
+constexpr int max_path_depth = 16;
+
 /**
  * write JSON style
  */
@@ -383,12 +389,6 @@ __device__ bool evaluate_path(json_parser& p,
     json_generator child_g;
   };
 
-  // path max depth limitation
-  // There is a same constant in JSONUtil.java, keep them consistent when changing
-  // Note: Spark-Rapids should guarantee the path depth is less or equal to this limit,
-  // or GPU reports cudaErrorIllegalAddress
-  constexpr int max_path_depth = 16;
-
   // define stack; plus 1 indicates root context task needs an extra memory
   context stack[max_path_depth + 1];
   int stack_pos = 0;
@@ -954,6 +954,8 @@ std::unique_ptr<cudf::column> get_json_object(
 {
   if (input.is_empty()) return cudf::make_empty_column(cudf::type_id::STRING);
 
+  if (instructions.size() > max_path_depth) { CUDF_FAIL("JSONPath query exceeds maximum depth"); }
+
   // get a string buffer to store all the names and convert to device
   std::string all_names;
   for (auto const& inst : instructions) {

From 56596c438c4de1a7f65453b8cfbbcd77338cc7d8 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 22 May 2024 04:29:29 +0800
Subject: [PATCH 110/124] Update submodule cudf to
 d78d565b15bd9a2e3200176af4656ee2098b209b (#2058)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index b4daa16f1d..d78d565b15 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit b4daa16f1d67d505abbdd816d4123d4b3a418369
+Subproject commit d78d565b15bd9a2e3200176af4656ee2098b209b

From fb4d5f644bfc823ca2218f19d807404c032a27f5 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 22 May 2024 11:14:01 +0800
Subject: [PATCH 111/124] Update submodule cudf to
 9a0612b3add9c76ea8cb45cc230b75b2474d91f7 (#2059)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    | 2 +-
 thirdparty/cudf-pins/versions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d78d565b15..9a0612b3ad 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d78d565b15bd9a2e3200176af4656ee2098b209b
+Subproject commit 9a0612b3add9c76ea8cb45cc230b75b2474d91f7
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 2324f94586..a0384f40f2 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -5,9 +5,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "6a28035c2b49b432dc63f5ee7524d76b4ed2d762",
+      "git_tag" : "7dd1d34074af176d9e861a360e135ae57b21cf96",
       "git_url" : "https://github.com/apache/arrow.git",
-      "version" : "16.0.0"
+      "version" : "16.1.0"
     },
     "CCCL" : 
     {

From cac7f14cb356a09bdd28e317d68c53a0500b97ba Mon Sep 17 00:00:00 2001
From: Feng Jiang <106386742+Feng-Jiang28@users.noreply.github.com>
Date: Wed, 22 May 2024 16:17:16 +0800
Subject: [PATCH 112/124] add my id (#2061)

Signed-off-by: fejiang <fejiang@nvidia.com>
---
 .github/workflows/blossom-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 5053ae4dae..33ccf50ea8 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -65,6 +65,7 @@ jobs:
       liurenjie1024,\
       binmahone,\
       pmattione-nvidia,\
+      Feng-Jiang28,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person

From 79253a9d5a99932e8fdd5ae4f72f903d214626e6 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 22 May 2024 08:59:34 -0400
Subject: [PATCH 113/124] Switch rmm device_memory_resource to
 device_async_resource_ref (#2011)

* Switch rmm device_memory_resource to device_async_resource_ref

Signed-off-by: Paul Mattione <pmattione@nvidia.com>

* Revert resource adaptor changes, as this functionality is not ready yet.

Signed-off-by: Paul Mattione <pmattione@nvidia.com>

* Remove unnecessary includes

Signed-off-by: Paul Mattione <pmattione@nvidia.com>

* Fix cudf submodule hash

Signed-off-by: Paul Mattione <pmattione@nvidia.com>

---------

Signed-off-by: Paul Mattione <pmattione@nvidia.com>
---
 src/main/cpp/src/bloom_filter.cu           |  8 +++----
 src/main/cpp/src/bloom_filter.hpp          | 17 ++++++++-------
 src/main/cpp/src/cast_decimal_to_string.cu |  8 +++----
 src/main/cpp/src/cast_float_to_string.cu   |  8 +++----
 src/main/cpp/src/cast_string.cu            | 12 +++++------
 src/main/cpp/src/cast_string.hpp           | 16 ++++++++------
 src/main/cpp/src/cast_string_to_float.cu   |  4 +++-
 src/main/cpp/src/datetime_rebase.cu        |  9 ++++----
 src/main/cpp/src/format_float.cu           |  8 +++----
 src/main/cpp/src/get_json_object.cu        |  7 +++---
 src/main/cpp/src/get_json_object.hpp       |  6 ++++--
 src/main/cpp/src/hash.cuh                  | 15 +++++++------
 src/main/cpp/src/histogram.cu              | 10 ++++-----
 src/main/cpp/src/histogram.hpp             | 11 +++++-----
 src/main/cpp/src/map_utils.cu              |  6 +++---
 src/main/cpp/src/map_utils.hpp             |  5 +++--
 src/main/cpp/src/murmur_hash.cu            |  2 +-
 src/main/cpp/src/parse_uri.cu              | 14 ++++++------
 src/main/cpp/src/parse_uri.hpp             | 25 +++++++++++-----------
 src/main/cpp/src/row_conversion.cu         | 16 +++++++-------
 src/main/cpp/src/row_conversion.hpp        | 17 ++++++++-------
 src/main/cpp/src/timezones.cu              |  8 +++----
 src/main/cpp/src/timezones.hpp             | 11 +++++-----
 src/main/cpp/src/utilities.cu              |  5 ++---
 src/main/cpp/src/utilities.hpp             |  7 +++---
 src/main/cpp/src/xxhash64.cu               |  2 +-
 src/main/cpp/src/zorder.cu                 |  4 ++--
 src/main/cpp/src/zorder.hpp                |  9 ++++----
 28 files changed, 143 insertions(+), 127 deletions(-)

diff --git a/src/main/cpp/src/bloom_filter.cu b/src/main/cpp/src/bloom_filter.cu
index d5f868c476..5dfdd582ef 100644
--- a/src/main/cpp/src/bloom_filter.cu
+++ b/src/main/cpp/src/bloom_filter.cu
@@ -225,7 +225,7 @@ std::pair<int, int> get_bloom_filter_stride(int bloom_filter_longs)
 std::unique_ptr<cudf::list_scalar> bloom_filter_create(int num_hashes,
                                                        int bloom_filter_longs,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto [bloom_filter_size, buf_size] = get_bloom_filter_stride(bloom_filter_longs);
 
@@ -276,7 +276,7 @@ void bloom_filter_put(cudf::list_scalar& bloom_filter,
 
 std::unique_ptr<cudf::list_scalar> bloom_filter_merge(cudf::column_view const& bloom_filters,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   // unpack the bloom filter
   cudf::lists_column_view lcv(bloom_filters);
@@ -339,7 +339,7 @@ std::unique_ptr<cudf::list_scalar> bloom_filter_merge(cudf::column_view const& b
 std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
                                                  cudf::device_span<uint8_t const> bloom_filter,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // unpack the bloom filter
   auto [header, buffer, bloom_filter_bits] = unpack_bloom_filter(bloom_filter, stream);
@@ -368,7 +368,7 @@ std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
 std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
                                                  cudf::list_scalar& bloom_filter,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   return bloom_filter_probe(input, bloom_filter.view(), stream, mr);
 }
diff --git a/src/main/cpp/src/bloom_filter.hpp b/src/main/cpp/src/bloom_filter.hpp
index e54d26f630..9bb83e0b8b 100644
--- a/src/main/cpp/src/bloom_filter.hpp
+++ b/src/main/cpp/src/bloom_filter.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace spark_rapids_jni {
 
@@ -46,8 +47,8 @@ constexpr int bloom_filter_header_size = sizeof(bloom_filter_header);
 std::unique_ptr<cudf::list_scalar> bloom_filter_create(
   int num_hashes,
   int bloom_filter_longs,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Inserts input values into a bloom filter.
@@ -77,8 +78,8 @@ void bloom_filter_put(cudf::list_scalar& bloom_filter,
 std::unique_ptr<cudf::column> bloom_filter_probe(
   cudf::column_view const& input,
   cudf::device_span<uint8_t const> bloom_filter,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Probe a bloom filter with an input column of int64_t values.
@@ -94,8 +95,8 @@ std::unique_ptr<cudf::column> bloom_filter_probe(
 std::unique_ptr<cudf::column> bloom_filter_probe(
   cudf::column_view const& input,
   cudf::list_scalar& bloom_filter,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Merge multiple bloom filters into a single output.
@@ -112,7 +113,7 @@ std::unique_ptr<cudf::column> bloom_filter_probe(
  */
 std::unique_ptr<cudf::list_scalar> bloom_filter_merge(
   cudf::column_view const& bloom_filters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/cast_decimal_to_string.cu b/src/main/cpp/src/cast_decimal_to_string.cu
index 91b155dae4..099ac59a04 100644
--- a/src/main/cpp/src/cast_decimal_to_string.cu
+++ b/src/main/cpp/src/cast_decimal_to_string.cu
@@ -181,7 +181,7 @@ struct dispatch_decimal_to_non_ansi_string_fn {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;  // underlying value type
 
@@ -200,7 +200,7 @@ struct dispatch_decimal_to_non_ansi_string_fn {
   template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for decimal_to_non_ansi_string function must be a decimal type.");
   }
@@ -210,7 +210,7 @@ struct dispatch_decimal_to_non_ansi_string_fn {
 
 std::unique_ptr<column> decimal_to_non_ansi_string(column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   return type_dispatcher(input.type(), dispatch_decimal_to_non_ansi_string_fn{}, input, stream, mr);
@@ -222,7 +222,7 @@ std::unique_ptr<column> decimal_to_non_ansi_string(column_view const& input,
 
 std::unique_ptr<column> decimal_to_non_ansi_string(column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::decimal_to_non_ansi_string(input, stream, mr);
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index b294ca6f1b..5af0d8c5ce 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -77,7 +77,7 @@ struct dispatch_float_to_string_fn {
   template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto const strings_count = floats.size();
     if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
@@ -98,7 +98,7 @@ struct dispatch_float_to_string_fn {
   template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Values for float_to_string function must be a float type.");
   }
@@ -109,7 +109,7 @@ struct dispatch_float_to_string_fn {
 // This will convert all float column types into a strings column.
 std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr);
 }
@@ -119,7 +119,7 @@ std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
 // external API
 std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::float_to_string(floats, stream, mr);
diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu
index d32d153632..bfbbc3777d 100644
--- a/src/main/cpp/src/cast_string.cu
+++ b/src/main/cpp/src/cast_string.cu
@@ -650,7 +650,7 @@ struct string_to_integer_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (string_col.size() == 0) {
       return std::make_unique<column>(
@@ -695,7 +695,7 @@ struct string_to_integer_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Invalid integer column type");
   }
@@ -722,7 +722,7 @@ struct string_to_decimal_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using Type = device_storage_type_t<T>;
 
@@ -764,7 +764,7 @@ struct string_to_decimal_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Invalid decimal column type");
   }
@@ -789,7 +789,7 @@ std::unique_ptr<column> string_to_integer(data_type dtype,
                                           bool ansi_mode,
                                           bool strip,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     dtype, detail::string_to_integer_impl{}, string_col, ansi_mode, strip, stream, mr);
@@ -814,7 +814,7 @@ std::unique_ptr<column> string_to_decimal(int32_t precision,
                                           bool ansi_mode,
                                           bool strip,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   data_type dtype = [precision, scale]() {
     if (precision <= cuda::std::numeric_limits<int32_t>::digits10)
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index 43ec36e576..2850fbfae5 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 
 namespace spark_rapids_jni {
@@ -73,7 +75,7 @@ std::unique_ptr<cudf::column> string_to_integer(
   bool ansi_mode,
   bool strip,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert a string column into an decimal column.
@@ -95,7 +97,7 @@ std::unique_ptr<cudf::column> string_to_decimal(
   bool ansi_mode,
   bool strip,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert a string column into an float column.
@@ -113,22 +115,22 @@ std::unique_ptr<cudf::column> string_to_float(
   cudf::strings_column_view const& string_col,
   bool ansi_mode,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> format_float(
   cudf::column_view const& input,
   int const digits,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> float_to_string(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> decimal_to_non_ansi_string(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu
index 99090efbe5..e843d645ce 100644
--- a/src/main/cpp/src/cast_string_to_float.cu
+++ b/src/main/cpp/src/cast_string_to_float.cu
@@ -25,6 +25,8 @@
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/utilities/bit.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cub/warp/warp_reduce.cuh>
 
 using namespace cudf;
@@ -655,7 +657,7 @@ std::unique_ptr<column> string_to_float(data_type dtype,
                                         strings_column_view const& string_col,
                                         bool ansi_mode,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(dtype == data_type{type_id::FLOAT32} || dtype == data_type{type_id::FLOAT64},
                "invalid float data type");
diff --git a/src/main/cpp/src/datetime_rebase.cu b/src/main/cpp/src/datetime_rebase.cu
index 9e8e791490..976c9b1530 100644
--- a/src/main/cpp/src/datetime_rebase.cu
+++ b/src/main/cpp/src/datetime_rebase.cu
@@ -25,6 +25,7 @@
 
 //
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 //
 #include <cuda/functional>
@@ -56,7 +57,7 @@ __device__ __inline__ auto days_from_julian(cuda::std::chrono::year_month_day co
 // This is to match with Apache Spark's `localRebaseGregorianToJulianDays` function.
 std::unique_ptr<cudf::column> gregorian_to_julian_days(cudf::column_view const& input,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_DAYS,
                "The input column type must be microsecond timestamp.",
@@ -127,7 +128,7 @@ __device__ __inline__ cuda::std::chrono::year_month_day julian_from_days(int32_t
 // `localRebaseJulianToGregorianDays` function.
 std::unique_ptr<cudf::column> julian_to_gregorian_days(cudf::column_view const& input,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_DAYS,
                "The input column type must be microsecond timestamp.",
@@ -227,7 +228,7 @@ __device__ __inline__ time_components get_time_components(int64_t micros)
 // fixed to UTC.
 std::unique_ptr<cudf::column> gregorian_to_julian_micros(cudf::column_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_MICROSECONDS,
                "The input column type must be microsecond timestamp.",
@@ -290,7 +291,7 @@ std::unique_ptr<cudf::column> gregorian_to_julian_micros(cudf::column_view const
 // fixed to UTC.
 std::unique_ptr<cudf::column> julian_to_gregorian_micros(cudf::column_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_MICROSECONDS,
                "The input column type must be microsecond timestamp.",
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index bc3c85bbcc..8d316d6cbf 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -78,7 +78,7 @@ struct dispatch_format_float_fn {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
                                            int const digits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     auto const strings_count = floats.size();
     if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
@@ -100,7 +100,7 @@ struct dispatch_format_float_fn {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            int const,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*) const
+                                           rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for format_float function must be a float type.");
   }
@@ -112,7 +112,7 @@ struct dispatch_format_float_fn {
 std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
                                            int const digits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr);
 }
@@ -123,7 +123,7 @@ std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
 std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
                                            int const digits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::format_float(floats, digits, stream, mr);
diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index 98e4f2c408..887b9887de 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -35,6 +35,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/pair.h>
 #include <thrust/tuple.h>
@@ -790,7 +791,7 @@ rmm::device_uvector<path_instruction> construct_path_commands(
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
   cudf::string_scalar const& all_names_scalar,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   int name_pos = 0;
 
@@ -950,7 +951,7 @@ std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return cudf::make_empty_column(cudf::type_id::STRING);
 
@@ -1018,7 +1019,7 @@ std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return detail::get_json_object(input, instructions, stream, mr);
 }
diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp
index b48ef17a66..bb3294b424 100644
--- a/src/main/cpp/src/get_json_object.hpp
+++ b/src/main/cpp/src/get_json_object.hpp
@@ -19,6 +19,8 @@
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/scan.h>
@@ -43,7 +45,7 @@ enum class path_instruction_type { WILDCARD, INDEX, NAMED };
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hash.cuh b/src/main/cpp/src/hash.cuh
index 1c6333523c..8cf489a7e7 100644
--- a/src/main/cpp/src/hash.cuh
+++ b/src/main/cpp/src/hash.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/reverse.h>
 
@@ -113,9 +114,9 @@ __device__ __inline__ std::pair<__int128_t, cudf::size_type> to_java_bigdecimal(
  */
 std::unique_ptr<cudf::column> murmur_hash3_32(
   cudf::table_view const& input,
-  uint32_t seed                       = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint32_t seed                     = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the xxhash64 hash value of each row in the input set of columns.
@@ -129,8 +130,8 @@ std::unique_ptr<cudf::column> murmur_hash3_32(
  */
 std::unique_ptr<cudf::column> xxhash64(
   cudf::table_view const& input,
-  int64_t seed                        = DEFAULT_XXHASH64_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  int64_t seed                      = DEFAULT_XXHASH64_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/histogram.cu b/src/main/cpp/src/histogram.cu
index d39ceaa5be..b78c5ae1e0 100644
--- a/src/main/cpp/src/histogram.cu
+++ b/src/main/cpp/src/histogram.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -167,7 +167,7 @@ struct percentile_dispatcher {
                          bool has_null,
                          cudf::size_type num_histograms,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr) const
+                         rmm::device_async_resource_ref mr) const
   {
     // Returns all nulls for totally empty input.
     if (data.size() == 0 || percentages.size() == 0) {
@@ -257,7 +257,7 @@ std::unique_ptr<cudf::column> wrap_in_list(std::unique_ptr<cudf::column>&& input
                                            cudf::size_type num_histograms,
                                            cudf::size_type num_percentages,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   if (input->size() == 0) {
     return cudf::lists::detail::make_empty_lists_column(input->type(), stream, mr);
@@ -284,7 +284,7 @@ std::unique_ptr<cudf::column> create_histogram_if_valid(cudf::column_view const&
                                                         cudf::column_view const& frequencies,
                                                         bool output_as_lists,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     !frequencies.has_nulls(), "The input frequencies must not have nulls.", std::invalid_argument);
@@ -430,7 +430,7 @@ std::unique_ptr<cudf::column> percentile_from_histogram(cudf::column_view const&
                                                         std::vector<double> const& percentages,
                                                         bool output_as_list,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   check_input(input, percentages);
 
diff --git a/src/main/cpp/src/histogram.hpp b/src/main/cpp/src/histogram.hpp
index 43058d9522..23318bdfac 100644
--- a/src/main/cpp/src/histogram.hpp
+++ b/src/main/cpp/src/histogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 //
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace spark_rapids_jni {
 
@@ -50,8 +51,8 @@ std::unique_ptr<cudf::column> create_histogram_if_valid(
   cudf::column_view const& values,
   cudf::column_view const& frequencies,
   bool output_as_lists,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute percentiles from the given histograms and percentage values.
@@ -70,7 +71,7 @@ std::unique_ptr<cudf::column> percentile_from_histogram(
   cudf::column_view const& input,
   std::vector<double> const& percentage,
   bool output_as_lists,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu
index 529ed1d90c..ebb12eee93 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/map_utils.cu
@@ -538,7 +538,7 @@ std::unique_ptr<cudf::column> extract_keys_or_values(
   rmm::device_uvector<int8_t> const& key_or_value,
   rmm::device_uvector<char> const& unified_json_buff,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const is_key = cuda::proclaim_return_type<bool>(
     [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
@@ -579,7 +579,7 @@ rmm::device_uvector<cudf::size_type> compute_list_offsets(
   rmm::device_uvector<NodeIndexT> const& parent_node_ids,
   rmm::device_uvector<int8_t> const& key_or_value,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Count the number of children nodes for the json object nodes.
   // These object nodes are given as one row of the input json strings column.
@@ -643,7 +643,7 @@ rmm::device_uvector<cudf::size_type> compute_list_offsets(
 
 std::unique_ptr<cudf::column> from_json(cudf::column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format");
 
diff --git a/src/main/cpp/src/map_utils.hpp b/src/main/cpp/src/map_utils.hpp
index c620b6fb95..96ba6f7e9b 100644
--- a/src/main/cpp/src/map_utils.hpp
+++ b/src/main/cpp/src/map_utils.hpp
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -27,7 +28,7 @@ namespace spark_rapids_jni {
 
 std::unique_ptr<cudf::column> from_json(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu
index d94ca2d5bc..91e8fb97e0 100644
--- a/src/main/cpp/src/murmur_hash.cu
+++ b/src/main/cpp/src/murmur_hash.cu
@@ -187,7 +187,7 @@ void check_hash_compatibility(cudf::table_view const& input)
 std::unique_ptr<cudf::column> murmur_hash3_32(cudf::table_view const& input,
                                               uint32_t seed,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   auto output =
     cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<murmur_hash_value_type>()),
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 398c033c3a..0e57366358 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -877,7 +877,7 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
                                   URI_chunks chunk,
                                   std::optional<strings_column_view const> query_match,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
@@ -955,7 +955,7 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
 
 std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, std::nullopt, stream, mr);
@@ -963,7 +963,7 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
 
 std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::parse_uri(input, detail::URI_chunks::HOST, std::nullopt, stream, mr);
@@ -971,7 +971,7 @@ std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
 
 std::unique_ptr<column> parse_uri_to_query(strings_column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::parse_uri(input, detail::URI_chunks::QUERY, std::nullopt, stream, mr);
@@ -980,7 +980,7 @@ std::unique_ptr<column> parse_uri_to_query(strings_column_view const& input,
 std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const& input,
                                                  std::string const& query_match,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -994,7 +994,7 @@ std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const
 std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const& input,
                                                  cudf::strings_column_view const& query_match,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input.size() == query_match.size(), "Query column must be the same size as input!");
@@ -1004,7 +1004,7 @@ std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const
 
 std::unique_ptr<column> parse_uri_to_path(strings_column_view const& input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::parse_uri(input, detail::URI_chunks::PATH, std::nullopt, stream, mr);
diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp
index 39add300f7..2afc879cba 100644
--- a/src/main/cpp/src/parse_uri.hpp
+++ b/src/main/cpp/src/parse_uri.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -36,8 +37,8 @@ namespace spark_rapids_jni {
  */
 std::unique_ptr<cudf::column> parse_uri_to_protocol(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Parse host and copy from the input string column to the output string column.
@@ -49,8 +50,8 @@ std::unique_ptr<cudf::column> parse_uri_to_protocol(
  */
 std::unique_ptr<cudf::column> parse_uri_to_host(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Parse query and copy from the input string column to the output string column.
@@ -62,8 +63,8 @@ std::unique_ptr<cudf::column> parse_uri_to_host(
  */
 std::unique_ptr<cudf::column> parse_uri_to_query(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Parse query and copy from the input string column to the output string column.
@@ -77,8 +78,8 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
 std::unique_ptr<cudf::column> parse_uri_to_query(
   cudf::strings_column_view const& input,
   std::string const& query_match,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Parse query and copy from the input string column to the output string column.
@@ -92,8 +93,8 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
 std::unique_ptr<cudf::column> parse_uri_to_query(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& query_match,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Parse path and copy from the input string column to the output string column.
@@ -105,7 +106,7 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
  */
 std::unique_ptr<cudf::column> parse_uri_to_path(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu
index 3d6e767042..4ad3927a41 100644
--- a/src/main/cpp/src/row_conversion.cu
+++ b/src/main/cpp/src/row_conversion.cu
@@ -1213,7 +1213,7 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
   const scalar& zero,
   const scalar& scalar_size_per_row,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
@@ -1459,7 +1459,7 @@ batch_data build_batches(size_type num_rows,
                          RowSize row_sizes,
                          bool all_fixed_width,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
 {
   auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
   auto const num_batches = static_cast<int32_t>(
@@ -1758,7 +1758,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   column_info_s const& column_info,
   std::optional<rmm::device_uvector<cudf::detail::input_offsetalator>> variable_width_offsets,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   int device_id;
   CUDF_CUDA_TRY(cudaGetDevice(&device_id));
@@ -1989,7 +1989,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
  */
 std::vector<std::unique_ptr<column>> convert_to_rows(table_view const& tbl,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto const num_columns = tbl.num_columns();
   auto const num_rows    = tbl.num_rows();
@@ -2051,7 +2051,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const& tbl,
 }
 
 std::vector<std::unique_ptr<column>> convert_to_rows_fixed_width_optimized(
-  table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  table_view const& tbl, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto const num_columns = tbl.num_columns();
 
@@ -2145,7 +2145,7 @@ void fixup_null_counts(std::vector<std::unique_ptr<column>>& output_columns,
 std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
                                          std::vector<data_type> const& schema,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // verify that the types are what we expect
   column_view child    = input.child();
@@ -2208,7 +2208,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
                                                size_type num_rows,
                                                bool include_nm,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr) {
+                                               rmm::device_async_resource_ref mr) {
       auto column =
         make_fixed_width_column(type,
                                 num_rows,
@@ -2444,7 +2444,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
 std::unique_ptr<table> convert_from_rows_fixed_width_optimized(lists_column_view const& input,
                                                                std::vector<data_type> const& schema,
                                                                rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+                                                               rmm::device_async_resource_ref mr)
 {
   // verify that the types are what we expect
   column_view child    = input.child();
diff --git a/src/main/cpp/src/row_conversion.hpp b/src/main/cpp/src/row_conversion.hpp
index a5abd5b1bd..0aa7593516 100644
--- a/src/main/cpp/src/row_conversion.hpp
+++ b/src/main/cpp/src/row_conversion.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -29,25 +30,25 @@ namespace spark_rapids_jni {
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
   cudf::table_view const& tbl,
   // TODO need something for validity
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
   cudf::table_view const& tbl,
   // TODO need something for validity
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
   cudf::lists_column_view const& input,
   std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table> convert_from_rows(
   cudf::lists_column_view const& input,
   std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/timezones.cu b/src/main/cpp/src/timezones.cu
index 30f19d9df0..12278d181d 100644
--- a/src/main/cpp/src/timezones.cu
+++ b/src/main/cpp/src/timezones.cu
@@ -95,7 +95,7 @@ auto convert_timestamp_tz(column_view const& input,
                           size_type tz_index,
                           bool to_utc,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
 {
   // get the fixed transitions
   auto const ft_cdv_ptr        = column_device_view::create(transitions.column(0), stream);
@@ -127,7 +127,7 @@ std::unique_ptr<column> convert_timestamp(column_view const& input,
                                           size_type tz_index,
                                           bool to_utc,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto const type = input.type().id();
 
@@ -149,7 +149,7 @@ std::unique_ptr<column> convert_timestamp_to_utc(column_view const& input,
                                                  table_view const& transitions,
                                                  size_type tz_index,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   return convert_timestamp(input, transitions, tz_index, true, stream, mr);
 }
@@ -158,7 +158,7 @@ std::unique_ptr<column> convert_utc_timestamp_to_timezone(column_view const& inp
                                                           table_view const& transitions,
                                                           size_type tz_index,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   return convert_timestamp(input, transitions, tz_index, false, stream, mr);
 }
diff --git a/src/main/cpp/src/timezones.hpp b/src/main/cpp/src/timezones.hpp
index c7ab3c0cc8..00173075b6 100644
--- a/src/main/cpp/src/timezones.hpp
+++ b/src/main/cpp/src/timezones.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstddef>
 
@@ -42,8 +43,8 @@ std::unique_ptr<cudf::column> convert_timestamp_to_utc(
   cudf::column_view const& input,
   cudf::table_view const& transitions,
   cudf::size_type tz_index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert input column timestamps in UTC to specified timezone
@@ -63,7 +64,7 @@ std::unique_ptr<cudf::column> convert_utc_timestamp_to_timezone(
   cudf::column_view const& input,
   cudf::table_view const& transitions,
   cudf::size_type tz_index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/utilities.cu b/src/main/cpp/src/utilities.cu
index 7c202a1bec..0b44a2a994 100644
--- a/src/main/cpp/src/utilities.cu
+++ b/src/main/cpp/src/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda/functional>
 
@@ -32,7 +31,7 @@ namespace spark_rapids_jni {
 std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
   std::vector<cudf::device_span<cudf::bitmask_type const>> const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() > 0, "Empty input");
   auto const mask_size = (*input.begin()).size();
diff --git a/src/main/cpp/src/utilities.hpp b/src/main/cpp/src/utilities.hpp
index 261e75befc..ad0eae7dc6 100644
--- a/src/main/cpp/src/utilities.hpp
+++ b/src/main/cpp/src/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace spark_rapids_jni {
 
@@ -35,7 +36,7 @@ namespace spark_rapids_jni {
  */
 std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
   std::vector<cudf::device_span<cudf::bitmask_type const>> const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu
index 78cc4651bd..daed7590c3 100644
--- a/src/main/cpp/src/xxhash64.cu
+++ b/src/main/cpp/src/xxhash64.cu
@@ -330,7 +330,7 @@ class device_row_hasher {
 std::unique_ptr<cudf::column> xxhash64(cudf::table_view const& input,
                                        int64_t _seed,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   hash_value_type seed = static_cast<hash_value_type>(_seed);
 
diff --git a/src/main/cpp/src/zorder.cu b/src/main/cpp/src/zorder.cu
index 405b046528..37089c7736 100644
--- a/src/main/cpp/src/zorder.cu
+++ b/src/main/cpp/src/zorder.cu
@@ -137,7 +137,7 @@ namespace spark_rapids_jni {
 
 std::unique_ptr<cudf::column> interleave_bits(cudf::table_view const& tbl,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   auto num_columns = tbl.num_columns();
   CUDF_EXPECTS(num_columns > 0, "The input table must have at least one column.");
@@ -224,7 +224,7 @@ std::unique_ptr<cudf::column> interleave_bits(cudf::table_view const& tbl,
 std::unique_ptr<cudf::column> hilbert_index(int32_t const num_bits_per_entry,
                                             cudf::table_view const& tbl,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const num_rows    = tbl.num_rows();
   auto const num_columns = tbl.num_columns();
diff --git a/src/main/cpp/src/zorder.hpp b/src/main/cpp/src/zorder.hpp
index c3fffc9b48..1e084a09de 100644
--- a/src/main/cpp/src/zorder.hpp
+++ b/src/main/cpp/src/zorder.hpp
@@ -20,6 +20,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -27,13 +28,13 @@ namespace spark_rapids_jni {
 
 std::unique_ptr<cudf::column> interleave_bits(
   cudf::table_view const& tbl,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> hilbert_index(
   int32_t const num_bits,
   cudf::table_view const& tbl,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni

From f0d01b42f014f50ba249ee2e8430b0bca045f7c5 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 23 May 2024 04:44:55 +0800
Subject: [PATCH 114/124] Update submodule cudf to
 45dc595945301f4076e66ec54a6e4de0b539cfb0 (#2065)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9a0612b3ad..45dc595945 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9a0612b3add9c76ea8cb45cc230b75b2474d91f7
+Subproject commit 45dc595945301f4076e66ec54a6e4de0b539cfb0
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index df1ef9ff01..245472ded1 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-78da6e709bb893fe31587f419a2c80fa72cd66f0
+2f2a3c37af91d11347a8f64bc6e90197fc53eea2

From 6d682161683d4d0c9d72c19eab09713d2268445b Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Thu, 23 May 2024 14:54:35 +0800
Subject: [PATCH 115/124] Auto merge PRs to branch-24.08 from branch-24.06
 [skip ci] (#2068)

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 .github/workflows/auto-merge.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index c6de34764b..b1c3c2b32b 100755
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -18,12 +18,12 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-      - branch-24.04
+      - branch-24.06
     types: [closed]
 
 env:
-  HEAD: branch-24.04
-  BASE: branch-24.06
+  HEAD: branch-24.06
+  BASE: branch-24.08
 
 jobs:
   auto-merge:

From a77db76c1bde9452d346f47322f2a17f373d419c Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 24 May 2024 12:46:14 -0500
Subject: [PATCH 116/124] Fix NVTX3 dependency (#2072)

* Fix NVTX3 dependency

Signed-off-by: Jason Lowe <jlowe@nvidia.com>

* Update pinned version of rapids-cmake

---------

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 src/main/cpp/CMakeLists.txt           |  2 +-
 thirdparty/cudf                       |  2 +-
 thirdparty/cudf-pins/rapids-cmake.sha |  2 +-
 thirdparty/cudf-pins/versions.json    | 18 +++++++++---------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 169067bfdd..fd9d671d6f 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -233,7 +233,7 @@ target_link_libraries(
   -Wl,--whole-archive
     ${CUDFJNI_LIB}
     cudf::cudf
-    nvtx3-cpp
+    nvtx3::nvtx3-cpp
   -Wl,--no-whole-archive
     ${PARQUET_LIB}
     ${THRIFT_LIB}
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 45dc595945..8b5ff188e7 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 45dc595945301f4076e66ec54a6e4de0b539cfb0
+Subproject commit 8b5ff188e79bb79ca0c2d581e94d3a91654a2d31
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 245472ded1..20bbb44986 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-2f2a3c37af91d11347a8f64bc6e90197fc53eea2
+41dc9623dbb8e5bdd2bccc22815efb9db6a49280
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index a0384f40f2..662cbb8dc9 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -71,14 +71,6 @@
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },
-    "NVTX3" : 
-    {
-      "always_download" : true,
-      "git_shallow" : false,
-      "git_tag" : "e170594ac7cf1dac584da473d4ca9301087090c1",
-      "git_url" : "https://github.com/NVIDIA/NVTX.git",
-      "version" : "3.1.0"
-    },
     "cuco" : 
     {
       "always_download" : true,
@@ -148,11 +140,19 @@
       },
       "version" : "3.0.6"
     },
+    "nvtx3" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "e170594ac7cf1dac584da473d4ca9301087090c1",
+      "git_url" : "https://github.com/NVIDIA/NVTX.git",
+      "version" : "3.1.0"
+    },
     "rmm" : 
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "46e153c18d17b07526d6ff2e04859fcbbd706879",
+      "git_tag" : "dc1e17a03ed2dbc9329ccecc27922e414250f45a",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },

From a7334da9bc9b822859d6f6871b8c769316a11c46 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 25 May 2024 02:53:38 +0800
Subject: [PATCH 117/124] Update submodule cudf to
 8a405674a5ba1554a0ced5d1f39f89fb424a768d (#2075)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8b5ff188e7..8a405674a5 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8b5ff188e79bb79ca0c2d581e94d3a91654a2d31
+Subproject commit 8a405674a5ba1554a0ced5d1f39f89fb424a768d

From 092fdb8d1a07636448cea5cd06e649f7903e84ed Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 28 May 2024 09:35:05 -0500
Subject: [PATCH 118/124] Profiler class and native code to support
 self-profiling (#2066)

* Profiler class and native code to support self-profiling

Signed-off-by: Jason Lowe <jlowe@nvidia.com>

* Add comments, provide more info for unrecognized types

* Refactor enable/disable to reduce duplication

* Avoid aborting when unable to determine current time

* Ensure atomic updates to Last_flush_time_msec

* clang format

* Fix copy-n-paste bug

* Update to new nvtx3 dependency name

---------

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 NOTICE                                        |  22 +-
 pom.xml                                       |  10 +
 src/main/cpp/CMakeLists.txt                   |  28 +-
 src/main/cpp/cmake/get_flatbuffers.cmake      |  33 +
 src/main/cpp/profiler/CMakeLists.txt          |  98 +++
 src/main/cpp/profiler/ProfilerJni.cpp         | 527 ++++++++++++
 src/main/cpp/profiler/profiler_debug.cpp      | 194 +++++
 src/main/cpp/profiler/profiler_debug.hpp      |  30 +
 src/main/cpp/profiler/profiler_schema.cpp.in  |  19 +
 src/main/cpp/profiler/profiler_serializer.cpp | 559 +++++++++++++
 src/main/cpp/profiler/profiler_serializer.hpp |  66 ++
 .../spark_rapids_profile_converter.cpp        | 754 ++++++++++++++++++
 .../cpp/src/spark_rapids_jni_version.cpp.in   |  23 +
 src/main/cpp/src/spark_rapids_jni_version.h   |  23 +
 src/main/fbs/profiler.fbs                     | 287 +++++++
 .../com/nvidia/spark/rapids/jni/Profiler.java | 125 +++
 16 files changed, 2796 insertions(+), 2 deletions(-)
 create mode 100644 src/main/cpp/cmake/get_flatbuffers.cmake
 create mode 100644 src/main/cpp/profiler/CMakeLists.txt
 create mode 100644 src/main/cpp/profiler/ProfilerJni.cpp
 create mode 100644 src/main/cpp/profiler/profiler_debug.cpp
 create mode 100644 src/main/cpp/profiler/profiler_debug.hpp
 create mode 100644 src/main/cpp/profiler/profiler_schema.cpp.in
 create mode 100644 src/main/cpp/profiler/profiler_serializer.cpp
 create mode 100644 src/main/cpp/profiler/profiler_serializer.hpp
 create mode 100644 src/main/cpp/profiler/spark_rapids_profile_converter.cpp
 create mode 100644 src/main/cpp/src/spark_rapids_jni_version.cpp.in
 create mode 100644 src/main/cpp/src/spark_rapids_jni_version.h
 create mode 100644 src/main/fbs/profiler.fbs
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/Profiler.java

diff --git a/NOTICE b/NOTICE
index 5e01c7e14c..4c06d1da90 100644
--- a/NOTICE
+++ b/NOTICE
@@ -17,4 +17,24 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the License.
\ No newline at end of file
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from flatbuffers (https://github.com/google/flatbuffers).
+
+Copyright 2021 Google Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 24daa4635e..c13cdd2030 100644
--- a/pom.xml
+++ b/pom.xml
@@ -84,6 +84,7 @@
     <BUILD_TESTS>OFF</BUILD_TESTS>
     <BUILD_BENCHMARKS>OFF</BUILD_BENCHMARKS>
     <BUILD_FAULTINJ>ON</BUILD_FAULTINJ>
+    <BUILD_PROFILER>ON</BUILD_PROFILER>
     <ai.rapids.cudf.nvtx.enabled>false</ai.rapids.cudf.nvtx.enabled>
     <ai.rapids.refcount.debug>false</ai.rapids.refcount.debug>
     <cuda.version>cuda11</cuda.version>
@@ -345,6 +346,8 @@
       </activation>
       <properties>
         <jni.classifier>${cuda.version}-arm64</jni.classifier>
+        <!-- CUPTI does not have a static library for arm64 yet -->
+        <BUILD_PROFILER>OFF</BUILD_PROFILER>
       </properties>
     </profile>
   </profiles>
@@ -457,6 +460,7 @@
                   <arg value="-DBUILD_TESTS=${BUILD_TESTS}"/>
                   <arg value="-DBUILD_BENCHMARKS=${BUILD_BENCHMARKS}"/>
                   <arg value="-DBUILD_FAULTINJ=${BUILD_FAULTINJ}"/>
+                  <arg value="-DBUILD_PROFILER=${BUILD_PROFILER}"/>
                 </exec>
                 <exec dir="${native.build.path}"
                       failonerror="true"
@@ -548,6 +552,12 @@
                     <include>libcufilejni.so</include>
                   </includes>
                 </resource>
+                <resource>
+                  <directory>${native.build.path}/profiler</directory>
+                  <includes>
+                    <include>libprofilerjni.so</include>
+                  </includes>
+                </resource>
                 <resource>
                   <directory>${libcudfjni.build.path}</directory>
                   <includes>
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index fd9d671d6f..88d48e1587 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -44,6 +44,7 @@ option(USE_GDS "Build with GPUDirect Storage (GDS)/cuFile support" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" OFF)
 option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
 option(BUILD_FAULTINJ "Configure CMake to build fault injection" ON)
+option(BUILD_PROFILER "Configure CMake to build profiler" ON)
 
 message(
   VERBOSE "SPARK_RAPIDS_JNI: Build with per-thread default stream:
@@ -60,6 +61,12 @@ set(SPARK_RAPIDS_JNI_CUDA_DEFINITIONS "")
 set(SPARK_RAPIDS_JNI_BUILD_TESTS ${BUILD_TESTS})
 set(SPARK_RAPIDS_JNI_BUILD_BENCHMARKS ${BUILD_BENCHMARKS})
 set(SPARK_RAPIDS_JNI_BUILD_FAULTINJ ${BUILD_FAULTINJ})
+if(NOT SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR)
+  set(SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR ${SPARK_RAPIDS_JNI_BINARY_DIR}/generated/include)
+endif()
+if(NOT SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR)
+  set(SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR ${SPARK_RAPIDS_JNI_BINARY_DIR}/generated/src)
+endif()
 
 # Set RMM logging level
 set(RMM_LOGGING_LEVEL
@@ -94,6 +101,21 @@ include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags
 # ##################################################################################################
 # * dependencies ----------------------------------------------------------------------------------
 
+# version header
+find_package(Git REQUIRED)
+execute_process(COMMAND
+  "${GIT_EXECUTABLE}" describe --abbrev=40 --always --dirty --long
+  WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+  OUTPUT_VARIABLE SPARK_RAPIDS_JNI_COMMIT_DETAILS
+  ERROR_QUIET
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+configure_file(
+  src/spark_rapids_jni_version.cpp.in
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
+  @ONLY
+)
+
 # find NVTX
 include(${CUDF_DIR}/cpp/cmake/thirdparty/get_nvtx.cmake)
 
@@ -256,7 +278,7 @@ add_dependencies(cudfjnistub spark_rapids_jni)
 if(USE_GDS)
   include(${CUDF_DIR}/cpp/cmake/Modules/FindcuFile.cmake)
   find_library(CUFILEJNI_LIB "libcufilejni.a" REQUIRED NO_DEFAULT_PATH
-    HINTS "${PROJECT_BINARY_DIR}/../libcudfjni"
+    HINTS "${SPARK_RAPIDS_JNI_BINARY_DIR}/../libcudfjni"
   )
   add_library(cufilejni SHARED src/emptyfile.cpp)
   set_target_properties(
@@ -300,3 +322,7 @@ endif()
 if(SPARK_RAPIDS_JNI_BUILD_FAULTINJ)
   add_subdirectory(faultinj)
 endif()
+
+if(BUILD_PROFILER)
+  add_subdirectory(profiler)
+endif()
diff --git a/src/main/cpp/cmake/get_flatbuffers.cmake b/src/main/cpp/cmake/get_flatbuffers.cmake
new file mode 100644
index 0000000000..c7e0dfb549
--- /dev/null
+++ b/src/main/cpp/cmake/get_flatbuffers.cmake
@@ -0,0 +1,33 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone flatbuffers
+function(find_and_configure_flatbuffers VERSION)
+
+  rapids_cpm_find(
+    flatbuffers ${VERSION}
+    GLOBAL_TARGETS flatbuffers
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/google/flatbuffers.git
+    GIT_TAG v${VERSION}
+    GIT_SHALLOW TRUE
+  )
+
+  rapids_export_find_package_root(
+    BUILD flatbuffers "${flatbuffers_BINARY_DIR}" EXPORT_SET profilerjni-exports
+  )
+
+endfunction()
+
+find_and_configure_flatbuffers(24.3.25)
diff --git a/src/main/cpp/profiler/CMakeLists.txt b/src/main/cpp/profiler/CMakeLists.txt
new file mode 100644
index 0000000000..03a552b3ea
--- /dev/null
+++ b/src/main/cpp/profiler/CMakeLists.txt
@@ -0,0 +1,98 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+include(../cmake/get_flatbuffers.cmake)
+
+# ##################################################################################################
+# * flatbuffer generation---------------------------------------------------------------------------
+
+set(SPARK_RAPIDS_JNI_FBS_DIR "${SPARK_RAPIDS_JNI_SOURCE_DIR}/../fbs")
+add_custom_command(
+  OUTPUT ${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h
+  DEPENDS "${SPARK_RAPIDS_JNI_FBS_DIR}/profiler.fbs"
+  WORKING_DIRECTORY "${SPARK_RAPIDS_JNI_FBS_DIR}"
+  VERBATIM
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
+  COMMAND
+    $<TARGET_FILE:flatbuffers::flatc> --cpp -o "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}" profiler.fbs
+  COMMENT "Generating profiler flatbuffer code"
+)
+
+# ##################################################################################################
+# * profiler JNI -----------------------------------------------------------------------------------
+
+add_library(profilerjni SHARED
+  ProfilerJni.cpp
+  profiler_debug.cpp
+  profiler_serializer.cpp
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
+  "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h"
+)
+
+set_target_properties(
+  profilerjni
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             INSTALL_RPATH "\$ORIGIN"
+             # set target compile options
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CXX_VISIBILITY_PRESET "hidden"
+             VISIBILITY_INLINES_HIDDEN TRUE
+)
+
+target_include_directories(
+  profilerjni
+  PRIVATE "${JNI_INCLUDE_DIRS}"
+          "${CUDAToolkit_INCLUDE_DIRS}"
+          "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
+          "${SPARK_RAPIDS_JNI_SOURCE_DIR}/src"
+)
+
+find_library(CUPTI_LIBRARY_PATH cupti_static PATHS
+  "/usr/local/cuda/lib64"
+  "/usr/local/cuda/extras/CUPTI/lib64"
+)
+
+target_link_libraries(profilerjni
+  PRIVATE ${CUPTI_LIBRARY_PATH} nvtx3::nvtx3-cpp flatbuffers::flatbuffers
+)
+
+file(READ "${SPARK_RAPIDS_JNI_FBS_DIR}/profiler.fbs" SPARK_RAPIDS_JNI_PROFILER_SCHEMA)
+configure_file(
+  profiler_schema.cpp.in
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp"
+  @ONLY
+)
+
+add_executable(spark_rapids_profile_converter
+  spark_rapids_profile_converter.cpp
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp"
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
+  "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h"
+)
+
+target_include_directories(
+  spark_rapids_profile_converter
+  PRIVATE
+  "${CUDAToolkit_INCLUDE_DIRS}"
+  "${SPARK_RAPIDS_JNI_SOURCE_DIR}/src"
+  "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
+)
+
+target_link_libraries(spark_rapids_profile_converter
+  "${CUPTI_LIBRARY_PATH}"
+  flatbuffers::flatbuffers
+  dl
+  pthread
+  rt)
diff --git a/src/main/cpp/profiler/ProfilerJni.cpp b/src/main/cpp/profiler/ProfilerJni.cpp
new file mode 100644
index 0000000000..1271b89d7b
--- /dev/null
+++ b/src/main/cpp/profiler/ProfilerJni.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "profiler_generated.h"
+#include "profiler_serializer.hpp"
+#include "spark_rapids_jni_version.h"
+
+#include <cupti.h>
+#include <jni.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <stack>
+#include <thread>
+
+// Set this to true to have each CUPTI buffer dumped to stderr as it arrives.
+#define PROFILER_DEBUG_LOG_BUFFER 0
+
+#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)    \
+  {                                                   \
+    if (env->ExceptionOccurred()) { return ret_val; } \
+  }
+
+#define JNI_THROW_NEW(env, class_name, message, ret_val) \
+  {                                                      \
+    jclass ex_class = env->FindClass(class_name);        \
+    if (ex_class == NULL) { return ret_val; }            \
+    env->ThrowNew(ex_class, message);                    \
+    return ret_val;                                      \
+  }
+
+#define CATCH_STD_CLASS(env, class_name, ret_val) \
+  catch (const std::exception& e) { JNI_THROW_NEW(env, class_name, e.what(), ret_val) }
+
+#define CATCH_STD(env, ret_val) CATCH_STD_CLASS(env, "java/lang/RuntimeException", ret_val)
+
+namespace spark_rapids_jni::profiler {
+
+namespace {
+
+// Encapsulates a buffer of profile data
+struct profile_buffer {
+  explicit profile_buffer(size_t size) : size_(size), valid_size_(0)
+  {
+    auto err = posix_memalign(reinterpret_cast<void**>(&data_), ALIGN_BYTES, size_);
+    if (err != 0) {
+      std::cerr << "PROFILER: Failed to allocate CUPTI buffer: " << strerror(err) << std::endl;
+      data_ = nullptr;
+      size_ = 0;
+    }
+  }
+
+  profile_buffer(uint8_t* data, size_t size, size_t valid_size)
+    : data_(data), size_(size), valid_size_(valid_size)
+  {
+  }
+
+  // Disconnects the underlying buffer of memory from the instance.
+  // The caller is responsible for freeing the resulting buffer.
+  void release(uint8_t** data_ptr_ptr, size_t* size_ptr)
+  {
+    *data_ptr_ptr = data_;
+    *size_ptr     = size_;
+    data_         = nullptr;
+    size_         = 0;
+  }
+
+  ~profile_buffer()
+  {
+    free(data_);
+    data_ = nullptr;
+    size_ = 0;
+  }
+
+  uint8_t const* data() const { return data_; }
+  uint8_t* data() { return data_; }
+  size_t size() const { return size_; }
+  size_t valid_size() const { return valid_size_; }
+  void set_valid_size(size_t size) { valid_size_ = size; }
+
+ private:
+  static constexpr size_t ALIGN_BYTES = 8;
+  uint8_t* data_;
+  size_t size_;
+  size_t valid_size_;
+};
+
+// Queue of profile buffers that have been filled with profile data.
+struct completed_buffer_queue {
+  // Gets the next available buffer of profile data, blocking until a buffer is available
+  // or the queue is shutdown. If the queue is shutdown, a nullptr is returned.
+  std::unique_ptr<profile_buffer> get()
+  {
+    std::unique_lock lock(lock_);
+    cv_.wait(lock, [this] { return shutdown_ || buffers_.size() > 0; });
+    if (buffers_.size() > 0) {
+      auto result = std::move(buffers_.front());
+      buffers_.pop();
+      return result;
+    }
+    return std::unique_ptr<profile_buffer>(nullptr);
+  }
+
+  void put(std::unique_ptr<profile_buffer>&& buffer)
+  {
+    std::unique_lock lock(lock_);
+    if (!shutdown_) {
+      buffers_.push(std::move(buffer));
+      lock.unlock();
+      cv_.notify_one();
+    }
+  }
+
+  void shutdown()
+  {
+    std::unique_lock lock(lock_);
+    shutdown_ = true;
+    lock.unlock();
+    cv_.notify_one();
+  }
+
+ private:
+  std::mutex lock_;
+  std::condition_variable cv_;
+  std::queue<std::unique_ptr<profile_buffer>> buffers_;
+  bool shutdown_ = false;
+};
+
+// Stack of profile buffers that are ready to be filled with profile data.
+struct free_buffer_tracker {
+  explicit free_buffer_tracker(size_t size) : buffer_size_(size) {}
+
+  // Returns the next available profile buffer or creates one if none are available.
+  std::unique_ptr<profile_buffer> get()
+  {
+    {
+      std::lock_guard lock(lock_);
+      if (buffers_.size() > 0) {
+        auto result = std::move(buffers_.top());
+        buffers_.pop();
+        return result;
+      }
+    }
+    return std::make_unique<profile_buffer>(buffer_size_);
+  }
+
+  void put(std::unique_ptr<profile_buffer>&& buffer)
+  {
+    buffer->set_valid_size(0);
+    std::lock_guard lock(lock_);
+    if (buffers_.size() < NUM_CACHED_BUFFERS) {
+      buffers_.push(std::move(buffer));
+    } else {
+      buffer.reset(nullptr);
+    }
+  }
+
+ private:
+  static constexpr size_t NUM_CACHED_BUFFERS = 2;
+  std::mutex lock_;
+  std::stack<std::unique_ptr<profile_buffer>> buffers_;
+  size_t buffer_size_;
+};
+
+void writer_thread_process(JavaVM* vm,
+                           jobject j_writer,
+                           size_t buffer_size,
+                           size_t flush_threshold);
+
+struct subscriber_state {
+  CUpti_SubscriberHandle subscriber_handle;
+  jobject j_writer;
+  std::thread writer_thread;
+  free_buffer_tracker free_buffers;
+  completed_buffer_queue completed_buffers;
+  bool has_cupti_callback_errored;
+  bool is_shutdown;
+
+  subscriber_state(jobject writer, size_t buffer_size)
+    : j_writer(writer),
+      free_buffers(buffer_size),
+      has_cupti_callback_errored(false),
+      is_shutdown(false)
+  {
+  }
+};
+
+// Global variables
+subscriber_state* State = nullptr;
+uint64_t Flush_period_msec;
+std::atomic_uint64_t Last_flush_time_msec;
+
+JavaVM* get_jvm(JNIEnv* env)
+{
+  JavaVM* vm;
+  if (env->GetJavaVM(&vm) != 0) { throw std::runtime_error("Unable to get JavaVM"); }
+  return vm;
+}
+
+JNIEnv* attach_to_jvm(JavaVM* vm)
+{
+  JavaVMAttachArgs args;
+  args.version = JNI_VERSION_1_6;
+  args.name    = const_cast<char*>("profiler writer");
+  args.group   = nullptr;
+  JNIEnv* env;
+  if (vm->AttachCurrentThread(reinterpret_cast<void**>(&env), &args) != JNI_OK) {
+    char const* msg = "PROFILER: unable to attach to JVM";
+    std::cerr << msg << std::endl;
+    throw std::runtime_error(msg);
+  }
+  return env;
+}
+
+char const* get_cupti_error(CUptiResult rc)
+{
+  char const* err;
+  if (cuptiGetResultString(rc, &err) != CUPTI_SUCCESS) { err = "UNKNOWN"; }
+  return err;
+}
+
+void check_cupti(CUptiResult rc, std::string msg)
+{
+  if (rc != CUPTI_SUCCESS) { throw std::runtime_error(msg + ": " + get_cupti_error(rc)); }
+}
+
+uint64_t timestamp_now()
+{
+  timespec info;
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &info) != 0) {
+    static bool have_logged_error = false;
+    if (!have_logged_error) {
+      std::cerr << "PROFILER: Unable to determine current time!" << std::endl;
+      have_logged_error = true;
+    }
+    // No idea what time it is, so return the last flush time which will effectively
+    // disable periodic flushing but avoid pathologic flushing on every kernel launch.
+    return Last_flush_time_msec;
+  }
+  return info.tv_sec * 1e3 + info.tv_nsec / 1e6;
+}
+
+void on_driver_launch_exit()
+{
+  auto now = timestamp_now();
+  if (now - Last_flush_time_msec >= Flush_period_msec) {
+    auto rc = cuptiActivityFlushAll(0);
+    if (rc != CUPTI_SUCCESS) {
+      std::cerr << "PROFILER: Error interval flushing records: " << get_cupti_error(rc)
+                << std::endl;
+    }
+    Last_flush_time_msec = now;
+  }
+}
+
+void domain_driver_callback(CUpti_CallbackId callback_id, CUpti_CallbackData const* cb_data)
+{
+  if (cb_data->callbackSite == CUPTI_API_ENTER) { return; }
+
+  switch (callback_id) {
+    case CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch:
+    case CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunch:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz: on_driver_launch_exit(); break;
+    default:
+      std::cerr << "PROFILER: Unexpected driver API callback for " << callback_id << std::endl;
+      break;
+  }
+}
+
+void domain_runtime_callback(CUpti_CallbackId callback_id, CUpti_CallbackData const* data_ptr)
+{
+  switch (callback_id) {
+    case CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020:
+      if (data_ptr->callbackSite == CUPTI_API_ENTER) {
+        auto rc = cuptiActivityFlushAll(0);
+        if (rc != CUPTI_SUCCESS) {
+          std::cerr << "PROFILER: Error flushing CUPTI activity on device reset: "
+                    << get_cupti_error(rc) << std::endl;
+        }
+      }
+      break;
+    default: break;
+  }
+}
+
+// Invoked by CUPTI when something occurs for which we previously requested a callback.
+void CUPTIAPI callback_handler(void*,
+                               CUpti_CallbackDomain domain,
+                               CUpti_CallbackId callback_id,
+                               const void* callback_data_ptr)
+{
+  auto rc = cuptiGetLastError();
+  if (rc != CUPTI_SUCCESS && !State->has_cupti_callback_errored) {
+    // State->has_cupti_callback_errored = true;
+    std::cerr << "PROFILER: Error handling callback: " << get_cupti_error(rc) << std::endl;
+    return;
+  }
+
+  auto cb_data = static_cast<CUpti_CallbackData const*>(callback_data_ptr);
+  switch (domain) {
+    case CUPTI_CB_DOMAIN_DRIVER_API: domain_driver_callback(callback_id, cb_data); break;
+    case CUPTI_CB_DOMAIN_RUNTIME_API: domain_runtime_callback(callback_id, cb_data); break;
+    default: break;
+  }
+}
+
+// Invoked by CUPTI when a new buffer is needed to record CUPTI activity events.
+void CUPTIAPI buffer_requested_callback(uint8_t** buffer_ptr_ptr,
+                                        size_t* size_ptr,
+                                        size_t* max_num_records_ptr)
+{
+  *max_num_records_ptr = 0;
+  if (!State->is_shutdown) {
+    auto buffer = State->free_buffers.get();
+    buffer->release(buffer_ptr_ptr, size_ptr);
+  } else {
+    *buffer_ptr_ptr = nullptr;
+    *size_ptr       = 0;
+  }
+}
+
+// Invoked by CUPTI when an activity event buffer has completed.
+void CUPTIAPI buffer_completed_callback(
+  CUcontext, uint32_t, uint8_t* buffer, size_t buffer_size, size_t valid_size)
+{
+  auto pb = std::make_unique<profile_buffer>(buffer, buffer_size, valid_size);
+  if (!State->is_shutdown) { State->completed_buffers.put(std::move(pb)); }
+}
+
+// Setup the environment variables for NVTX library injection so we can capture NVTX events.
+void setup_nvtx_env(JNIEnv* env, jstring j_lib_path)
+{
+  auto lib_path = env->GetStringUTFChars(j_lib_path, 0);
+  if (lib_path == NULL) { throw std::runtime_error("Error getting library path"); }
+  setenv("NVTX_INJECTION64_PATH", lib_path, 1);
+  env->ReleaseStringUTFChars(j_lib_path, lib_path);
+}
+
+// Main processing loop for the background writer thread
+void writer_thread_process(JavaVM* vm, jobject j_writer, size_t buffer_size, size_t flush_threshold)
+{
+  try {
+    JNIEnv* env = attach_to_jvm(vm);
+    profiler_serializer serializer(env, j_writer, buffer_size, flush_threshold);
+    auto buffer = State->completed_buffers.get();
+    while (buffer) {
+      serializer.process_cupti_buffer(buffer->data(), buffer->valid_size());
+      State->free_buffers.put(std::move(buffer));
+      buffer = State->completed_buffers.get();
+    }
+    serializer.flush();
+  } catch (std::exception const& e) {
+    std::cerr << "PROFILER: WRITER THREAD ERROR: " << e.what() << std::endl;
+    // no-op process buffers
+    auto buffer = State->completed_buffers.get();
+    while (buffer) {
+      State->free_buffers.put(std::move(buffer));
+      buffer = State->completed_buffers.get();
+    }
+  }
+  vm->DetachCurrentThread();
+}
+
+// Enable/disable capture of CUPTI activity events
+void update_activity_enable(bool enable)
+{
+  CUpti_ActivityKind const activity_ids[] = {CUPTI_ACTIVITY_KIND_DEVICE,
+                                             CUPTI_ACTIVITY_KIND_DRIVER,
+                                             CUPTI_ACTIVITY_KIND_RUNTIME,
+                                             CUPTI_ACTIVITY_KIND_MEMCPY,
+                                             CUPTI_ACTIVITY_KIND_MEMSET,
+                                             CUPTI_ACTIVITY_KIND_NAME,
+                                             CUPTI_ACTIVITY_KIND_MARKER,
+                                             CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
+                                             CUPTI_ACTIVITY_KIND_OVERHEAD};
+  if (enable) {
+    for (CUpti_ActivityKind const id : activity_ids) {
+      check_cupti(cuptiActivityEnable(id), "Error enabling device activity");
+    }
+  } else {
+    for (CUpti_ActivityKind const id : activity_ids) {
+      check_cupti(cuptiActivityDisable(id), "Error disabling device activity");
+    }
+    check_cupti(cuptiActivityFlushAll(0), "Error flushing activity records");
+  }
+}
+
+}  // anonymous namespace
+
+}  // namespace spark_rapids_jni::profiler
+
+extern "C" {
+
+using namespace spark_rapids_jni::profiler;
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeInit(JNIEnv* env,
+                                                                            jclass,
+                                                                            jstring j_lib_path,
+                                                                            jobject j_writer,
+                                                                            jlong write_buffer_size,
+                                                                            jint flush_period_msec)
+{
+  try {
+    setup_nvtx_env(env, j_lib_path);
+    // grab a global reference to the writer instance so it isn't garbage collected
+    auto writer = static_cast<jobject>(env->NewGlobalRef(j_writer));
+    if (!writer) { throw std::runtime_error("Unable to create a global reference to writer"); }
+    State                = new subscriber_state(writer, write_buffer_size);
+    State->writer_thread = std::thread(
+      writer_thread_process, get_jvm(env), writer, write_buffer_size, write_buffer_size);
+    auto rc = cuptiSubscribe(&State->subscriber_handle, callback_handler, nullptr);
+    check_cupti(rc, "Error initializing CUPTI");
+    rc = cuptiEnableCallback(1,
+                             State->subscriber_handle,
+                             CUPTI_CB_DOMAIN_RUNTIME_API,
+                             CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020);
+    if (flush_period_msec > 0) {
+      std::cerr << "PROFILER: Flushing activity records every " << flush_period_msec
+                << " milliseconds" << std::endl;
+      Flush_period_msec    = static_cast<uint64_t>(flush_period_msec);
+      Last_flush_time_msec = timestamp_now();
+      // CUPTI's periodic flush does not appear to work in this environment. As a workaround,
+      // register a callback for all the various ways a GPU kernel gets launched. The callback
+      // checks if the flush period has elapsed since we last flushed, and if so, forces a flush.
+      CUpti_CallbackId const driver_launch_callback_ids[] = {
+        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch,
+        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunch,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz};
+      for (CUpti_CallbackId const id : driver_launch_callback_ids) {
+        rc = cuptiEnableCallback(1, State->subscriber_handle, CUPTI_CB_DOMAIN_DRIVER_API, id);
+        check_cupti(rc, "Error registering driver launch callbacks");
+      }
+    }
+    check_cupti(rc, "Error enabling device reset callback");
+    rc = cuptiActivityRegisterCallbacks(buffer_requested_callback, buffer_completed_callback);
+    check_cupti(rc, "Error registering activity buffer callbacks");
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeStart(JNIEnv* env, jclass)
+{
+  try {
+    if (State && !State->is_shutdown) { update_activity_enable(true); }
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeStop(JNIEnv* env, jclass)
+{
+  try {
+    if (State && !State->is_shutdown) { update_activity_enable(false); }
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeShutdown(JNIEnv* env, jclass)
+{
+  try {
+    if (State && !State->is_shutdown) {
+      auto unsub_rc = cuptiUnsubscribe(State->subscriber_handle);
+      auto flush_rc = cuptiActivityFlushAll(1);
+      State->completed_buffers.shutdown();
+      State->writer_thread.join();
+      State->is_shutdown = true;
+      env->DeleteGlobalRef(State->j_writer);
+      // There can be late arrivals of CUPTI activity events and other callbacks, so it's safer
+      // and simpler to _not_ delete the State object on shutdown.
+      check_cupti(unsub_rc, "Error unsubscribing from CUPTI");
+      check_cupti(flush_rc, "Error flushing CUPTI records");
+    }
+  }
+  CATCH_STD(env, );
+}
+
+}  // extern "C"
+
+// Extern the CUPTI NVTX initialization APIs. The APIs are thread-safe.
+extern "C" CUptiResult CUPTIAPI cuptiNvtxInitialize(void* pfnGetExportTable);
+extern "C" CUptiResult CUPTIAPI cuptiNvtxInitialize2(void* pfnGetExportTable);
+
+// Interface that may be called by NVTX to capture NVTX events
+extern "C" JNIEXPORT int InitializeInjectionNvtx(void* p)
+{
+  CUptiResult res = cuptiNvtxInitialize(p);
+  return (res == CUPTI_SUCCESS) ? 1 : 0;
+}
+
+// Interface that may be called by NVTX to capture NVTX events
+extern "C" JNIEXPORT int InitializeInjectionNvtx2(void* p)
+{
+  CUptiResult res = cuptiNvtxInitialize2(p);
+  return (res == CUPTI_SUCCESS) ? 1 : 0;
+}
diff --git a/src/main/cpp/profiler/profiler_debug.cpp b/src/main/cpp/profiler/profiler_debug.cpp
new file mode 100644
index 0000000000..3759b11e0d
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_debug.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "profiler_debug.hpp"
+
+#include <iostream>
+#include <sstream>
+
+namespace spark_rapids_jni::profiler {
+
+namespace {
+
+std::string marker_flags_to_string(CUpti_ActivityFlag flags)
+{
+  std::string s("");
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS) { s += "INSTANTANEOUS "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_START) { s += "START "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_END) { s += "END "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE) { s += "SYNCACQUIRE "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS) { s += "SYNCACQUIRESUCCESS "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED) { s += "SYNCACQUIREFAILED "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE) { s += "SYNCRELEASE "; }
+  return s;
+}
+
+std::string activity_object_kind_to_string(CUpti_ActivityObjectKind kind)
+{
+  switch (kind) {
+    case CUPTI_ACTIVITY_OBJECT_PROCESS: return "PROCESS";
+    case CUPTI_ACTIVITY_OBJECT_THREAD: return "THREAD";
+    case CUPTI_ACTIVITY_OBJECT_DEVICE: return "DEVICE";
+    case CUPTI_ACTIVITY_OBJECT_CONTEXT: return "CONTEXT";
+    case CUPTI_ACTIVITY_OBJECT_STREAM: return "STREAM";
+    case CUPTI_ACTIVITY_OBJECT_UNKNOWN: return "UNKNOWN";
+    default: {
+      std::ostringstream oss;
+      oss << "UNRECOGNIZED(" << kind << ")";
+      return oss.str();
+    }
+  }
+}
+
+}  // anonymous namespace
+
+std::string activity_kind_to_string(CUpti_ActivityKind kind)
+{
+  switch (kind) {
+    case CUPTI_ACTIVITY_KIND_MEMCPY: return "CUPTI_ACTIVITY_KIND_MEMCPY";
+    case CUPTI_ACTIVITY_KIND_MEMSET: return "CUPTI_ACTIVITY_KIND_MEMSET";
+    case CUPTI_ACTIVITY_KIND_KERNEL: return "CUPTI_ACTIVITY_KIND_KERNEL";
+    case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: return "CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL";
+    case CUPTI_ACTIVITY_KIND_DRIVER: return "CPUTI_ACTIVITY_KIND_DRIVER";
+    case CUPTI_ACTIVITY_KIND_RUNTIME: return "CUPTI_ACTIVITY_KIND_RUNTIME";
+    case CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API: return "CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API";
+    case CUPTI_ACTIVITY_KIND_EVENT: return "CUPTI_ACTIVITY_KIND_EVENT";
+    case CUPTI_ACTIVITY_KIND_METRIC: return "CUPTI_ACTIVITY_KIND_METRIC";
+    case CUPTI_ACTIVITY_KIND_DEVICE: return "CUPTI_ACTIVITY_KIND_DEVICE";
+    case CUPTI_ACTIVITY_KIND_CONTEXT: return "CUPTI_ACTIVITY_KIND_CONTEXT";
+    case CUPTI_ACTIVITY_KIND_NAME: return "CUPTI_ACTIVITY_KIND_NAME";
+    case CUPTI_ACTIVITY_KIND_MARKER: return "CUPTI_ACTIVITY_KIND_MARKER";
+    case CUPTI_ACTIVITY_KIND_MARKER_DATA: return "CUPTI_ACTIVITY_KIND_MARKER_DATA";
+    case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: return "CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR";
+    case CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS: return "CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS";
+    case CUPTI_ACTIVITY_KIND_BRANCH: return "CUPTI_ACTIVITY_KIND_BRANCH";
+    case CUPTI_ACTIVITY_KIND_OVERHEAD: return "CUPTI_ACTIVITY_KIND_OVERHEAD";
+    case CUPTI_ACTIVITY_KIND_CDP_KERNEL: return "CUPTI_ACTIVITY_KIND_CDP_KERNEL";
+    case CUPTI_ACTIVITY_KIND_PREEMPTION: return "CUPTI_ACTIVITY_KIND_PREEMPTION";
+    case CUPTI_ACTIVITY_KIND_ENVIRONMENT: return "CUPTI_ACTIVITY_KIND_ENVIRONMENT";
+    case CUPTI_ACTIVITY_KIND_EVENT_INSTANCE: return "CUPTI_ACTIVITY_KIND_EVENT_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_MEMCPY2: return "CUPTI_ACTIVITY_KIND_MEMCPY2";
+    case CUPTI_ACTIVITY_KIND_METRIC_INSTANCE: return "CUPTI_ACTIVITY_KIND_METRIC_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION:
+      return "CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION";
+    case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
+      return "CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER";
+    case CUPTI_ACTIVITY_KIND_FUNCTION: return "CUPTI_ACTIVITY_KIND_FUNCTION";
+    case CUPTI_ACTIVITY_KIND_MODULE: return "CUPTI_ACTIVITY_KIND_MODULE";
+    case CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE: return "CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE";
+    case CUPTI_ACTIVITY_KIND_SHARED_ACCESS: return "CUPTI_ACTIVITY_KIND_SHARED_ACCESS";
+    case CUPTI_ACTIVITY_KIND_PC_SAMPLING: return "CUPTI_ACTIVITY_KIND_PC_SAMPLING";
+    case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO:
+      return "CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO";
+    case CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION:
+      return "CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION";
+    case CUPTI_ACTIVITY_KIND_OPENACC_DATA: return "CUPTI_ACTIVITY_KIND_OPENACC_DATA";
+    case CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH: return "CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH";
+    case CUPTI_ACTIVITY_KIND_OPENACC_OTHER: return "CUPTI_ACTIVITY_KIND_OPENACC_OTHER";
+    case CUPTI_ACTIVITY_KIND_CUDA_EVENT: return "CUPTI_ACTIVITY_KIND_CUDA_EVENT";
+    case CUPTI_ACTIVITY_KIND_STREAM: return "CUPTI_ACTIVITY_KIND_STREAM";
+    case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION: return "CUPTI_ACTIVITY_KIND_SYNCHRONIZATION";
+    case CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION:
+      return "CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION";
+    case CUPTI_ACTIVITY_KIND_NVLINK: return "CUPTI_ACTIVITY_KIND_NVLINK";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT: return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE:
+      return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC:
+      return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE:
+      return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_MEMORY: return "CUPTI_ACTIVITY_KIND_MEMORY";
+    case CUPTI_ACTIVITY_KIND_PCIE: return "CUPTI_ACTIVITY_KIND_PCIE";
+    case CUPTI_ACTIVITY_KIND_OPENMP: return "CUPTI_ACTIVITY_KIND_OPENMP";
+    case CUPTI_ACTIVITY_KIND_MEMORY2: return "CUPTI_ACTIVITY_KIND_MEMORY2";
+    case CUPTI_ACTIVITY_KIND_MEMORY_POOL: return "CUPTI_ACTIVITY_KIND_MEMORY_POOL";
+    case CUPTI_ACTIVITY_KIND_GRAPH_TRACE: return "CUPTI_ACTIVITY_KIND_GRAPH_TRACE";
+    case CUPTI_ACTIVITY_KIND_JIT: return "CUPTI_ACTIVITY_KIND_JIT";
+    default: {
+      std::ostringstream oss;
+      oss << "UNRECOGNIZED(" << kind << ")";
+      return oss.str();
+    }
+  }
+}
+
+void print_cupti_buffer(uint8_t* buffer, size_t valid_size)
+{
+  if (valid_size > 0) {
+    std::cerr << "PROFILER: CUPTI buffer size: " << valid_size << std::endl;
+    CUpti_Activity* record_ptr = nullptr;
+    auto rc                    = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    while (rc == CUPTI_SUCCESS) {
+      std::cerr << "RECORD: " << activity_kind_to_string(record_ptr->kind) << std::endl;
+      switch (record_ptr->kind) {
+        case CUPTI_ACTIVITY_KIND_DRIVER: {
+          auto api_record  = reinterpret_cast<CUpti_ActivityAPI const*>(record_ptr);
+          char const* name = nullptr;
+          cuptiGetCallbackName(CUPTI_CB_DOMAIN_DRIVER_API, api_record->cbid, &name);
+          name = name ? name : "NULL";
+          std::cerr << "  NAME: " << name << " THREAD: " << api_record->threadId << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_DEVICE: {
+          auto device_record = reinterpret_cast<CUpti_ActivityDevice4 const*>(record_ptr);
+          char const* name   = device_record->name != nullptr ? device_record->name : "NULL";
+          std::cerr << "  " << activity_kind_to_string(device_record->kind) << " " << name
+                    << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_RUNTIME: {
+          auto api_record  = reinterpret_cast<CUpti_ActivityAPI const*>(record_ptr);
+          char const* name = nullptr;
+          cuptiGetCallbackName(CUPTI_CB_DOMAIN_RUNTIME_API, api_record->cbid, &name);
+          name = name ? name : "NULL";
+          std::cerr << "  NAME: " << name << " THREAD: " << api_record->threadId << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER: {
+          auto marker_record = reinterpret_cast<CUpti_ActivityMarker2 const*>(record_ptr);
+          std::cerr << "  FLAGS: " << marker_flags_to_string(marker_record->flags)
+                    << " ID: " << marker_record->id
+                    << " OBJECTKIND: " << activity_object_kind_to_string(marker_record->objectKind)
+                    << " NAME: " << std::string(marker_record->name ? marker_record->name : "NULL")
+                    << " DOMAIN: "
+                    << std::string(marker_record->domain ? marker_record->domain : "NULL")
+                    << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER_DATA: {
+          auto marker_record = reinterpret_cast<CUpti_ActivityMarkerData const*>(record_ptr);
+          std::cerr << "  FLAGS: " << marker_flags_to_string(marker_record->flags)
+                    << " ID: " << marker_record->id << " COLOR: " << marker_record->color
+                    << " COLOR FLAG: " << marker_record->flags
+                    << " CATEGORY: " << marker_record->category
+                    << " DATA KIND: " << marker_record->payloadKind
+                    << " DATA: " << marker_record->payload.metricValueUint64 << "/"
+                    << marker_record->payload.metricValueDouble << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
+          auto kernel_record = reinterpret_cast<CUpti_ActivityKernel8 const*>(record_ptr);
+          std::cerr << "  NAME: " << kernel_record->name << std::endl;
+        }
+        default: break;
+      }
+      rc = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    }
+  }
+}
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/profiler_debug.hpp b/src/main/cpp/profiler/profiler_debug.hpp
new file mode 100644
index 0000000000..e44fdb87ff
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_debug.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cupti.h>
+
+#include <cstdint>
+#include <string>
+
+namespace spark_rapids_jni::profiler {
+
+std::string activity_kind_to_string(CUpti_ActivityKind kind);
+
+void print_cupti_buffer(uint8_t* buffer, size_t valid_size);
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/profiler_schema.cpp.in b/src/main/cpp/profiler/profiler_schema.cpp.in
new file mode 100644
index 0000000000..f2940a91bf
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_schema.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace spark_rapids_jni::profiler {
+char const* Profiler_Schema = R"raw(@SPARK_RAPIDS_JNI_PROFILER_SCHEMA@)raw";
+}
diff --git a/src/main/cpp/profiler/profiler_serializer.cpp b/src/main/cpp/profiler/profiler_serializer.cpp
new file mode 100644
index 0000000000..b47ff234ad
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_serializer.cpp
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "profiler_serializer.hpp"
+
+#include "profiler_debug.hpp"
+#include "profiler_generated.h"
+#include "spark_rapids_jni_version.h"
+
+#include <cupti.h>
+
+#include <iostream>
+
+namespace spark_rapids_jni::profiler {
+
+namespace {
+
+constexpr uint32_t PROFILE_VERSION = 1;
+
+flatbuffers::Offset<ActivityObjectId> add_object_id(flatbuffers::FlatBufferBuilder& fbb,
+                                                    CUpti_ActivityObjectKind kind,
+                                                    CUpti_ActivityObjectKindId const& object_id)
+{
+  switch (kind) {
+    case CUPTI_ACTIVITY_OBJECT_PROCESS:
+    case CUPTI_ACTIVITY_OBJECT_THREAD: {
+      ActivityObjectIdBuilder aoib(fbb);
+      aoib.add_process_id(object_id.pt.processId);
+      if (kind == CUPTI_ACTIVITY_OBJECT_THREAD) { aoib.add_thread_id(object_id.pt.threadId); }
+      return aoib.Finish();
+    }
+    case CUPTI_ACTIVITY_OBJECT_DEVICE:
+    case CUPTI_ACTIVITY_OBJECT_CONTEXT:
+    case CUPTI_ACTIVITY_OBJECT_STREAM: {
+      ActivityObjectIdBuilder aoib(fbb);
+      aoib.add_device_id(object_id.dcs.deviceId);
+      if (kind == CUPTI_ACTIVITY_OBJECT_CONTEXT || kind == CUPTI_ACTIVITY_OBJECT_STREAM) {
+        aoib.add_context_id(object_id.dcs.contextId);
+        if (kind == CUPTI_ACTIVITY_OBJECT_STREAM) { aoib.add_stream_id(object_id.dcs.streamId); }
+      }
+      return aoib.Finish();
+    }
+    default:
+      std::cerr << "PROFILER: Unrecognized object kind: " << kind << std::endl;
+      return flatbuffers::Offset<ActivityObjectId>();
+  }
+}
+
+MarkerFlags marker_flags_to_fb(CUpti_ActivityFlag flags)
+{
+  uint8_t result = 0;
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS) { result |= MarkerFlags_Instantaneous; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_START) { result |= MarkerFlags_Start; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_END) { result |= MarkerFlags_End; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE) { result |= MarkerFlags_SyncAcquire; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS) {
+    result |= MarkerFlags_SyncAcquireSuccess;
+  }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED) {
+    result |= MarkerFlags_SyncAcquireFailed;
+  }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE) { result |= MarkerFlags_SyncRelease; }
+  return static_cast<MarkerFlags>(result);
+}
+
+ChannelType to_channel_type(CUpti_ChannelType t)
+{
+  switch (t) {
+    case CUPTI_CHANNEL_TYPE_INVALID: return ChannelType_Invalid;
+    case CUPTI_CHANNEL_TYPE_COMPUTE: return ChannelType_Compute;
+    case CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY: return ChannelType_AsyncMemcpy;
+    default:
+      std::cerr << "PROFILER: Unrecognized channel type: " << t << std::endl;
+      return ChannelType_Invalid;
+  }
+}
+
+LaunchType to_launch_type(uint8_t t)
+{
+  switch (t) {
+    case CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR: return LaunchType_Regular;
+    case CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE:
+      return LaunchType_CooperativeSingleDevice;
+    case CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE:
+      return LaunchType_CooperativeMultiDevice;
+    default:
+      std::cerr << "PROFILER: Unrecognized launch type: " << t << std::endl;
+      return LaunchType_Regular;
+  }
+}
+
+MemcpyFlags to_memcpy_flags(uint32_t flags)
+{
+  uint8_t result = 0;
+  if (flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC) { result |= MemcpyFlags_Async; }
+  return static_cast<MemcpyFlags>(result);
+}
+
+MemcpyKind to_memcpy_kind(uint8_t k)
+{
+  switch (k) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN: return MemcpyKind_Unknown;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: return MemcpyKind_HtoD;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: return MemcpyKind_DtoH;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: return MemcpyKind_HtoA;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: return MemcpyKind_AtoH;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: return MemcpyKind_AtoA;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: return MemcpyKind_AtoD;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: return MemcpyKind_DtoA;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: return MemcpyKind_DtoD;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: return MemcpyKind_HtoH;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: return MemcpyKind_PtoP;
+    default:
+      std::cerr << "PROFILER: Unrecognized memcpy kind: " << k << std::endl;
+      return MemcpyKind_Unknown;
+  }
+}
+
+MemoryKind to_memory_kind(uint8_t k)
+{
+  switch (k) {
+    case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN: return MemoryKind_Unknown;
+    case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE: return MemoryKind_Pageable;
+    case CUPTI_ACTIVITY_MEMORY_KIND_PINNED: return MemoryKind_Pinned;
+    case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE: return MemoryKind_Device;
+    case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY: return MemoryKind_Array;
+    case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED: return MemoryKind_Managed;
+    case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC: return MemoryKind_DeviceStatic;
+    case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC: return MemoryKind_ManagedStatic;
+    default:
+      std::cerr << "PROFILER: Unrecognized memory kind: " << k << std::endl;
+      return MemoryKind_Unknown;
+  }
+}
+
+MemsetFlags to_memset_flags(uint32_t flags)
+{
+  uint8_t result = 0;
+  if (flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC) { result |= MemsetFlags_Async; }
+  return static_cast<MemsetFlags>(result);
+}
+
+OverheadKind to_overhead_kind(CUpti_ActivityOverheadKind k)
+{
+  switch (k) {
+    case CUPTI_ACTIVITY_OVERHEAD_UNKNOWN: return OverheadKind_Unknown;
+    case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER: return OverheadKind_DriverCompiler;
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH: return OverheadKind_CUptiBufferFlush;
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION: return OverheadKind_CUptiInstrumentation;
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE: return OverheadKind_CUptiResource;
+    default:
+      std::cerr << "PROFILER: Unrecognized overhead kind: " << k << std::endl;
+      return OverheadKind_Unknown;
+  }
+}
+
+PartitionedGlobalCacheConfig to_partitioned_global_cache_config(
+  CUpti_ActivityPartitionedGlobalCacheConfig c)
+{
+  switch (c) {
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN:
+      return PartitionedGlobalCacheConfig_Unknown;
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED:
+      return PartitionedGlobalCacheConfig_NotSupported;
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF:
+      return PartitionedGlobalCacheConfig_Off;
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON: return PartitionedGlobalCacheConfig_On;
+    default:
+      std::cerr << "PROFILER: Unrecognized partitioned global cache config: " << c << std::endl;
+      return PartitionedGlobalCacheConfig_Unknown;
+  }
+}
+
+ShmemLimitConfig to_shmem_limit_config(CUpti_FuncShmemLimitConfig c)
+{
+  switch (c) {
+    case CUPTI_FUNC_SHMEM_LIMIT_DEFAULT: return ShmemLimitConfig_Default;
+    case CUPTI_FUNC_SHMEM_LIMIT_OPTIN: return ShmemLimitConfig_Optin;
+    default:
+      std::cerr << "PROFILER: Unrecognized shmem limit config: " << c << std::endl;
+      return ShmemLimitConfig_Default;
+  }
+}
+
+}  // anonymous namespace
+
+profiler_serializer::profiler_serializer(JNIEnv* env,
+                                         jobject writer,
+                                         size_t buffer_size,
+                                         size_t flush_threshold)
+  : env_(env), j_writer_(writer), flush_threshold_(flush_threshold), fbb_(buffer_size)
+{
+  auto writer_class = env->GetObjectClass(writer);
+  if (!writer_class) { throw std::runtime_error("Failed to locate class of data writer"); }
+  j_write_method_ = env->GetMethodID(writer_class, "write", "(Ljava/nio/ByteBuffer;)V");
+  if (!j_write_method_) { throw std::runtime_error("Failed to locate data writer write method"); }
+  write_profile_header();
+}
+
+void profiler_serializer::write_profile_header()
+{
+  auto writer_version = fbb_.CreateString(spark_rapids_jni::Version);
+  auto magic          = fbb_.CreateString("spark-rapids profile");
+  auto header         = CreateProfileHeader(fbb_, magic, PROFILE_VERSION, writer_version);
+  fbb_.FinishSizePrefixed(header);
+  write_current_fb();
+}
+
+void profiler_serializer::process_cupti_buffer(uint8_t* buffer, size_t valid_size)
+{
+  report_num_dropped_records();
+  if (valid_size > 0) {
+    CUpti_Activity* record_ptr = nullptr;
+    auto rc                    = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    while (rc == CUPTI_SUCCESS) {
+      switch (record_ptr->kind) {
+        case CUPTI_ACTIVITY_KIND_DEVICE: {
+          auto device_record = reinterpret_cast<CUpti_ActivityDevice4 const*>(record_ptr);
+          process_device_activity(device_record);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_DRIVER:
+        case CUPTI_ACTIVITY_KIND_RUNTIME: {
+          auto api_record = reinterpret_cast<CUpti_ActivityAPI const*>(record_ptr);
+          process_api_activity(api_record);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER: {
+          auto marker = reinterpret_cast<CUpti_ActivityMarker2 const*>(record_ptr);
+          process_marker_activity(marker);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER_DATA: {
+          auto marker = reinterpret_cast<CUpti_ActivityMarkerData const*>(record_ptr);
+          process_marker_data(marker);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MEMCPY: {
+          auto r = reinterpret_cast<CUpti_ActivityMemcpy5 const*>(record_ptr);
+          process_memcpy(r);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MEMSET: {
+          auto r = reinterpret_cast<CUpti_ActivityMemset4 const*>(record_ptr);
+          process_memset(r);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
+          auto r = reinterpret_cast<CUpti_ActivityKernel8 const*>(record_ptr);
+          process_kernel(r);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_OVERHEAD: {
+          auto r = reinterpret_cast<CUpti_ActivityOverhead const*>(record_ptr);
+          process_overhead(r);
+          break;
+        }
+        default:
+          std::cerr << "PROFILER: Ignoring activity record "
+                    << activity_kind_to_string(record_ptr->kind) << std::endl;
+          break;
+      }
+      if (fbb_.GetSize() >= flush_threshold_) { flush(); }
+      rc = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    }
+  }
+}
+
+void profiler_serializer::flush()
+{
+  if (fbb_.GetSize() > 0) {
+    using flatbuffers::Offset;
+    using flatbuffers::Vector;
+    Offset<Vector<Offset<ApiActivity>>> api_vec;
+    Offset<Vector<Offset<DeviceActivity>>> device_vec;
+    Offset<Vector<Offset<DroppedRecords>>> dropped_vec;
+    Offset<Vector<Offset<KernelActivity>>> kernel_vec;
+    Offset<Vector<Offset<MarkerActivity>>> marker_vec;
+    Offset<Vector<Offset<MarkerData>>> marker_data_vec;
+    Offset<Vector<Offset<MemcpyActivity>>> memcpy_vec;
+    Offset<Vector<Offset<MemsetActivity>>> memset_vec;
+    Offset<Vector<Offset<OverheadActivity>>> overhead_vec;
+    if (api_offsets_.size() > 0) { api_vec = fbb_.CreateVector(api_offsets_); }
+    if (device_offsets_.size() > 0) { device_vec = fbb_.CreateVector(device_offsets_); }
+    if (dropped_offsets_.size() > 0) { dropped_vec = fbb_.CreateVector(dropped_offsets_); }
+    if (kernel_offsets_.size() > 0) { kernel_vec = fbb_.CreateVector(kernel_offsets_); }
+    if (marker_offsets_.size() > 0) { marker_vec = fbb_.CreateVector(marker_offsets_); }
+    if (marker_data_offsets_.size() > 0) {
+      marker_data_vec = fbb_.CreateVector(marker_data_offsets_);
+    }
+    if (memcpy_offsets_.size() > 0) { memcpy_vec = fbb_.CreateVector(memcpy_offsets_); }
+    if (memset_offsets_.size() > 0) { memset_vec = fbb_.CreateVector(memset_offsets_); }
+    if (overhead_offsets_.size() > 0) { overhead_vec = fbb_.CreateVector(overhead_offsets_); }
+    ActivityRecordsBuilder arb(fbb_);
+    arb.add_api(api_vec);
+    arb.add_device(device_vec);
+    arb.add_dropped(dropped_vec);
+    arb.add_kernel(kernel_vec);
+    arb.add_marker(marker_vec);
+    arb.add_marker_data(marker_data_vec);
+    arb.add_memcpy(memcpy_vec);
+    arb.add_memset(memset_vec);
+    arb.add_overhead(overhead_vec);
+    auto r = arb.Finish();
+    fbb_.FinishSizePrefixed(r);
+    write_current_fb();
+  }
+}
+
+void profiler_serializer::process_api_activity(CUpti_ActivityAPI const* r)
+{
+  auto api_kind = ApiKind_Runtime;
+  if (r->kind == CUPTI_ACTIVITY_KIND_DRIVER) {
+    api_kind = ApiKind_Driver;
+  } else if (r->kind == CUPTI_ACTIVITY_KIND_RUNTIME) {
+    // skip some very common and uninteresting APIs to reduce the profile size
+    switch (r->cbid) {
+      case CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020:
+      case CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020:
+      case CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020:
+      case CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000: return;
+      default: break;
+    }
+  } else {
+    std::cerr << "PROFILER: Ignoring API activity record kind: " << activity_kind_to_string(r->kind)
+              << std::endl;
+    return;
+  }
+  ApiActivityBuilder aab(fbb_);
+  aab.add_kind(api_kind);
+  aab.add_cbid(r->cbid);
+  aab.add_start(r->start);
+  aab.add_end(r->end);
+  aab.add_process_id(r->processId);
+  aab.add_thread_id(r->threadId);
+  aab.add_correlation_id(r->correlationId);
+  aab.add_return_value(r->returnValue);
+  api_offsets_.push_back(aab.Finish());
+}
+
+void profiler_serializer::process_device_activity(CUpti_ActivityDevice4 const* r)
+{
+  auto name = fbb_.CreateSharedString(r->name);
+  DeviceActivityBuilder dab(fbb_);
+  dab.add_global_memory_bandwidth(r->globalMemoryBandwidth);
+  dab.add_global_memory_size(r->globalMemorySize);
+  dab.add_constant_memory_size(r->constantMemorySize);
+  dab.add_l2_cache_size(r->l2CacheSize);
+  dab.add_num_threads_per_warp(r->numThreadsPerWarp);
+  dab.add_core_clock_rate(r->coreClockRate);
+  dab.add_num_memcpy_engines(r->numMemcpyEngines);
+  dab.add_num_multiprocessors(r->numMultiprocessors);
+  dab.add_max_ipc(r->maxIPC);
+  dab.add_max_warps_per_multiprocessor(r->maxWarpsPerMultiprocessor);
+  dab.add_max_blocks_per_multiprocessor(r->maxBlocksPerMultiprocessor);
+  dab.add_max_shared_memory_per_multiprocessor(r->maxSharedMemoryPerMultiprocessor);
+  dab.add_max_registers_per_multiprocessor(r->maxRegistersPerMultiprocessor);
+  dab.add_max_registers_per_block(r->maxRegistersPerBlock);
+  dab.add_max_shared_memory_per_block(r->maxSharedMemoryPerBlock);
+  dab.add_max_threads_per_block(r->maxThreadsPerBlock);
+  dab.add_max_block_dim_x(r->maxBlockDimX);
+  dab.add_max_block_dim_y(r->maxBlockDimY);
+  dab.add_max_block_dim_z(r->maxBlockDimZ);
+  dab.add_max_grid_dim_x(r->maxGridDimX);
+  dab.add_max_grid_dim_y(r->maxGridDimY);
+  dab.add_max_grid_dim_z(r->maxGridDimZ);
+  dab.add_compute_capability_major(r->computeCapabilityMajor);
+  dab.add_compute_capability_minor(r->computeCapabilityMinor);
+  dab.add_id(r->id);
+  dab.add_ecc_enabled(r->eccEnabled);
+  dab.add_name(name);
+  device_offsets_.push_back(dab.Finish());
+}
+
+void profiler_serializer::process_dropped_records(size_t num_dropped)
+{
+  auto dropped = CreateDroppedRecords(fbb_, num_dropped);
+  dropped_offsets_.push_back(dropped);
+}
+
+void profiler_serializer::process_kernel(CUpti_ActivityKernel8 const* r)
+{
+  auto name = fbb_.CreateSharedString(r->name);
+  KernelActivityBuilder kab(fbb_);
+  kab.add_requested(r->cacheConfig.config.requested);
+  kab.add_executed(r->cacheConfig.config.executed);
+  kab.add_shared_memory_config(r->sharedMemoryConfig);
+  kab.add_registers_per_thread(r->registersPerThread);
+  kab.add_partitioned_global_cache_requested(
+    to_partitioned_global_cache_config(r->partitionedGlobalCacheRequested));
+  kab.add_partitioned_global_cache_executed(
+    to_partitioned_global_cache_config(r->partitionedGlobalCacheExecuted));
+  kab.add_start(r->start);
+  kab.add_end(r->end);
+  kab.add_completed(r->completed);
+  kab.add_device_id(r->deviceId);
+  kab.add_context_id(r->contextId);
+  kab.add_stream_id(r->streamId);
+  kab.add_grid_x(r->gridX);
+  kab.add_grid_y(r->gridY);
+  kab.add_grid_z(r->gridZ);
+  kab.add_block_x(r->blockX);
+  kab.add_block_y(r->blockY);
+  kab.add_block_z(r->blockZ);
+  kab.add_static_shared_memory(r->staticSharedMemory);
+  kab.add_dynamic_shared_memory(r->dynamicSharedMemory);
+  kab.add_local_memory_per_thread(r->localMemoryPerThread);
+  kab.add_local_memory_total(r->localMemoryTotal);
+  kab.add_correlation_id(r->correlationId);
+  kab.add_grid_id(r->gridId);
+  kab.add_name(name);
+  kab.add_queued(r->queued);
+  kab.add_submitted(r->submitted);
+  kab.add_launch_type(to_launch_type(r->launchType));
+  kab.add_is_shared_memory_carveout_requested(r->isSharedMemoryCarveoutRequested);
+  kab.add_shared_memory_carveout_requested(r->sharedMemoryCarveoutRequested);
+  kab.add_shared_memory_executed(r->sharedMemoryExecuted);
+  kab.add_graph_node_id(r->graphNodeId);
+  kab.add_shmem_limit_config(to_shmem_limit_config(r->shmemLimitConfig));
+  kab.add_graph_id(r->graphId);
+  kab.add_channel_id(r->channelID);
+  kab.add_channel_type(to_channel_type(r->channelType));
+  kab.add_cluster_x(r->clusterX);
+  kab.add_cluster_y(r->clusterY);
+  kab.add_cluster_z(r->clusterZ);
+  kab.add_cluster_scheduling_policy(r->clusterSchedulingPolicy);
+  kab.add_local_memory_total_v2(r->localMemoryTotal_v2);
+  kernel_offsets_.push_back(kab.Finish());
+}
+
+void profiler_serializer::process_marker_activity(CUpti_ActivityMarker2 const* r)
+{
+  auto object_id  = add_object_id(fbb_, r->objectKind, r->objectId);
+  auto has_name   = r->name != nullptr;
+  auto has_domain = r->name != nullptr;
+  flatbuffers::Offset<flatbuffers::String> name;
+  flatbuffers::Offset<flatbuffers::String> domain;
+  if (has_name) { name = fbb_.CreateSharedString(r->name); }
+  if (has_domain) { domain = fbb_.CreateSharedString(r->domain); }
+  MarkerActivityBuilder mab(fbb_);
+  mab.add_flags(marker_flags_to_fb(r->flags));
+  mab.add_timestamp(r->timestamp);
+  mab.add_id(r->id);
+  mab.add_object_id(object_id);
+  mab.add_name(name);
+  mab.add_domain(domain);
+  marker_offsets_.push_back(mab.Finish());
+}
+
+void profiler_serializer::process_marker_data(CUpti_ActivityMarkerData const* r)
+{
+  MarkerDataBuilder mdb(fbb_);
+  mdb.add_flags(marker_flags_to_fb(r->flags));
+  mdb.add_id(r->id);
+  mdb.add_color(r->color);
+  mdb.add_category(r->category);
+  marker_data_offsets_.push_back(mdb.Finish());
+}
+
+void profiler_serializer::process_memcpy(CUpti_ActivityMemcpy5 const* r)
+{
+  MemcpyActivityBuilder mab(fbb_);
+  mab.add_copy_kind(to_memcpy_kind(r->copyKind));
+  mab.add_src_kind(to_memory_kind(r->srcKind));
+  mab.add_dst_kind(to_memory_kind(r->dstKind));
+  mab.add_flags(to_memcpy_flags(r->flags));
+  mab.add_bytes(r->bytes);
+  mab.add_start(r->start);
+  mab.add_end(r->end);
+  mab.add_device_id(r->deviceId);
+  mab.add_context_id(r->contextId);
+  mab.add_stream_id(r->streamId);
+  mab.add_correlation_id(r->correlationId);
+  mab.add_runtime_correlation_id(r->runtimeCorrelationId);
+  mab.add_graph_node_id(r->graphNodeId);
+  mab.add_graph_id(r->graphId);
+  mab.add_channel_id(r->channelID);
+  mab.add_channel_type(to_channel_type(r->channelType));
+  memcpy_offsets_.push_back(mab.Finish());
+}
+
+void profiler_serializer::process_memset(CUpti_ActivityMemset4 const* r)
+{
+  MemsetActivityBuilder mab(fbb_);
+  mab.add_value(r->value);
+  mab.add_bytes(r->bytes);
+  mab.add_start(r->start);
+  mab.add_end(r->end);
+  mab.add_device_id(r->deviceId);
+  mab.add_context_id(r->contextId);
+  mab.add_stream_id(r->streamId);
+  mab.add_correlation_id(r->correlationId);
+  mab.add_flags(to_memset_flags(r->flags));
+  mab.add_memory_kind(to_memory_kind(r->memoryKind));
+  mab.add_graph_node_id(r->graphNodeId);
+  mab.add_graph_id(r->graphId);
+  mab.add_channel_id(r->channelID);
+  mab.add_channel_type(to_channel_type(r->channelType));
+  memset_offsets_.push_back(mab.Finish());
+}
+
+void profiler_serializer::process_overhead(CUpti_ActivityOverhead const* r)
+{
+  auto object_id = add_object_id(fbb_, r->objectKind, r->objectId);
+  OverheadActivityBuilder oab(fbb_);
+  oab.add_overhead_kind(to_overhead_kind(r->overheadKind));
+  oab.add_object_id(object_id);
+  oab.add_start(r->start);
+  oab.add_end(r->end);
+  overhead_offsets_.push_back(oab.Finish());
+}
+
+// Query CUPTI for dropped records, and if any, record in the current activity record
+void profiler_serializer::report_num_dropped_records()
+{
+  size_t num_dropped = 0;
+  auto rc            = cuptiActivityGetNumDroppedRecords(NULL, 0, &num_dropped);
+  if (rc == CUPTI_SUCCESS && num_dropped > 0) { process_dropped_records(num_dropped); }
+}
+
+// Write out the current flatbuffer and reset state for the next flatbuffer.
+void profiler_serializer::write_current_fb()
+{
+  auto fb_size = fbb_.GetSize();
+  if (fb_size > 0) {
+    auto fb          = fbb_.GetBufferPointer();
+    auto bytebuf_obj = env_->NewDirectByteBuffer(fb, fb_size);
+    if (bytebuf_obj != nullptr) {
+      env_->CallVoidMethod(j_writer_, j_write_method_, bytebuf_obj);
+    } else {
+      std::cerr << "PROFILER: Unable to create ByteBuffer for writer" << std::endl;
+    }
+  }
+  fbb_.Clear();
+  api_offsets_.clear();
+  device_offsets_.clear();
+  dropped_offsets_.clear();
+  kernel_offsets_.clear();
+  marker_offsets_.clear();
+  marker_data_offsets_.clear();
+  memcpy_offsets_.clear();
+  memset_offsets_.clear();
+  overhead_offsets_.clear();
+}
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/profiler_serializer.hpp b/src/main/cpp/profiler/profiler_serializer.hpp
new file mode 100644
index 0000000000..1feebf1b96
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_serializer.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "profiler_generated.h"
+
+#include <cupti.h>
+#include <flatbuffers/flatbuffers.h>
+#include <jni.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace spark_rapids_jni::profiler {
+
+// Serializes profile data as flatbuffers
+struct profiler_serializer {
+  profiler_serializer(JNIEnv* env, jobject writer, size_t buffer_size, size_t flush_threshold);
+  void process_cupti_buffer(uint8_t* buffer, size_t valid_size);
+  void flush();
+
+ private:
+  void write_profile_header();
+  void process_api_activity(CUpti_ActivityAPI const*);
+  void process_device_activity(CUpti_ActivityDevice4 const*);
+  void process_dropped_records(size_t num_dropped);
+  void process_marker_activity(CUpti_ActivityMarker2 const*);
+  void process_marker_data(CUpti_ActivityMarkerData const*);
+  void process_memcpy(CUpti_ActivityMemcpy5 const*);
+  void process_memset(CUpti_ActivityMemset4 const*);
+  void process_kernel(CUpti_ActivityKernel8 const*);
+  void process_overhead(CUpti_ActivityOverhead const*);
+  void report_num_dropped_records();
+  void write_current_fb();
+
+  JNIEnv* env_;
+  jmethodID j_write_method_;
+  jobject j_writer_;
+  size_t flush_threshold_;
+  flatbuffers::FlatBufferBuilder fbb_;
+  std::vector<flatbuffers::Offset<ApiActivity>> api_offsets_;
+  std::vector<flatbuffers::Offset<DeviceActivity>> device_offsets_;
+  std::vector<flatbuffers::Offset<DroppedRecords>> dropped_offsets_;
+  std::vector<flatbuffers::Offset<KernelActivity>> kernel_offsets_;
+  std::vector<flatbuffers::Offset<MarkerActivity>> marker_offsets_;
+  std::vector<flatbuffers::Offset<MarkerData>> marker_data_offsets_;
+  std::vector<flatbuffers::Offset<MemcpyActivity>> memcpy_offsets_;
+  std::vector<flatbuffers::Offset<MemsetActivity>> memset_offsets_;
+  std::vector<flatbuffers::Offset<OverheadActivity>> overhead_offsets_;
+};
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/spark_rapids_profile_converter.cpp b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp
new file mode 100644
index 0000000000..b916020392
--- /dev/null
+++ b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* A tool that converts a spark-rapids profile binary into other forms. */
+
+#if 0
+#include <stdexcept>
+#define FLATBUFFERS_ASSERT(x)                                     \
+  do {                                                            \
+    if (!(x)) { throw std::runtime_error("flatbuffers assert"); } \
+  } while (0)
+#define FLATBUFFERS_DEBUG_VERIFICATION_FAILURE
+#endif
+
+#include "profiler_generated.h"
+#include "spark_rapids_jni_version.h"
+
+#include <cupti.h>
+#include <cxxabi.h>
+#include <flatbuffers/idl.h>
+
+#include <cerrno>
+#include <charconv>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace spark_rapids_jni::profiler {
+extern char const* Profiler_Schema;
+}
+
+struct program_options {
+  std::optional<std::filesystem::path> output_path;
+  bool help       = false;
+  bool json       = false;
+  bool nvtxt      = false;
+  int json_indent = 2;
+  bool version    = false;
+};
+
+struct event {
+  enum struct type_id { API, DEVICE, KERNEL, MARKER, MARKER_DATA, MEMCPY, MEMSET, OVERHEAD };
+  type_id id;
+  void const* fb_data;
+};
+
+struct thread_id {
+  uint32_t pid;
+  uint32_t tid;
+
+  bool operator==(thread_id const& o) const { return pid == o.pid && tid == o.tid; }
+};
+
+template <>
+struct std::hash<thread_id> {
+  std::size_t operator()(thread_id const& t) const
+  {
+    return std::hash<uint32_t>{}(t.pid) ^ (std::hash<uint32_t>{}(t.tid) << 1);
+  }
+};
+
+struct stream_id {
+  uint32_t device;
+  uint32_t context;
+  uint32_t stream;
+
+  bool operator==(stream_id const& s) const
+  {
+    return device == s.device && context == s.context && stream == s.stream;
+  }
+};
+
+template <>
+struct std::hash<stream_id> {
+  std::size_t operator()(stream_id const& s) const
+  {
+    return std::hash<uint32_t>{}(s.device) ^ (std::hash<uint32_t>{}(s.context) << 1) ^
+           (std::hash<uint32_t>{}(s.stream) << 2);
+  }
+};
+
+struct event_streams {
+  std::unordered_map<thread_id, std::vector<event>> cpu;
+  std::unordered_map<stream_id, std::vector<event>> gpu;
+};
+
+void print_usage()
+{
+  std::cout << "spark_rapids_profile_converter [OPTION]... profilebin" << std::endl;
+  std::cout << R"(
+Converts the spark-rapids profile in profile.bin into other forms.
+
+  -h, --help                show this usage message
+  -j, --json                convert to JSON, default output is stdout
+  -i, --json-indent=INDENT  indentation to use for JSON. 0 is no indent, less than 0 also removes newlines
+  -o, --output=PATH         use PATH as the output filename
+  -t. --nvtxt               convert to NVTXT, default output is stdout
+  -V, --version             print the version number
+  )" << std::endl;
+}
+
+void print_version()
+{
+  std::cout << "spark_rapids_profile_converter " << spark_rapids_jni::Version << std::endl;
+}
+
+std::pair<program_options, std::vector<std::string_view>> parse_options(
+  std::vector<std::string_view> args)
+{
+  program_options opts{};
+  std::string_view long_output("--output=");
+  std::string_view long_json_indent("--json-indent=");
+  bool seen_output      = false;
+  bool seen_json_indent = false;
+  auto argp             = args.begin();
+  while (argp != args.end()) {
+    if (*argp == "-o" || *argp == "--output") {
+      if (seen_output) { throw std::runtime_error("output path cannot be specified twice"); }
+      seen_output = true;
+      if (++argp != args.end()) {
+        opts.output_path = std::make_optional(*argp++);
+      } else {
+        throw std::runtime_error("missing argument for output path");
+      }
+    } else if (argp->substr(0, long_output.size()) == long_output) {
+      if (seen_output) { throw std::runtime_error("output path cannot be specified twice"); }
+      seen_output = true;
+      argp->remove_prefix(long_output.size());
+      if (argp->empty()) {
+        throw std::runtime_error("missing argument for output path");
+      } else {
+        opts.output_path = std::make_optional(*argp++);
+      }
+    } else if (*argp == "-h" || *argp == "--help") {
+      opts.help = true;
+      ++argp;
+    } else if (*argp == "-i" || *argp == "--json-indent") {
+      if (seen_json_indent) { throw std::runtime_error("JSON indent cannot be specified twice"); }
+      seen_json_indent = true;
+      if (++argp != args.end()) {
+        auto [ptr, err] = std::from_chars(argp->data(), argp->end(), opts.json_indent);
+        if (err != std::errc() || ptr != argp->end()) {
+          throw std::runtime_error("invalid JSON indent value");
+        }
+        ++argp;
+      } else {
+        throw std::runtime_error("missing argument for JSON indent");
+      }
+    } else if (argp->substr(0, long_json_indent.size()) == long_json_indent) {
+      if (seen_json_indent) { throw std::runtime_error("JSON indent cannot be specified twice"); }
+      seen_json_indent = true;
+      argp->remove_prefix(long_json_indent.size());
+      if (argp->empty()) {
+        throw std::runtime_error("missing argument for JSON indent");
+      } else {
+        auto [ptr, err] = std::from_chars(argp->data(), argp->end(), opts.json_indent);
+        if (err != std::errc() || ptr != argp->end()) {
+          throw std::runtime_error("invalid JSON indent value");
+        }
+        ++argp;
+      }
+    } else if (*argp == "-j" || *argp == "--json") {
+      if (opts.nvtxt) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); }
+      opts.json = true;
+      ++argp;
+    } else if (*argp == "-t" || *argp == "--nvtxt") {
+      if (opts.json) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); }
+      opts.nvtxt = true;
+      ++argp;
+    } else if (*argp == "-V" || *argp == "--version") {
+      opts.version = true;
+      ++argp;
+    } else if (argp->empty()) {
+      throw std::runtime_error("empty argument");
+    } else if (argp->at(0) == '-') {
+      throw std::runtime_error(std::string("unrecognized option: ") + std::string(*argp));
+    } else {
+      break;
+    }
+  }
+  return std::make_pair(opts, std::vector<std::string_view>(argp, args.end()));
+}
+
+void checked_read(std::ifstream& in, char* buffer, size_t size)
+{
+  in.read(buffer, size);
+  if (in.fail()) {
+    if (in.eof()) {
+      throw std::runtime_error("Unexpected EOF");
+    } else {
+      throw std::runtime_error(std::strerror(errno));
+    }
+  }
+}
+
+flatbuffers::uoffset_t read_flatbuffer_size(std::ifstream& in)
+{
+  flatbuffers::uoffset_t fb_size;
+  checked_read(in, reinterpret_cast<char*>(&fb_size), sizeof(fb_size));
+  return flatbuffers::EndianScalar(fb_size);
+}
+
+std::unique_ptr<std::vector<char>> read_flatbuffer(std::ifstream& in)
+{
+  auto size = read_flatbuffer_size(in);
+  // Allocate a buffer that can hold the flatbuffer along with the prefixed size.
+  // SizePrefixed APIs require size to be at the front of the buffer and alignment
+  // of fields is planned out with that size.
+  auto buffer   = std::make_unique<std::vector<char>>(size + sizeof(flatbuffers::uoffset_t));
+  auto size_ptr = reinterpret_cast<flatbuffers::uoffset_t*>(buffer->data());
+  *size_ptr     = size;
+  checked_read(in, buffer->data() + sizeof(flatbuffers::uoffset_t), size);
+  return buffer;
+}
+
+std::ofstream open_output(std::filesystem::path const& path,
+                          std::ios::openmode mode = std::ios::out)
+{
+  if (std::filesystem::exists(path)) {
+    throw std::runtime_error(path.string() + " already exists");
+  }
+  std::ofstream out(path, mode);
+  out.exceptions(std::ios::badbit);
+  return out;
+}
+
+template <typename T>
+T const* validate_fb(std::vector<char> const& fb, std::string_view const& name)
+{
+  flatbuffers::Verifier::Options verifier_opts;
+  verifier_opts.assert = true;
+  flatbuffers::Verifier verifier(
+    reinterpret_cast<uint8_t const*>(fb.data()), fb.size(), verifier_opts);
+  if (not verifier.VerifySizePrefixedBuffer<T>(nullptr)) {
+    throw std::runtime_error(std::string("malformed ") + std::string(name) + " record");
+  }
+  return flatbuffers::GetSizePrefixedRoot<T>(fb.data());
+}
+
+void verify_profile_header(std::ifstream& in)
+{
+  auto fb_ptr = read_flatbuffer(in);
+  auto header = validate_fb<spark_rapids_jni::profiler::ProfileHeader>(*fb_ptr, "profile header");
+  auto magic  = header->magic();
+  if (magic == nullptr) {
+    throw std::runtime_error("does not appear to be a spark-rapids profile");
+  }
+  if (magic->str() != "spark-rapids profile") {
+    std::ostringstream oss;
+    oss << "bad profile magic, expected 'spark-rapids profile' found '" << magic->str() << "'";
+    throw std::runtime_error(oss.str());
+  }
+  auto version = header->version();
+  if (version != 1) {
+    std::ostringstream oss;
+    oss << "unsupported profile version: " << version;
+    throw std::runtime_error(oss.str());
+  }
+}
+
+void convert_to_nsys_rep(std::ifstream& in,
+                         std::string_view const& in_filename,
+                         program_options const& opts)
+{
+  event_streams events;
+  size_t num_dropped_records = 0;
+  while (!in.eof()) {
+    auto fb_ptr = read_flatbuffer(in);
+    auto records =
+      validate_fb<spark_rapids_jni::profiler::ActivityRecords>(*fb_ptr, "ActivityRecords");
+    auto api = records->api();
+    if (api != nullptr) {
+      for (int i = 0; i < api->size(); ++i) {
+        auto a = api->Get(i);
+        thread_id tid{a->process_id(), a->thread_id()};
+        event e{event::type_id::API, a};
+        auto it = events.cpu.find(tid);
+        if (it == events.cpu.end()) {
+          events.cpu.emplace(tid, std::initializer_list<event>{e});
+        } else {
+          it->second.push_back(e);
+        }
+      }
+    }
+    auto device = records->device();
+    if (device != nullptr) { std::cerr << "NUM DEVICES=" << device->size() << std::endl; }
+    auto dropped = records->dropped();
+    if (dropped != nullptr) {
+      for (int i = 0; i < dropped->size(); ++i) {
+        auto d = dropped->Get(i);
+        num_dropped_records += d->num_dropped();
+      }
+    }
+    auto kernel = records->kernel();
+    if (kernel != nullptr) { std::cerr << "NUM KERNEL=" << kernel->size() << std::endl; }
+    auto marker = records->marker();
+    if (marker != nullptr) { std::cerr << "NUM MARKERS=" << marker->size() << std::endl; }
+    auto marker_data = records->marker_data();
+    if (marker_data != nullptr) {
+      std::cerr << "NUM MARKER DATA=" << marker_data->size() << std::endl;
+      for (int i = 0; i < marker_data->size(); ++i) {
+        std::cerr << "MARKER DATA " << i << std::endl;
+        auto md = marker_data->Get(i);
+        std::cerr << " FLAGS: " << md->flags();
+        std::cerr << " ID: " << md->id();
+        std::cerr << " COLOR: " << md->color();
+        std::cerr << " CATEGORY: " << md->category() << std::endl;
+      }
+    }
+    auto memcpy = records->memcpy();
+    if (memcpy != nullptr) { std::cerr << "NUM MEMCPY=" << memcpy->size() << std::endl; }
+    auto memset = records->memset();
+    if (device != nullptr) { std::cerr << "NUM MEMSET=" << memset->size() << std::endl; }
+    auto overhead = records->overhead();
+    if (overhead != nullptr) { std::cerr << "NUM OVERHEADS=" << overhead->size() << std::endl; }
+
+    in.peek();
+  }
+  if (not in.eof()) { throw std::runtime_error(std::strerror(errno)); }
+  if (num_dropped_records) {
+    std::cerr << "Warning: " << num_dropped_records
+              << " records were noted as dropped in the profile" << std::endl;
+  }
+}
+
+void convert_to_json(std::ifstream& in, std::ostream& out, program_options const& opts)
+{
+  flatbuffers::Parser parser;
+  if (parser.Parse(spark_rapids_jni::profiler::Profiler_Schema) != 0) {
+    std::runtime_error("Internal error: Unable to parse profiler schema");
+  }
+  parser.opts.strict_json = true;
+  parser.opts.indent_step = opts.json_indent;
+  while (!in.eof()) {
+    auto fb_ptr = read_flatbuffer(in);
+    auto records =
+      validate_fb<spark_rapids_jni::profiler::ActivityRecords>(*fb_ptr, "ActivityRecords");
+    std::string json;
+    char const* err =
+      flatbuffers::GenText(parser, fb_ptr->data() + sizeof(flatbuffers::uoffset_t), &json);
+    if (err != nullptr) { throw std::runtime_error(std::string("Error generating JSON: ") + err); }
+    out << json;
+
+    in.peek();
+  }
+  if (not in.eof()) { throw std::runtime_error(std::strerror(errno)); }
+}
+
+char const* get_api_name(spark_rapids_jni::profiler::ApiActivity const* a)
+{
+  char const* name = nullptr;
+  switch (a->kind()) {
+    case spark_rapids_jni::profiler::ApiKind_Driver:
+      cuptiGetCallbackName(CUPTI_CB_DOMAIN_DRIVER_API, a->cbid(), &name);
+      break;
+    case spark_rapids_jni::profiler::ApiKind_Runtime:
+      cuptiGetCallbackName(CUPTI_CB_DOMAIN_RUNTIME_API, a->cbid(), &name);
+      break;
+    default: {
+      std::ostringstream oss;
+      oss << "unsupported API kind: " << a->kind();
+      throw std::runtime_error(oss.str());
+    }
+  }
+  return name;
+}
+
+std::string demangle(char const* s)
+{
+  int status      = 0;
+  char* demangled = abi::__cxa_demangle(s, nullptr, nullptr, &status);
+  if (status == 0) {
+    std::string result(demangled);
+    free(demangled);
+    return result;
+  } else {
+    return s;
+  }
+}
+
+std::string memcpy_to_string(spark_rapids_jni::profiler::MemcpyActivity const* m)
+{
+  char const* kind_str;
+  char const* pinned = "";
+  switch (m->copy_kind()) {
+    case spark_rapids_jni::profiler::MemcpyKind_HtoD:
+      kind_str = "HtoD";
+      if (m->src_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_DtoH:
+      kind_str = "DtoH";
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_HtoA:
+      kind_str = "HtoA";
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_AtoH:
+      kind_str = "AtoH";
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_AtoA: kind_str = "AtoA"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_AtoD: kind_str = "AtoD"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_DtoA: kind_str = "DtoA"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_DtoD: kind_str = "DtoD"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_HtoH:
+      kind_str = "HtoH";
+      if (m->src_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned &&
+          m->dst_kind() == m->src_kind()) {
+        pinned = " Pinned";
+      }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_PtoP: kind_str = "PtoP"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_Unknown: kind_str = "Unknown"; break;
+    default: kind_str = "Unknown"; break;
+  }
+  std::ostringstream oss;
+  oss << kind_str << pinned;
+  oss << " " << m->bytes() << " bytes";
+  if (m->flags() == spark_rapids_jni::profiler::MemcpyFlags_Async) { oss << " async"; }
+  return oss.str();
+}
+
+const char* memcpy_to_color(spark_rapids_jni::profiler::MemcpyActivity const* m)
+{
+  switch (m->copy_kind()) {
+    case spark_rapids_jni::profiler::MemcpyKind_HtoD:
+      if (m->src_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { return "MediumPurple"; }
+      return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_DtoH:
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { return "MediumPurple"; }
+      return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_HtoA:
+    case spark_rapids_jni::profiler::MemcpyKind_AtoH:
+    case spark_rapids_jni::profiler::MemcpyKind_AtoA:
+    case spark_rapids_jni::profiler::MemcpyKind_AtoD:
+    case spark_rapids_jni::profiler::MemcpyKind_DtoA: return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_DtoD: return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_HtoH: return "Ivory";
+    case spark_rapids_jni::profiler::MemcpyKind_PtoP: return "LightSalmon";
+    case spark_rapids_jni::profiler::MemcpyKind_Unknown:
+    default: return "DarkRed";
+  }
+}
+
+std::string memset_to_string(spark_rapids_jni::profiler::MemsetActivity const* m)
+{
+  std::ostringstream oss;
+  oss << "Memset " << m->bytes() << " bytes to " << m->value();
+  if (m->flags() == spark_rapids_jni::profiler::MemsetFlags_Async) { oss << " async"; }
+  return oss.str();
+}
+
+char const* overhead_kind_to_string(spark_rapids_jni::profiler::OverheadKind k)
+{
+  switch (k) {
+    case spark_rapids_jni::profiler::OverheadKind_Unknown: return "Unknown";
+    case spark_rapids_jni::profiler::OverheadKind_DriverCompiler: return "Driver compiler";
+    case spark_rapids_jni::profiler::OverheadKind_CUptiBufferFlush: return "Buffer flush";
+    case spark_rapids_jni::profiler::OverheadKind_CUptiInstrumentation: return "Instrumentation";
+    case spark_rapids_jni::profiler::OverheadKind_CUptiResource: return "Resource";
+    default: return "Unknown";
+  }
+}
+
+// Convert a CUPTI thread ID to an NVTXT thread ID.
+uint32_t to_nvtxt_tid(uint32_t tid)
+{
+  // NVTXT thread IDs are limited to 24-bit.
+  // Take the upper 24 bits which empirically are the most unique bits returned by CUPTI.
+  return tid >> 8;
+}
+
+void convert_to_nvtxt(std::ifstream& in, std::ostream& out, program_options const& opts)
+{
+  struct marker_start {
+    uint64_t timestamp;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint32_t color;
+    uint32_t category;
+    std::string name;
+  };
+  std::unordered_set<stream_id> streams_seen;
+  std::unordered_map<int, spark_rapids_jni::profiler::MarkerData const*> marker_data_map;
+  std::unordered_map<int, marker_start> marker_start_map;
+  size_t num_dropped_records = 0;
+  out << "@NameProcess,ProcessId,Name" << std::endl;
+  out << "NameProcess,0,\"GPU\"" << std::endl;
+  out << "@NameOsThread,ProcessId,ThreadId,Name" << std::endl;
+  out << "@RangePush,Time,ProcessId,ThreadId,CategoryId,Color,Message" << std::endl;
+  out << "@RangePop,Time,ProcessId,ThreadId" << std::endl;
+  out << "TimeBase=Relative" << std::endl;
+  out << "Payload=0" << std::endl;
+  while (!in.eof()) {
+    auto fb_ptr = read_flatbuffer(in);
+    auto records =
+      validate_fb<spark_rapids_jni::profiler::ActivityRecords>(*fb_ptr, "ActivityRecords");
+    auto dropped = records->dropped();
+    if (dropped != nullptr) {
+      for (int i = 0; i < dropped->size(); ++i) {
+        auto d = dropped->Get(i);
+        num_dropped_records += d->num_dropped();
+      }
+    }
+    auto api = records->api();
+    if (api != nullptr) {
+      for (int i = 0; i < api->size(); ++i) {
+        auto a = api->Get(i);
+        out << "RangePush," << a->start() << "," << a->process_id() << ","
+            << to_nvtxt_tid(a->thread_id()) << ",0,PaleGreen"
+            << ","
+            << "\"" << get_api_name(a) << "\"" << std::endl;
+        out << "RangePop," << a->end() << "," << a->process_id() << ","
+            << to_nvtxt_tid(a->thread_id()) << std::endl;
+      }
+    }
+    auto marker_data = records->marker_data();
+    if (marker_data != nullptr) {
+      for (int i = 0; i < marker_data->size(); ++i) {
+        auto m              = marker_data->Get(i);
+        auto [it, inserted] = marker_data_map.insert({m->id(), m});
+        if (not inserted) {
+          std::ostringstream oss;
+          oss << "duplicate marker data for " << m->id();
+          throw std::runtime_error(oss.str());
+        }
+      }
+    }
+    auto marker = records->marker();
+    if (marker != nullptr) {
+      for (int i = 0; i < marker->size(); ++i) {
+        auto m         = marker->Get(i);
+        auto object_id = m->object_id();
+        if (object_id != nullptr) {
+          uint32_t process_id = object_id->process_id();
+          uint32_t thread_id  = to_nvtxt_tid(object_id->thread_id());
+          if (process_id == 0) {
+            // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+            thread_id = object_id->stream_id();
+            // TODO: Ignoring device ID and context here
+            auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+            if (inserted) { out << "NameOsThread,0,\"Stream " << thread_id << "\"" << std::endl; }
+          }
+          if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_Start) {
+            auto it           = marker_data_map.find(m->id());
+            uint32_t color    = 0x444444;
+            uint32_t category = 0;
+            if (it != marker_data_map.end()) {
+              color    = it->second->color();
+              category = it->second->category();
+            }
+            marker_start ms{
+              m->timestamp(), process_id, thread_id, color, category, m->name()->str()};
+            auto [ignored, inserted] = marker_start_map.insert({m->id(), ms});
+            if (not inserted) {
+              std::ostringstream oss;
+              oss << "duplicate marker start for ID " << m->id();
+              throw std::runtime_error(oss.str());
+            }
+          } else if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_End) {
+            auto it = marker_start_map.find(m->id());
+            if (it != marker_start_map.end()) {
+              auto const& ms = it->second;
+              out << "RangePush," << ms.timestamp << "," << ms.process_id << "," << ms.thread_id
+                  << "," << ms.category << "," << ms.color << ","
+                  << "\"" << ms.name << "\"" << std::endl;
+              out << "RangePop," << m->timestamp() << "," << ms.process_id << "," << ms.thread_id
+                  << std::endl;
+              marker_start_map.erase(it);
+            } else {
+              std::cerr << "Ignoring marker end without start for ID " << m->id() << std::endl;
+            }
+          } else {
+            std::cerr << "Ignoring marker with unsupported flags: " << m->flags() << std::endl;
+          }
+        } else {
+          std::cerr << "Marker " << m->id() << " has no object ID" << std::endl;
+        }
+      }
+    }
+    marker_data_map.clear();
+    auto kernel = records->kernel();
+    if (kernel != nullptr) {
+      for (int i = 0; i < kernel->size(); ++i) {
+        auto k              = kernel->Get(i);
+        uint32_t process_id = 0;
+        // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+        uint32_t thread_id = k->stream_id();
+        // TODO: Ignoring device ID and context here
+        auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+        if (inserted) {
+          out << "NameOsThread,0," << thread_id << ",\"Stream " << thread_id << "\"" << std::endl;
+        }
+        out << "RangePush," << k->start() << "," << process_id << "," << thread_id << ",0,Blue"
+            << ","
+            << "\"" << demangle(k->name()->c_str()) << "\"" << std::endl;
+        out << "RangePop," << k->end() << "," << process_id << "," << thread_id << std::endl;
+      }
+    }
+    auto memcpy = records->memcpy();
+    if (memcpy != nullptr) {
+      for (int i = 0; i < memcpy->size(); ++i) {
+        auto m              = memcpy->Get(i);
+        uint32_t process_id = 0;
+        // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+        uint32_t thread_id = m->stream_id();
+        // TODO: Ignoring device ID and context here
+        auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+        if (inserted) {
+          out << "NameOsThread,0," << thread_id << ",\"Stream " << thread_id << "\"" << std::endl;
+        }
+        out << "RangePush," << m->start() << "," << process_id << "," << thread_id << ",0,"
+            << memcpy_to_color(m) << ","
+            << "\"" << memcpy_to_string(m) << "\"" << std::endl;
+        out << "RangePop," << m->end() << "," << process_id << "," << thread_id << std::endl;
+      }
+    }
+    auto memset = records->memset();
+    if (memset != nullptr) {
+      for (int i = 0; i < memset->size(); ++i) {
+        auto m              = memset->Get(i);
+        uint32_t process_id = 0;
+        // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+        uint32_t thread_id = m->stream_id();
+        // TODO: Ignoring device ID and context here
+        auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+        if (inserted) {
+          out << "NameOsThread,0," << thread_id << ",\"Stream " << thread_id << "\"" << std::endl;
+        }
+        out << "RangePush," << m->start() << "," << process_id << "," << thread_id << ",0,Olive"
+            << ","
+            << "\"" << memset_to_string(m) << "\"" << std::endl;
+        out << "RangePop," << m->end() << "," << process_id << "," << thread_id << std::endl;
+      }
+    }
+    auto overhead = records->overhead();
+    if (overhead != nullptr) {
+      for (int i = 0; i < overhead->size(); ++i) {
+        auto o         = overhead->Get(i);
+        auto object_id = o->object_id();
+        if (object_id != nullptr) {
+          uint32_t process_id = object_id->process_id();
+          uint32_t thread_id  = to_nvtxt_tid(object_id->thread_id());
+          if (process_id == 0) {
+            // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+            thread_id = object_id->stream_id();
+            // TODO: Ignoring device ID and context here
+            auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+            if (inserted) { out << "NameOsThread,0,\"Stream " << thread_id << "\"" << std::endl; }
+          }
+          out << "RangePush," << o->start() << "," << process_id << "," << thread_id
+              << ",0,OrangeRed"
+              << ","
+              << "\"" << overhead_kind_to_string(o->overhead_kind()) << "\"" << std::endl;
+          out << "RangePop," << o->end() << "," << process_id << "," << thread_id << std::endl;
+        } else {
+          std::cerr << "Overhead activity has no object ID" << std::endl;
+        }
+      }
+    }
+
+    in.peek();
+  }
+  if (num_dropped_records) {
+    std::cerr << "Warning: " << num_dropped_records
+              << " records were noted as dropped in the profile" << std::endl;
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  constexpr int RESULT_SUCCESS = 0;
+  constexpr int RESULT_FAILURE = 1;
+  constexpr int RESULT_USAGE   = 2;
+  program_options opts;
+  std::vector<std::string_view> files;
+  if (argc < 2) {
+    print_usage();
+    return RESULT_USAGE;
+  }
+  std::vector<std::string_view> args(argv + 1, argv + argc);
+  try {
+    auto [options, inputs] = parse_options(args);
+    opts                   = options;
+    files                  = inputs;
+  } catch (std::exception const& e) {
+    std::cerr << "spark_rapids_profile_converter: " << e.what() << std::endl;
+    print_usage();
+    return RESULT_USAGE;
+  }
+  if (opts.help) {
+    print_usage();
+    return RESULT_USAGE;
+  }
+  if (opts.version) {
+    print_version();
+    return RESULT_SUCCESS;
+  }
+  if (files.size() != 1) {
+    std::cerr << "Missing input file." << std::endl;
+    print_usage();
+    return RESULT_USAGE;
+  }
+  auto input_file = files.front();
+  try {
+    std::ifstream in(std::string(input_file), std::ios::binary | std::ios::in);
+    in.exceptions(std::istream::badbit);
+    verify_profile_header(in);
+    if (opts.json) {
+      if (opts.output_path) {
+        std::ofstream out = open_output(opts.output_path.value());
+        convert_to_json(in, out, opts);
+      } else {
+        convert_to_json(in, std::cout, opts);
+      }
+    } else if (opts.nvtxt) {
+      if (opts.output_path) {
+        std::ofstream out = open_output(opts.output_path.value());
+        convert_to_nvtxt(in, out, opts);
+      } else {
+        convert_to_nvtxt(in, std::cout, opts);
+      }
+    } else {
+      convert_to_nsys_rep(in, input_file, opts);
+    }
+  } catch (std::system_error const& e) {
+    std::cerr << "Error converting " << input_file << ": " << e.code().message() << std::endl;
+    return RESULT_FAILURE;
+  } catch (std::exception const& e) {
+    std::cerr << "Error converting " << input_file << ": " << e.what() << std::endl;
+    return RESULT_FAILURE;
+  }
+  return RESULT_SUCCESS;
+}
diff --git a/src/main/cpp/src/spark_rapids_jni_version.cpp.in b/src/main/cpp/src/spark_rapids_jni_version.cpp.in
new file mode 100644
index 0000000000..fdc2aa3007
--- /dev/null
+++ b/src/main/cpp/src/spark_rapids_jni_version.cpp.in
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "spark_rapids_jni_version.h"
+
+namespace spark_rapids_jni {
+
+char const Version[] = "@CMAKE_PROJECT_VERSION@ @SPARK_RAPIDS_JNI_COMMIT_DETAILS@";
+
+}
diff --git a/src/main/cpp/src/spark_rapids_jni_version.h b/src/main/cpp/src/spark_rapids_jni_version.h
new file mode 100644
index 0000000000..c77a8ec5a9
--- /dev/null
+++ b/src/main/cpp/src/spark_rapids_jni_version.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace spark_rapids_jni {
+
+extern char const Version[];
+
+}
diff --git a/src/main/fbs/profiler.fbs b/src/main/fbs/profiler.fbs
new file mode 100644
index 0000000000..0770be33cf
--- /dev/null
+++ b/src/main/fbs/profiler.fbs
@@ -0,0 +1,287 @@
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Flatbuffer schema for the profiler
+// NOTE: The schema needs to be in a single file because the build embeds it
+//       into the converter tool to be able to emit profile records as JSON.
+
+// Profiling data is written as a series of size-prefixed flatbuffers.
+// The first flatbuffer is always ProfileHeader followed by zero or more ActivityRecords.
+
+namespace spark_rapids_jni.profiler;
+
+table ActivityObjectId {
+  process_id:uint32;  // present if object kind is Process or Thread
+  thread_id:uint32;   // present if object kind is Thread
+  device_id:uint32;   // present if object kind is Device or Context or Stream
+  context_id:uint32;  // present if object kind is Context or Stream
+  stream_id:uint32;   // present if object kind is Stream
+}
+
+enum ApiKind:byte {
+  Driver = 0,
+  Runtime = 1
+}
+
+enum ChannelType:uint8 {
+  Invalid = 0,
+  Compute = 1,
+  AsyncMemcpy = 2
+}
+
+table CommandBufferFullData {
+  command_buffer_length:uint32;
+  channel_id:uint32;
+  channel_type:uint32;
+}
+
+enum LaunchType:uint8 {
+  Regular = 0,
+  CooperativeSingleDevice = 1,
+  CooperativeMultiDevice = 2
+}
+
+enum MarkerFlags:uint8 (bit_flags) {
+  Instantaneous = 0,
+  Start = 1,
+  End = 2,
+  SyncAcquire = 3,
+  SyncAcquireSuccess = 4,
+  SyncAcquireFailed = 5,
+  SyncRelease = 6
+}
+
+enum MemcpyFlags:uint8 (bit_flags) {
+  Async = 0
+}
+
+enum MemcpyKind:uint8 {
+  Unknown = 0,
+  HtoD = 1,
+  DtoH = 2,
+  HtoA = 3,
+  AtoH = 4,
+  AtoA = 5,
+  AtoD = 6,
+  DtoA = 7,
+  DtoD = 8,
+  HtoH = 9,
+  PtoP = 10
+}
+
+enum MemoryKind:uint8 {
+  Unknown = 0,
+  Pageable = 1,
+  Pinned = 2,
+  Device = 3,
+  Array = 4,
+  Managed = 5,
+  DeviceStatic = 6,
+  ManagedStatic = 7
+}
+
+enum MemsetFlags:uint8 (bit_flags) {
+  Async = 0
+}
+
+enum OverheadKind:uint8 {
+  Unknown = 0,
+  DriverCompiler = 1,
+  CUptiBufferFlush = 2,
+  CUptiInstrumentation = 3,
+  CUptiResource = 4
+}
+
+enum PartitionedGlobalCacheConfig:uint8 {
+  Unknown = 0,
+  NotSupported = 1,
+  Off = 2,
+  On = 3
+}
+
+enum ShmemLimitConfig:uint8 {
+  Default = 0,
+  Optin = 1
+}
+
+table ProfileHeader {
+  magic:string;
+  version:uint32;
+  writer_version:string;
+}
+
+table ActivityRecords {
+  api:[ApiActivity];
+  device:[DeviceActivity];
+  dropped:[DroppedRecords];
+  kernel:[KernelActivity];
+  marker:[MarkerActivity];
+  marker_data:[MarkerData];
+  memcpy:[MemcpyActivity];
+  memset:[MemsetActivity];
+  overhead:[OverheadActivity];
+}
+
+table ApiActivity {
+  kind:ApiKind = Runtime;
+  cbid:uint32;
+  start:uint64;
+  end:uint64;
+  process_id:uint32;
+  thread_id:uint32;
+  correlation_id:uint32;
+  return_value:uint32 = 0;
+}
+
+table DeviceActivity {
+  global_memory_bandwidth:uint64;
+  global_memory_size:uint64;
+  constant_memory_size:uint32;
+  l2_cache_size:uint32;
+  num_threads_per_warp:uint32;
+  core_clock_rate:uint32;
+  num_memcpy_engines:uint32;
+  num_multiprocessors:uint32;
+  max_ipc:uint32;
+  max_warps_per_multiprocessor:uint32;
+  max_blocks_per_multiprocessor:uint32;
+  max_shared_memory_per_multiprocessor:uint32;
+  max_registers_per_multiprocessor:uint32;
+  max_registers_per_block:uint32;
+  max_shared_memory_per_block:uint32;
+  max_threads_per_block:uint32;
+  max_block_dim_x:uint32;
+  max_block_dim_y:uint32;
+  max_block_dim_z:uint32;
+  max_grid_dim_x:uint32;
+  max_grid_dim_y:uint32;
+  max_grid_dim_z:uint32;
+  compute_capability_major:uint32;
+  compute_capability_minor:uint32;
+  id:uint32;
+  ecc_enabled:uint32;
+  name:string;
+}
+
+table DroppedRecords {
+  num_dropped:uint64;
+}
+
+table KernelActivity {
+  requested:uint8;
+  executed:uint8;
+  shared_memory_config:uint8;
+  registers_per_thread:uint16;
+  partitioned_global_cache_requested:PartitionedGlobalCacheConfig;
+  partitioned_global_cache_executed:PartitionedGlobalCacheConfig;
+  start:uint64;
+  end:uint64;
+  completed:uint64 = 0;
+  device_id:uint32;
+  context_id:uint32;
+  stream_id:uint32;
+  grid_x:int32;
+  grid_y:int32;
+  grid_z:int32;
+  block_x:int32;
+  block_y:int32;
+  block_z:int32;
+  static_shared_memory:int32;
+  dynamic_shared_memory:int32;
+  local_memory_per_thread:uint32;
+  local_memory_total:uint32;
+  correlation_id:uint32;
+  grid_id:int64;
+  name:string;
+  queued:uint64 = 0;
+  submitted:uint64 = 0;
+  launch_type:LaunchType = Regular;
+  is_shared_memory_carveout_requested:uint8;
+  shared_memory_carveout_requested:uint8;
+  shared_memory_executed:uint32;
+  graph_node_id:uint64 = 0;
+  shmem_limit_config:ShmemLimitConfig = Default;
+  graph_id:uint32 = 0;
+  //access_policy_window:???;
+  channel_id:uint32;
+  channel_type:ChannelType;
+  cluster_x:uint32;
+  cluster_y:uint32;
+  cluster_z:uint32;
+  cluster_scheduling_policy:uint32;
+  local_memory_total_v2:uint64;
+}
+
+table MarkerActivity {
+  flags:MarkerFlags = Start;
+  timestamp:uint64;
+  id:int32;
+  object_id:ActivityObjectId;
+  name:string;
+  domain:string;
+}
+
+table MarkerData {
+  flags:MarkerFlags = Start;
+  id:int32;
+  //payload_kind:MetricValueKind;
+  //payload:MetricValue;
+  color:uint32;
+  category:uint32;
+}
+
+table MemcpyActivity {
+  copy_kind:MemcpyKind;
+  src_kind:MemoryKind;
+  dst_kind:MemoryKind;
+  flags:MemcpyFlags;
+  bytes:uint64;
+  start:uint64;
+  end:uint64;
+  device_id:uint32;
+  context_id:uint32;
+  stream_id:uint32;
+  correlation_id:uint32;
+  runtime_correlation_id:uint32;
+  graph_node_id:uint64 = 0;
+  graph_id:uint32 = 0;
+  channel_id:uint32;
+  channel_type:ChannelType;
+}
+
+table MemsetActivity {
+  value:uint32;
+  bytes:uint64;
+  start:uint64;
+  end:uint64;
+  device_id:uint32;
+  context_id:uint32;
+  stream_id:uint32;
+  correlation_id:uint32;
+  flags:MemsetFlags;
+  memory_kind:MemoryKind;
+  graph_node_id:uint64 = 0;
+  graph_id:uint32 = 0;
+  channel_id:uint32;
+  channel_type:ChannelType;
+}
+
+table OverheadActivity {
+  overhead_kind:OverheadKind;
+  object_id:ActivityObjectId;
+  start:uint64;
+  end:uint64;
+}
+
+root_type ActivityRecords;
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Profiler.java b/src/main/java/com/nvidia/spark/rapids/jni/Profiler.java
new file mode 100644
index 0000000000..86d5b0edde
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/Profiler.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.NativeDepsLoader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/** Profiler that collects CUDA and NVTX events for the current process. */
+public class Profiler {
+  private static final long DEFAULT_WRITE_BUFFER_SIZE = 1024 * 1024;
+  private static final int DEFAULT_FLUSH_PERIOD_MILLIS = 0;
+  private static DataWriter writer = null;
+
+  /**
+   * Initialize the profiler in a standby state. The start method must be called after this
+   * to start collecting profiling data.
+   * @param w data writer for writing profiling data
+   */
+  public static void init(DataWriter w) {
+    init(w, DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_FLUSH_PERIOD_MILLIS);
+  }
+
+  /**
+   * Initialize the profiler in a standby state. The start method must be called after this
+   * to start collecting profiling data.
+   * @param w data writer for writing profiling data
+   * @param writeBufferSize size of host memory buffer to use for collecting profiling data.
+   *                        Recommended to be between 1-8 MB in size to balance callback
+   *                        overhead with latency.
+   * @param flushPeriodMillis time period in milliseconds to explicitly flush collected
+   *                          profiling data to the writer. A value <= 0 will disable explicit
+   *                          flushing.
+   */
+  public static void init(DataWriter w, long writeBufferSize, int flushPeriodMillis) {
+    if (writer == null) {
+      File libPath;
+      try {
+        libPath = NativeDepsLoader.loadNativeDep("profilerjni", true);
+      } catch (IOException e) {
+        throw new RuntimeException("Error loading profiler library", e);
+      }
+      nativeInit(libPath.getAbsolutePath(), w, writeBufferSize, flushPeriodMillis);
+      writer = w;
+    } else {
+      throw new IllegalStateException("Already initialized");
+    }
+  }
+
+  /**
+   * Shutdown the profiling session. Flushes collected profiling data to the writer and
+   * closes the writer.
+   */
+  public static void shutdown() {
+    if (writer != null) {
+      nativeShutdown();
+      try {
+        writer.close();
+      } catch (Exception e) {
+        throw new RuntimeException("Error closing writer", e);
+      } finally {
+        writer = null;
+      }
+    }
+  }
+
+  /**
+   * Start collecting profiling data. Safe to call if profiling data is already being collected.
+   */
+  public static void start() {
+    if (writer != null) {
+      nativeStart();
+    } else {
+      throw new IllegalStateException("Profiler not initialized");
+    }
+  }
+
+  /**
+   * Stop collecting profiling data. Safe to call if the profiler is initialized but not
+   * actively collecting data.
+   */
+  public static void stop() {
+    if (writer != null) {
+      nativeStop();
+    } else {
+      throw new IllegalStateException("Profiler not initialized");
+    }
+  }
+
+  private static native void nativeInit(String libPath, DataWriter writer,
+                                        long writeBufferSize, int flushPeriodMillis);
+
+  private static native void nativeStart();
+
+  private static native void nativeStop();
+
+  private static native void nativeShutdown();
+
+  /** Interface for profiler data writers */
+  public interface DataWriter extends AutoCloseable {
+    /**
+     * Called by the profiler to write a block of profiling data. Profiling data is written
+     * in a size-prefixed flatbuffer format. See profiler.fbs for the schema.
+     * @param data profiling data to be written
+     */
+    void write(ByteBuffer data);
+  }
+}

From 85e22b972ad7676a360454deb529fefed8986cb2 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 29 May 2024 01:00:56 +0800
Subject: [PATCH 119/124] Update submodule cudf to
 29429f7e4c871758c0de930026347e6e3b0a5a9a (#2081)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                    |  2 +-
 thirdparty/cudf-pins/versions.json | 22 +++++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8a405674a5..29429f7e4c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8a405674a5ba1554a0ced5d1f39f89fb424a768d
+Subproject commit 29429f7e4c871758c0de930026347e6e3b0a5a9a
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 662cbb8dc9..50d4c4c2f4 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -32,6 +32,11 @@
           "fixed_in" : "",
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue."
         },
+        {
+          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue."
+        },
         {
           "file" : "cccl/kernel_pointer_hiding.diff",
           "fixed_in" : "2.4",
@@ -42,15 +47,30 @@
           "fixed_in" : "",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]"
         },
+        {
+          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]"
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "fixed_in" : "",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]"
         },
+        {
+          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]"
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "fixed_in" : "",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]"
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]"
         }
       ],
       "version" : "2.2.0"
@@ -67,7 +87,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "261445be7993df57f624a3f4ee9fd15e7d26bb5e",
+      "git_tag" : "1d87e5f7a19993e12726b610a9491d58748201b7",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },

From 40d4ac1060cc17562339a526de5ed5538185df8e Mon Sep 17 00:00:00 2001
From: bolic2346 <armmanvaillancourt+bolic2346@gmail.com>
Date: Wed, 29 May 2024 15:06:10 -0400
Subject: [PATCH 120/124] Typo in contributing (#2089)

Signed-off-by: Aveline Scosus <avescs@gmail.com>
Co-authored-by: Aveline Scosus <avescs@gmail.com>
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9f5c5be5c0..1ada0b474b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -355,7 +355,7 @@ conda install -c conda-forge pre-commit
 pip install pre-commit
 ```
 
-Then, run pre-commit hooks before committing your code. This wil reformat the stagged files:
+Then, run pre-commit hooks before committing your code. This will reformat the staged files:
 ```
 pre-commit run
 ```

From a9daa19d224f0a3c3219fec4e5e6895b6e8fb1ba Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 30 May 2024 06:00:41 +0800
Subject: [PATCH 121/124] Update submodule cudf to
 2b031e06a7fe18eec462db445eea1c596b93a9f1 (#2092)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 29429f7e4c..2b031e06a7 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 29429f7e4c871758c0de930026347e6e3b0a5a9a
+Subproject commit 2b031e06a7fe18eec462db445eea1c596b93a9f1

From eb407f066c865355dae0909dffdd24ef6652a68a Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Thu, 30 May 2024 16:09:43 -0700
Subject: [PATCH 122/124] Fix nvtx3 linking issue in benchmark (#2098)

* Fix link issue

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Change copyright year

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

---------

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 src/main/cpp/benchmarks/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/cpp/benchmarks/CMakeLists.txt b/src/main/cpp/benchmarks/CMakeLists.txt
index ca0c43d2fe..732777ef10 100644
--- a/src/main/cpp/benchmarks/CMakeLists.txt
+++ b/src/main/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ target_compile_options(
 )
 
 target_link_libraries(
-  spark_rapids_jni_datagen PUBLIC cudf::cudf nvtx3-cpp
+  spark_rapids_jni_datagen PUBLIC cudf::cudf nvtx3::nvtx3-cpp
 )
 
 target_include_directories(

From a43e6dc1dda2b6d8a4a76d1544b8d56e0ab05269 Mon Sep 17 00:00:00 2001
From: jenkins <jenkins@localhost>
Date: Wed, 5 Jun 2024 10:26:45 +0000
Subject: [PATCH 123/124] Change version to 24.06.0

Signed-off-by: jenkins <jenkins@localhost>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index c13cdd2030..566c06b934 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>24.06.0-SNAPSHOT</version>
+  <version>24.06.0</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>

From baed9255a78d413d78046c8955582f618426dc28 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:39:38 +0800
Subject: [PATCH 124/124] Update submodule cudf to
 7c706cc4004d5feaae92544b3b29a00c64f7ed86 (#2117)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf                       | 2 +-
 thirdparty/cudf-pins/rapids-cmake.sha | 2 +-
 thirdparty/cudf-pins/versions.json    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2b031e06a7..7c706cc400 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2b031e06a7fe18eec462db445eea1c596b93a9f1
+Subproject commit 7c706cc4004d5feaae92544b3b29a00c64f7ed86
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index 20bbb44986..356898bd1d 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-41dc9623dbb8e5bdd2bccc22815efb9db6a49280
+3c754156fbe1b8ba0b6848d3e2cc1359d0d8918d
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 50d4c4c2f4..49c41c7c53 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -87,7 +87,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "1d87e5f7a19993e12726b610a9491d58748201b7",
+      "git_tag" : "5d4f4a8565c5cdff94f77832a775d73ec9e7513e",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
       "version" : "24.06"
     },
@@ -172,7 +172,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "dc1e17a03ed2dbc9329ccecc27922e414250f45a",
+      "git_tag" : "f47ce3f0d46848cd9d5844d499bf150dd14d823a",
       "git_url" : "https://github.com/rapidsai/rmm.git",
       "version" : "24.06"
     },