Skip to content

Commit c6f20ff

Browse files
PHILO-HEglutenperfbot
authored andcommitted
[11691] feat: Add get_json_object Spark function
1 parent 44c324f commit c6f20ff

File tree

5 files changed

+372
-1
lines changed

5 files changed

+372
-1
lines changed

velox/docs/functions/spark/json.rst

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,43 @@
22
JSON Functions
33
==============
44

5+
JSON Format
6+
-----------
7+
8+
JSON is a language-independent data format that represents data as
9+
human-readable text. A JSON text can represent a number, a boolean, a
10+
string, an array, an object, or a null, with slightly different grammar.
11+
For instance, a JSON text representing a string must escape all characters
12+
and enclose the string in double quotes, such as ``"123\n"``, whereas a JSON
13+
text representing a number does not need to, such as ``123``. A JSON text
14+
representing an array must enclose the array elements in square brackets,
15+
such as ``[1,2,3]``. More detailed grammar can be found in
16+
`this JSON introduction`_.
17+
18+
.. _this JSON introduction: https://www.json.org
19+
20+
JSON Functions
21+
--------------
22+
23+
.. spark:function:: get_json_object(jsonString, path) -> varchar
24+
25+
Returns a json object, represented by VARCHAR, from ``jsonString`` by searching ``path``.
26+
Valid ``path`` should start with '$' and then contain "[index]", "['field']" or ".field"
27+
to define a JSON path. Here are some examples: "$.a" "$.a.b", "$[0]['a'].b". Returns
28+
``jsonString`` if ``path`` is "$". Returns NULL if ``jsonString`` or ``path`` is malformed.
29+
Also returns NULL if ``path`` doesn't exist. ::
30+
31+
SELECT get_json_object('{"a":"b"}', '$.a'); -- 'b'
32+
SELECT get_json_object('{"a":{"b":"c"}}', '$.a'); -- '{"b":"c"}'
33+
SELECT get_json_object('{"a":3}', '$.b'); -- NULL (unexisting field)
34+
SELECT get_json_object('{"a"-3}'', '$.a'); -- NULL (malformed JSON string)
35+
SELECT get_json_object('{"a":3}'', '.a'); -- NULL (malformed JSON path)
36+
537
.. spark:function:: json_object_keys(jsonString) -> array(string)
638
7-
Returns all the keys of the outermost JSON object as an array if a valid JSON object is given. If it is any other valid JSON string, an invalid JSON string or an empty string, the function returns null. ::
39+
Returns all the keys of the outermost JSON object as an array if a valid JSON object is given.
40+
If it is any other valid JSON string, an invalid JSON string or an empty string, the function
41+
returns null. ::
842

943
SELECT json_object_keys('{}'); -- []
1044
SELECT json_object_keys('{"name": "Alice", "age": 5, "id": "001"}'); -- ['name', 'age', 'id']
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "velox/functions/Macros.h"
18+
#include "velox/functions/prestosql/json/SIMDJsonUtil.h"
19+
20+
namespace facebook::velox::functions::sparksql {
21+
22+
template <typename T>
23+
struct GetJsonObjectFunction {
24+
VELOX_DEFINE_FUNCTION_TYPES(T);
25+
26+
// ASCII input always produces ASCII result.
27+
static constexpr bool is_default_ascii_behavior = true;
28+
29+
FOLLY_ALWAYS_INLINE void initialize(
30+
const std::vector<TypePtr>& /*inputTypes*/,
31+
const core::QueryConfig& config,
32+
const arg_type<Varchar>* /*json*/,
33+
const arg_type<Varchar>* jsonPath) {
34+
if (jsonPath != nullptr) {
35+
if (checkJsonPath(*jsonPath)) {
36+
jsonPath_ = removeSingleQuotes(*jsonPath);
37+
}
38+
}
39+
}
40+
41+
FOLLY_ALWAYS_INLINE bool call(
42+
out_type<Varchar>& result,
43+
const arg_type<Varchar>& json,
44+
const arg_type<Varchar>& jsonPath) {
45+
// Spark requires the first char in jsonPath is '$'.
46+
if (!checkJsonPath(jsonPath)) {
47+
return false;
48+
}
49+
// jsonPath is "$".
50+
if (jsonPath.size() == 1) {
51+
result.append(json);
52+
return true;
53+
}
54+
simdjson::ondemand::document jsonDoc;
55+
simdjson::padded_string paddedJson(json.data(), json.size());
56+
if (simdjsonParse(paddedJson).get(jsonDoc)) {
57+
return false;
58+
}
59+
try {
60+
auto rawResult = jsonPath_.has_value()
61+
? jsonDoc.at_path(jsonPath_.value().data())
62+
: jsonDoc.at_path(removeSingleQuotes(jsonPath));
63+
if (rawResult.error()) {
64+
return false;
65+
}
66+
67+
if (!extractStringResult(rawResult, result)) {
68+
return false;
69+
}
70+
} catch (simdjson::simdjson_error& e) {
71+
return false;
72+
}
73+
74+
const char* currentPos;
75+
jsonDoc.current_location().get(currentPos);
76+
return isValidEndingCharacter(currentPos);
77+
}
78+
79+
private:
80+
FOLLY_ALWAYS_INLINE bool checkJsonPath(StringView jsonPath) {
81+
// Spark requires the first char in jsonPath is '$'.
82+
if (jsonPath.size() < 1 || jsonPath.data()[0] != '$') {
83+
return false;
84+
}
85+
return true;
86+
}
87+
88+
// Spark's json path requires field name surrounded by single quotes if it is
89+
// specified in "[]". But simdjson lib requires not. This method just removes
90+
// such single quotes, e.g., converts "['a']['b']" to "[a][b]".
91+
std::string removeSingleQuotes(StringView jsonPath) {
92+
// Skip the initial "$".
93+
std::string result(jsonPath.data() + 1, jsonPath.size() - 1);
94+
size_t pairEnd = 0;
95+
while (true) {
96+
auto pairBegin = result.find("['", pairEnd);
97+
if (pairBegin == std::string::npos) {
98+
break;
99+
}
100+
pairEnd = result.find("]", pairBegin);
101+
if (pairEnd == std::string::npos || result[pairEnd - 1] != '\'') {
102+
return "-1";
103+
}
104+
result.erase(pairEnd - 1, 1);
105+
result.erase(pairBegin + 1, 1);
106+
pairEnd -= 2;
107+
}
108+
return result;
109+
}
110+
111+
// Returns true if no error.
112+
bool extractStringResult(
113+
simdjson::simdjson_result<simdjson::ondemand::value> rawResult,
114+
out_type<Varchar>& result) {
115+
std::stringstream ss;
116+
switch (rawResult.type()) {
117+
// For number and bool types, we need to explicitly get the value
118+
// for specific types instead of using `ss << rawResult`. Thus, we
119+
// can make simdjson's internal parsing position moved and then we
120+
// can check the validity of ending character.
121+
case simdjson::ondemand::json_type::number: {
122+
switch (rawResult.get_number_type()) {
123+
case simdjson::ondemand::number_type::unsigned_integer: {
124+
uint64_t numberResult;
125+
if (!rawResult.get_uint64().get(numberResult)) {
126+
ss << numberResult;
127+
result.append(ss.str());
128+
return true;
129+
}
130+
return false;
131+
}
132+
case simdjson::ondemand::number_type::signed_integer: {
133+
int64_t numberResult;
134+
if (!rawResult.get_int64().get(numberResult)) {
135+
ss << numberResult;
136+
result.append(ss.str());
137+
return true;
138+
}
139+
return false;
140+
}
141+
case simdjson::ondemand::number_type::floating_point_number: {
142+
double numberResult;
143+
if (!rawResult.get_double().get(numberResult)) {
144+
ss << rawResult;
145+
result.append(ss.str());
146+
return true;
147+
}
148+
return false;
149+
}
150+
default:
151+
VELOX_UNREACHABLE();
152+
}
153+
}
154+
case simdjson::ondemand::json_type::boolean: {
155+
bool boolResult;
156+
if (!rawResult.get_bool().get(boolResult)) {
157+
result.append(boolResult ? "true" : "false");
158+
return true;
159+
}
160+
return false;
161+
}
162+
case simdjson::ondemand::json_type::string: {
163+
std::string_view stringResult;
164+
if (!rawResult.get_string().get(stringResult)) {
165+
result.append(stringResult);
166+
return true;
167+
}
168+
return false;
169+
}
170+
case simdjson::ondemand::json_type::object: {
171+
// For nested case, e.g., for "{"my": {"hello": 10}}", "$.my" will
172+
// return an object type.
173+
ss << rawResult;
174+
result.append(ss.str());
175+
return true;
176+
}
177+
case simdjson::ondemand::json_type::array: {
178+
ss << rawResult;
179+
result.append(ss.str());
180+
return true;
181+
}
182+
default: {
183+
return false;
184+
}
185+
}
186+
}
187+
188+
// This is a simple validation by checking whether the obtained result is
189+
// followed by valid char. Because ondemand parsing we are using ignores json
190+
// format validation for characters following the current parsing position.
191+
// As json doc is padded with NULL characters, it's safe to do recursively
192+
// check.
193+
bool isValidEndingCharacter(const char* currentPos) {
194+
char endingChar = *currentPos;
195+
if (endingChar == ',' || endingChar == '}' || endingChar == ']') {
196+
return true;
197+
}
198+
// These chars can be prior to a valid ending char.
199+
if (endingChar == ' ' || endingChar == '\r' || endingChar == '\n' ||
200+
endingChar == '\t') {
201+
return isValidEndingCharacter(++currentPos);
202+
}
203+
return false;
204+
}
205+
206+
std::optional<std::string> jsonPath_;
207+
};
208+
209+
} // namespace facebook::velox::functions::sparksql

velox/functions/sparksql/registration/RegisterJson.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@
1414
* limitations under the License.
1515
*/
1616
#include "velox/functions/lib/RegistrationHelpers.h"
17+
#include "velox/functions/sparksql/JsonFunctions.h"
1718
#include "velox/functions/sparksql/JsonObjectKeys.h"
1819

1920
namespace facebook::velox::functions::sparksql {
2021

2122
void registerJsonFunctions(const std::string& prefix) {
23+
registerFunction<GetJsonObjectFunction, Varchar, Varchar, Varchar>(
24+
{prefix + "get_json_object"});
2225
registerFunction<JsonObjectKeysFunction, Array<Varchar>, Varchar>(
2326
{prefix + "json_object_keys"});
2427
}

velox/functions/sparksql/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ add_executable(
3434
ElementAtTest.cpp
3535
HashTest.cpp
3636
InTest.cpp
37+
JsonFunctionsTest.cpp
3738
JsonObjectKeysTest.cpp
3839
LeastGreatestTest.cpp
3940
MakeDecimalTest.cpp

0 commit comments

Comments
 (0)