Skip to content

Commit 1cea1e9

Browse files
feat: avro schema add sanitize field name (#190)
1 parent 341f8d5 commit 1cea1e9

File tree

3 files changed

+227
-4
lines changed

3 files changed

+227
-4
lines changed

src/iceberg/avro/avro_schema_util.cc

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <charconv>
2121
#include <format>
2222
#include <mutex>
23+
#include <sstream>
2324
#include <string_view>
2425

2526
#include <arrow/type.h>
@@ -56,8 +57,60 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
5657
return attributes;
5758
}
5859

60+
void SanitizeChar(char c, std::ostringstream& os) {
61+
if (std::isdigit(c)) {
62+
os << '_' << c;
63+
} else {
64+
os << "_x" << std::uppercase << std::hex << static_cast<int>(c);
65+
}
66+
}
67+
5968
} // namespace
6069

70+
bool ValidAvroName(std::string_view name) {
71+
if (name.empty()) {
72+
return false;
73+
}
74+
75+
char first = name[0];
76+
if (!(std::isalpha(first) || first == '_')) {
77+
return false;
78+
}
79+
80+
for (size_t i = 1; i < name.length(); i++) {
81+
char character = name[i];
82+
if (!(std::isalnum(character) || character == '_')) {
83+
return false;
84+
}
85+
}
86+
return true;
87+
}
88+
89+
std::string SanitizeFieldName(std::string_view field_name) {
90+
if (field_name.empty()) {
91+
return "";
92+
}
93+
94+
std::ostringstream result;
95+
96+
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
97+
SanitizeChar(field_name[0], result);
98+
} else {
99+
result << field_name[0];
100+
}
101+
102+
for (size_t i = 1; i < field_name.size(); ++i) {
103+
char c = field_name[i];
104+
if (std::isalnum(c) || c == '_') {
105+
result << c;
106+
} else {
107+
SanitizeChar(c, result);
108+
}
109+
}
110+
111+
return result.str();
112+
}
113+
61114
std::string ToString(const ::avro::NodePtr& node) {
62115
std::stringstream ss;
63116
ss << *node;
@@ -181,10 +234,20 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
181234
::avro::NodePtr field_node;
182235
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
183236

184-
// TODO(gangwu): sanitize field name
185-
(*node)->addName(std::string(sub_field.name()));
237+
bool is_valid_field_name = ValidAvroName(sub_field.name());
238+
std::string field_name = is_valid_field_name ? std::string(sub_field.name())
239+
: SanitizeFieldName(sub_field.name());
240+
241+
(*node)->addName(field_name);
186242
(*node)->addLeaf(field_node);
187-
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
243+
244+
::avro::CustomAttributes attributes = GetAttributesWithFieldId(sub_field.field_id());
245+
if (!is_valid_field_name) {
246+
attributes.addAttribute(std::string(kIcebergFieldNameProp),
247+
std::string(sub_field.name()),
248+
/*addQuotes=*/true);
249+
}
250+
(*node)->addCustomAttributesForField(attributes);
188251
}
189252
return {};
190253
}

src/iceberg/avro/avro_schema_util_internal.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,11 +149,44 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type);
149149
/// \return True if the node has a map logical type, false otherwise.
150150
bool HasMapLogicalType(const ::avro::NodePtr& node);
151151

152+
/// \brief Check if a string is a valid Avro name.
153+
///
154+
/// Valid Avro names must:
155+
/// 1. Start with a letter or underscore
156+
/// 2. Contain only letters, digits, or underscores
157+
///
158+
/// \param name The name to check.
159+
/// \return True if the name is valid, false otherwise.
160+
bool ValidAvroName(std::string_view name);
161+
152162
/// \brief Create a new Avro node with field IDs from name mapping.
153163
/// \param original_node The original Avro node to copy.
154164
/// \param mapping The name mapping to apply field IDs from.
155165
/// \return A new Avro node with field IDs applied, or an error.
156166
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
157167
const NameMapping& mapping);
158168

169+
/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
170+
///
171+
/// Converts names that are not valid Avro names to valid Avro names.
172+
/// Conversion rules:
173+
/// 1. If the first character is not a letter or underscore, it is specially handled:
174+
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
175+
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
176+
/// representation of the character (e.g., '$' -> '_x24')
177+
/// 2. For characters other than the first:
178+
/// - If it's a letter, digit, or underscore, it remains unchanged
179+
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
180+
/// representation
181+
///
182+
/// Examples:
183+
/// - "123field" -> "_123field"
184+
/// - "user-name" -> "user_x2Dname"
185+
/// - "$price" -> "_x24price"
186+
/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
187+
///
188+
/// \param field_name The original field name to sanitize.
189+
/// \return A sanitized field name that follows Avro naming conventions.
190+
std::string SanitizeFieldName(std::string_view field_name);
191+
159192
} // namespace iceberg::avro

test/avro_schema_test.cc

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
4747
ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id)));
4848
}
4949

50+
// Helper function to check if a custom attribute exists for a field name preservation
51+
void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
52+
const std::string& original_name) {
53+
ASSERT_LT(index, node->customAttributes());
54+
const auto& attrs = node->customAttributesAt(index);
55+
ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name));
56+
}
57+
5058
} // namespace
5159

60+
TEST(ValidAvroNameTest, ValidNames) {
61+
// Valid field names should return true
62+
EXPECT_TRUE(ValidAvroName("valid_field"));
63+
EXPECT_TRUE(ValidAvroName("field123"));
64+
EXPECT_TRUE(ValidAvroName("_private"));
65+
EXPECT_TRUE(ValidAvroName("CamelCase"));
66+
EXPECT_TRUE(ValidAvroName("field_with_underscores"));
67+
}
68+
69+
TEST(ValidAvroNameTest, InvalidNames) {
70+
// Names starting with numbers should return false
71+
EXPECT_FALSE(ValidAvroName("123field"));
72+
EXPECT_FALSE(ValidAvroName("0value"));
73+
74+
// Names with special characters should return false
75+
EXPECT_FALSE(ValidAvroName("field-name"));
76+
EXPECT_FALSE(ValidAvroName("field.name"));
77+
EXPECT_FALSE(ValidAvroName("field name"));
78+
EXPECT_FALSE(ValidAvroName("field@name"));
79+
EXPECT_FALSE(ValidAvroName("field#name"));
80+
}
81+
82+
TEST(ValidAvroNameTest, EmptyName) {
83+
// Empty name should return false
84+
EXPECT_FALSE(ValidAvroName(""));
85+
}
86+
87+
TEST(SanitizeFieldNameTest, ValidFieldNames) {
88+
// Valid field names should remain unchanged
89+
EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
90+
EXPECT_EQ(SanitizeFieldName("field123"), "field123");
91+
EXPECT_EQ(SanitizeFieldName("_private"), "_private");
92+
EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
93+
EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores");
94+
}
95+
96+
TEST(SanitizeFieldNameTest, InvalidFieldNames) {
97+
// Field names starting with numbers should be prefixed with underscore
98+
EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
99+
EXPECT_EQ(SanitizeFieldName("0value"), "_0value");
100+
101+
// Field names with special characters should be encoded with hex values
102+
EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
103+
EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
104+
EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
105+
EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
106+
EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");
107+
108+
// Complex field names with multiple issues
109+
EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"),
110+
"_1field_x2Dwith_x2Especial_x40chars");
111+
EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
112+
}
113+
114+
TEST(SanitizeFieldNameTest, EdgeCases) {
115+
// Empty field name
116+
EXPECT_EQ(SanitizeFieldName(""), "");
117+
118+
// Field name with only special characters
119+
EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");
120+
121+
// Field name starting with special character
122+
EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
123+
EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
124+
}
125+
52126
TEST(ToAvroNodeVisitorTest, BooleanType) {
53127
::avro::NodePtr node;
54128
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
@@ -181,6 +255,60 @@ TEST(ToAvroNodeVisitorTest, StructType) {
181255
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
182256
}
183257

258+
TEST(ToAvroNodeVisitorTest, StructTypeWithFieldNames) {
259+
StructType struct_type{
260+
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
261+
/*optional=*/false},
262+
SchemaField{/*field_id=*/2, "valid_field", iceberg::string(),
263+
/*optional=*/false},
264+
SchemaField{/*field_id=*/3, "email.address", iceberg::string(),
265+
/*optional=*/true},
266+
SchemaField{/*field_id=*/4, "AnotherField", iceberg::int32(),
267+
/*optional=*/true},
268+
SchemaField{/*field_id=*/5, "123field", iceberg::int32(),
269+
/*optional=*/false},
270+
SchemaField{/*field_id=*/6, "field with spaces", iceberg::boolean(),
271+
/*optional=*/true}}};
272+
::avro::NodePtr node;
273+
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
274+
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
275+
276+
ASSERT_EQ(node->names(), 6);
277+
278+
EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
279+
EXPECT_EQ(node->nameAt(2),
280+
"email_x2Eaddress"); // "email.address" -> "email_x2Eaddress"
281+
EXPECT_EQ(node->nameAt(4), "_123field"); // "123field" -> "_123field"
282+
EXPECT_EQ(
283+
node->nameAt(5),
284+
"field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces"
285+
286+
EXPECT_EQ(node->nameAt(1), "valid_field");
287+
EXPECT_EQ(node->nameAt(3), "AnotherField");
288+
289+
ASSERT_EQ(node->customAttributes(), 6);
290+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
291+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
292+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
293+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
294+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/4, /*field_id=*/5));
295+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/5, /*field_id=*/6));
296+
297+
const auto& attrs1 = node->customAttributesAt(1); // valid_field
298+
const auto& attrs3 = node->customAttributesAt(3); // AnotherField
299+
EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
300+
EXPECT_FALSE(attrs3.getAttribute("iceberg-field-name").has_value());
301+
302+
ASSERT_NO_FATAL_FAILURE(
303+
CheckIcebergFieldName(node, /*index=*/0, /*original_name=*/"user-name"));
304+
ASSERT_NO_FATAL_FAILURE(
305+
CheckIcebergFieldName(node, /*index=*/2, /*original_name=*/"email.address"));
306+
ASSERT_NO_FATAL_FAILURE(
307+
CheckIcebergFieldName(node, /*index=*/4, /*original_name=*/"123field"));
308+
ASSERT_NO_FATAL_FAILURE(
309+
CheckIcebergFieldName(node, /*index=*/5, /*original_name=*/"field with spaces"));
310+
}
311+
184312
TEST(ToAvroNodeVisitorTest, ListType) {
185313
ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
186314
/*optional=*/true}};
@@ -1436,5 +1564,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
14361564
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
14371565
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
14381566
}
1439-
14401567
} // namespace iceberg::avro

0 commit comments

Comments
 (0)