Skip to content

Commit 0ba6cb8

Browse files
committed
Add core algorithms for columnar serialization
This adds the core of the columnar serialization code paths. Even though we internally scan in a columnar fashion in the tablet server, sending those columns across the wire isn't straightforward. We have two bits of necessary processing: 1) the selection vector needs to be taken into account so we only send back selected rows. This means we need to copy out the selected cells and also copy out the selected bits from the null bitmap where relevant. Doing the null bitmap portion efficiently with wide platform support makes up a lot of this patch. 2) for the case of null values, we want to make sure we don't send uninitialized memory (which might include secrets!) to the client. So we need to zero out any cells where the corresponding non-null bitmap bit is unset. To keep the review manageable, this just adds some unit tests and all the new code is initially "dead". Later commits will add the parts that construct the full block of columns to be sent on the wire, hook this into the tserver, etc. Change-Id: I16f2993081aac54609aab4d8219ef0bf6c7708c2 Reviewed-on: http://gerrit.cloudera.org:8080/15556 Tested-by: Kudu Jenkins Reviewed-by: Andrew Wong <[email protected]> Reviewed-by: Alexey Serbin <[email protected]>
1 parent 0a46332 commit 0ba6cb8

File tree

7 files changed

+842
-1
lines changed

7 files changed

+842
-1
lines changed

LICENSE.txt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,31 @@ src/kudu/util/array_view.h: 3-clause BSD license with patent grant
350350
for this implementation of the WebRTC code package shall terminate as
351351
of the date such litigation is filed.
352352

353+
--------------------------------------------------------------------------------
354+
355+
src/kudu/common/zp7.cc: MIT license
356+
357+
ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
358+
359+
Copyright (c) 2020 Zach Wegner
360+
361+
Permission is hereby granted, free of charge, to any person obtaining a copy
362+
of this software and associated documentation files (the "Software"), to deal
363+
in the Software without restriction, including without limitation the rights
364+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
365+
copies of the Software, and to permit persons to whom the Software is
366+
furnished to do so, subject to the following conditions:
367+
368+
The above copyright notice and this permission notice shall be included in
369+
all copies or substantial portions of the Software.
370+
371+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
372+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
373+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
374+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
375+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
376+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
377+
SOFTWARE.
353378

354379
--------------------------------------------------------------------------------
355380

src/kudu/common/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ ADD_EXPORTABLE_LIBRARY(wire_protocol_proto
4242
set(COMMON_SRCS
4343
columnblock.cc
4444
column_predicate.cc
45+
columnar_serialization.cc
4546
encoded_key.cc
4647
generic_iterators.cc
4748
id_mapping.cc
@@ -60,7 +61,8 @@ set(COMMON_SRCS
6061
table_util.cc
6162
timestamp.cc
6263
types.cc
63-
wire_protocol.cc)
64+
wire_protocol.cc
65+
zp7.cc)
6466

6567
# Workaround for clang bug https://llvm.org/bugs/show_bug.cgi?id=23757
6668
# in which it incorrectly optimizes key_util.cc and causes incorrect results.
@@ -80,6 +82,7 @@ ADD_EXPORTABLE_LIBRARY(kudu_common
8082
DEPS ${COMMON_LIBS})
8183

8284
SET_KUDU_TEST_LINK_LIBS(kudu_common)
85+
ADD_KUDU_TEST(columnar_serialization-test)
8386
ADD_KUDU_TEST(columnblock-test)
8487
ADD_KUDU_TEST(column_predicate-test NUM_SHARDS 4)
8588
ADD_KUDU_TEST(encoded_key-test)
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "kudu/common/columnar_serialization.h"
19+
20+
#include <cstddef>
21+
#include <cstdint>
22+
#include <ostream>
23+
#include <string>
24+
#include <utility>
25+
#include <vector>
26+
27+
#include <glog/logging.h>
28+
#include <gtest/gtest.h>
29+
30+
#include "kudu/util/bitmap.h"
31+
#include "kudu/util/faststring.h"
32+
#include "kudu/util/random.h"
33+
#include "kudu/util/scoped_cleanup.h"
34+
#include "kudu/util/test_util.h"
35+
36+
using std::vector;
37+
38+
namespace kudu {
39+
40+
class ColumnarSerializationTest : public KuduTest {
41+
protected:
42+
ColumnarSerializationTest() : rng_(SeedRandom()) {
43+
}
44+
45+
// TODO(todd): templatize this test for other types once we have specialized
46+
// implementations.
47+
using DataType = uint32_t;
48+
static constexpr int kTypeSize = sizeof(DataType);
49+
50+
struct RandomCellsAndNulls {
51+
vector<DataType> vals;
52+
faststring non_nulls;
53+
54+
void VerifyNullsAreZeroed() {
55+
for (int i = 0; i < vals.size(); i++) {
56+
SCOPED_TRACE(i);
57+
if (BitmapTest(non_nulls.data(), i)) {
58+
EXPECT_EQ(0xdeadbeef, vals[i]);
59+
} else {
60+
EXPECT_EQ(0, vals[i]);
61+
}
62+
}
63+
}
64+
};
65+
66+
// Generate a random bitmap with the given number of bits.
67+
faststring RandomBitmap(int n_bits) {
68+
faststring bm;
69+
bm.resize(BitmapSize(n_bits));
70+
71+
for (int i = 0; i < n_bits; i++) {
72+
BitmapChange(bm.data(), i, rng_.OneIn(3));
73+
}
74+
return bm;
75+
}
76+
77+
// Create an array of 0xdeadbeef values and a corresponding
78+
// null bitmap with random entries set to null.
79+
RandomCellsAndNulls CreateDeadBeefsWithRandomNulls() {
80+
auto num_rows = rng_.Uniform(1000) + 1;
81+
vector<uint32_t> vals(num_rows, 0xdeadbeef);
82+
faststring non_nulls = RandomBitmap(num_rows);
83+
return { std::move(vals), std::move(non_nulls) };
84+
}
85+
86+
Random rng_;
87+
};
88+
89+
90+
// Simple test of ZeroNullValues for a whole array.
91+
TEST_F(ColumnarSerializationTest, TestZeroNullValues) {
92+
auto data = CreateDeadBeefsWithRandomNulls();
93+
94+
internal::ZeroNullValues(
95+
kTypeSize, /* dst_idx= */0,
96+
data.vals.size(),
97+
reinterpret_cast<uint8_t*>(data.vals.data()),
98+
data.non_nulls.data());
99+
100+
ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed());
101+
}
102+
103+
// More complex test test of ZeroNullValues which runs on sub-ranges
104+
// of an array.
105+
TEST_F(ColumnarSerializationTest, TestZeroNullValuesWithOffset) {
106+
auto data = CreateDeadBeefsWithRandomNulls();
107+
int dst_idx = 0;
108+
while (dst_idx < data.vals.size()) {
109+
auto rem = data.vals.size() - dst_idx;
110+
auto n = rng_.Uniform(rem) + 1;
111+
internal::ZeroNullValues(
112+
kTypeSize, dst_idx, n,
113+
reinterpret_cast<uint8_t*>(data.vals.data()),
114+
data.non_nulls.data());
115+
dst_idx += n;
116+
}
117+
ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed());
118+
}
119+
120+
TEST_F(ColumnarSerializationTest, TestCopyNonNullBitmap) {
121+
auto save_method = internal::g_pext_method;
122+
SCOPED_CLEANUP({ internal::g_pext_method = save_method; });
123+
// Test using all available methods. Depending on the machine where
124+
// the test is running we might miss some, but we typically run this
125+
// test on relatively recent hardware that would support BMI2 (Haswell
126+
// or later).
127+
auto available_methods = internal::GetAvailablePextMethods();
128+
for (auto m : available_methods) {
129+
SCOPED_TRACE(static_cast<int>(m));
130+
internal::g_pext_method = m;
131+
auto n_rows = 1 + rng_.Uniform(200);
132+
faststring non_null_bitmap = RandomBitmap(n_rows);
133+
faststring sel_bitmap = RandomBitmap(n_rows);
134+
faststring dst_bitmap;
135+
dst_bitmap.resize(BitmapSize(n_rows));
136+
137+
internal::CopyNonNullBitmap(
138+
non_null_bitmap.data(), sel_bitmap.data(),
139+
/*dst_idx=*/0, n_rows,
140+
dst_bitmap.data());
141+
142+
vector<bool> expected;
143+
ForEachSetBit(sel_bitmap.data(), n_rows,
144+
[&](size_t bit) {
145+
expected.push_back(BitmapTest(non_null_bitmap.data(), bit));
146+
});
147+
LOG(INFO) << "non-null: " << BitmapToString(non_null_bitmap.data(), n_rows);
148+
LOG(INFO) << "selection: " << BitmapToString(sel_bitmap.data(), n_rows);
149+
LOG(INFO) << "result: " << BitmapToString(dst_bitmap.data(), expected.size());
150+
for (int i = 0; i < expected.size(); i++) {
151+
EXPECT_EQ(expected[i], BitmapTest(dst_bitmap.data(), i));
152+
}
153+
}
154+
}
155+
156+
TEST_F(ColumnarSerializationTest, TestCopySelectedRows) {
157+
auto num_rows = rng_.Uniform(1000) + 1;
158+
vector<uint32_t> vals;
159+
for (int i = 0; i < num_rows; i++) {
160+
vals.push_back(rng_.Next());
161+
}
162+
163+
vector<uint32_t> expected;
164+
vector<uint16_t> sel_indexes;
165+
for (int i = 0; i < num_rows; i++) {
166+
if (rng_.OneIn(3)) {
167+
sel_indexes.push_back(i);
168+
expected.push_back(vals[i]);
169+
}
170+
}
171+
172+
vector<uint32_t> ret(expected.size());
173+
internal::CopySelectedRows(sel_indexes, kTypeSize,
174+
reinterpret_cast<const uint8_t*>(vals.data()),
175+
reinterpret_cast<uint8_t*>(ret.data()));
176+
ASSERT_EQ(expected, ret);
177+
}
178+
179+
} // namespace kudu

0 commit comments

Comments
 (0)