This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 7d19ee843 KUDU-1261 use bits not bytes for validity in array1d.fbs
7d19ee843 is described below
commit 7d19ee84339208f5e982fabf6b9c98188875918e
Author: Alexey Serbin <[email protected]>
AuthorDate: Wed Oct 22 00:01:02 2025 -0700
KUDU-1261 use bits not bytes for validity in array1d.fbs
This changelist switches to using one-validity-bit-per-array-element
approach in encoding one-dimensional arrays. It takes care both of
the C++ and the Java side, so now they both speak the same language
w.r.t. the array element validity.
I added a few new sub-scenarios to cover some of the edge cases with
the new approach, but the existing tests also provide good enough
coverage for this update.
Change-Id: I09d281f7e76f6ead3fac8cb853eea503b8fb0bfe
Reviewed-on: http://gerrit.cloudera.org:8080/23571
Reviewed-by: Abhishek Chennaka <[email protected]>
Tested-by: Alexey Serbin <[email protected]>
---
.../java/org/apache/kudu/client/Array1dSerdes.java | 44 +++++++----
.../java/org/apache/kudu/client/ArrayCellView.java | 68 ++++++++---------
.../apache/kudu/client/ArrayCellViewHelper.java | 85 ++++++++--------------
.../org/apache/kudu/client/TestPartialRow.java | 28 +++++++
src/kudu/common/array_cell_view.h | 38 ++++------
src/kudu/common/array_type_serdes-test.cc | 2 +
src/kudu/common/array_type_serdes.h | 42 ++++++++---
src/kudu/common/serdes/array1d.fbs | 12 +--
src/kudu/util/bitmap-test.cc | 28 ++++++-
src/kudu/util/bitmap.cc | 13 ++++
src/kudu/util/bitmap.h | 3 +-
11 files changed, 217 insertions(+), 146 deletions(-)
diff --git
a/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java
b/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java
index ee32dc82d..bd3b0b830 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java
@@ -23,6 +23,8 @@ import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.Objects;
+import com.google.common.base.Preconditions;
+import com.google.flatbuffers.ByteVector;
import com.google.flatbuffers.FlatBufferBuilder;
// FlatBuffers generated classes
@@ -84,21 +86,31 @@ public class Array1dSerdes {
if (validity == null || validity.length == 0) {
return 0;
}
-
- // If all entries are true, omit encoding
- boolean allTrue = true;
- for (boolean v : validity) {
- if (!v) {
- allTrue = false;
+ // If all elements are valid, the validity field isn't populated,
+ // similar to null and empty validity vectors.
+ boolean allValid = true;
+ for (boolean elem : validity) {
+ if (!elem) {
+ allValid = false;
break;
}
}
- if (allTrue) {
+ if (allValid) {
return 0;
}
- // Only encode if at least one false
- return Content.createValidityVector(b, validity);
+ // NOTE: java.util.BitSet is unusable here since it automatically
+ // truncates all trailing zero bits after the last set bit
+ final int validityBitNum = validity.length;
+ final int validityByteNum = (validityBitNum + 7) / 8;
+ byte[] validityBytes = new byte[validityByteNum];
+ Arrays.fill(validityBytes, (byte)0);
+ for (int idx = 0; idx < validityBitNum; ++idx) {
+ if (validity[idx]) {
+ validityBytes[idx >> 3] |= 1 << (idx & 7);
+ }
+ }
+ return Content.createValidityVector(b, validityBytes);
}
private static int finishContent(FlatBufferBuilder b, int discriminator,
@@ -125,12 +137,18 @@ public class Array1dSerdes {
Arrays.fill(validity, true);
return validity;
}
- if (n != m) {
+
+ final int expected_validity_len = (m + 7) / 8;
+ if (n != expected_validity_len) {
throw new IllegalArgumentException(
- String.format("Invalid validity length %d (expected %d)", n, m));
+ String.format("invalid validity length %d: expected %d for %d
elements in array",
+ n, expected_validity_len, m));
}
- for (int i = 0; i < m; i++) {
- validity[i] = c.validity(i);
+ final ByteVector vv = c.validityVector();
+ Preconditions.checkState(vv.length() == n,
+ "unexpected length of validityVector: %d (expected %d)", vv.length(),
n);
+ for (int idx = 0; idx < m; ++idx) {
+ validity[idx] = ((vv.get(idx >> 3) & (1 << (idx & 7))) != 0);
}
return validity;
}
diff --git
a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java
b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java
index 62a0266e6..f1f120ba1 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java
@@ -20,10 +20,10 @@ package org.apache.kudu.client;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
+import java.util.BitSet;
import java.util.function.IntFunction;
import org.apache.kudu.ColumnSchema;
-import org.apache.kudu.Type;
import org.apache.kudu.serdes.BinaryArray;
import org.apache.kudu.serdes.Content;
import org.apache.kudu.serdes.DoubleArray;
@@ -47,6 +47,8 @@ class ArrayCellView {
private final byte[] rawBytes;
private final Content content;
private final byte typeTag;
+ private final BitSet validityBitSet;
+ private final int elemNum;
private static final String NULL_ELEMENT_MSG = "Element %d is NULL";
@@ -62,6 +64,24 @@ class ArrayCellView {
ByteBuffer bb = ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN);
this.content = Content.getRootAsContent(bb);
this.typeTag = content.dataType();
+ this.elemNum = getElemNum();
+
+ // Build validity/not-null BitSet.
+ final int validityElemNum = content.validityLength(); // number of bytes
in the array
+ if (validityElemNum == 0) {
+ // Setting the validity bitset explicitly to null: all array elements
are valid.
+ this.validityBitSet = null;
+ } else {
+ // One validity bit per element of the 'data' array.
+ final int dataElemNum = length();
+ final int expectedValidityElemNum = (dataElemNum + 7) / 8;
+ if (validityElemNum != expectedValidityElemNum) {
+ throw new IllegalArgumentException(
+ String.format("invalid validity length %d: expected %d for %d
elements in array)",
+ validityElemNum, expectedValidityElemNum, dataElemNum));
+ }
+ this.validityBitSet = BitSet.valueOf(content.validityAsByteBuffer());
+ }
}
/** Return the underlying FlatBuffer bytes exactly as passed in. */
@@ -70,8 +90,13 @@ class ArrayCellView {
return rawBytes;
}
- /** Number of logical elements (driven by the data vector). */
+ /** Number of logical elements in the array, including null/non-valid
elements. */
int length() {
+ return elemNum;
+ }
+
+ /** Get the number of array elements in the underlying flatbuffers contents.
*/
+ private int getElemNum() {
switch (typeTag) {
case ScalarArray.Int8Array: {
Int8Array arr = new Int8Array();
@@ -141,49 +166,16 @@ class ArrayCellView {
/** Returns true iff element i is valid (non-null). */
boolean isValid(int i) {
- int n = length();
+ final int n = length();
if (i < 0 || i >= n) {
throw new IndexOutOfBoundsException(
String.format("Index %d out of bounds for array length %d", i, n));
}
-
- int vlen = content.validityLength();
- if (vlen == 0) {
+ if (validityBitSet == null) {
// No validity vector present => all elements are valid
return true;
}
-
- if (i >= vlen) {
- throw new IllegalStateException(
- String.format("Validity vector shorter than array: i=%d, vlen=%d",
i, vlen));
- }
-
- return content.validity(i);
- }
-
- /**
- * Returns a boolean array of element validity flags.
- *
- * @param n expected number of elements in the values vector
- * @throws IllegalStateException if validity length and value length mismatch
- */
-
- boolean[] validityOrAllTrue(int n) {
- int len = content.validityLength();
- if (len == 0) {
- boolean[] allTrue = new boolean[n];
- java.util.Arrays.fill(allTrue, true);
- return allTrue;
- }
- if (len != n) {
- throw new IllegalStateException(
- String.format("Validity length %d does not match values length %d",
len, n));
- }
- boolean[] v = new boolean[n];
- for (int i = 0; i < n; i++) {
- v[i] = content.validity(i);
- }
- return v;
+ return validityBitSet.get(i);
}
// ----------------------------------------------------------------------
diff --git
a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
index b753cf9c2..937b26779 100644
---
a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
+++
b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
@@ -50,14 +50,6 @@ class ArrayCellViewHelper {
private ArrayCellViewHelper() {
}
- // ----------------------------------------------------------------------
- // Common helpers
- // ----------------------------------------------------------------------
-
- private static boolean[] validity(ArrayCellView view, int n) {
- return view.validityOrAllTrue(n);
- }
-
// ----------------------------------------------------------------------
// Boxed numeric arrays with null (validity) handling
// ----------------------------------------------------------------------
@@ -70,14 +62,13 @@ class ArrayCellViewHelper {
// which avoids position drift and unnecessary buffer slicing while
remaining allocation-free.
static Boolean[] toBoolArray(ArrayCellView view) {
UInt8Array arr = view.asUInt8FB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
+ final int n = arr.valuesLength();
ByteBuffer bb = arr.valuesAsByteBuffer();
bb.order(ByteOrder.LITTLE_ENDIAN);
int base = bb.position();
Boolean[] out = new Boolean[n];
for (int i = 0; i < n; i++) {
- if (!v[i]) {
+ if (!view.isValid(i)) {
out[i] = null;
continue;
}
@@ -96,84 +87,78 @@ class ArrayCellViewHelper {
// Null elements are filled with Java nulls according to the validity bitmap.
static Byte[] toInt8Array(ArrayCellView view) {
Int8Array arr = view.asInt8FB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
ByteBuffer bb = arr.valuesAsByteBuffer();
bb.order(ByteOrder.LITTLE_ENDIAN);
+ final int n = arr.valuesLength();
int base = bb.position();
Byte[] out = new Byte[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? bb.get(base + i) : null;
+ out[i] = view.isValid(i) ? bb.get(base + i) : null;
}
return out;
}
static Short[] toInt16Array(ArrayCellView view) {
Int16Array arr = view.asInt16FB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
- ShortBuffer sb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asShortBuffer();
+ final int n = arr.valuesLength();
short[] tmp = new short[n];
+ ShortBuffer sb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asShortBuffer();
sb.get(tmp);
Short[] out = new Short[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? tmp[i] : null;
+ out[i] = view.isValid(i) ? tmp[i] : null;
}
return out;
}
static Integer[] toInt32Array(ArrayCellView view) {
Int32Array arr = view.asInt32FB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
- IntBuffer ib =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+ final int n = arr.valuesLength();
int[] tmp = new int[n];
+ IntBuffer ib =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
ib.get(tmp);
Integer[] out = new Integer[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? tmp[i] : null;
+ out[i] = view.isValid(i) ? tmp[i] : null;
}
return out;
}
static Long[] toInt64Array(ArrayCellView view) {
Int64Array arr = view.asInt64FB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
- LongBuffer lb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
+ final int n = arr.valuesLength();
long[] tmp = new long[n];
+ LongBuffer lb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
lb.get(tmp);
Long[] out = new Long[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? tmp[i] : null;
+ out[i] = view.isValid(i) ? tmp[i] : null;
}
return out;
}
static Float[] toFloatArray(ArrayCellView view) {
FloatArray arr = view.asFloatFB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
- FloatBuffer fb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
+ final int n = arr.valuesLength();
float[] tmp = new float[n];
+ FloatBuffer fb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
fb.get(tmp);
Float[] out = new Float[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? tmp[i] : null;
+ out[i] = view.isValid(i) ? tmp[i] : null;
}
return out;
}
static Double[] toDoubleArray(ArrayCellView view) {
DoubleArray arr = view.asDoubleFB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
- DoubleBuffer db =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asDoubleBuffer();
+ final int n = arr.valuesLength();
double[] tmp = new double[n];
+ DoubleBuffer db =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asDoubleBuffer();
db.get(tmp);
Double[] out = new Double[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? tmp[i] : null;
+ out[i] = view.isValid(i) ? tmp[i] : null;
}
return out;
}
@@ -184,11 +169,10 @@ class ArrayCellViewHelper {
static String[] toStringArray(ArrayCellView view) {
StringArray arr = view.asStringFB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
+ final int n = arr.valuesLength();
String[] out = new String[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? arr.values(i) : null;
+ out[i] = view.isValid(i) ? arr.values(i) : null;
}
return out;
}
@@ -200,12 +184,11 @@ class ArrayCellViewHelper {
static byte[][] toBinaryArray(ArrayCellView view) {
BinaryArray arr = view.asBinaryFB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
+ final int n = arr.valuesLength();
byte[][] out = new byte[n][];
UInt8Array elem = new UInt8Array();
for (int i = 0; i < n; i++) {
- if (!v[i]) {
+ if (!view.isValid(i)) {
out[i] = null;
continue;
}
@@ -223,28 +206,26 @@ class ArrayCellViewHelper {
static Date[] toDateArray(ArrayCellView view) {
Int32Array arr = view.asInt32FB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
- IntBuffer ib =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+ final int n = arr.valuesLength();
int[] tmp = new int[n];
+ IntBuffer ib =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
ib.get(tmp);
Date[] out = new Date[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? DateUtil.epochDaysToSqlDate(tmp[i]) : null;
+ out[i] = view.isValid(i) ? DateUtil.epochDaysToSqlDate(tmp[i]) : null;
}
return out;
}
static Timestamp[] toTimestampArray(ArrayCellView view) {
Int64Array arr = view.asInt64FB();
- int n = arr.valuesLength();
- boolean[] v = validity(view, n);
LongBuffer lb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
+ final int n = arr.valuesLength();
long[] tmp = new long[n];
lb.get(tmp);
Timestamp[] out = new Timestamp[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? TimestampUtil.microsToTimestamp(tmp[i]) : null;
+ out[i] = view.isValid(i) ? TimestampUtil.microsToTimestamp(tmp[i]) :
null;
}
return out;
}
@@ -258,25 +239,23 @@ class ArrayCellViewHelper {
BigDecimal[] out;
if (precision <= 9) {
Int32Array arr = view.asInt32FB();
- int n = arr.valuesLength();
- v = validity(view, n);
IntBuffer ib =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+ final int n = arr.valuesLength();
int[] tmp = new int[n];
ib.get(tmp);
out = new BigDecimal[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? BigDecimal.valueOf(tmp[i], scale) : null;
+ out[i] = view.isValid(i) ? BigDecimal.valueOf(tmp[i], scale) : null;
}
} else if (precision <= 18) {
Int64Array arr = view.asInt64FB();
- int n = arr.valuesLength();
- v = validity(view, n);
LongBuffer lb =
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
+ final int n = arr.valuesLength();
long[] tmp = new long[n];
lb.get(tmp);
out = new BigDecimal[n];
for (int i = 0; i < n; i++) {
- out[i] = v[i] ? BigDecimal.valueOf(tmp[i], scale) : null;
+ out[i] = view.isValid(i) ? BigDecimal.valueOf(tmp[i], scale) : null;
}
} else {
throw new IllegalStateException("DECIMAL128 arrays not yet supported");
diff --git
a/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java
b/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java
index 58d5fb819..63b2d2df0 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java
@@ -192,6 +192,34 @@ public class TestPartialRow {
assertArrayDataEquals(view, row.getArrayData("ints"), view::getInt8);
}
+ @Test
+ public void testAddAndGetInt8ArrayAllNulls() {
+ Schema schema = new Schema(Arrays.asList(
+ new ColumnSchema.ColumnSchemaBuilder("tiny_ints",
Type.INT8).array(true).build()
+ ));
+
+ // Test various number of NULL elements in the array to check various
boundary
+ // conditions on the internal representation of the validity vector in
ArrayCellView.
+ final int[] numberOfElements = new int[]{ 0, 1, 3, 7, 8, 9, 100, 127};
+ for (int elemNum : numberOfElements) {
+ PartialRow row = schema.newPartialRow();
+
+ byte[] vals = new byte[elemNum];
+ Arrays.fill(vals, (byte)0);
+
+ boolean[] validity = new boolean[elemNum];
+ Arrays.fill(validity, false);
+
+ row.addArrayInt8("tiny_ints", vals, validity);
+
+ ArrayCellView view = (ArrayCellView) row.getObject("tiny_ints");
+ assertEquals(elemNum, view.length());
+ for (int idx = 0; idx < elemNum; ++idx) {
+ assertFalse("elemNum: " + elemNum + " idx: " + idx, view.isValid(idx));
+ }
+ }
+ }
+
@Test
public void testAddAndGetInt16Array() {
Schema schema = new Schema(Arrays.asList(
diff --git a/src/kudu/common/array_cell_view.h
b/src/kudu/common/array_cell_view.h
index 663cc8390..5699c4ee5 100644
--- a/src/kudu/common/array_cell_view.h
+++ b/src/kudu/common/array_cell_view.h
@@ -90,6 +90,7 @@ class ArrayCellMetadataView final {
: data_(buf),
size_(size),
content_(nullptr),
+ elem_num_(0),
is_initialized_(false),
has_nulls_(false) {
}
@@ -98,6 +99,7 @@ class ArrayCellMetadataView final {
DCHECK(!is_initialized_);
if (size_ == 0) {
content_ = nullptr;
+ elem_num_ = 0;
is_initialized_ = true;
has_nulls_ = false;
return Status::OK();
@@ -141,36 +143,25 @@ class ArrayCellMetadataView final {
return Status::IllegalState("null flatbuffers of non-zero size");
}
- const size_t values_size = elem_num_impl();
+ elem_num_ = GetElemNum();
const size_t validity_size = content_->validity() ?
content_->validity()->size() : 0;
- if (validity_size != 0 && values_size != validity_size) {
- return Status::Corruption("number of data and validity elements differ");
+ if (validity_size != 0 && BitmapSize(elem_num_) != validity_size) {
+ return Status::Corruption("'validity' and 'data' fields not in sync");
}
has_nulls_ = false;
if (validity_size != 0) {
// If the validity vector is supplied and with all its elements non-zero.
const auto& validity = *(DCHECK_NOTNULL(content_->validity()));
- has_nulls_ = std::any_of(validity.cbegin(), validity.cend(),
- [&](uint8_t e) { return e == 0; });
+ has_nulls_ = !BitmapIsAllSet(validity.Data(), 0, elem_num_);
}
if (has_nulls_) {
- const size_t bit_num = values_size;
- DCHECK_GT(bit_num, 0);
- bitmap_.reset(new uint8_t[BitmapSize(bit_num)]);
- // Assuming the most of the elements are non-null/valid, it's usually
less
- // calls to flip particular bits from 1 to 0 than flipping them from 0
to 1.
- // TODO(aserbin): consider counting alternating strategy based on the
number
- // of non-zero values that comes from computing
'has_nulls_'
- auto* bm = bitmap_.get();
- memset(bm, 0xff, BitmapSize(bit_num));
- const auto& v = *(DCHECK_NOTNULL(content_->validity()));
- for (size_t idx = 0; idx < bit_num; ++idx) {
- if (v.Get(idx) == 0) {
- BitmapClear(bm, idx);
- }
- }
+ DCHECK_GT(validity_size, 0);
+ DCHECK_GT(elem_num_, 0);
+ DCHECK_EQ(validity_size, BitmapSize(elem_num_));
+ bitmap_.reset(new uint8_t[validity_size]);
+ memcpy(bitmap_.get(), content_->validity()->Data(), validity_size);
}
const auto data_type = content_->data_type();
if (data_type != serdes::ScalarArray::BinaryArray &&
@@ -209,7 +200,7 @@ class ArrayCellMetadataView final {
// Number of elements in the array.
size_t elem_num() const {
DCHECK(is_initialized_);
- return elem_num_impl();
+ return elem_num_;
}
bool empty() const {
@@ -284,7 +275,7 @@ class ArrayCellMetadataView final {
return content_ ? content_->data_as<T>()->values()->Data() : nullptr;
}
- size_t elem_num_impl() const {
+ size_t GetElemNum() const {
if (!content_) {
return 0;
}
@@ -336,6 +327,9 @@ class ArrayCellMetadataView final {
// for an empty (size_ == 0) buffer.
const serdes::Content* content_;
+ // Number of elements in the underlying flatbuffers buffer.
+ size_t elem_num_;
+
// A bitmap built of the boolean validity vector.
// TODO(aserbin): switch array1d to bitfield instead of bool vector for
validity?
std::unique_ptr<uint8_t[]> bitmap_;
diff --git a/src/kudu/common/array_type_serdes-test.cc
b/src/kudu/common/array_type_serdes-test.cc
index 7e8284489..1345799b5 100644
--- a/src/kudu/common/array_type_serdes-test.cc
+++ b/src/kudu/common/array_type_serdes-test.cc
@@ -81,6 +81,7 @@ TEST(ArrayTypeSerdesTest, Basic) {
const auto* view_validity_bitmap = view.not_null_bitmap();
ASSERT_NE(nullptr, view_validity_bitmap);
ASSERT_TRUE(view.has_nulls());
+ ASSERT_EQ(0, memcmp(view_validity_bitmap, validity_bitmap, 1));
// Verify the data matches the source.
{
@@ -111,6 +112,7 @@ TEST(ArrayTypeSerdesTest, AllNonNullElements) {
const vector<bool> kEmpty{};
for (const uint8_t* validity_bitmap : { kNull, kThreeOnes }) {
+ SCOPED_TRACE(validity_bitmap ? "all ones" : "null");
const vector<int16_t> val{ 0, 1, 2, };
const vector<bool>& validity_vector =
validity_bitmap ? BitmapToVector(validity_bitmap, val.size()) : kEmpty;
diff --git a/src/kudu/common/array_type_serdes.h
b/src/kudu/common/array_type_serdes.h
index 9acd64b4e..f45073558 100644
--- a/src/kudu/common/array_type_serdes.h
+++ b/src/kudu/common/array_type_serdes.h
@@ -40,13 +40,18 @@ template<DataType KUDU_DATA_TYPE, typename FB_TYPE>
void BuildFlatbuffers(
const uint8_t* column_data,
size_t nrows,
- const std::vector<bool>& validity,
+ const uint8_t* validity_bitmap,
flatbuffers::FlatBufferBuilder* bld) {
typedef typename DataTypeTraits<KUDU_DATA_TYPE>::cpp_type ElementType;
DCHECK(bld);
auto& builder = *bld;
+ if (validity_bitmap && BitmapIsAllSet(validity_bitmap, 0, nrows)) {
+ // All bits in the validity bitmap are set: it's equivalent of null bitmap.
+ validity_bitmap = nullptr;
+ }
+
std::vector<ElementType> val;
val.resize(nrows);
if (column_data != nullptr) {
@@ -55,7 +60,8 @@ void BuildFlatbuffers(
static const Slice kEmptySlice(static_cast<uint8_t*>(nullptr), 0);
const Slice* ptr = reinterpret_cast<const Slice*>(column_data);
for (size_t idx = 0; idx < nrows; ++idx) {
- val[idx] = (validity.empty() || validity[idx]) ? *(ptr + idx) :
kEmptySlice;
+ val[idx] = (!validity_bitmap || BitmapTest(validity_bitmap, idx))
+ ? *(ptr + idx) : kEmptySlice;
}
} else {
static_assert(!std::is_same<Slice, ElementType>::value,
@@ -64,15 +70,21 @@ void BuildFlatbuffers(
}
}
- const auto validity_vector =
- !validity.empty() ? builder.CreateVector(validity) : 0;
+ std::vector<uint8_t> validity_vector;
+ if (validity_bitmap) {
+ validity_vector.resize(BitmapSize(nrows));
+ memcpy(validity_vector.data(), validity_bitmap, validity_vector.size());
+ }
+ const auto validity_fb =
+ validity_vector.empty() ? 0 : builder.CreateVector(validity_vector);
+
if constexpr (KUDU_DATA_TYPE == STRING) {
auto values = FB_TYPE::Traits::Create(
builder, builder.CreateVectorOfStrings<ElementType>(val));
builder.Finish(CreateContent(builder,
KuduToScalarArrayType(KUDU_DATA_TYPE),
values.Union(),
- validity_vector));
+ validity_fb));
} else if constexpr (KUDU_DATA_TYPE == BINARY) {
std::vector<flatbuffers::Offset<serdes::UInt8Array>> offsets;
offsets.reserve(val.size());
@@ -86,14 +98,14 @@ void BuildFlatbuffers(
builder.Finish(CreateContent(builder,
KuduToScalarArrayType(KUDU_DATA_TYPE),
values.Union(),
- validity_vector));
+ validity_fb));
} else {
auto values = FB_TYPE::Traits::Create(
builder, builder.CreateVector<ElementType>(val));
builder.Finish(CreateContent(builder,
KuduToScalarArrayType(KUDU_DATA_TYPE),
values.Union(),
- validity_vector));
+ validity_fb));
}
}
@@ -116,7 +128,16 @@ Status Serialize(
flatbuffers::FlatBufferBuilder builder(
nrows * sizeof(ElementType) + nrows + FLATBUFFERS_MIN_BUFFER_SIZE);
- BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows, validity,
&builder);
+
+ const uint8_t* validity_bitmap = nullptr;
+ std::vector<uint8_t> validity_vector;
+ if (!validity.empty() && std::any_of(validity.begin(), validity.end(),
+ [&](bool e) { return !e; })) {
+ validity_vector.resize(BitmapSize(nrows));
+ VectorToBitmap(validity, validity_vector.data());
+ validity_bitmap = validity_vector.data();
+ }
+ BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows,
validity_bitmap, &builder);
DCHECK(builder.GetBufferPointer());
// TODO(aserbin): would it be better to copy the data from the builder
@@ -142,7 +163,6 @@ Status SerializeIntoArena(
size_t nrows,
Arena* arena,
Slice* out) {
- static const std::vector<bool> kAllValid{};
typedef typename DataTypeTraits<KUDU_DATA_TYPE>::cpp_type ElementType;
DCHECK(arena);
@@ -150,9 +170,7 @@ Status SerializeIntoArena(
flatbuffers::FlatBufferBuilder builder(
nrows * sizeof(ElementType) + nrows + FLATBUFFERS_MIN_BUFFER_SIZE);
- const std::vector<bool>& validity =
- validity_bitmap ? BitmapToVector(validity_bitmap, nrows) : kAllValid;
- BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows, validity,
&builder);
+ BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows,
validity_bitmap, &builder);
// Copy the serialized data into the arena.
//
diff --git a/src/kudu/common/serdes/array1d.fbs
b/src/kudu/common/serdes/array1d.fbs
index 3f43aaa58..4fc612383 100644
--- a/src/kudu/common/serdes/array1d.fbs
+++ b/src/kudu/common/serdes/array1d.fbs
@@ -52,14 +52,14 @@ union ScalarArray {
// This is to represent a one-dimensional array of a particular scalar type
// where some of the array's elements might be null/non-valid. The 'data'
// field is a placeholder for the array's elements, and the 'validity' field
-// contains information on their validity/non-nullness: 'true' means 'valid',
-// 'false' means 'non-valid' (a.k.a. 'null').
-//
-// TODO(aserbin): consider storing 1 bit per data element in the 'validity'
-// field instead of using whole 'ubyte/uint8'
+// contains information on their validity/non-nullness: one bit per element
+// in 'data'. A set bit ('1') in the 'validity' field means 'non-null'
+// (a.k.a. 'valid'), 'false' means 'null' (a.k.a. 'non-valid'). If all the
+// elements in the 'data' field are non-null/valid, the 'validity' field
+// may be empty or absent to optimize for memory/space.
table Content {
data:ScalarArray;
- validity:[bool];
+ validity:[uint8];
}
root_type Content;
diff --git a/src/kudu/util/bitmap-test.cc b/src/kudu/util/bitmap-test.cc
index e2a33e17b..fe613912e 100644
--- a/src/kudu/util/bitmap-test.cc
+++ b/src/kudu/util/bitmap-test.cc
@@ -386,12 +386,19 @@ TEST(TestBitMap, TestCopy) {
TEST(TestBitMap, BitmapToVector) {
constexpr size_t kNumBytes = 8;
constexpr size_t kNumBits = kNumBytes * 8;
- constexpr uint8_t kAllZeroes[kNumBytes] = { 0 };
+ constexpr uint8_t kAllZeroes[kNumBytes] = { 0x00 };
+ constexpr uint8_t kAllOnes[kNumBytes] =
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
{
const auto v = BitmapToVector(kAllZeroes, kNumBits);
ASSERT_EQ(kNumBits, v.size());
ASSERT_TRUE(std::all_of(v.begin(), v.end(), [](bool e) { return !e; }));
+
+ ASSERT_EQ(kNumBytes, BitmapSize(kNumBits));
+ uint8_t buf[BitmapSize(kNumBits)] = { 0xff };
+ ASSERT_EQ(kNumBits, VectorToBitmap(v, buf));
+ ASSERT_EQ(0, memcmp(kAllZeroes, buf, sizeof(kAllZeroes)));
}
{
uint8_t input[kNumBytes];
@@ -401,6 +408,11 @@ TEST(TestBitMap, BitmapToVector) {
const auto v = BitmapToVector(input, kNumBits);
ASSERT_EQ(kNumBits, v.size());
ASSERT_TRUE(std::all_of(v.begin(), v.end(), [](bool e) { return e; }));
+
+ ASSERT_EQ(kNumBytes, BitmapSize(kNumBits));
+ uint8_t buf[BitmapSize(kNumBits)] = { 0x00 };
+ ASSERT_EQ(kNumBits, VectorToBitmap(v, buf));
+ ASSERT_EQ(0, memcmp(kAllOnes, buf, sizeof(kAllOnes)));
}
{
uint8_t input[kNumBytes];
@@ -412,12 +424,21 @@ TEST(TestBitMap, BitmapToVector) {
auto it_half = v.begin() + (v.size() / 2);
ASSERT_TRUE(std::all_of(v.begin(), it_half, [](bool e) { return e; }));
ASSERT_TRUE(std::all_of(it_half, v.end(), [](bool e) { return !e; }));
+
+ uint8_t output[kNumBytes] = { 0 };
+ ASSERT_EQ(kNumBits, VectorToBitmap(v, output));
+ ASSERT_EQ(0, memcmp(input, output, kNumBytes));
}
{
uint8_t input[1] = { 0b00000001 };
const auto v = BitmapToVector(input, 1);
ASSERT_EQ(1, v.size());
ASSERT_TRUE(v.front());
+
+ ASSERT_EQ(1, BitmapSize(1));
+ uint8_t output[1] = { 0x00 };
+ ASSERT_EQ(1, VectorToBitmap(v, output));
+ ASSERT_EQ(0, memcmp(input, output, 1));
}
{
uint8_t input[1] = { 0b10000001 };
@@ -426,6 +447,11 @@ TEST(TestBitMap, BitmapToVector) {
ASSERT_TRUE(v.front());
ASSERT_TRUE(v.back());
ASSERT_TRUE(std::all_of(v.begin() + 1, v.end() - 1, [](bool e) { return
!e; }));
+
+ ASSERT_EQ(1, BitmapSize(8));
+ uint8_t output[1] = { 0x00 };
+ ASSERT_EQ(8, VectorToBitmap(v, output));
+ ASSERT_EQ(0, memcmp(input, output, 1));
}
}
diff --git a/src/kudu/util/bitmap.cc b/src/kudu/util/bitmap.cc
index 17c6e268c..20b583c48 100644
--- a/src/kudu/util/bitmap.cc
+++ b/src/kudu/util/bitmap.cc
@@ -169,6 +169,19 @@ vector<bool> BitmapToVector(const uint8_t* bitmap, size_t
num_bits) {
return result;
}
+size_t VectorToBitmap(const std::vector<bool>& v, uint8_t* bitmap) {
+ DCHECK(bitmap);
+ const size_t num_bits = v.size();
+ for (size_t idx = 0; idx < num_bits; ++idx) {
+ if (v[idx]) {
+ BitmapSet(bitmap, idx);
+ } else {
+ BitmapClear(bitmap, idx);
+ }
+ }
+ return num_bits;
+}
+
string BitmapToString(const uint8_t* bitmap, size_t num_bits) {
string s;
size_t index = 0;
diff --git a/src/kudu/util/bitmap.h b/src/kudu/util/bitmap.h
index c3feb92f8..9a520b6e3 100644
--- a/src/kudu/util/bitmap.h
+++ b/src/kudu/util/bitmap.h
@@ -33,7 +33,7 @@
namespace kudu {
// Return the number of bytes necessary to store the given number of bits.
-inline size_t BitmapSize(size_t num_bits) {
+constexpr size_t BitmapSize(size_t num_bits) {
return (num_bits + 7) / 8;
}
@@ -126,6 +126,7 @@ void BitmapCopy(uint8_t* dst, size_t dst_offset,
size_t num_bits);
std::vector<bool> BitmapToVector(const uint8_t* bitmap, size_t num_bits);
+size_t VectorToBitmap(const std::vector<bool>& v, uint8_t* bitmap);
std::string BitmapToString(const uint8_t* bitmap, size_t num_bits);