This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 7d19ee843 KUDU-1261 use bits not bytes for validity in array1d.fbs
7d19ee843 is described below

commit 7d19ee84339208f5e982fabf6b9c98188875918e
Author: Alexey Serbin <[email protected]>
AuthorDate: Wed Oct 22 00:01:02 2025 -0700

    KUDU-1261 use bits not bytes for validity in array1d.fbs
    
    This changelist switches to using one-validity-bit-per-array-element
    approach in encoding one-dimensional arrays.  It takes care both of
    the C++ and the Java side, so now they both speak the same language
    w.r.t. the array element validity.
    
    I added a few new sub-scenarios to cover some of the edge cases with
    the new approach, but the existing tests also provide good enough
    coverage for this update.
    
    Change-Id: I09d281f7e76f6ead3fac8cb853eea503b8fb0bfe
    Reviewed-on: http://gerrit.cloudera.org:8080/23571
    Reviewed-by: Abhishek Chennaka <[email protected]>
    Tested-by: Alexey Serbin <[email protected]>
---
 .../java/org/apache/kudu/client/Array1dSerdes.java | 44 +++++++----
 .../java/org/apache/kudu/client/ArrayCellView.java | 68 ++++++++---------
 .../apache/kudu/client/ArrayCellViewHelper.java    | 85 ++++++++--------------
 .../org/apache/kudu/client/TestPartialRow.java     | 28 +++++++
 src/kudu/common/array_cell_view.h                  | 38 ++++------
 src/kudu/common/array_type_serdes-test.cc          |  2 +
 src/kudu/common/array_type_serdes.h                | 42 ++++++++---
 src/kudu/common/serdes/array1d.fbs                 | 12 +--
 src/kudu/util/bitmap-test.cc                       | 28 ++++++-
 src/kudu/util/bitmap.cc                            | 13 ++++
 src/kudu/util/bitmap.h                             |  3 +-
 11 files changed, 217 insertions(+), 146 deletions(-)

diff --git 
a/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java 
b/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java
index ee32dc82d..bd3b0b830 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/Array1dSerdes.java
@@ -23,6 +23,8 @@ import java.nio.ByteOrder;
 import java.util.Arrays;
 import java.util.Objects;
 
+import com.google.common.base.Preconditions;
+import com.google.flatbuffers.ByteVector;
 import com.google.flatbuffers.FlatBufferBuilder;
 
 // FlatBuffers generated classes
@@ -84,21 +86,31 @@ public class Array1dSerdes {
     if (validity == null || validity.length == 0) {
       return 0;
     }
-
-    // If all entries are true, omit encoding
-    boolean allTrue = true;
-    for (boolean v : validity) {
-      if (!v) {
-        allTrue = false;
+    // If all elements are valid, the validity field isn't populated,
+    // similar to null and empty validity vectors.
+    boolean allValid = true;
+    for (boolean elem : validity) {
+      if (!elem) {
+        allValid = false;
         break;
       }
     }
-    if (allTrue) {
+    if (allValid) {
       return 0;
     }
 
-    // Only encode if at least one false
-    return Content.createValidityVector(b, validity);
+    // NOTE: java.util.BitSet is unusable here since it automatically
+    //   truncates all trailing zero bits after the last set bit
+    final int validityBitNum = validity.length;
+    final int validityByteNum = (validityBitNum + 7) / 8;
+    byte[] validityBytes = new byte[validityByteNum];
+    Arrays.fill(validityBytes, (byte)0);
+    for (int idx = 0; idx < validityBitNum; ++idx) {
+      if (validity[idx]) {
+        validityBytes[idx >> 3] |= 1 << (idx & 7);
+      }
+    }
+    return Content.createValidityVector(b, validityBytes);
   }
 
   private static int finishContent(FlatBufferBuilder b, int discriminator,
@@ -125,12 +137,18 @@ public class Array1dSerdes {
       Arrays.fill(validity, true);
       return validity;
     }
-    if (n != m) {
+
+    final int expected_validity_len = (m + 7) / 8;
+    if (n != expected_validity_len) {
       throw new IllegalArgumentException(
-          String.format("Invalid validity length %d (expected %d)", n, m));
+          String.format("invalid validity length %d: expected %d for %d 
elements in array",
+                        n, expected_validity_len, m));
     }
-    for (int i = 0; i < m; i++) {
-      validity[i] = c.validity(i);
+    final ByteVector vv = c.validityVector();
+    Preconditions.checkState(vv.length() == n,
+        "unexpected length of validityVector: %d (expected %d)", vv.length(), 
n);
+    for (int idx = 0; idx < m; ++idx) {
+      validity[idx] = ((vv.get(idx >> 3) & (1 << (idx & 7))) != 0);
     }
     return validity;
   }
diff --git 
a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java 
b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java
index 62a0266e6..f1f120ba1 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellView.java
@@ -20,10 +20,10 @@ package org.apache.kudu.client;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.Arrays;
+import java.util.BitSet;
 import java.util.function.IntFunction;
 
 import org.apache.kudu.ColumnSchema;
-import org.apache.kudu.Type;
 import org.apache.kudu.serdes.BinaryArray;
 import org.apache.kudu.serdes.Content;
 import org.apache.kudu.serdes.DoubleArray;
@@ -47,6 +47,8 @@ class ArrayCellView {
   private final byte[] rawBytes;
   private final Content content;
   private final byte typeTag;
+  private final BitSet validityBitSet;
+  private final int elemNum;
 
   private static final String NULL_ELEMENT_MSG = "Element %d is NULL";
 
@@ -62,6 +64,24 @@ class ArrayCellView {
     ByteBuffer bb = ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN);
     this.content = Content.getRootAsContent(bb);
     this.typeTag = content.dataType();
+    this.elemNum = getElemNum();
+
+    // Build validity/not-null BitSet.
+    final int validityElemNum = content.validityLength(); // number of bytes 
in the array
+    if (validityElemNum == 0) {
+      // Setting the validity bitset explicitly to null: all array elements 
are valid.
+      this.validityBitSet = null;
+    } else {
+      // One validity bit per element of the 'data' array.
+      final int dataElemNum = length();
+      final int expectedValidityElemNum = (dataElemNum + 7) / 8;
+      if (validityElemNum != expectedValidityElemNum) {
+        throw new IllegalArgumentException(
+            String.format("invalid validity length %d: expected %d for %d 
elements in array)",
+                validityElemNum, expectedValidityElemNum, dataElemNum));
+      }
+      this.validityBitSet = BitSet.valueOf(content.validityAsByteBuffer());
+    }
   }
 
   /** Return the underlying FlatBuffer bytes exactly as passed in. */
@@ -70,8 +90,13 @@ class ArrayCellView {
     return rawBytes;
   }
 
-  /** Number of logical elements (driven by the data vector). */
+  /** Number of logical elements in the array, including null/non-valid 
elements. */
   int length() {
+    return elemNum;
+  }
+
+  /** Get the number of array elements in the underlying flatbuffers contents. 
*/
+  private int getElemNum() {
     switch (typeTag) {
       case ScalarArray.Int8Array: {
         Int8Array arr = new Int8Array();
@@ -141,49 +166,16 @@ class ArrayCellView {
 
   /** Returns true iff element i is valid (non-null). */
   boolean isValid(int i) {
-    int n = length();
+    final int n = length();
     if (i < 0 || i >= n) {
       throw new IndexOutOfBoundsException(
           String.format("Index %d out of bounds for array length %d", i, n));
     }
-
-    int vlen = content.validityLength();
-    if (vlen == 0) {
+    if (validityBitSet == null) {
       // No validity vector present => all elements are valid
       return true;
     }
-
-    if (i >= vlen) {
-      throw new IllegalStateException(
-          String.format("Validity vector shorter than array: i=%d, vlen=%d", 
i, vlen));
-    }
-
-    return content.validity(i);
-  }
-
-  /**
-   * Returns a boolean array of element validity flags.
-   *
-   * @param n expected number of elements in the values vector
-   * @throws IllegalStateException if validity length and value length mismatch
-   */
-
-  boolean[] validityOrAllTrue(int n) {
-    int len = content.validityLength();
-    if (len == 0) {
-      boolean[] allTrue = new boolean[n];
-      java.util.Arrays.fill(allTrue, true);
-      return allTrue;
-    }
-    if (len != n) {
-      throw new IllegalStateException(
-          String.format("Validity length %d does not match values length %d", 
len, n));
-    }
-    boolean[] v = new boolean[n];
-    for (int i = 0; i < n; i++) {
-      v[i] = content.validity(i);
-    }
-    return v;
+    return validityBitSet.get(i);
   }
 
   // ----------------------------------------------------------------------
diff --git 
a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
 
b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
index b753cf9c2..937b26779 100644
--- 
a/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
+++ 
b/java/kudu-client/src/main/java/org/apache/kudu/client/ArrayCellViewHelper.java
@@ -50,14 +50,6 @@ class ArrayCellViewHelper {
   private ArrayCellViewHelper() {
   }
 
-  // ----------------------------------------------------------------------
-  // Common helpers
-  // ----------------------------------------------------------------------
-
-  private static boolean[] validity(ArrayCellView view, int n) {
-    return view.validityOrAllTrue(n);
-  }
-
   // ----------------------------------------------------------------------
   // Boxed numeric arrays with null (validity) handling
   // ----------------------------------------------------------------------
@@ -70,14 +62,13 @@ class ArrayCellViewHelper {
   // which avoids position drift and unnecessary buffer slicing while 
remaining allocation-free.
   static Boolean[] toBoolArray(ArrayCellView view) {
     UInt8Array arr = view.asUInt8FB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
+    final int n = arr.valuesLength();
     ByteBuffer bb = arr.valuesAsByteBuffer();
     bb.order(ByteOrder.LITTLE_ENDIAN);
     int base = bb.position();
     Boolean[] out = new Boolean[n];
     for (int i = 0; i < n; i++) {
-      if (!v[i]) {
+      if (!view.isValid(i)) {
         out[i] = null;
         continue;
       }
@@ -96,84 +87,78 @@ class ArrayCellViewHelper {
   // Null elements are filled with Java nulls according to the validity bitmap.
   static Byte[] toInt8Array(ArrayCellView view) {
     Int8Array arr = view.asInt8FB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
     ByteBuffer bb = arr.valuesAsByteBuffer();
     bb.order(ByteOrder.LITTLE_ENDIAN);
+    final int n = arr.valuesLength();
     int base = bb.position();
     Byte[] out = new Byte[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? bb.get(base + i) : null;
+      out[i] = view.isValid(i) ? bb.get(base + i) : null;
     }
     return out;
   }
 
   static Short[] toInt16Array(ArrayCellView view) {
     Int16Array arr = view.asInt16FB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
-    ShortBuffer sb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asShortBuffer();
+    final int n = arr.valuesLength();
     short[] tmp = new short[n];
+    ShortBuffer sb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asShortBuffer();
     sb.get(tmp);
     Short[] out = new Short[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? tmp[i] : null;
+      out[i] = view.isValid(i) ? tmp[i] : null;
     }
     return out;
   }
 
   static Integer[] toInt32Array(ArrayCellView view) {
     Int32Array arr = view.asInt32FB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
-    IntBuffer ib = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+    final int n = arr.valuesLength();
     int[] tmp = new int[n];
+    IntBuffer ib = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
     ib.get(tmp);
     Integer[] out = new Integer[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? tmp[i] : null;
+      out[i] = view.isValid(i) ? tmp[i] : null;
     }
     return out;
   }
 
   static Long[] toInt64Array(ArrayCellView view) {
     Int64Array arr = view.asInt64FB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
-    LongBuffer lb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
+    final int n = arr.valuesLength();
     long[] tmp = new long[n];
+    LongBuffer lb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
     lb.get(tmp);
     Long[] out = new Long[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? tmp[i] : null;
+      out[i] = view.isValid(i) ? tmp[i] : null;
     }
     return out;
   }
 
   static Float[] toFloatArray(ArrayCellView view) {
     FloatArray arr = view.asFloatFB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
-    FloatBuffer fb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
+    final int n = arr.valuesLength();
     float[] tmp = new float[n];
+    FloatBuffer fb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
     fb.get(tmp);
     Float[] out = new Float[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? tmp[i] : null;
+      out[i] = view.isValid(i) ? tmp[i] : null;
     }
     return out;
   }
 
   static Double[] toDoubleArray(ArrayCellView view) {
     DoubleArray arr = view.asDoubleFB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
-    DoubleBuffer db = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asDoubleBuffer();
+    final int n = arr.valuesLength();
     double[] tmp = new double[n];
+    DoubleBuffer db = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asDoubleBuffer();
     db.get(tmp);
     Double[] out = new Double[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? tmp[i] : null;
+      out[i] = view.isValid(i) ? tmp[i] : null;
     }
     return out;
   }
@@ -184,11 +169,10 @@ class ArrayCellViewHelper {
 
   static String[] toStringArray(ArrayCellView view) {
     StringArray arr = view.asStringFB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
+    final int n = arr.valuesLength();
     String[] out = new String[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? arr.values(i) : null;
+      out[i] = view.isValid(i) ? arr.values(i) : null;
     }
     return out;
   }
@@ -200,12 +184,11 @@ class ArrayCellViewHelper {
 
   static byte[][] toBinaryArray(ArrayCellView view) {
     BinaryArray arr = view.asBinaryFB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
+    final int n = arr.valuesLength();
     byte[][] out = new byte[n][];
     UInt8Array elem = new UInt8Array();
     for (int i = 0; i < n; i++) {
-      if (!v[i]) {
+      if (!view.isValid(i)) {
         out[i] = null;
         continue;
       }
@@ -223,28 +206,26 @@ class ArrayCellViewHelper {
 
   static Date[] toDateArray(ArrayCellView view) {
     Int32Array arr = view.asInt32FB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
-    IntBuffer ib = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+    final int n = arr.valuesLength();
     int[] tmp = new int[n];
+    IntBuffer ib = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
     ib.get(tmp);
     Date[] out = new Date[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? DateUtil.epochDaysToSqlDate(tmp[i]) : null;
+      out[i] = view.isValid(i) ? DateUtil.epochDaysToSqlDate(tmp[i]) : null;
     }
     return out;
   }
 
   static Timestamp[] toTimestampArray(ArrayCellView view) {
     Int64Array arr = view.asInt64FB();
-    int n = arr.valuesLength();
-    boolean[] v = validity(view, n);
     LongBuffer lb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
+    final int n = arr.valuesLength();
     long[] tmp = new long[n];
     lb.get(tmp);
     Timestamp[] out = new Timestamp[n];
     for (int i = 0; i < n; i++) {
-      out[i] = v[i] ? TimestampUtil.microsToTimestamp(tmp[i]) : null;
+      out[i] = view.isValid(i) ? TimestampUtil.microsToTimestamp(tmp[i]) : 
null;
     }
     return out;
   }
@@ -258,25 +239,23 @@ class ArrayCellViewHelper {
     BigDecimal[] out;
     if (precision <= 9) {
       Int32Array arr = view.asInt32FB();
-      int n = arr.valuesLength();
-      v = validity(view, n);
       IntBuffer ib = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+      final int n = arr.valuesLength();
       int[] tmp = new int[n];
       ib.get(tmp);
       out = new BigDecimal[n];
       for (int i = 0; i < n; i++) {
-        out[i] = v[i] ? BigDecimal.valueOf(tmp[i], scale) : null;
+        out[i] = view.isValid(i) ? BigDecimal.valueOf(tmp[i], scale) : null;
       }
     } else if (precision <= 18) {
       Int64Array arr = view.asInt64FB();
-      int n = arr.valuesLength();
-      v = validity(view, n);
       LongBuffer lb = 
arr.valuesAsByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
+      final int n = arr.valuesLength();
       long[] tmp = new long[n];
       lb.get(tmp);
       out = new BigDecimal[n];
       for (int i = 0; i < n; i++) {
-        out[i] = v[i] ? BigDecimal.valueOf(tmp[i], scale) : null;
+        out[i] = view.isValid(i) ? BigDecimal.valueOf(tmp[i], scale) : null;
       }
     } else {
       throw new IllegalStateException("DECIMAL128 arrays not yet supported");
diff --git 
a/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java 
b/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java
index 58d5fb819..63b2d2df0 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestPartialRow.java
@@ -192,6 +192,34 @@ public class TestPartialRow {
     assertArrayDataEquals(view, row.getArrayData("ints"), view::getInt8);
   }
 
+  @Test
+  public void testAddAndGetInt8ArrayAllNulls() {
+    Schema schema = new Schema(Arrays.asList(
+        new ColumnSchema.ColumnSchemaBuilder("tiny_ints", 
Type.INT8).array(true).build()
+    ));
+
+    // Test various number of NULL elements in the array to check various 
boundary
+    // conditions on the internal representation of the validity vector in 
ArrayCellView.
+    final int[] numberOfElements = new int[]{ 0, 1, 3, 7, 8, 9, 100, 127};
+    for (int elemNum : numberOfElements) {
+      PartialRow row = schema.newPartialRow();
+
+      byte[] vals = new byte[elemNum];
+      Arrays.fill(vals, (byte)0);
+
+      boolean[] validity = new boolean[elemNum];
+      Arrays.fill(validity, false);
+
+      row.addArrayInt8("tiny_ints", vals, validity);
+
+      ArrayCellView view = (ArrayCellView) row.getObject("tiny_ints");
+      assertEquals(elemNum, view.length());
+      for (int idx = 0; idx < elemNum; ++idx) {
+        assertFalse("elemNum: " + elemNum + " idx: " + idx, view.isValid(idx));
+      }
+    }
+  }
+
   @Test
   public void testAddAndGetInt16Array() {
     Schema schema = new Schema(Arrays.asList(
diff --git a/src/kudu/common/array_cell_view.h 
b/src/kudu/common/array_cell_view.h
index 663cc8390..5699c4ee5 100644
--- a/src/kudu/common/array_cell_view.h
+++ b/src/kudu/common/array_cell_view.h
@@ -90,6 +90,7 @@ class ArrayCellMetadataView final {
        : data_(buf),
          size_(size),
          content_(nullptr),
+         elem_num_(0),
          is_initialized_(false),
          has_nulls_(false) {
   }
@@ -98,6 +99,7 @@ class ArrayCellMetadataView final {
     DCHECK(!is_initialized_);
     if (size_ == 0) {
       content_ = nullptr;
+      elem_num_ = 0;
       is_initialized_ = true;
       has_nulls_ = false;
       return Status::OK();
@@ -141,36 +143,25 @@ class ArrayCellMetadataView final {
       return Status::IllegalState("null flatbuffers of non-zero size");
     }
 
-    const size_t values_size = elem_num_impl();
+    elem_num_ = GetElemNum();
     const size_t validity_size = content_->validity() ? 
content_->validity()->size() : 0;
-    if (validity_size != 0 && values_size != validity_size) {
-      return Status::Corruption("number of data and validity elements differ");
+    if (validity_size != 0 && BitmapSize(elem_num_) != validity_size) {
+      return Status::Corruption("'validity' and 'data' fields not in sync");
     }
 
     has_nulls_ = false;
     if (validity_size != 0) {
       // If the validity vector is supplied and with all its elements non-zero.
       const auto& validity = *(DCHECK_NOTNULL(content_->validity()));
-      has_nulls_ = std::any_of(validity.cbegin(), validity.cend(),
-                               [&](uint8_t e) { return e == 0; });
+      has_nulls_ = !BitmapIsAllSet(validity.Data(), 0, elem_num_);
     }
 
     if (has_nulls_) {
-      const size_t bit_num = values_size;
-      DCHECK_GT(bit_num, 0);
-      bitmap_.reset(new uint8_t[BitmapSize(bit_num)]);
-      // Assuming the most of the elements are non-null/valid, it's usually 
less
-      // calls to flip particular bits from 1 to 0 than flipping them from 0 
to 1.
-      // TODO(aserbin): consider counting alternating strategy based on the 
number
-      //                of non-zero values that comes from computing 
'has_nulls_'
-      auto* bm = bitmap_.get();
-      memset(bm, 0xff, BitmapSize(bit_num));
-      const auto& v = *(DCHECK_NOTNULL(content_->validity()));
-      for (size_t idx = 0; idx < bit_num; ++idx) {
-        if (v.Get(idx) == 0) {
-          BitmapClear(bm, idx);
-        }
-      }
+      DCHECK_GT(validity_size, 0);
+      DCHECK_GT(elem_num_, 0);
+      DCHECK_EQ(validity_size, BitmapSize(elem_num_));
+      bitmap_.reset(new uint8_t[validity_size]);
+      memcpy(bitmap_.get(), content_->validity()->Data(), validity_size);
     }
     const auto data_type = content_->data_type();
     if (data_type != serdes::ScalarArray::BinaryArray &&
@@ -209,7 +200,7 @@ class ArrayCellMetadataView final {
   // Number of elements in the array.
   size_t elem_num() const {
     DCHECK(is_initialized_);
-    return elem_num_impl();
+    return elem_num_;
   }
 
   bool empty() const {
@@ -284,7 +275,7 @@ class ArrayCellMetadataView final {
     return content_ ? content_->data_as<T>()->values()->Data() : nullptr;
   }
 
-  size_t elem_num_impl() const {
+  size_t GetElemNum() const {
     if (!content_) {
       return 0;
     }
@@ -336,6 +327,9 @@ class ArrayCellMetadataView final {
   // for an empty (size_ == 0) buffer.
   const serdes::Content* content_;
 
+  // Number of elements in the underlying flatbuffers buffer.
+  size_t elem_num_;
+
   // A bitmap built of the boolean validity vector.
   // TODO(aserbin): switch array1d to bitfield instead of bool vector for 
validity?
   std::unique_ptr<uint8_t[]> bitmap_;
diff --git a/src/kudu/common/array_type_serdes-test.cc 
b/src/kudu/common/array_type_serdes-test.cc
index 7e8284489..1345799b5 100644
--- a/src/kudu/common/array_type_serdes-test.cc
+++ b/src/kudu/common/array_type_serdes-test.cc
@@ -81,6 +81,7 @@ TEST(ArrayTypeSerdesTest, Basic) {
   const auto* view_validity_bitmap = view.not_null_bitmap();
   ASSERT_NE(nullptr, view_validity_bitmap);
   ASSERT_TRUE(view.has_nulls());
+  ASSERT_EQ(0, memcmp(view_validity_bitmap, validity_bitmap, 1));
 
   // Verify the data matches the source.
   {
@@ -111,6 +112,7 @@ TEST(ArrayTypeSerdesTest, AllNonNullElements) {
   const vector<bool> kEmpty{};
 
   for (const uint8_t* validity_bitmap : { kNull, kThreeOnes }) {
+    SCOPED_TRACE(validity_bitmap ? "all ones" : "null");
     const vector<int16_t> val{ 0, 1, 2, };
     const vector<bool>& validity_vector =
         validity_bitmap ? BitmapToVector(validity_bitmap, val.size()) : kEmpty;
diff --git a/src/kudu/common/array_type_serdes.h 
b/src/kudu/common/array_type_serdes.h
index 9acd64b4e..f45073558 100644
--- a/src/kudu/common/array_type_serdes.h
+++ b/src/kudu/common/array_type_serdes.h
@@ -40,13 +40,18 @@ template<DataType KUDU_DATA_TYPE, typename FB_TYPE>
 void BuildFlatbuffers(
     const uint8_t* column_data,
     size_t nrows,
-    const std::vector<bool>& validity,
+    const uint8_t* validity_bitmap,
     flatbuffers::FlatBufferBuilder* bld) {
   typedef typename DataTypeTraits<KUDU_DATA_TYPE>::cpp_type ElementType;
 
   DCHECK(bld);
   auto& builder = *bld;
 
+  if (validity_bitmap && BitmapIsAllSet(validity_bitmap, 0, nrows)) {
+    // All bits in the validity bitmap are set: it's equivalent of null bitmap.
+    validity_bitmap = nullptr;
+  }
+
   std::vector<ElementType> val;
   val.resize(nrows);
   if (column_data != nullptr) {
@@ -55,7 +60,8 @@ void BuildFlatbuffers(
       static const Slice kEmptySlice(static_cast<uint8_t*>(nullptr), 0);
       const Slice* ptr = reinterpret_cast<const Slice*>(column_data);
       for (size_t idx = 0; idx < nrows; ++idx) {
-        val[idx] = (validity.empty() || validity[idx]) ? *(ptr + idx) : 
kEmptySlice;
+        val[idx] = (!validity_bitmap || BitmapTest(validity_bitmap, idx))
+            ? *(ptr + idx) : kEmptySlice;
       }
     } else {
       static_assert(!std::is_same<Slice, ElementType>::value,
@@ -64,15 +70,21 @@ void BuildFlatbuffers(
     }
   }
 
-  const auto validity_vector =
-      !validity.empty() ? builder.CreateVector(validity) : 0;
+  std::vector<uint8_t> validity_vector;
+  if (validity_bitmap) {
+    validity_vector.resize(BitmapSize(nrows));
+    memcpy(validity_vector.data(), validity_bitmap, validity_vector.size());
+  }
+  const auto validity_fb =
+      validity_vector.empty() ? 0 : builder.CreateVector(validity_vector);
+
   if constexpr (KUDU_DATA_TYPE == STRING) {
     auto values = FB_TYPE::Traits::Create(
         builder, builder.CreateVectorOfStrings<ElementType>(val));
     builder.Finish(CreateContent(builder,
                                  KuduToScalarArrayType(KUDU_DATA_TYPE),
                                  values.Union(),
-                                 validity_vector));
+                                 validity_fb));
   } else if constexpr (KUDU_DATA_TYPE == BINARY) {
     std::vector<flatbuffers::Offset<serdes::UInt8Array>> offsets;
     offsets.reserve(val.size());
@@ -86,14 +98,14 @@ void BuildFlatbuffers(
     builder.Finish(CreateContent(builder,
                                  KuduToScalarArrayType(KUDU_DATA_TYPE),
                                  values.Union(),
-                                 validity_vector));
+                                 validity_fb));
   } else {
     auto values = FB_TYPE::Traits::Create(
         builder, builder.CreateVector<ElementType>(val));
     builder.Finish(CreateContent(builder,
                                  KuduToScalarArrayType(KUDU_DATA_TYPE),
                                  values.Union(),
-                                 validity_vector));
+                                 validity_fb));
   }
 }
 
@@ -116,7 +128,16 @@ Status Serialize(
 
   flatbuffers::FlatBufferBuilder builder(
       nrows * sizeof(ElementType) + nrows + FLATBUFFERS_MIN_BUFFER_SIZE);
-  BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows, validity, 
&builder);
+
+  const uint8_t* validity_bitmap = nullptr;
+  std::vector<uint8_t> validity_vector;
+  if (!validity.empty() && std::any_of(validity.begin(), validity.end(),
+                                       [&](bool e) { return !e; })) {
+    validity_vector.resize(BitmapSize(nrows));
+    VectorToBitmap(validity, validity_vector.data());
+    validity_bitmap = validity_vector.data();
+  }
+  BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows, 
validity_bitmap, &builder);
   DCHECK(builder.GetBufferPointer());
 
   // TODO(aserbin): would it be better to copy the data from the builder
@@ -142,7 +163,6 @@ Status SerializeIntoArena(
     size_t nrows,
     Arena* arena,
     Slice* out) {
-  static const std::vector<bool> kAllValid{};
   typedef typename DataTypeTraits<KUDU_DATA_TYPE>::cpp_type ElementType;
 
   DCHECK(arena);
@@ -150,9 +170,7 @@ Status SerializeIntoArena(
 
   flatbuffers::FlatBufferBuilder builder(
       nrows * sizeof(ElementType) + nrows + FLATBUFFERS_MIN_BUFFER_SIZE);
-  const std::vector<bool>& validity =
-      validity_bitmap ? BitmapToVector(validity_bitmap, nrows) : kAllValid;
-  BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows, validity, 
&builder);
+  BuildFlatbuffers<KUDU_DATA_TYPE, FB_TYPE>(column_data, nrows, 
validity_bitmap, &builder);
 
   // Copy the serialized data into the arena.
   //
diff --git a/src/kudu/common/serdes/array1d.fbs 
b/src/kudu/common/serdes/array1d.fbs
index 3f43aaa58..4fc612383 100644
--- a/src/kudu/common/serdes/array1d.fbs
+++ b/src/kudu/common/serdes/array1d.fbs
@@ -52,14 +52,14 @@ union ScalarArray {
 // This is to represent a one-dimensional array of a particular scalar type
 // where some of the array's elements might be null/non-valid. The 'data'
 // field is a placeholder for the array's elements, and the 'validity' field
-// contains information on their validity/non-nullness: 'true' means 'valid',
-// 'false' means 'non-valid' (a.k.a. 'null').
-//
-// TODO(aserbin): consider storing 1 bit per data element in the 'validity'
-//                field instead of using whole 'ubyte/uint8'
+// contains information on their validity/non-nullness: one bit per element
+// in 'data'.  A set bit ('1') in the 'validity' field means 'non-null'
+// (a.k.a. 'valid'), 'false' means 'null' (a.k.a. 'non-valid'). If all the
+// elements in the 'data' field are non-null/valid, the 'validity' field
+// may be empty or absent to optimize for memory/space.
 table Content {
   data:ScalarArray;
-  validity:[bool];
+  validity:[uint8];
 }
 
 root_type Content;
diff --git a/src/kudu/util/bitmap-test.cc b/src/kudu/util/bitmap-test.cc
index e2a33e17b..fe613912e 100644
--- a/src/kudu/util/bitmap-test.cc
+++ b/src/kudu/util/bitmap-test.cc
@@ -386,12 +386,19 @@ TEST(TestBitMap, TestCopy) {
 TEST(TestBitMap, BitmapToVector) {
   constexpr size_t kNumBytes = 8;
   constexpr size_t kNumBits = kNumBytes * 8;
-  constexpr uint8_t kAllZeroes[kNumBytes] = { 0 };
+  constexpr uint8_t kAllZeroes[kNumBytes] = { 0x00 };
+  constexpr uint8_t kAllOnes[kNumBytes] =
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
   {
     const auto v = BitmapToVector(kAllZeroes, kNumBits);
     ASSERT_EQ(kNumBits, v.size());
     ASSERT_TRUE(std::all_of(v.begin(), v.end(), [](bool e) { return !e; }));
+
+    ASSERT_EQ(kNumBytes, BitmapSize(kNumBits));
+    uint8_t buf[BitmapSize(kNumBits)] = { 0xff };
+    ASSERT_EQ(kNumBits, VectorToBitmap(v, buf));
+    ASSERT_EQ(0, memcmp(kAllZeroes, buf, sizeof(kAllZeroes)));
   }
   {
     uint8_t input[kNumBytes];
@@ -401,6 +408,11 @@ TEST(TestBitMap, BitmapToVector) {
     const auto v = BitmapToVector(input, kNumBits);
     ASSERT_EQ(kNumBits, v.size());
     ASSERT_TRUE(std::all_of(v.begin(), v.end(), [](bool e) { return e; }));
+
+    ASSERT_EQ(kNumBytes, BitmapSize(kNumBits));
+    uint8_t buf[BitmapSize(kNumBits)] = { 0x00 };
+    ASSERT_EQ(kNumBits, VectorToBitmap(v, buf));
+    ASSERT_EQ(0, memcmp(kAllOnes, buf, sizeof(kAllOnes)));
   }
   {
     uint8_t input[kNumBytes];
@@ -412,12 +424,21 @@ TEST(TestBitMap, BitmapToVector) {
     auto it_half = v.begin() + (v.size() / 2);
     ASSERT_TRUE(std::all_of(v.begin(), it_half, [](bool e) { return e; }));
     ASSERT_TRUE(std::all_of(it_half, v.end(), [](bool e) { return !e; }));
+
+    uint8_t output[kNumBytes] = { 0 };
+    ASSERT_EQ(kNumBits, VectorToBitmap(v, output));
+    ASSERT_EQ(0, memcmp(input, output, kNumBytes));
   }
   {
     uint8_t input[1] = { 0b00000001 };
     const auto v = BitmapToVector(input, 1);
     ASSERT_EQ(1, v.size());
     ASSERT_TRUE(v.front());
+
+    ASSERT_EQ(1, BitmapSize(1));
+    uint8_t output[1] = { 0x00 };
+    ASSERT_EQ(1, VectorToBitmap(v, output));
+    ASSERT_EQ(0, memcmp(input, output, 1));
   }
   {
     uint8_t input[1] = { 0b10000001 };
@@ -426,6 +447,11 @@ TEST(TestBitMap, BitmapToVector) {
     ASSERT_TRUE(v.front());
     ASSERT_TRUE(v.back());
     ASSERT_TRUE(std::all_of(v.begin() + 1, v.end() - 1, [](bool e) { return 
!e; }));
+
+    ASSERT_EQ(1, BitmapSize(8));
+    uint8_t output[1] = { 0x00 };
+    ASSERT_EQ(8, VectorToBitmap(v, output));
+    ASSERT_EQ(0, memcmp(input, output, 1));
   }
 }
 
diff --git a/src/kudu/util/bitmap.cc b/src/kudu/util/bitmap.cc
index 17c6e268c..20b583c48 100644
--- a/src/kudu/util/bitmap.cc
+++ b/src/kudu/util/bitmap.cc
@@ -169,6 +169,19 @@ vector<bool> BitmapToVector(const uint8_t* bitmap, size_t 
num_bits) {
   return result;
 }
 
+size_t VectorToBitmap(const std::vector<bool>& v, uint8_t* bitmap) {
+  DCHECK(bitmap);
+  const size_t num_bits = v.size();
+  for (size_t idx = 0; idx < num_bits; ++idx) {
+    if (v[idx]) {
+      BitmapSet(bitmap, idx);
+    } else {
+      BitmapClear(bitmap, idx);
+    }
+  }
+  return num_bits;
+}
+
 string BitmapToString(const uint8_t* bitmap, size_t num_bits) {
   string s;
   size_t index = 0;
diff --git a/src/kudu/util/bitmap.h b/src/kudu/util/bitmap.h
index c3feb92f8..9a520b6e3 100644
--- a/src/kudu/util/bitmap.h
+++ b/src/kudu/util/bitmap.h
@@ -33,7 +33,7 @@
 namespace kudu {
 
 // Return the number of bytes necessary to store the given number of bits.
-inline size_t BitmapSize(size_t num_bits) {
+constexpr size_t BitmapSize(size_t num_bits) {
   return (num_bits + 7) / 8;
 }
 
@@ -126,6 +126,7 @@ void BitmapCopy(uint8_t* dst, size_t dst_offset,
                 size_t num_bits);
 
 std::vector<bool> BitmapToVector(const uint8_t* bitmap, size_t num_bits);
+size_t VectorToBitmap(const std::vector<bool>& v, uint8_t* bitmap);
 
 std::string BitmapToString(const uint8_t* bitmap, size_t num_bits);
 

Reply via email to