felipecrv commented on code in PR #34408:
URL: https://github.com/apache/arrow/pull/34408#discussion_r1123232448


##########
cpp/src/arrow/array/data.h:
##########
@@ -229,15 +256,88 @@ struct ARROW_EXPORT ArrayData {
 
   void SetNullCount(int64_t v) { null_count.store(v); }
 
-  /// \brief Return null count, or compute and set it if it's not known
+  /// \brief Return physical null count, or compute and set it if it's not 
known
   int64_t GetNullCount() const;
 
+  /// \brief Return true if the data has a validity bitmap and the physical 
null
+  /// count is known to be non-zero or not yet known.
+  ///
+  /// Note that this is not the same as MayHaveLogicalNulls, which also checks
+  /// for the presence of nulls in child data for types like unions and run-end
+  /// encoded types.
+  ///
+  /// \see HasValidityBitmap
+  /// \see MayHaveLogicalNulls
   bool MayHaveNulls() const {
     // If an ArrayData is slightly malformed it may have kUnknownNullCount set
     // but no buffer
     return null_count.load() != 0 && buffers[0] != NULLPTR;
   }
 
+  /// \brief Return true if the data has a validity bitmap
+  bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
+
+  /// \brief Return true if the validity bitmap may have 0's in it, or if the
+  /// child arrays (in the case of types without a validity bitmap) may have
+  /// nulls
+  ///
+  /// This is not a drop-in replacement for MayHaveNulls, as historically
+  /// MayHaveNulls() has been used to check for the presence of a validity
+  /// bitmap that needs to be checked.
+  ///
+  /// Code that previously used MayHaveNulls() and then dealt with the validity
+  /// bitmap directly can be fixed to handle all types correctly without
+  /// performance degradation when handling most types by adopting
+  /// HasValidityBitmap and MayHaveLogicalNulls.
+  ///
+  /// BEFORE:
+  ///
+  ///   uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : 
NULLPTR;
+  ///   for (int64_t i = 0; i < array.length; ++i) {
+  ///     if (validity && !bit_util::GetBit(validity, i)) {
+  ///       continue;  // skip a NULL
+  ///     }
+  ///     ...
+  ///   }
+  ///
+  /// AFTER:
+  ///
+  ///   bool all_valid = !array.MayHaveLogicalNulls();
+  ///   uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data 
: NULLPTR;
+  ///   for (int64_t i = 0; i < array.length; ++i) {
+  ///     bool is_valid = all_valid ||
+  ///                     (validity && bit_util::GetBit(validity, i)) ||
+  ///                     array.IsValid(i);
+  ///     if (!is_valid) {
+  ///       continue;  // skip a NULL
+  ///     }
+  ///     ...
+  ///   }
+  bool MayHaveLogicalNulls() const {
+    if (buffers[0] != NULLPTR) {
+      return null_count.load() != 0;
+    }
+    const auto t = type->id();
+    if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
+      return internal::UnionMayHaveLogicalNulls(*this);
+    }
+    if (t == Type::RUN_END_ENCODED) {
+      return internal::RunEndEncodedMayHaveLogicalNulls(*this);
+    }
+    return false;
+  }
+
+  /// \brief Computes the logical null count for arrays of all types including
+  /// those that do not have a validity bitmap like union and run-end encoded
+  /// arrays
+  ///
+  /// If the array has a validity bitmap, this function behaves the same as
+  /// GetNullCount. For types that have no validity bitmap, this function will
+  /// recompute the null count every time it is called.

Review Comment:
   It would require adding a field to all these classes and updating it 
correctly when slicing arrays (back to kUnknownCount). Adds more to think about 
in these central classes. I think it's premature to think about caching this 
value, it can be done later if we see it becoming an issue in practice.
   
   For most types this re-computation is just returning the cached 
`null_count_` and depending on how the other types implement the calculation, 
they will benefit from the `null_count_` cache in child arrays.
   
   In the case of REEs, if we scan the validity bitmap of the values arrays and 
get a null_count=0, we won't have to do any processing when 
`ComputeLogicalCount` is called again. And if REE compression is being 
effective, the values arrays is much smaller than the logical length, making 
the re-computation cheap.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to