felipecrv commented on code in PR #34408:
URL: https://github.com/apache/arrow/pull/34408#discussion_r1128724311


##########
cpp/src/arrow/array/data.h:
##########
@@ -229,15 +256,88 @@ struct ARROW_EXPORT ArrayData {
 
   void SetNullCount(int64_t v) { null_count.store(v); }
 
-  /// \brief Return null count, or compute and set it if it's not known
+  /// \brief Return physical null count, or compute and set it if it's not 
known
   int64_t GetNullCount() const;
 
+  /// \brief Return true if the data has a validity bitmap and the physical 
null
+  /// count is known to be non-zero or not yet known.
+  ///
+  /// Note that this is not the same as MayHaveLogicalNulls, which also checks
+  /// for the presence of nulls in child data for types like unions and run-end
+  /// encoded types.
+  ///
+  /// \see HasValidityBitmap
+  /// \see MayHaveLogicalNulls
   bool MayHaveNulls() const {
     // If an ArrayData is slightly malformed it may have kUnknownNullCount set
     // but no buffer
     return null_count.load() != 0 && buffers[0] != NULLPTR;
   }
 
+  /// \brief Return true if the data has a validity bitmap
+  bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
+
+  /// \brief Return true if the validity bitmap may have 0's in it, or if the
+  /// child arrays (in the case of types without a validity bitmap) may have
+  /// nulls
+  ///
+  /// This is not a drop-in replacement for MayHaveNulls, as historically
+  /// MayHaveNulls() has been used to check for the presence of a validity
+  /// bitmap that needs to be checked.
+  ///
+  /// Code that previously used MayHaveNulls() and then dealt with the validity
+  /// bitmap directly can be fixed to handle all types correctly without
+  /// performance degradation when handling most types by adopting
+  /// HasValidityBitmap and MayHaveLogicalNulls.
+  ///
+  /// Before:
+  ///
+  ///     uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : 
NULLPTR;
+  ///     for (int64_t i = 0; i < array.length; ++i) {
+  ///       if (validity && !bit_util::GetBit(validity, i)) {
+  ///         continue;  // skip a NULL
+  ///       }
+  ///       ...
+  ///     }
+  ///
+  /// After:
+  ///
+  ///     bool all_valid = !array.MayHaveLogicalNulls();
+  ///     uint8_t* validity = array.HasValidityBitmap() ? 
array.buffers[0].data : NULLPTR;
+  ///     for (int64_t i = 0; i < array.length; ++i) {
+  ///       bool is_valid = all_valid ||
+  ///                       (validity && bit_util::GetBit(validity, i)) ||

Review Comment:
   Yes, performance. Avoid all the branches inside `IsValid` and allows the 
compiler to create specializations of the loop.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to