This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 39af73f2ad GH-41909: [C++] Add arrow::ArrayStatistics (#43273)
39af73f2ad is described below

commit 39af73f2ada90b2d66c1410f9c591b25544b711f
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sun Aug 4 16:39:21 2024 +0900

    GH-41909: [C++] Add arrow::ArrayStatistics (#43273)
    
    ### Rationale for this change
    
    We're discussion API on the mailing list 
https://lists.apache.org/thread/kcpyq9npnh346pw90ljwbg0wxq6hwxxh and GH-41909.
    
    If we have `arrow::ArrayStatistics`, we can attach statistics read from 
Apache Parquet to `arrow::Array`s.
    
    This only includes `arrow::ArrayStatistics`. See GH-42133 how to use 
`arrow::ArrayStatitics` for Apache Parquet's statistics.
    
    ### What changes are included in this PR?
    
    This only adds `arrow::ArrayStatistics` and its tests.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes.
    * GitHub Issue: #41909
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/CMakeLists.txt           |   2 +
 cpp/src/arrow/array/statistics.cc      |  21 +++++++
 cpp/src/arrow/array/statistics.h       |  76 ++++++++++++++++++++++++
 cpp/src/arrow/array/statistics_test.cc | 103 +++++++++++++++++++++++++++++++++
 4 files changed, 202 insertions(+)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 6dc8358f50..9c66a58c54 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -412,6 +412,7 @@ arrow_add_object_library(ARROW_ARRAY
                          array/concatenate.cc
                          array/data.cc
                          array/diff.cc
+                         array/statistics.cc
                          array/util.cc
                          array/validate.cc)
 
@@ -1168,6 +1169,7 @@ add_arrow_test(array_test
                array/array_struct_test.cc
                array/array_union_test.cc
                array/array_view_test.cc
+               array/statistics_test.cc
                PRECOMPILED_HEADERS
                "$<$<COMPILE_LANGUAGE:CXX>:arrow/testing/pch.h>")
 
diff --git a/cpp/src/arrow/array/statistics.cc 
b/cpp/src/arrow/array/statistics.cc
new file mode 100644
index 0000000000..b661c9fbaf
--- /dev/null
+++ b/cpp/src/arrow/array/statistics.cc
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This empty .cc file is for embedding not inlined symbols in
+// arrow::ArrayStatistics into libarrow.
+
+#include "arrow/array/statistics.h"
diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
new file mode 100644
index 0000000000..7357e27f41
--- /dev/null
+++ b/cpp/src/arrow/array/statistics.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <variant>
+
+#include "arrow/util/float16.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief Statistics for an Array
+///
+/// Apache Arrow format doesn't have statistics but data source such
+/// as Apache Parquet may have statistics. Statistics associated with
+/// data source can be read unified API via this class.
+struct ARROW_EXPORT ArrayStatistics {
+  using ValueType =
+      std::variant<bool, int8_t, uint8_t, int16_t, uint16_t, int32_t, 
uint32_t, int64_t,
+                   uint64_t, util::Float16, float, double, std::string, 
std::string_view>;
+
+  ArrayStatistics() = default;
+  ~ArrayStatistics() = default;
+
+  /// \brief The number of null values, may not be set
+  std::optional<int64_t> null_count = std::nullopt;
+
+  /// \brief The number of distinct values, may not be set
+  std::optional<int64_t> distinct_count = std::nullopt;
+
+  /// \brief The minimum value, may not be set
+  std::optional<ValueType> min = std::nullopt;
+
+  /// \brief Whether the minimum value is exact or not, may not be set
+  std::optional<bool> is_min_exact = std::nullopt;
+
+  /// \brief The maximum value, may not be set
+  std::optional<ValueType> max = std::nullopt;
+
+  /// \brief Whether the maximum value is exact or not, may not be set
+  std::optional<bool> is_max_exact = std::nullopt;
+
+  /// \brief Check two statistics for equality
+  bool Equals(const ArrayStatistics& other) const {
+    return null_count == other.null_count && distinct_count == 
other.distinct_count &&
+           min == other.min && is_min_exact == other.is_min_exact && max == 
other.max &&
+           is_max_exact == other.is_max_exact;
+  }
+
+  /// \brief Check two statistics for equality
+  bool operator==(const ArrayStatistics& other) const { return Equals(other); }
+
+  /// \brief Check two statistics for not equality
+  bool operator!=(const ArrayStatistics& other) const { return !Equals(other); 
}
+};
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/array/statistics_test.cc 
b/cpp/src/arrow/array/statistics_test.cc
new file mode 100644
index 0000000000..a465ac0bc2
--- /dev/null
+++ b/cpp/src/arrow/array/statistics_test.cc
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array/statistics.h"
+
+namespace arrow {
+
+TEST(ArrayStatisticsTest, TestNullCount) {
+  ArrayStatistics statistics;
+  ASSERT_FALSE(statistics.null_count.has_value());
+  statistics.null_count = 29;
+  ASSERT_TRUE(statistics.null_count.has_value());
+  ASSERT_EQ(29, statistics.null_count.value());
+}
+
+TEST(ArrayStatisticsTest, TestDistinctCount) {
+  ArrayStatistics statistics;
+  ASSERT_FALSE(statistics.distinct_count.has_value());
+  statistics.distinct_count = 29;
+  ASSERT_TRUE(statistics.distinct_count.has_value());
+  ASSERT_EQ(29, statistics.distinct_count.value());
+}
+
+TEST(ArrayStatisticsTest, TestMin) {
+  ArrayStatistics statistics;
+  ASSERT_FALSE(statistics.min.has_value());
+  ASSERT_FALSE(statistics.is_min_exact.has_value());
+  statistics.min = static_cast<int32_t>(29);
+  statistics.is_min_exact = true;
+  ASSERT_TRUE(statistics.min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int32_t>(statistics.min.value()));
+  ASSERT_EQ(29, std::get<int32_t>(statistics.min.value()));
+  ASSERT_TRUE(statistics.is_min_exact.has_value());
+  ASSERT_TRUE(statistics.is_min_exact.value());
+}
+
+TEST(ArrayStatisticsTest, TestMax) {
+  ArrayStatistics statistics;
+  ASSERT_FALSE(statistics.max.has_value());
+  ASSERT_FALSE(statistics.is_max_exact.has_value());
+  statistics.max = std::string("hello");
+  statistics.is_max_exact = false;
+  ASSERT_TRUE(statistics.max.has_value());
+  ASSERT_TRUE(std::holds_alternative<std::string>(statistics.max.value()));
+  ASSERT_EQ("hello", std::get<std::string>(statistics.max.value()));
+  ASSERT_TRUE(statistics.is_max_exact.has_value());
+  ASSERT_FALSE(statistics.is_max_exact.value());
+}
+
+TEST(ArrayStatisticsTest, TestEquality) {
+  ArrayStatistics statistics1;
+  ArrayStatistics statistics2;
+
+  ASSERT_EQ(statistics1, statistics2);
+
+  statistics1.null_count = 29;
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.null_count = 29;
+  ASSERT_EQ(statistics1, statistics2);
+
+  statistics1.distinct_count = 2929;
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.distinct_count = 2929;
+  ASSERT_EQ(statistics1, statistics2);
+
+  statistics1.min = std::string_view("world");
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.min = std::string_view("world");
+  ASSERT_EQ(statistics1, statistics2);
+
+  statistics1.is_min_exact = false;
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.is_min_exact = false;
+  ASSERT_EQ(statistics1, statistics2);
+
+  statistics1.max = arrow::util::Float16(-29);
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.max = arrow::util::Float16(-29);
+  ASSERT_EQ(statistics1, statistics2);
+
+  statistics1.is_max_exact = true;
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.is_max_exact = true;
+  ASSERT_EQ(statistics1, statistics2);
+}
+
+}  // namespace arrow

Reply via email to