pitrou commented on code in PR #45459:
URL: https://github.com/apache/arrow/pull/45459#discussion_r2068140399


##########
cpp/src/parquet/geospatial/statistics.h:
##########
@@ -0,0 +1,194 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet::geospatial {
+
+/// \brief The maximum number of dimensions represented by a geospatial type
+/// (i.e., X, Y, Z, and M)
+inline constexpr int kMaxDimensions = 4;
+
+/// \brief NaN, used to represent bounds for which predicate pushdown cannnot
+/// be applied (e.g., because a writer did not provide bounds for a given 
dimension)
+inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN();
+
+/// \brief Structure represented encoded statistics to be written to and read 
from Parquet
+/// serialized metadata.
+///
+/// See the Parquet Thrift definition and GeoStatistics for the specific 
definition
+/// of field values.
+struct PARQUET_EXPORT EncodedGeoStatistics {
+  bool xy_bounds_present{false};
+  double xmin{kNaN};
+  double xmax{kNaN};
+  double ymin{kNaN};
+  double ymax{kNaN};
+
+  bool z_bounds_present{false};
+  double zmin{kNaN};
+  double zmax{kNaN};
+
+  bool m_bounds_present{false};
+  double mmin{kNaN};
+  double mmax{kNaN};
+
+  bool geospatial_types_present() const { return !geospatial_types.empty(); }
+  std::vector<int32_t> geospatial_types;
+};
+
+class GeoStatisticsImpl;
+
+/// \brief Base type for computing geospatial column statistics while writing 
a file
+/// or representing them when reading a file
+///
+/// Note that NaN values that were encountered within coordinates are omitted; 
however,
+/// NaN values that were obtained via decoding encoded statistics are 
propagated. This
+/// behaviour ensures C++ clients that are inspecting statistics via the 
column metadata
+/// can detect the case where a writer generated NaNs (even though this 
implementation
+/// does not generate them).
+///
+/// The handling of NaN values in coordinates is not well-defined among 
bounding
+/// implementations except for the WKB convention for POINT EMPTY, which is 
consistently
+/// represented as a point whose ordinates are all NaN. Any other geometry 
that contains
+/// NaNs cannot expect defined behaviour here or elsewhere; however, a row 
group that
+/// contains both NaN-containing and normal (completely finite) geometries 
should not be
+/// excluded from predicate pushdown.
+///
+/// EXPERIMENTAL
+class PARQUET_EXPORT GeoStatistics {
+ public:
+  GeoStatistics();
+  explicit GeoStatistics(const EncodedGeoStatistics& encoded);
+
+  ~GeoStatistics();
+
+  /// \brief Return true if bounds, geometry types, and validity are identical
+  bool Equals(const GeoStatistics& other) const;
+
+  /// \brief Update these statistics based on previously calculated or decoded 
statistics
+  ///
+  /// Merging statistics with wraparound X values is not currently supported. 
Merging
+  /// two GeoStatistics where one or both has a wraparound X range will result 
in these
+  /// statistics having an X dimension marked as invalid.
+  void Merge(const GeoStatistics& other);
+
+  /// \brief Update these statistics based on values
+  void Update(const ByteArray* values, int64_t num_values);
+
+  /// \brief Update these statistics based on the non-null elements of values
+  void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset, int64_t num_spaced_values,
+                    int64_t num_values);
+
+  /// \brief Update these statistics based on the non-null elements of values
+  ///
+  /// Currently, BinaryArray and LargeBinaryArray input is supported.
+  void Update(const ::arrow::Array& values);
+
+  /// \brief Return these statistics to an empty state
+  void Reset();
+
+  /// \brief Encode the statistics for serializing to Thrift
+  ///
+  /// If invalid WKB was encountered or if the statistics contain NaN
+  /// for any reason, Encode() will return nullopt to indicate that
+  /// statistics should not be written to thrift.
+  std::optional<EncodedGeoStatistics> Encode() const;
+
+  /// \brief Returns false if invalid WKB was encountered
+  bool is_valid() const;
+
+  /// \brief Reset existing statistics and populate them from 
previously-encoded ones
+  void Decode(const EncodedGeoStatistics& encoded);
+
+  /// \brief Minimum values in XYZM order
+  ///
+  /// For dimensions where dimension_valid() is false, the value will be NaN. 
For
+  /// dimensions where dimension_empty() is true, the value will be +Inf.
+  ///
+  /// For the first dimension (X) only, wraparound bounds apply where xmin > 
xmax. In this
+  /// case, these bounds represent the union of the intervals [xmax, Inf] and 
[-Inf,
+  /// xmin]. This implementation does not yet generate these types of bounds 
but they may
+  /// be encountered in statistics when reading a Parquet file.
+  std::array<double, kMaxDimensions> lower_bound() const;
+
+  /// \brief Maximum values in XYZM order
+  ///
+  /// For dimensions where dimension_valid() is false, the value will be NaN. 
For
+  /// dimensions where dimension_empty() is true, the value will be -Inf.
+  ///
+  /// For the first dimension (X) only, wraparound bounds apply where xmin > 
xmax. In this
+  /// case, these bounds represent the union of the intervals [xmax, Inf] and 
[-Inf,
+  /// xmin]. This implementation does not yet generate these types of bounds 
but they may
+  /// be encountered in statistics when reading a Parquet file.
+  std::array<double, kMaxDimensions> upper_bound() const;
+
+  /// \brief Dimension emptiness in XYZM order
+  ///
+  /// True for a given dimension if and only if zero non-NaN values were 
encountered
+  /// in that dimension and dimension_valid() is true for that dimension.

Review Comment:
   @paleolimbot You misunderstood this comment.
   
   If a method named `dimension_empty`, then returning `true` should mean the 
dimension is "empty" and returning `false` should mean the dimension is not 
"empty" (regardless of the meaning).
   
   This method does the reverse, can you change it in a followup PR? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to