wgtmac commented on code in PR #45459:
URL: https://github.com/apache/arrow/pull/45459#discussion_r2038875059


##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1230,11 +1242,19 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
     current_dict_encoder_ =
         dynamic_cast<DictEncoder<ParquetType>*>(current_encoder_.get());
 
+    // Geometry/Geography are the first non-nested logical types to have a
+    // SortOrder::UNKNOWN. Currently, the presence of statistics is tied to
+    // having a known sort order and so null counts will be missing.
     if (properties->statistics_enabled(descr_->path()) &&
         (SortOrder::UNKNOWN != descr_->sort_order())) {
       page_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
       chunk_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
+    } else if (properties->statistics_enabled(descr_->path()) &&

Review Comment:
   What above the below to avoid checking `statistics_enabled` twice:
   
   ```
       if (properties->statistics_enabled(descr_->path())) {
         if (SortOrder::UNKNOWN != descr_->sort_order()) {
           page_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
           chunk_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
         }
         if (descr_->logical_type() != nullptr &&
             descr_->logical_type()->is_geometry()) {
           chunk_geospatial_statistics_ = std::make_shared<GeoStatistics>();
         }
       }
   ```



##########
cpp/src/parquet/geospatial/statistics.h:
##########
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// \brief Structure represented encoded statistics to be written to and read 
from Parquet
+/// serialized metadata.
+///
+/// See the Parquet Thrift definition and GeoStatistics for the specific 
definition
+/// of field values.
+struct PARQUET_EXPORT EncodedGeoStatistics {
+  static constexpr double kInf = std::numeric_limits<double>::infinity();
+
+  double xmin{kInf};
+  double xmax{-kInf};
+  double ymin{kInf};
+  double ymax{-kInf};
+  double zmin{kInf};
+  double zmax{-kInf};
+  double mmin{kInf};
+  double mmax{-kInf};
+  std::vector<int32_t> geospatial_types;
+
+  bool has_x() const { return !std::isinf(xmin - xmax); }
+  bool has_y() const { return !std::isinf(ymin - ymax); }
+  bool has_z() const { return !std::isinf(zmin - zmax); }
+  bool has_m() const { return !std::isinf(mmin - mmax); }
+
+  bool is_empty() const {
+    return !geospatial_types.empty() || has_x() || has_y() || has_z() || 
has_m();

Review Comment:
   ```suggestion
       return !(!geospatial_types.empty() || has_x() || has_y() || has_z() || 
has_m());
   ```



##########
cpp/src/parquet/metadata.cc:
##########
@@ -309,6 +321,14 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
                                                  descr_->sort_order());
   }
 
+  inline bool is_geo_stats_set() const {
+    if (possible_geo_stats_ == nullptr &&
+        column_metadata_->__isset.geospatial_statistics) {
+      possible_geo_stats_ = MakeColumnGeometryStats(*column_metadata_, descr_);

Review Comment:
   Is it worth adding a mutex for both `is_stats_set` and `is_geo_stats_set`?



##########
cpp/src/parquet/geospatial/util_json_internal.cc:
##########
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/geospatial/util_json_internal.h"
+
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "arrow/json/rapidjson_defs.h"  // IWYU pragma: keep
+#include "arrow/result.h"
+#include "arrow/util/string.h"
+
+#include <rapidjson/document.h>
+#include <rapidjson/writer.h>
+
+#include "parquet/exception.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace {
+::arrow::Result<std::shared_ptr<const LogicalType>> ParseGeoArrowJSON(
+    std::string_view serialized_data);
+}
+
+::arrow::Result<std::shared_ptr<const LogicalType>> 
LogicalTypeFromGeoArrowMetadata(
+    std::string_view serialized_data) {
+  // Handle a few hard-coded cases so that the tests can run/users can write 
the default
+  // LogicalType::Geometry() even if ARROW_JSON is not defined.
+  if (serialized_data.empty() || serialized_data == "{}") {
+    return LogicalType::Geometry();
+  } else if (
+      serialized_data ==
+      R"({"edges": "spherical", "crs": "OGC:CRS84", "crs_type": 
"authority_code"})") {

Review Comment:
   Does it require some sort of normalization just in case there are spaces or 
keys are in a different order?



##########
cpp/src/parquet/geospatial/statistics_test.cc:
##########
@@ -0,0 +1,307 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/compute/api.h"
+#include "arrow/testing/gtest_util.h"
+
+#include "parquet/geospatial/statistics.h"
+#include "parquet/test_util.h"
+
+static constexpr double kInf = std::numeric_limits<double>::infinity();
+
+namespace parquet::geometry {

Review Comment:
   Why some files (e.g. geospatial/statistics.h/.cc) are in the `namespace 
parquet` while others are in the `namespace parquet::geometry`? Should we be 
consistent? And should it be `namespace parquet::geospatial` instead if we want 
a nested one?



##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -373,6 +385,8 @@ add_parquet_test(internals-test
                  statistics_test.cc
                  encoding_test.cc
                  metadata_test.cc
+                 geospatial/statistics_test.cc

Review Comment:
   Sort alphabetically?



##########
cpp/src/parquet/geospatial/util_internal.h:
##########
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/util/logging_internal.h"
+#include "parquet/platform.h"
+
+namespace parquet::geometry {
+
+/// \brief Infinity, used to define bounds of empty bounding boxes
+constexpr double kInf = std::numeric_limits<double>::infinity();

Review Comment:
   It has a duplicate definition in the `statistics.h`



##########
cpp/src/parquet/parquet.thrift:
##########
@@ -281,7 +281,7 @@ struct Statistics {
     */
    1: optional binary max;
    2: optional binary min;
-   /** 
+   /**

Review Comment:
   We can rebase to get rid of changes on these files now.



##########
cpp/src/parquet/geospatial/statistics.cc:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/geospatial/statistics.h"
+
+#include <cmath>
+#include <memory>
+
+#include "arrow/array.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/logging.h"
+#include "parquet/exception.h"
+#include "parquet/geospatial/util_internal.h"
+
+using arrow::util::SafeLoad;
+
+namespace parquet {
+
+class GeoStatisticsImpl {
+ public:
+  bool Equals(const GeoStatisticsImpl& other) const {
+    return is_valid_ == other.is_valid_ &&
+           bounder_.GeometryTypes() == other.bounder_.GeometryTypes() &&
+           bounder_.Bounds() == other.bounder_.Bounds();
+  }
+
+  void Merge(const GeoStatisticsImpl& other) {
+    is_valid_ = is_valid_ && other.is_valid_;
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x() || other.is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not supported by 
GeoStatistics::Merge()");
+    }
+
+    bounder_.MergeBox(other.bounder_.Bounds());
+    std::vector<int32_t> other_geometry_types = other.bounder_.GeometryTypes();
+    bounder_.MergeGeometryTypes(other_geometry_types);
+  }
+
+  void Update(const ByteArray* values, int64_t num_values) {
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not suppored by 
GeoStatistics::Update()");
+    }
+
+    for (int64_t i = 0; i < num_values; i++) {
+      const ByteArray& item = values[i];
+      try {
+        bounder_.MergeGeometry({reinterpret_cast<const char*>(item.ptr), 
item.len});
+      } catch (ParquetException&) {
+        is_valid_ = false;
+        return;
+      }
+    }
+  }
+
+  void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset, int64_t num_spaced_values,
+                    int64_t num_values) {
+    DCHECK_GT(num_spaced_values, 0);
+
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not suppored by 
GeoStatistics::Update()");
+    }
+
+    ::arrow::Status status = ::arrow::internal::VisitSetBitRuns(
+        valid_bits, valid_bits_offset, num_spaced_values,
+        [&](int64_t position, int64_t length) {
+          for (int64_t i = 0; i < length; i++) {
+            ByteArray item = SafeLoad(values + i + position);
+            PARQUET_CATCH_NOT_OK(bounder_.MergeGeometry(
+                {reinterpret_cast<const char*>(item.ptr), item.len}));
+          }
+
+          return ::arrow::Status::OK();
+        });
+
+    if (!status.ok()) {
+      is_valid_ = false;
+    }
+  }
+
+  void Update(const ::arrow::Array& values) {
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not suppored by 
GeoStatistics::Update()");
+    }
+
+    // Note that ::arrow::Type::EXTENSION seems to be handled before this is 
called
+    switch (values.type_id()) {
+      case ::arrow::Type::BINARY:
+        UpdateArrayImpl<::arrow::BinaryArray>(values);
+        break;
+      case ::arrow::Type::LARGE_BINARY:
+        UpdateArrayImpl<::arrow::LargeBinaryArray>(values);
+        break;
+      // This does not currently handle run-end encoded, dictionary encodings, 
or views
+      default:
+        throw ParquetException("Unsupported Array type in 
GeoStatistics::Update(Array): ",
+                               values.type()->ToString());
+    }
+  }
+
+  void Reset() {
+    bounder_.Reset();
+    is_valid_ = true;
+  }
+
+  EncodedGeoStatistics Encode() const {
+    if (!is_valid_) {
+      return {};
+    }
+
+    const geometry::BoundingBox::XYZM& mins = bounder_.Bounds().min;
+    const geometry::BoundingBox::XYZM& maxes = bounder_.Bounds().max;
+
+    EncodedGeoStatistics out;
+    out.geospatial_types = bounder_.GeometryTypes();
+
+    if (!bound_empty(0)) {
+      out.xmin = mins[0];
+      out.xmax = maxes[0];
+    }
+
+    if (!bound_empty(1)) {
+      out.ymin = mins[1];
+      out.ymax = maxes[1];
+    }
+
+    if (!bound_empty(2)) {
+      out.zmin = mins[2];
+      out.zmax = maxes[2];
+    }
+
+    if (!bound_empty(3)) {
+      out.mmin = mins[3];
+      out.mmax = maxes[3];
+    }
+
+    return out;
+  }
+
+  void Update(const EncodedGeoStatistics& encoded) {
+    if (!is_valid_) {
+      return;
+    }
+
+    // We can create GeoStatistics from a wraparound bounding box, but we can't
+    // update an existing one because the merge logic is not yet implemented.
+    if (!bounds_empty() &&
+        (is_wraparound_x() || is_wraparound(encoded.xmin, encoded.xmax) ||
+         is_wraparound(encoded.ymin, encoded.ymax))) {

Review Comment:
   ```suggestion
           (is_wraparound_x() || is_wraparound(encoded.xmin, encoded.xmax))) {
   ```



##########
cpp/src/parquet/geospatial/statistics.cc:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/geospatial/statistics.h"
+
+#include <cmath>
+#include <memory>
+
+#include "arrow/array.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/logging.h"
+#include "parquet/exception.h"
+#include "parquet/geospatial/util_internal.h"
+
+using arrow::util::SafeLoad;
+
+namespace parquet {
+
+class GeoStatisticsImpl {
+ public:
+  bool Equals(const GeoStatisticsImpl& other) const {
+    return is_valid_ == other.is_valid_ &&
+           bounder_.GeometryTypes() == other.bounder_.GeometryTypes() &&
+           bounder_.Bounds() == other.bounder_.Bounds();
+  }
+
+  void Merge(const GeoStatisticsImpl& other) {
+    is_valid_ = is_valid_ && other.is_valid_;
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x() || other.is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not supported by 
GeoStatistics::Merge()");
+    }
+
+    bounder_.MergeBox(other.bounder_.Bounds());
+    std::vector<int32_t> other_geometry_types = other.bounder_.GeometryTypes();
+    bounder_.MergeGeometryTypes(other_geometry_types);
+  }
+
+  void Update(const ByteArray* values, int64_t num_values) {
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not suppored by 
GeoStatistics::Update()");
+    }
+
+    for (int64_t i = 0; i < num_values; i++) {
+      const ByteArray& item = values[i];
+      try {
+        bounder_.MergeGeometry({reinterpret_cast<const char*>(item.ptr), 
item.len});
+      } catch (ParquetException&) {
+        is_valid_ = false;
+        return;
+      }
+    }
+  }
+
+  void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset, int64_t num_spaced_values,
+                    int64_t num_values) {
+    DCHECK_GT(num_spaced_values, 0);
+
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not suppored by 
GeoStatistics::Update()");
+    }
+
+    ::arrow::Status status = ::arrow::internal::VisitSetBitRuns(
+        valid_bits, valid_bits_offset, num_spaced_values,
+        [&](int64_t position, int64_t length) {
+          for (int64_t i = 0; i < length; i++) {
+            ByteArray item = SafeLoad(values + i + position);
+            PARQUET_CATCH_NOT_OK(bounder_.MergeGeometry(
+                {reinterpret_cast<const char*>(item.ptr), item.len}));
+          }
+
+          return ::arrow::Status::OK();
+        });
+
+    if (!status.ok()) {
+      is_valid_ = false;
+    }
+  }
+
+  void Update(const ::arrow::Array& values) {
+    if (!is_valid_) {
+      return;
+    }
+
+    if (is_wraparound_x()) {
+      throw ParquetException("Wraparound X is not suppored by 
GeoStatistics::Update()");
+    }
+
+    // Note that ::arrow::Type::EXTENSION seems to be handled before this is 
called
+    switch (values.type_id()) {
+      case ::arrow::Type::BINARY:
+        UpdateArrayImpl<::arrow::BinaryArray>(values);
+        break;
+      case ::arrow::Type::LARGE_BINARY:
+        UpdateArrayImpl<::arrow::LargeBinaryArray>(values);
+        break;
+      // This does not currently handle run-end encoded, dictionary encodings, 
or views
+      default:
+        throw ParquetException("Unsupported Array type in 
GeoStatistics::Update(Array): ",
+                               values.type()->ToString());
+    }
+  }
+
+  void Reset() {
+    bounder_.Reset();
+    is_valid_ = true;
+  }
+
+  EncodedGeoStatistics Encode() const {
+    if (!is_valid_) {
+      return {};
+    }
+
+    const geometry::BoundingBox::XYZM& mins = bounder_.Bounds().min;
+    const geometry::BoundingBox::XYZM& maxes = bounder_.Bounds().max;
+
+    EncodedGeoStatistics out;
+    out.geospatial_types = bounder_.GeometryTypes();
+
+    if (!bound_empty(0)) {
+      out.xmin = mins[0];
+      out.xmax = maxes[0];
+    }
+
+    if (!bound_empty(1)) {
+      out.ymin = mins[1];
+      out.ymax = maxes[1];
+    }
+
+    if (!bound_empty(2)) {
+      out.zmin = mins[2];
+      out.zmax = maxes[2];
+    }
+
+    if (!bound_empty(3)) {
+      out.mmin = mins[3];
+      out.mmax = maxes[3];
+    }
+
+    return out;
+  }
+
+  void Update(const EncodedGeoStatistics& encoded) {
+    if (!is_valid_) {
+      return;
+    }
+
+    // We can create GeoStatistics from a wraparound bounding box, but we can't
+    // update an existing one because the merge logic is not yet implemented.
+    if (!bounds_empty() &&
+        (is_wraparound_x() || is_wraparound(encoded.xmin, encoded.xmax) ||
+         is_wraparound(encoded.ymin, encoded.ymax))) {
+      throw ParquetException("Wraparound X is not suppored by 
GeoStatistics::Update()");
+    }
+
+    geometry::BoundingBox box;
+
+    if (encoded.has_x()) {
+      box.min[0] = encoded.xmin;
+      box.max[0] = encoded.xmax;
+    }
+
+    if (encoded.has_y()) {
+      box.min[1] = encoded.ymin;
+      box.max[1] = encoded.ymax;
+    }
+
+    if (encoded.has_z()) {
+      box.min[2] = encoded.zmin;
+      box.max[2] = encoded.zmax;
+    }
+
+    if (encoded.has_m()) {
+      box.min[3] = encoded.mmin;
+      box.max[3] = encoded.mmax;
+    }
+
+    bounder_.MergeBox(box);
+    bounder_.MergeGeometryTypes(encoded.geospatial_types);
+  }
+
+  bool is_wraparound_x() const {
+    return is_wraparound(lower_bound()[0], upper_bound()[0]);
+  }
+
+  bool is_valid() const { return is_valid_; }
+
+  bool bounds_empty() const {
+    for (int i = 0; i < 4; i++) {

Review Comment:
   Should we declare `constexpr int kNumDimensions = 4` and use it here and 
below for `std::array<double, 4>`?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -788,6 +788,9 @@ class ColumnWriterImpl {
   // Plain-encoded statistics of the whole chunk
   virtual StatisticsPair GetChunkStatistics() = 0;
 
+  // Plain-encoded geometry statistics of the whole chunk

Review Comment:
   ```suggestion
     // Geospatial statistics of the whole chunk
   ```
   
   This has nothing to do with plain encoding.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to