wgtmac commented on code in PR #45459:
URL: https://github.com/apache/arrow/pull/45459#discussion_r1957011653
##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -171,6 +171,8 @@ set(PARQUET_SRCS
exception.cc
file_reader.cc
file_writer.cc
+ geometry_statistics.cc
Review Comment:
We have renamed `GeometryStatistics` to `GeospatialStatistics` in the
Parquet spec to avoid confusion that it does not support geography type.
Perhaps we should rename the file and class names too.
##########
cpp/src/parquet/types.cc:
##########
@@ -1619,6 +1668,204 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+
+ private:
+ explicit Geometry(std::string crs)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)) {}
+
+ std::string crs_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (!crs_.empty()) {
+ // TODO(paleolimbot): For documented cases the CRS shouldn't contain
quotes,
+ // but we should probably escape the value of crs_ for backslash and quotes
+ // to be safe.
+ json << R"(, "crs": ")" << crs_ << R"(")";
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
+ format::LogicalType type;
+ format::GeometryType geometry_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (!crs_.empty()) {
+ geometry_type.__set_crs(crs_);
+ }
+
+ type.__set_GEOMETRY(geometry_type);
+ return type;
+}
+
+bool LogicalType::Impl::Geometry::Equals(const LogicalType& other) const {
+ if (other.is_geometry()) {
+ const auto& other_geometry = checked_cast<const
GeometryLogicalType&>(other);
+ return crs() == other_geometry.crs();
+ } else {
+ return false;
+ }
+}
+
+const std::string& GeometryLogicalType::crs() const {
+ return (dynamic_cast<const LogicalType::Impl::Geometry&>(*impl_)).crs();
+}
+
+std::shared_ptr<const LogicalType> GeometryLogicalType::Make(std::string crs) {
+ auto* logical_type = new GeometryLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Geometry(std::move(crs)));
+ return std::shared_ptr<const LogicalType>(logical_type);
+}
+
+class LogicalType::Impl::Geography final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeographyLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::EdgeInterpolationAlgorithm::algorithm algorithm() const {
+ return algorithm_;
+ }
+
+ const char* algorithm_name() const {
Review Comment:
```suggestion
std::string_view algorithm_name() const {
```
##########
cpp/src/parquet/thrift_internal.h:
##########
@@ -232,6 +233,29 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1
aesGcmCtrV1) {
aesGcmCtrV1.supply_aad_prefix};
}
+static inline EncodedGeospatialStatistics FromThrift(
+ const format::GeospatialStatistics& geometry_stats) {
Review Comment:
```suggestion
const format::GeospatialStatistics& geospatial_stats) {
```
or `geo_stats` for short?
##########
cpp/src/parquet/metadata.h:
##########
@@ -31,6 +31,20 @@
namespace parquet {
+class ColumnDescriptor;
Review Comment:
Please use `parquet/type_fwd.h` and add missing ones there.
##########
cpp/src/parquet/metadata.h:
##########
@@ -141,8 +155,10 @@ class PARQUET_EXPORT ColumnChunkMetaData {
int64_t num_values() const;
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
bool is_stats_set() const;
+ bool is_geometry_stats_set() const;
std::shared_ptr<Statistics> statistics() const;
std::shared_ptr<SizeStatistics> size_statistics() const;
+ std::shared_ptr<GeospatialStatistics> geometry_statistics() const;
Review Comment:
```suggestion
std::shared_ptr<GeospatialStatistics> geospatial_statistics() const;
```
##########
cpp/src/parquet/metadata.cc:
##########
@@ -111,6 +112,17 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count);
}
+static std::shared_ptr<GeospatialStatistics> MakeColumnGeometryStats(
Review Comment:
```suggestion
static std::shared_ptr<GeospatialStatistics> MakeColumnGeospatialStats(
```
##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -110,8 +111,62 @@ Result<std::shared_ptr<ArrowType>>
MakeArrowTimestamp(const LogicalType& logical
}
}
+Result<std::string> MakeGeoArrowCrsMetadata(
+ const std::string& crs,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ std::string srid_prefix{"srid:"};
+ std::string projjson_prefix{"projjson:"};
+
+ if (crs.empty()) {
+ return R"("crs": "OGC:CRS84", "crs_type": "authority_code")";
+ } else if (crs.rfind(srid_prefix, 0) == 0) {
+ return R"("crs": ")" + crs.substr(srid_prefix.size()) + R"(", "crs_type":
"srid")";
+ } else if (crs.rfind(projjson_prefix, 0) == 0) {
+ std::string metadata_field = crs.substr(projjson_prefix.size());
+ if (metadata && metadata->Contains(metadata_field)) {
+ ARROW_ASSIGN_OR_RAISE(std::string projjson_value,
metadata->Get(metadata_field));
+ return R"("crs": )" + projjson_value + R"(, "crs_type": "projjson")";
+ } else {
+ // Pass on the value of the field so the user can sort this out if needed
+ return R"("crs": )" + metadata_field + R"(, "crs_type": "projjson")";
+ }
+ } else {
+ return Status::Invalid("Can't convert invalid Parquet CRS string to
GeoArrow: ", crs);
Review Comment:
Does it mean that we should enforce srid/projjson prefix from the `crs`
string if we want to use GeoArrow extension type? If true, we need to document
this expectation somewhere.
##########
cpp/src/parquet/properties.h:
##########
@@ -1112,6 +1113,12 @@ class PARQUET_EXPORT ArrowWriterProperties {
return this;
}
+ /// Write GEOMETRY and GEOGRAPHY logical types where possible.
Review Comment:
Why do we need this?
##########
cpp/src/parquet/types.cc:
##########
@@ -479,6 +479,38 @@ std::shared_ptr<const LogicalType> LogicalType::FromThrift(
return UUIDLogicalType::Make();
} else if (type.__isset.FLOAT16) {
return Float16LogicalType::Make();
+ } else if (type.__isset.GEOMETRY) {
+ std::string crs;
+ if (type.GEOMETRY.__isset.crs) {
+ crs = type.GEOMETRY.crs;
+ }
+
+ return GeometryLogicalType::Make(crs);
+ } else if (type.__isset.GEOGRAPHY) {
+ std::string crs;
+ if (type.GEOGRAPHY.__isset.crs) {
+ crs = type.GEOGRAPHY.crs;
+ }
+
+ LogicalType::EdgeInterpolationAlgorithm::algorithm algorithm =
+ LogicalType::EdgeInterpolationAlgorithm::UNKNOWN;
+ if (!type.GEOGRAPHY.__isset.algorithm ||
+ type.GEOGRAPHY.algorithm ==
format::EdgeInterpolationAlgorithm::SPHERICAL) {
+ algorithm = LogicalType::EdgeInterpolationAlgorithm::SPHERICAL;
Review Comment:
nit: replace these lines by defining `FromThrift` and `ToThrift` functions
in the `thrift_internal.h`
##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -110,8 +111,62 @@ Result<std::shared_ptr<ArrowType>>
MakeArrowTimestamp(const LogicalType& logical
}
}
+Result<std::string> MakeGeoArrowCrsMetadata(
+ const std::string& crs,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ std::string srid_prefix{"srid:"};
+ std::string projjson_prefix{"projjson:"};
Review Comment:
```suggestion
const std::string kSridPrefix{"srid:"};
const std::string kProjjsonPrefix{"projjson:"};
```
##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -110,8 +111,62 @@ Result<std::shared_ptr<ArrowType>>
MakeArrowTimestamp(const LogicalType& logical
}
}
+Result<std::string> MakeGeoArrowCrsMetadata(
+ const std::string& crs,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ std::string srid_prefix{"srid:"};
+ std::string projjson_prefix{"projjson:"};
+
+ if (crs.empty()) {
+ return R"("crs": "OGC:CRS84", "crs_type": "authority_code")";
+ } else if (crs.rfind(srid_prefix, 0) == 0) {
+ return R"("crs": ")" + crs.substr(srid_prefix.size()) + R"(", "crs_type":
"srid")";
+ } else if (crs.rfind(projjson_prefix, 0) == 0) {
+ std::string metadata_field = crs.substr(projjson_prefix.size());
+ if (metadata && metadata->Contains(metadata_field)) {
+ ARROW_ASSIGN_OR_RAISE(std::string projjson_value,
metadata->Get(metadata_field));
+ return R"("crs": )" + projjson_value + R"(, "crs_type": "projjson")";
+ } else {
+ // Pass on the value of the field so the user can sort this out if needed
+ return R"("crs": )" + metadata_field + R"(, "crs_type": "projjson")";
+ }
+ } else {
+ return Status::Invalid("Can't convert invalid Parquet CRS string to
GeoArrow: ", crs);
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeGeoArrowGeometryType(
+ const LogicalType& logical_type,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ // Check if we have a registered GeoArrow type to read into
+ std::shared_ptr<::arrow::ExtensionType> maybe_geoarrow_wkb =
+ ::arrow::GetExtensionType("geoarrow.wkb");
+ if (!maybe_geoarrow_wkb) {
+ return ::arrow::binary();
+ }
+
+ if (logical_type.is_geometry()) {
+ const auto& geospatial_type = checked_cast<const
GeometryLogicalType&>(logical_type);
+ ARROW_ASSIGN_OR_RAISE(std::string crs_metadata,
+ MakeGeoArrowCrsMetadata(geospatial_type.crs(),
metadata));
+
+ std::string serialized_data = std::string("{") + crs_metadata + "}";
+ return maybe_geoarrow_wkb->Deserialize(::arrow::binary(), serialized_data);
Review Comment:
CMIW, the GeoArrow extension is responsible for deserializing the json
metadata?
##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -20,6 +20,7 @@
#include "arrow/extension/json.h"
#include "arrow/type.h"
+#include "arrow/util/key_value_metadata.h"
#include "parquet/properties.h"
Review Comment:
```suggestion
#include "arrow/extension/json.h"
#include "arrow/type.h"
#include "arrow/util/key_value_metadata.h"
#include "parquet/properties.h"
```
##########
cpp/src/parquet/types.h:
##########
@@ -446,6 +467,30 @@ class PARQUET_EXPORT Float16LogicalType : public
LogicalType {
Float16LogicalType() = default;
};
+class PARQUET_EXPORT GeometryLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(std::string crs = "");
+
+ const std::string& crs() const;
+
+ private:
+ GeometryLogicalType() = default;
+};
+
+class PARQUET_EXPORT GeographyLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(
+ std::string crs = "", LogicalType::EdgeInterpolationAlgorithm::algorithm
algorithm =
+ EdgeInterpolationAlgorithm::SPHERICAL);
+
+ const std::string& crs() const;
+ LogicalType::EdgeInterpolationAlgorithm::algorithm algorithm() const;
+ const char* algorithm_name() const;
Review Comment:
Why do we need this? BTW it is better to use std::string_view IMHO.
Perhaps we can remove it by providing `std::string_view
EdgeInterpolationAlgorithmToString(EdgeInterpolationAlgorithm t)` similar to
other `XXXToString` functions below in this file?
##########
cpp/src/parquet/metadata.h:
##########
@@ -141,8 +155,10 @@ class PARQUET_EXPORT ColumnChunkMetaData {
int64_t num_values() const;
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
bool is_stats_set() const;
+ bool is_geometry_stats_set() const;
Review Comment:
```suggestion
bool is_geo_stats_set() const;
```
##########
cpp/src/parquet/metadata.h:
##########
@@ -440,6 +456,9 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
void SetStatistics(const EncodedStatistics& stats);
void SetSizeStatistics(const SizeStatistics& size_stats);
+ // column geometry statistics
+ void SetGeospatialStatistics(const EncodedGeospatialStatistics&
geometry_stats);
Review Comment:
```suggestion
void SetGeospatialStatistics(const EncodedGeospatialStatistics& geo_stats);
```
##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -110,8 +111,62 @@ Result<std::shared_ptr<ArrowType>>
MakeArrowTimestamp(const LogicalType& logical
}
}
+Result<std::string> MakeGeoArrowCrsMetadata(
+ const std::string& crs,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ std::string srid_prefix{"srid:"};
+ std::string projjson_prefix{"projjson:"};
+
+ if (crs.empty()) {
+ return R"("crs": "OGC:CRS84", "crs_type": "authority_code")";
+ } else if (crs.rfind(srid_prefix, 0) == 0) {
+ return R"("crs": ")" + crs.substr(srid_prefix.size()) + R"(", "crs_type":
"srid")";
+ } else if (crs.rfind(projjson_prefix, 0) == 0) {
+ std::string metadata_field = crs.substr(projjson_prefix.size());
+ if (metadata && metadata->Contains(metadata_field)) {
+ ARROW_ASSIGN_OR_RAISE(std::string projjson_value,
metadata->Get(metadata_field));
+ return R"("crs": )" + projjson_value + R"(, "crs_type": "projjson")";
+ } else {
+ // Pass on the value of the field so the user can sort this out if needed
+ return R"("crs": )" + metadata_field + R"(, "crs_type": "projjson")";
+ }
+ } else {
+ return Status::Invalid("Can't convert invalid Parquet CRS string to
GeoArrow: ", crs);
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeGeoArrowGeometryType(
+ const LogicalType& logical_type,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ // Check if we have a registered GeoArrow type to read into
+ std::shared_ptr<::arrow::ExtensionType> maybe_geoarrow_wkb =
+ ::arrow::GetExtensionType("geoarrow.wkb");
+ if (!maybe_geoarrow_wkb) {
+ return ::arrow::binary();
+ }
+
+ if (logical_type.is_geometry()) {
+ const auto& geospatial_type = checked_cast<const
GeometryLogicalType&>(logical_type);
+ ARROW_ASSIGN_OR_RAISE(std::string crs_metadata,
+ MakeGeoArrowCrsMetadata(geospatial_type.crs(),
metadata));
+
+ std::string serialized_data = std::string("{") + crs_metadata + "}";
+ return maybe_geoarrow_wkb->Deserialize(::arrow::binary(), serialized_data);
+ } else {
Review Comment:
Should we check `logical_type.is_geography()` here and throw in the else
branch?
##########
cpp/src/parquet/metadata.cc:
##########
@@ -309,6 +322,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
descr_->sort_order());
}
+ inline bool is_geometry_stats_set() const {
+ DCHECK(writer_version_ != nullptr);
Review Comment:
```suggestion
```
We don't need this actually.
##########
cpp/src/parquet/thrift_internal.h:
##########
@@ -332,6 +356,27 @@ static inline format::SortingColumn ToThrift(SortingColumn
sorting_column) {
return thrift_sorting_column;
}
+static inline format::GeospatialStatistics ToThrift(
+ const EncodedGeospatialStatistics& encoded_geometry_stats) {
Review Comment:
ditto
##########
cpp/src/parquet/types.cc:
##########
@@ -1619,6 +1668,204 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+
+ private:
+ explicit Geometry(std::string crs)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)) {}
+
+ std::string crs_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (!crs_.empty()) {
+ // TODO(paleolimbot): For documented cases the CRS shouldn't contain
quotes,
+ // but we should probably escape the value of crs_ for backslash and quotes
+ // to be safe.
+ json << R"(, "crs": ")" << crs_ << R"(")";
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
+ format::LogicalType type;
+ format::GeometryType geometry_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (!crs_.empty()) {
+ geometry_type.__set_crs(crs_);
+ }
+
+ type.__set_GEOMETRY(geometry_type);
+ return type;
+}
+
+bool LogicalType::Impl::Geometry::Equals(const LogicalType& other) const {
+ if (other.is_geometry()) {
+ const auto& other_geometry = checked_cast<const
GeometryLogicalType&>(other);
+ return crs() == other_geometry.crs();
+ } else {
+ return false;
+ }
+}
+
+const std::string& GeometryLogicalType::crs() const {
+ return (dynamic_cast<const LogicalType::Impl::Geometry&>(*impl_)).crs();
+}
+
+std::shared_ptr<const LogicalType> GeometryLogicalType::Make(std::string crs) {
+ auto* logical_type = new GeometryLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Geometry(std::move(crs)));
+ return std::shared_ptr<const LogicalType>(logical_type);
+}
+
+class LogicalType::Impl::Geography final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeographyLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::EdgeInterpolationAlgorithm::algorithm algorithm() const {
+ return algorithm_;
+ }
+
+ const char* algorithm_name() const {
+ switch (algorithm_) {
+ case LogicalType::EdgeInterpolationAlgorithm::SPHERICAL:
+ return "spherical";
+ case LogicalType::EdgeInterpolationAlgorithm::VINCENTY:
+ return "vincenty";
+ case LogicalType::EdgeInterpolationAlgorithm::THOMAS:
+ return "thomas";
+ case LogicalType::EdgeInterpolationAlgorithm::ANDOYER:
+ return "andoyer";
+ case LogicalType::EdgeInterpolationAlgorithm::KARNEY:
+ return "karney";
+ default:
+ return "unknown";
+ }
+ }
+
+ private:
+ Geography(std::string crs,
LogicalType::EdgeInterpolationAlgorithm::algorithm algorithm)
+ : LogicalType::Impl(LogicalType::Type::GEOGRAPHY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ algorithm_(algorithm) {}
+
+ std::string crs_;
+ LogicalType::EdgeInterpolationAlgorithm::algorithm algorithm_;
+};
+
+std::string LogicalType::Impl::Geography::ToString() const {
+ std::stringstream type;
+ type << "Geography(crs=" << crs_ << ", algorithm=" << algorithm_name() <<
")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geography::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geography")";
+
+ if (!crs_.empty()) {
+ // TODO(paleolimbot): For documented cases the CRS shouldn't contain
quotes,
+ // but we should probably escape the value of crs_ for backslash and quotes
+ // to be safe.
+ json << R"(, "crs": ")" << crs_ << R"(")";
+ }
+
+ if (algorithm_ != LogicalType::EdgeInterpolationAlgorithm::SPHERICAL) {
+ json << R"(, "algorithm": ")" << algorithm_name() << R"(")";
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geography::ToThrift() const {
+ format::LogicalType type;
+ format::GeographyType geography_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (!crs_.empty()) {
+ geography_type.__set_crs(crs_);
+ }
+
+ if (algorithm_ == LogicalType::EdgeInterpolationAlgorithm::SPHERICAL) {
Review Comment:
ditto
##########
cpp/src/parquet/metadata.cc:
##########
@@ -309,6 +322,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
descr_->sort_order());
}
+ inline bool is_geometry_stats_set() const {
+ DCHECK(writer_version_ != nullptr);
Review Comment:
Perhaps we even don't need `is_geometry_stats_set()`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]