szehon-ho commented on code in PR #53227: URL: https://github.com/apache/spark/pull/53227#discussion_r2663019704
########## sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/geo/WkbConstants.java: ########## @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.util.geo; + +/** + * Utility class for WKB-related constants and helper methods. + */ +public class WkbConstants { + // WKB byte order values + public static final byte BIG_ENDIAN = 0; + public static final byte LITTLE_ENDIAN = 1; + + // WKB base type codes (2D geometries) - matches GeoTypeId values + public static final int WKB_POINT = 1; + public static final int WKB_LINESTRING = 2; + public static final int WKB_POLYGON = 3; + public static final int WKB_MULTIPOINT = 4; + public static final int WKB_MULTILINESTRING = 5; + public static final int WKB_MULTIPOLYGON = 6; + public static final int WKB_GEOMETRYCOLLECTION = 7; + + // Min and max valid base type codes + public static final int WKB_MIN_TYPE = WKB_POINT; + public static final int WKB_MAX_TYPE = WKB_GEOMETRYCOLLECTION; + + // Dimensional offset multipliers (following OGC WKB spec) + // Type = BaseType + DIM_OFFSET * DimFactor + public static final int DIM_OFFSET_2D = 0; + public static final int DIM_OFFSET_Z = 1000; + public static final int DIM_OFFSET_M = 2000; + public static final int DIM_OFFSET_ZM = 3000; + + // WKB type codes for 3DZ geometries (Z coordinate) + public static final int WKB_POINT_Z = WKB_POINT + DIM_OFFSET_Z; + public static final int WKB_LINESTRING_Z = WKB_LINESTRING + DIM_OFFSET_Z; + public static final int WKB_POLYGON_Z = WKB_POLYGON + DIM_OFFSET_Z; + public static final int WKB_MULTIPOINT_Z = WKB_MULTIPOINT + DIM_OFFSET_Z; + public static final int WKB_MULTILINESTRING_Z = WKB_MULTILINESTRING + DIM_OFFSET_Z; + public static final int WKB_MULTIPOLYGON_Z = WKB_MULTIPOLYGON + DIM_OFFSET_Z; + public static final int WKB_GEOMETRYCOLLECTION_Z = WKB_GEOMETRYCOLLECTION + DIM_OFFSET_Z; + + // WKB type codes for 3DM geometries (M coordinate) + public static final int WKB_POINT_M = WKB_POINT + DIM_OFFSET_M; + public static final int WKB_LINESTRING_M = WKB_LINESTRING + DIM_OFFSET_M; + public static final int WKB_POLYGON_M = WKB_POLYGON + DIM_OFFSET_M; + public static final int WKB_MULTIPOINT_M = WKB_MULTIPOINT + DIM_OFFSET_M; + public static final int WKB_MULTILINESTRING_M = WKB_MULTILINESTRING + DIM_OFFSET_M; + public static final int WKB_MULTIPOLYGON_M = WKB_MULTIPOLYGON + DIM_OFFSET_M; + public static final int WKB_GEOMETRYCOLLECTION_M = WKB_GEOMETRYCOLLECTION + DIM_OFFSET_M; + + // WKB type codes for 4D geometries (Z and M coordinates) + public static final int WKB_POINT_ZM = WKB_POINT + DIM_OFFSET_ZM; + public static final int WKB_LINESTRING_ZM = WKB_LINESTRING + DIM_OFFSET_ZM; + public static final int WKB_POLYGON_ZM = WKB_POLYGON + DIM_OFFSET_ZM; + public static final int WKB_MULTIPOINT_ZM = WKB_MULTIPOINT + DIM_OFFSET_ZM; + public static final int WKB_MULTILINESTRING_ZM = WKB_MULTILINESTRING + DIM_OFFSET_ZM; + public static final int WKB_MULTIPOLYGON_ZM = WKB_MULTIPOLYGON + DIM_OFFSET_ZM; + public static final int WKB_GEOMETRYCOLLECTION_ZM = WKB_GEOMETRYCOLLECTION + DIM_OFFSET_ZM; + + // Size constants + public static final int BYTE_SIZE = 1; + public static final int INT_SIZE = 4; + public static final int DOUBLE_SIZE = 8; + + // Default SRID + public static final int DEFAULT_SRID = 0; Review Comment: Yea its for the case of the pre-existing Geometry.fromWKB (the version without srid). Removed in favor of Geometry.DEFAULT_SRID ########## sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/geo/WkbReader.java: ########## @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.util.geo; + + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.spark.sql.catalyst.util.geo.WkbConstants.DEFAULT_SRID; + +/** + * Reader for parsing Well-Known Binary (WKB) format geometries. + * This class implements the OGC Simple Features specification for WKB parsing. + */ +public class WkbReader { + private ByteBuffer buffer; + private final int validationLevel; + private byte[] currentWkb; + + public WkbReader() { + this(1); + } + + public WkbReader(int validationLevel) { + this.validationLevel = validationLevel; + } + + // ========== Coordinate Validation Helpers ========== + + /** + * Returns true if the coordinate value is valid for a non-empty point. + * A valid coordinate is finite (not NaN and not Infinity). + */ + private static boolean isValidCoordinate(double value) { + return Double.isFinite(value); + } + + /** + * Returns true if the coordinate value is valid for a point that may be empty. + * A valid coordinate is either finite or NaN (for empty points). + * Infinity values are not allowed. + */ + private static boolean isValidCoordinateAllowEmpty(double value) { + return Double.isFinite(value) || Double.isNaN(value); + } + + /** + * Reads a geometry from WKB bytes. + */ + public GeometryModel read(byte[] wkb) { + try { + currentWkb = wkb; + return readGeometry(DEFAULT_SRID); + } finally { + // Clear references to allow garbage collection + buffer = null; + currentWkb = null; + } + } + + /** + * Reads a geometry from WKB bytes with a specified SRID. + */ + public GeometryModel read(byte[] wkb, int srid) { + try { + currentWkb = wkb; + return readGeometry(srid); + } finally { + // Clear references to allow garbage collection + buffer = null; + currentWkb = null; + } + } + + private void checkNotAtEnd(long pos) { + if (buffer.position() >= buffer.limit()) { + throw new WkbParseException("Unexpected end of WKB buffer", pos, currentWkb); + } + } + + private ByteOrder readEndianness() { + checkNotAtEnd(buffer.position()); + byte endianValue = buffer.get(); + if (endianValue != WkbConstants.BIG_ENDIAN && endianValue != WkbConstants.LITTLE_ENDIAN) { + throw new WkbParseException("Invalid byte order " + endianValue, buffer.position() - 1, + currentWkb); + } + return endianValue == WkbConstants.LITTLE_ENDIAN ? + ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN; + } + + private int readInt() { + if (buffer.remaining() < WkbConstants.INT_SIZE) { + checkNotAtEnd(buffer.limit()); + } + return buffer.getInt(); + } + + /** + * Reads a double coordinate value, allowing NaN for empty points. + */ + private double readDoubleAllowEmpty() { + if (buffer.remaining() < WkbConstants.DOUBLE_SIZE) { + checkNotAtEnd(buffer.limit()); + } + double value = buffer.getDouble(); + if (!isValidCoordinateAllowEmpty(value)) { + throw new WkbParseException("Invalid coordinate value found", buffer.position() - 8, + currentWkb); + } + return value; + } + + /** + * Reads a double coordinate value, not allowing NaN (for non-point coordinates like rings). + */ + private double readDoubleNoEmpty() { + if (buffer.remaining() < WkbConstants.DOUBLE_SIZE) { + checkNotAtEnd(buffer.limit()); + } + double value = buffer.getDouble(); + if (!isValidCoordinate(value)) { + throw new WkbParseException("Invalid coordinate value found", buffer.position() - 8, + currentWkb); + } + return value; + } + + /** + * Reads a geometry from WKB bytes with a specified SRID. + * + * @param defaultSrid srid to use if not specified in WKB + * @return Geometry object + */ + private GeometryModel readGeometry(int defaultSrid) { + // Check that we have at least one byte for endianness + if (currentWkb == null || currentWkb.length < 1) { + throw new WkbParseException("WKB data is empty or null", 0, currentWkb); + } + + // Read endianness directly from the first byte + byte endianValue = currentWkb[0]; + if (endianValue > 1) { + throw new WkbParseException("Invalid byte order " + endianValue, 0, currentWkb); + } + ByteOrder byteOrder = endianValue == 1 ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN; + + // Check that we have enough bytes for the rest of the data + if (currentWkb.length < 5) { + throw new WkbParseException("WKB data too short", 0, currentWkb); + } + + // Create a new buffer wrapping the rest of the byte array (after the endianness byte) + buffer = ByteBuffer.wrap(currentWkb, 1, currentWkb.length - 1); + buffer.order(byteOrder); + + return readGeometryInternal(defaultSrid, true); + } + + /** + * Reads a nested geometry and validates that its dimensions match the parent's dimensions. + * The check happens immediately after reading the type, before reading coordinate data. + */ + private GeometryModel readNestedGeometryWithDimensionCheck( + int defaultSrid, + boolean expectedHasZ, + boolean expectedHasM) { + return readNestedGeometryInternal(defaultSrid, true, expectedHasZ, expectedHasM); + } + + /** + * Internal method to read a nested geometry with optional dimension validation. + */ + private GeometryModel readNestedGeometryInternal( + int defaultSrid, + boolean validateDimensions, + boolean expectedHasZ, + boolean expectedHasM) { + // Read endianness from the current buffer position + ByteOrder byteOrder = readEndianness(); + + // Save the current byte order and temporarily set to the nested geometry's byte order + ByteOrder savedByteOrder = buffer.order(); + buffer.order(byteOrder); + long typeStartPos = buffer.position(); + + // Read type and dimension + int typeAndDim = readInt(); + + // Parse WKB format (geotype and dimension) + if (!WkbConstants.isValidWkbType(typeAndDim)) { + throw new WkbParseException("Invalid or unsupported type " + typeAndDim, typeStartPos, + currentWkb); + } + + int baseType = WkbConstants.getBaseType(typeAndDim); + GeoTypeId geoType = GeoTypeId.fromValue(baseType); + int dimensionCount = WkbConstants.getDimensionCount(typeAndDim); + boolean hasZ = WkbConstants.hasZ(typeAndDim); + boolean hasM = WkbConstants.hasM(typeAndDim); + + // Validate dimensions match parent if required + if (validateDimensions && (hasZ != expectedHasZ || hasM != expectedHasM)) { + throw new WkbParseException( + "Invalid or unsupported type " + typeAndDim, typeStartPos, currentWkb); + } + + // Dispatch to appropriate reader based on geometry type + GeometryModel result = readGeometryData(geoType, defaultSrid, dimensionCount, hasZ, hasM, + typeStartPos); + + // Restore the saved byte order + buffer.order(savedByteOrder); + + return result; + } + + /** + * Internal method to read geometry with optional endianness reading. + * + * @param defaultSrid srid to use if not specified in WKB + * @param isRootGeometry if true, assumes endianness has already been read and buffer is set up; + * if false, reads endianness from current buffer position + * @return Geometry object + */ + private GeometryModel readGeometryInternal(int defaultSrid, boolean isRootGeometry) { + ByteOrder savedByteOrder = null; + long typeStartPos; + + if (isRootGeometry) { Review Comment: removed this branch -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
