szehon-ho commented on code in PR #53227:
URL: https://github.com/apache/spark/pull/53227#discussion_r2663015392


##########
sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/geo/WkbReader.java:
##########
@@ -0,0 +1,475 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.util.geo;
+
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.spark.sql.catalyst.util.geo.WkbConstants.DEFAULT_SRID;
+
+/**
+ * Reader for parsing Well-Known Binary (WKB) format geometries.
+ * This class implements the OGC Simple Features specification for WKB parsing.
+ */
+public class WkbReader {
+  private ByteBuffer buffer;
+  private final int validationLevel;
+  private byte[] currentWkb;
+
+  public WkbReader() {
+    this(1);
+  }
+
+  public WkbReader(int validationLevel) {
+    this.validationLevel = validationLevel;
+  }
+
+  // ========== Coordinate Validation Helpers ==========
+
+  /**
+   * Returns true if the coordinate value is valid for a non-empty point.
+   * A valid coordinate is finite (not NaN and not Infinity).
+   */
+  private static boolean isValidCoordinate(double value) {
+    return Double.isFinite(value);
+  }
+
+  /**
+   * Returns true if the coordinate value is valid for a point that may be 
empty.
+   * A valid coordinate is either finite or NaN (for empty points).
+   * Infinity values are not allowed.
+   */
+  private static boolean isValidCoordinateAllowEmpty(double value) {
+    return Double.isFinite(value) || Double.isNaN(value);
+  }
+
+  /**
+   * Reads a geometry from WKB bytes.
+   */
+  public GeometryModel read(byte[] wkb) {
+    try {
+      currentWkb = wkb;
+      return readGeometry(DEFAULT_SRID);
+    } finally {
+      // Clear references to allow garbage collection
+      buffer = null;
+      currentWkb = null;
+    }
+  }
+
+  /**
+   * Reads a geometry from WKB bytes with a specified SRID.
+   */
+  public GeometryModel read(byte[] wkb, int srid) {
+    try {
+      currentWkb = wkb;
+      return readGeometry(srid);
+    } finally {
+      // Clear references to allow garbage collection
+      buffer = null;
+      currentWkb = null;
+    }
+  }
+
+  private void checkNotAtEnd(long pos) {
+    if (buffer.position() >= buffer.limit()) {
+      throw new WkbParseException("Unexpected end of WKB buffer", pos, 
currentWkb);
+    }
+  }
+
+  private ByteOrder readEndianness() {
+    checkNotAtEnd(buffer.position());
+    byte endianValue = buffer.get();
+    if (endianValue != WkbConstants.BIG_ENDIAN && endianValue != 
WkbConstants.LITTLE_ENDIAN) {
+      throw new WkbParseException("Invalid byte order " + endianValue, 
buffer.position() - 1,
+        currentWkb);
+    }
+    return endianValue == WkbConstants.LITTLE_ENDIAN ?
+      ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN;
+  }
+
+  private int readInt() {
+    if (buffer.remaining() < WkbConstants.INT_SIZE) {
+      checkNotAtEnd(buffer.limit());
+    }
+    return buffer.getInt();
+  }
+
+  /**
+   * Reads a double coordinate value, allowing NaN for empty points.
+   */
+  private double readDoubleAllowEmpty() {
+    if (buffer.remaining() < WkbConstants.DOUBLE_SIZE) {
+      checkNotAtEnd(buffer.limit());
+    }
+    double value = buffer.getDouble();
+    if (!isValidCoordinateAllowEmpty(value)) {
+      throw new WkbParseException("Invalid coordinate value found", 
buffer.position() - 8,
+        currentWkb);
+    }
+    return value;
+  }
+
+  /**
+   * Reads a double coordinate value, not allowing NaN (for non-point 
coordinates like rings).
+   */
+  private double readDoubleNoEmpty() {
+    if (buffer.remaining() < WkbConstants.DOUBLE_SIZE) {
+      checkNotAtEnd(buffer.limit());
+    }
+    double value = buffer.getDouble();
+    if (!isValidCoordinate(value)) {
+      throw new WkbParseException("Invalid coordinate value found", 
buffer.position() - 8,
+        currentWkb);
+    }
+    return value;
+  }
+
+  /**
+   * Reads a geometry from WKB bytes with a specified SRID.
+   *
+   * @param defaultSrid srid to use if not specified in WKB
+   * @return Geometry object
+   */
+  private GeometryModel readGeometry(int defaultSrid) {
+    // Check that we have at least one byte for endianness
+    if (currentWkb == null || currentWkb.length < 1) {
+      throw new WkbParseException("WKB data is empty or null", 0, currentWkb);
+    }
+
+    // Read endianness directly from the first byte
+    byte endianValue = currentWkb[0];
+    if (endianValue > 1) {
+      throw new WkbParseException("Invalid byte order " + endianValue, 0, 
currentWkb);
+    }
+    ByteOrder byteOrder = endianValue == 1 ? ByteOrder.LITTLE_ENDIAN : 
ByteOrder.BIG_ENDIAN;
+
+    // Check that we have enough bytes for the rest of the data
+    if (currentWkb.length < 5) {
+      throw new WkbParseException("WKB data too short", 0, currentWkb);
+    }
+
+    // Create a new buffer wrapping the rest of the byte array (after the 
endianness byte)
+    buffer = ByteBuffer.wrap(currentWkb, 1, currentWkb.length - 1);
+    buffer.order(byteOrder);
+
+    return readGeometryInternal(defaultSrid, true);
+  }
+
+  /**
+   * Reads a nested geometry and validates that its dimensions match the 
parent's dimensions.
+   * The check happens immediately after reading the type, before reading 
coordinate data.
+   */
+  private GeometryModel readNestedGeometryWithDimensionCheck(
+      int defaultSrid,
+      boolean expectedHasZ,
+      boolean expectedHasM) {
+    return readNestedGeometryInternal(defaultSrid, true, expectedHasZ, 
expectedHasM);
+  }
+
+  /**
+   * Internal method to read a nested geometry with optional dimension 
validation.
+   */
+  private GeometryModel readNestedGeometryInternal(
+      int defaultSrid,
+      boolean validateDimensions,
+      boolean expectedHasZ,
+      boolean expectedHasM) {
+    // Read endianness from the current buffer position
+    ByteOrder byteOrder = readEndianness();
+
+    // Save the current byte order and temporarily set to the nested 
geometry's byte order
+    ByteOrder savedByteOrder = buffer.order();
+    buffer.order(byteOrder);
+    long typeStartPos = buffer.position();
+
+    // Read type and dimension
+    int typeAndDim = readInt();
+
+    // Parse WKB format (geotype and dimension)
+    if (!WkbConstants.isValidWkbType(typeAndDim)) {
+      throw new WkbParseException("Invalid or unsupported type " + typeAndDim, 
typeStartPos,
+        currentWkb);
+    }
+
+    int baseType = WkbConstants.getBaseType(typeAndDim);
+    GeoTypeId geoType = GeoTypeId.fromValue(baseType);
+    int dimensionCount = WkbConstants.getDimensionCount(typeAndDim);
+    boolean hasZ = WkbConstants.hasZ(typeAndDim);
+    boolean hasM = WkbConstants.hasM(typeAndDim);
+
+    // Validate dimensions match parent if required
+    if (validateDimensions && (hasZ != expectedHasZ || hasM != expectedHasM)) {
+      throw new WkbParseException(
+          "Invalid or unsupported type " + typeAndDim, typeStartPos, 
currentWkb);
+    }
+
+    // Dispatch to appropriate reader based on geometry type
+    GeometryModel result = readGeometryData(geoType, defaultSrid, 
dimensionCount, hasZ, hasM,
+        typeStartPos);
+
+    // Restore the saved byte order
+    buffer.order(savedByteOrder);
+
+    return result;
+  }
+
+  /**
+   * Internal method to read geometry with optional endianness reading.
+   *
+   * @param defaultSrid srid to use if not specified in WKB
+   * @param isRootGeometry if true, assumes endianness has already been read 
and buffer is set up;
+   *                       if false, reads endianness from current buffer 
position
+   * @return Geometry object
+   */
+  private GeometryModel readGeometryInternal(int defaultSrid, boolean 
isRootGeometry) {

Review Comment:
   Yea I was being overly cautious.  It looks we dont need this, even for 
nested geometries, adding a test in: testGeometryCollectionDifferentEndianness



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to