iverase commented on code in PR #1017:
URL: https://github.com/apache/lucene/pull/1017#discussion_r928955670


##########
lucene/core/src/java/org/apache/lucene/document/ShapeDocValues.java:
##########
@@ -0,0 +1,907 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import org.apache.lucene.document.ShapeField.DecodedTriangle.TYPE;
+import org.apache.lucene.document.SpatialQuery.EncodedRectangle;
+import org.apache.lucene.geo.Component2D;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteBuffersDataOutput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * A binary doc values format representation for {@link LatLonShape} and 
{@link XYShape}
+ *
+ * <p>Note that this class cannot be instantiated directly due to different 
encodings {@link
+ * org.apache.lucene.geo.XYEncodingUtils} and {@link 
org.apache.lucene.geo.GeoEncodingUtils}
+ *
+ * <p>Concrete Implementations include: {@link LatLonShapeDocValues} and 
{@link XYShapeDocValues}
+ *
+ * @lucene.experimental
+ */
+abstract class ShapeDocValues {
+  /** doc value format version; used to support bwc for any encoding changes */
+  protected static final byte VERSION = 0;
+  /** the binary doc value */
+  private final BytesRef data;
+  /** the geometry comparator used to check relations */
+  protected final ShapeComparator shapeComparator;
+
+  /**
+   * Creates a {@ShapeDocValues} instance from a shape tessellation
+   *
+   * @param tessellation The tessellation (must not be null)
+   */
+  ShapeDocValues(List<ShapeField.DecodedTriangle> tessellation) {
+    this.data = computeBinaryValue(tessellation);
+    try {
+      this.shapeComparator = new ShapeComparator(this.data);
+    } catch (IOException e) {
+      throw new IllegalArgumentException("unable to read binary shape doc 
value field. ", e);
+    }
+  }
+
+  /** Creates a {@code ShapeDocValues} instance from a given serialized value 
*/
+  ShapeDocValues(BytesRef binaryValue) {
+    this.data = binaryValue;
+    try {
+      this.shapeComparator = new ShapeComparator(this.data);
+    } catch (IOException e) {
+      throw new IllegalArgumentException("unable to read binary shape doc 
value field. ", e);
+    }
+  }
+
+  /** returns the encoded doc values field as a {@link BytesRef} */
+  protected BytesRef binaryValue() {
+    return this.data;
+  }
+
+  /** Returns the number of terms (tessellated triangles) for this shape */
+  public int numberOfTerms() {
+    return shapeComparator.numberOfTerms();
+  }
+
+  /** returns the min x value for the shape's bounding box */
+  public int getMinX() {
+    return shapeComparator.getMinX();
+  }
+
+  /** returns the min y value for the shape's bounding box */
+  public int getMinY() {
+    return shapeComparator.getMinY();
+  }
+
+  /** returns the max x value for the shape's bounding box */
+  public int getMaxX() {
+    return shapeComparator.getMaxX();
+  }
+
+  /** returns the max y value for the shape's bounding box */
+  public int getMaxY() {
+    return shapeComparator.getMaxY();
+  }
+
+  /** Retrieves the x centroid location for the geometry(s) */
+  public int getCentroidX() {
+    return shapeComparator.getCentroidX();
+  }
+
+  /** Retrieves the y centroid location for the geometry(s) */
+  public int getCentroidY() {
+    return shapeComparator.getCentroidY();
+  }
+
+  /**
+   * Retrieves the highest dimensional type (POINT, LINE, TRIANGLE) for 
computing the geometry(s)
+   * centroid
+   */
+  public TYPE getHighestDimension() {
+    return shapeComparator.getHighestDimension();
+  }
+
+  private BytesRef computeBinaryValue(List<ShapeField.DecodedTriangle> 
tessellation) {
+    try {
+      // dfs order serialization
+      List<TreeNode> dfsSerialized = new ArrayList<>(tessellation.size());
+      buildTree(tessellation, dfsSerialized);
+      Writer w = new Writer(dfsSerialized);
+      return w.getBytesRef();
+    } catch (IOException e) {
+      throw new RuntimeException("Internal error building 
LatLonShapeDocValues. Got ", e);
+    }
+  }
+
+  /** Creates a geometry query for shape docvalues */
+  public static Query newGeometryQuery(
+      final String field, final ShapeField.QueryRelation relation, Object... 
geometries) {
+    return null;
+    // TODO
+    //  return new ShapeDocValuesQuery(field, relation, geometries);
+  }
+
+  public Relation relate(final Component2D component) throws IOException {
+    return shapeComparator.relate(component);
+  }
+
+  protected interface Encoder {
+    double decodeX(int encoded);
+
+    double decodeY(int encoded);
+  }
+
+  protected abstract Encoder getEncoder();
+
+  /** main entry point to build the tessellation tree * */
+  private TreeNode buildTree(
+      List<ShapeField.DecodedTriangle> tessellation, List<TreeNode> 
dfsSerialized)
+      throws IOException {
+    if (tessellation.size() == 1) {
+      ShapeField.DecodedTriangle t = tessellation.get(0);
+      TreeNode node = new TreeNode(t);
+      if (t.type == ShapeField.DecodedTriangle.TYPE.LINE) {
+        node.midX /= node.length;
+        node.midY /= node.length;
+      } else if (t.type == ShapeField.DecodedTriangle.TYPE.TRIANGLE) {
+        node.midX /= node.signedArea;
+        node.midY /= node.signedArea;
+      }
+      node.highestType = t.type;
+      dfsSerialized.add(node);
+      return node;
+    }
+    TreeNode[] triangles = new TreeNode[tessellation.size()];
+    int i = 0;
+    int minY = Integer.MAX_VALUE;
+    int minX = Integer.MAX_VALUE;
+    int maxY = Integer.MIN_VALUE;
+    int maxX = Integer.MIN_VALUE;
+
+    // running stats for computing centroid
+    double totalSignedArea = 0;
+    double totalLength = 0;
+    double numXPnt = 0;
+    double numYPnt = 0;
+    double numXLin = 0;
+    double numYLin = 0;
+    double numXPly = 0;
+    double numYPly = 0;
+    ShapeField.DecodedTriangle.TYPE highestType = 
ShapeField.DecodedTriangle.TYPE.POINT;
+
+    for (ShapeField.DecodedTriangle t : tessellation) {
+      TreeNode node = new TreeNode(t);
+      triangles[i++] = node;
+      // compute the bbox values up front
+      minY = Math.min(minY, node.minY);
+      minX = Math.min(minX, node.minX);
+      maxY = Math.max(maxY, node.maxY);
+      maxX = Math.max(maxX, node.maxX);
+
+      // compute the running centroid stats
+      totalSignedArea += node.signedArea; // non-zero if any components are 
triangles
+      totalLength += node.length; // non-zero if any components are line 
segments
+      if (t.type == ShapeField.DecodedTriangle.TYPE.POINT) {
+        numXPnt += node.midX;
+        numYPnt += node.midY;
+      } else if (t.type == ShapeField.DecodedTriangle.TYPE.LINE) {
+        if (highestType == ShapeField.DecodedTriangle.TYPE.POINT) {
+          highestType = ShapeField.DecodedTriangle.TYPE.LINE;
+        }
+        numXLin += node.midX;
+        numYLin += node.midY;
+      } else {
+        if (highestType != ShapeField.DecodedTriangle.TYPE.TRIANGLE) {
+          highestType = ShapeField.DecodedTriangle.TYPE.TRIANGLE;
+        }
+        numXPly += node.midX;
+        numYPly += node.midY;
+      }
+    }
+    TreeNode root = createTree(triangles, 0, triangles.length - 1, false, 
null, dfsSerialized);
+
+    // pull up min values for the root node so the bbox is consistent
+    root.minY = minY;
+    root.minX = minX;
+
+    // set the highest dimensional type
+    root.highestType = highestType;
+
+    // compute centroid values for the root node so the centroid is consistent
+    if (highestType == ShapeField.DecodedTriangle.TYPE.POINT) {
+      root.midX = numXPnt / i;
+      root.midY = numYPnt / i;
+    } else if (highestType == ShapeField.DecodedTriangle.TYPE.LINE) {
+      // numerator is sum of segment midPoints times segment length
+      // divide by total length per
+      // https://www.ae.msstate.edu/vlsm/shape/centroid_of_a_line/straight.htm
+      root.midX = numXLin / totalLength;
+      root.midY = numYLin / totalLength;
+    } else {
+      // numerator is sum of triangle centroids times triangle signed area
+      // divide by total signed area per 
http://www.faqs.org/faqs/graphics/algorithms-faq/
+      root.midX = numXPly / totalSignedArea;
+      root.midY = numYPly / totalSignedArea;
+    }
+
+    return root;
+  }
+
+  /** creates the tree */
+  private TreeNode createTree(
+      TreeNode[] triangles,
+      int low,
+      int high,
+      boolean splitX,
+      TreeNode parent,
+      List<TreeNode> dfsSerialized) {
+    if (low > high) {
+      return null;
+    }
+    // add midpoint
+    int mid = (low + high) >>> 1;
+    if (low < high) {
+      Comparator<TreeNode> comparator =
+          splitX
+              ? Comparator.comparingInt((TreeNode left) -> left.minX)
+                  .thenComparingInt(left -> left.maxX)
+              : Comparator.comparingInt((TreeNode left) -> left.minY)
+                  .thenComparingInt(left -> left.maxY);
+      ArrayUtil.select(triangles, low, high + 1, mid, comparator);
+    }
+    TreeNode newNode = triangles[mid];
+    dfsSerialized.add(newNode);
+    // set parent
+    newNode.parent = parent;
+
+    // add children
+    newNode.left = createTree(triangles, low, mid - 1, !splitX, newNode, 
dfsSerialized);
+    newNode.right = createTree(triangles, mid + 1, high, !splitX, newNode, 
dfsSerialized);
+    // pull up values to this node
+    if (newNode.left != null) {
+      newNode.minX = Math.min(newNode.minX, newNode.left.minX);
+      newNode.minY = Math.min(newNode.minY, newNode.left.minY);
+      newNode.maxX = Math.max(newNode.maxX, newNode.left.maxX);
+      newNode.maxY = Math.max(newNode.maxY, newNode.left.maxY);
+    }
+    if (newNode.right != null) {
+      newNode.minX = Math.min(newNode.minX, newNode.right.minX);
+      newNode.minY = Math.min(newNode.minY, newNode.right.minY);
+      newNode.maxX = Math.max(newNode.maxX, newNode.right.maxX);
+      newNode.maxY = Math.max(newNode.maxY, newNode.right.maxY);
+    }
+
+    // adjust byteSize based on new parent bbox values
+    if (newNode.left != null) {
+      // bounding box size
+      newNode.left.byteSize += vLongSize((long) newNode.maxX - 
newNode.left.minX);
+      newNode.left.byteSize += vLongSize((long) newNode.maxY - 
newNode.left.minY);
+      newNode.left.byteSize += vLongSize((long) newNode.maxX - 
newNode.left.maxX);
+      newNode.left.byteSize += vLongSize((long) newNode.maxY - 
newNode.left.maxY);
+      // component size
+      newNode.left.byteSize += computeComponentSize(newNode.left, 
newNode.maxX, newNode.maxY);
+      // include byte size (if needed to be skipped)
+      newNode.byteSize += vIntSize(newNode.left.byteSize) + 
newNode.left.byteSize;
+    }
+    if (newNode.right != null) {
+      // bounding box size
+      newNode.right.byteSize += vLongSize((long) newNode.maxX - 
newNode.right.minX);
+      newNode.right.byteSize += vLongSize((long) newNode.maxY - 
newNode.right.minY);
+      newNode.right.byteSize += vLongSize((long) newNode.maxX - 
newNode.right.maxX);
+      newNode.right.byteSize += vLongSize((long) newNode.maxY - 
newNode.right.maxY);
+      // component size
+      newNode.right.byteSize += computeComponentSize(newNode.right, 
newNode.maxX, newNode.maxY);
+      // include byte size (if needed to be skipped)
+      newNode.byteSize += vIntSize(newNode.right.byteSize) + 
newNode.right.byteSize;
+    }
+    return newNode;
+  }
+
+  private int computeComponentSize(TreeNode node, int maxX, int maxY) {
+    int size = 0;
+    ShapeField.DecodedTriangle t = node.triangle;
+    size += vLongSize((long) maxX - t.aX);
+    size += vLongSize((long) maxY - t.aY);
+    if (t.type == ShapeField.DecodedTriangle.TYPE.LINE
+        || t.type == ShapeField.DecodedTriangle.TYPE.TRIANGLE) {
+      size += vLongSize((long) maxX - t.bX);
+      size += vLongSize((long) maxY - t.bY);
+    }
+    if (t.type == ShapeField.DecodedTriangle.TYPE.TRIANGLE) {
+      size += vLongSize((long) maxX - t.cX);
+      size += vLongSize((long) maxY - t.cY);
+    }
+    return size;
+  }
+
+  /**
+   * Builds an in memory binary tree of tessellated triangles. This logic 
comes from {@code
+   * org.apache.lucene.geo.ComponentTree} which originated from {@code
+   * org.apache.lucene.geo.EdgeTree}
+   *
+   * <p>The tree is serialized on disk in a variable format which becomes a 
compressed
+   * representation of the doc value format for the Geometry Component Tree
+   */
+  private final class TreeNode {
+    /** the triangle for this tree node */
+    private final ShapeField.DecodedTriangle triangle;
+
+    /** centroid running stats (in encoded space) for this tree node */
+    private double midX;
+
+    private double midY;
+
+    private final double
+        signedArea; // Units are encoded space. This is only used to compute 
centroid
+    // in encoded space. DO NOT USE THIS FOR GEOGRAPHICAL SPATIAL AREA
+    // triangles are guaranteed CCW so this will always be positive
+    // unless the component is a point or line
+    private final double length; // Units are encoded space. This is only used 
to compute centroid
+    // in encoded space. DO NOT USE THIS FOR GEOGRAPHICAL DISTANCE
+    // this will always be positive unless the component is a
+    // point or triangle
+    private ShapeField.DecodedTriangle.TYPE highestType; // the highest 
dimensional type
+
+    /** the bounding box for the tree */
+    private int minX;
+
+    private int maxX;
+    private int minY;
+    private int maxY;
+
+    // left and right branch
+    private TreeNode left;
+    private TreeNode right;
+    private TreeNode parent;
+
+    private int byteSize = 1; // header size is one byte; remaining is 
accumulated on construction
+
+    private TreeNode(ShapeField.DecodedTriangle t) {
+      this.minX = StrictMath.min(StrictMath.min(t.aX, t.bX), t.cX);
+      this.minY = StrictMath.min(StrictMath.min(t.aY, t.bY), t.cY);
+      this.maxX = StrictMath.max(StrictMath.max(t.aX, t.bX), t.cX);
+      this.maxY = StrictMath.max(StrictMath.max(t.aY, t.bY), t.cY);
+      this.triangle = t;
+      this.left = null;
+      this.right = null;
+
+      // compute the component level centroid, encoded area, or length based 
on type
+      if (t.type == ShapeField.DecodedTriangle.TYPE.POINT) {
+        this.midX = t.aX;
+        this.midY = t.aY;
+        this.signedArea = 0;
+        this.length = 0;
+      } else if (t.type == ShapeField.DecodedTriangle.TYPE.LINE) {
+        this.length = Math.hypot(t.aX - t.bX, t.aY - t.bY);
+        this.midX = (0.5d * (t.aX + t.bX)) * length; // weight by length
+        this.midY = (0.5d * (t.aY + t.bY)) * length; // weight by length
+        this.signedArea = 0;
+      } else {
+        this.signedArea =
+            Math.abs(0.5d * ((t.bX - t.aX) * (t.cY - t.aY) - (t.cX - t.aX) * 
(t.bY - t.aY)));
+        this.midX = ((t.aX + t.bX + t.cX) / 3) * signedArea; // weight by 
signed area
+        this.midY = ((t.aY + t.bY + t.cY) / 3) * signedArea; // weight by 
signed area

Review Comment:
   The issue with the centroid computation is here, this addition might easily 
overflow!
   
   More over, you are computing centroids on the encoded space, would that work 
for XYShapes where the encoding is not linear? 
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to