This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 71dd81b4a ORC-1948: Fix `GeospatialTreeWriter#writeBatch` updating 
ColumnStatistics with incorrect values
71dd81b4a is described below

commit 71dd81b4a2ea16cec2f68c8ca85b91d8a8532698
Author: Bradley <bradley.b.p...@gmail.com>
AuthorDate: Tue Jul 8 08:51:12 2025 -0700

    ORC-1948: Fix `GeospatialTreeWriter#writeBatch` updating ColumnStatistics 
with incorrect values
    
    ### What changes were proposed in this pull request?
    Fix incorrect values in column statistics for geometry type.
    
    ### Why are the changes needed?
    `GeospatialTreeWriter#writeBatch` uses incorrect values to update column 
statistics when `vector.isRepeating` is false.
    
    ### How was this patch tested?
    The unit test `TestWriterImpl#testGeospatialColumnStatistics` covers cases 
where the `offset` parameter in `GeospatialTreeWriter#writeBatch` is either 0 
or greater than 0.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #2319 from usberkeley/ORC-1948.
    
    Authored-by: Bradley <bradley.b.p...@gmail.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../orc/impl/writer/GeospatialTreeWriter.java      |  6 +--
 .../test/org/apache/orc/impl/TestWriterImpl.java   | 60 ++++++++++++++++++++++
 2 files changed, 62 insertions(+), 4 deletions(-)

diff --git 
a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java 
b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
index e9a0aa70b..676ca32a9 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
@@ -20,7 +20,6 @@ package org.apache.orc.impl.writer;
 
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.io.BytesWritable;
 import org.apache.orc.OrcProto;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.impl.CryptoUtils;
@@ -96,10 +95,9 @@ public class GeospatialTreeWriter extends TreeWriterBase {
                   vec.start[offset + i], vec.length[offset + i]);
           this.length.write(vec.length[offset + i]);
           rawDataSize += vec.length[offset + i];
-          BytesWritable bw = new BytesWritable();
-          bw.set(vec.vector[offset + i], vec.start[offset + i], 
vec.length[offset + i]);
           if (isGeometry) {
-            indexStatistics.updateGeometry(vec.vector[i], vec.start[i], 
vec.length[i]);
+            indexStatistics.updateGeometry(vec.vector[offset + i],
+                    vec.start[offset + i], vec.length[offset + i]);
           }
           if (createBloomFilter) {
             if (bloomFilter != null) {
diff --git a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java 
b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
index 903e4e80c..58236502d 100644
--- a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
@@ -21,6 +21,7 @@ package org.apache.orc.impl;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.OrcConf;
@@ -29,9 +30,15 @@ import org.apache.orc.Reader;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
 import org.apache.orc.*;
+import org.apache.orc.geospatial.BoundingBox;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBWriter;
+import org.locationtech.jts.io.WKTReader;
 
 import java.io.IOException;
 
@@ -178,6 +185,59 @@ public class TestWriterImpl implements TestConf {
     assertEquals(10, w.getStripes().size());
   }
 
+  @ParameterizedTest
+  @ValueSource(booleans = {true, false})
+  public void testGeospatialColumnStatistics(boolean useFilter) throws 
IOException, ParseException {
+    conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
+    // Use the Geometry type
+    schema = TypeDescription.createGeometry();
+    Writer writer = OrcFile.createWriter(testFilePath, 
OrcFile.writerOptions(conf).setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    BytesColumnVector geomColumn = (BytesColumnVector) batch.cols[0];
+
+    WKTReader wktReader = new WKTReader();
+    WKBWriter wkbWriter = new WKBWriter();
+    byte[] point1 = wkbWriter.write(wktReader.read("POINT (1 2)"));
+    byte[] point2 = wkbWriter.write(wktReader.read("POINT (3 4)"));
+    byte[] point3 = wkbWriter.write(wktReader.read("POINT (5 6)"));
+    byte[] point4 = wkbWriter.write(wktReader.read("POINT (7 8)"));
+
+    geomColumn.setVal(0, point1);
+    geomColumn.setVal(1, point2);
+    geomColumn.setVal(2, point3);
+    geomColumn.setVal(3, point4);
+
+    if (useFilter) {
+      int[] selected = {2};
+      batch.setFilterContext(true, selected, selected.length);
+    } else {
+      batch.size = 4;
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf));
+    ColumnStatistics[] statistics = reader.getStatistics();
+    GeospatialColumnStatistics geometryStatistics = 
(GeospatialColumnStatistics) statistics[0];
+    BoundingBox bbox = geometryStatistics.getBoundingBox();
+    if (useFilter) {
+      assertEquals(5.0, bbox.getXMin());
+      assertEquals(5.0, bbox.getXMax());
+      assertEquals(6.0, bbox.getYMin());
+      assertEquals(6.0, bbox.getYMax());
+    } else {
+      assertEquals(1.0, bbox.getXMin());
+      assertEquals(7.0, bbox.getXMax());
+      assertEquals(2.0, bbox.getYMin());
+      assertEquals(8.0, bbox.getYMax());
+    }
+    assertEquals(Double.NaN, bbox.getZMin());
+    assertEquals(Double.NaN, bbox.getZMax());
+    assertEquals(Double.NaN, bbox.getMMin());
+    assertEquals(Double.NaN, bbox.getMMax());
+    reader.close();
+  }
+
   @Test
   public void testCloseIsIdempotent() throws IOException {
     conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");

Reply via email to