This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push: new 71dd81b4a ORC-1948: Fix `GeospatialTreeWriter#writeBatch` updating ColumnStatistics with incorrect values 71dd81b4a is described below commit 71dd81b4a2ea16cec2f68c8ca85b91d8a8532698 Author: Bradley <bradley.b.p...@gmail.com> AuthorDate: Tue Jul 8 08:51:12 2025 -0700 ORC-1948: Fix `GeospatialTreeWriter#writeBatch` updating ColumnStatistics with incorrect values ### What changes were proposed in this pull request? Fix incorrect values in column statistics for geometry type. ### Why are the changes needed? `GeospatialTreeWriter#writeBatch` uses incorrect values to update column statistics when `vector.isRepeating` is false. ### How was this patch tested? The unit test `TestWriterImpl#testGeospatialColumnStatistics` covers cases where the `offset` parameter in `GeospatialTreeWriter#writeBatch` is either 0 or greater than 0. ### Was this patch authored or co-authored using generative AI tooling? No Closes #2319 from usberkeley/ORC-1948. Authored-by: Bradley <bradley.b.p...@gmail.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../orc/impl/writer/GeospatialTreeWriter.java | 6 +-- .../test/org/apache/orc/impl/TestWriterImpl.java | 60 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java index e9a0aa70b..676ca32a9 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java @@ -20,7 +20,6 @@ package org.apache.orc.impl.writer; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; -import org.apache.hadoop.io.BytesWritable; import org.apache.orc.OrcProto; import org.apache.orc.TypeDescription; import org.apache.orc.impl.CryptoUtils; @@ -96,10 +95,9 @@ public class GeospatialTreeWriter extends TreeWriterBase { vec.start[offset + i], vec.length[offset + i]); this.length.write(vec.length[offset + i]); rawDataSize += vec.length[offset + i]; - BytesWritable bw = new BytesWritable(); - bw.set(vec.vector[offset + i], vec.start[offset + i], vec.length[offset + i]); if (isGeometry) { - indexStatistics.updateGeometry(vec.vector[i], vec.start[i], vec.length[i]); + indexStatistics.updateGeometry(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); } if (createBloomFilter) { if (bloomFilter != null) { diff --git a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java index 903e4e80c..58236502d 100644 --- a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java @@ -21,6 +21,7 @@ package org.apache.orc.impl; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcConf; @@ -29,9 +30,15 @@ import org.apache.orc.Reader; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.*; +import org.apache.orc.geospatial.BoundingBox; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBWriter; +import org.locationtech.jts.io.WKTReader; import java.io.IOException; @@ -178,6 +185,59 @@ public class TestWriterImpl implements TestConf { assertEquals(10, w.getStripes().size()); } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testGeospatialColumnStatistics(boolean useFilter) throws IOException, ParseException { + conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true"); + // Use the Geometry type + schema = TypeDescription.createGeometry(); + Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geomColumn = (BytesColumnVector) batch.cols[0]; + + WKTReader wktReader = new WKTReader(); + WKBWriter wkbWriter = new WKBWriter(); + byte[] point1 = wkbWriter.write(wktReader.read("POINT (1 2)")); + byte[] point2 = wkbWriter.write(wktReader.read("POINT (3 4)")); + byte[] point3 = wkbWriter.write(wktReader.read("POINT (5 6)")); + byte[] point4 = wkbWriter.write(wktReader.read("POINT (7 8)")); + + geomColumn.setVal(0, point1); + geomColumn.setVal(1, point2); + geomColumn.setVal(2, point3); + geomColumn.setVal(3, point4); + + if (useFilter) { + int[] selected = {2}; + batch.setFilterContext(true, selected, selected.length); + } else { + batch.size = 4; + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); + ColumnStatistics[] statistics = reader.getStatistics(); + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) statistics[0]; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + if (useFilter) { + assertEquals(5.0, bbox.getXMin()); + assertEquals(5.0, bbox.getXMax()); + assertEquals(6.0, bbox.getYMin()); + assertEquals(6.0, bbox.getYMax()); + } else { + assertEquals(1.0, bbox.getXMin()); + assertEquals(7.0, bbox.getXMax()); + assertEquals(2.0, bbox.getYMin()); + assertEquals(8.0, bbox.getYMax()); + } + assertEquals(Double.NaN, bbox.getZMin()); + assertEquals(Double.NaN, bbox.getZMax()); + assertEquals(Double.NaN, bbox.getMMin()); + assertEquals(Double.NaN, bbox.getMMax()); + reader.close(); + } + @Test public void testCloseIsIdempotent() throws IOException { conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");