This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 6d478cb6f ORC-1740: Avoid the dump tool repeatedly parsing
ColumnStatistics
6d478cb6f is described below
commit 6d478cb6fb810d220406b26f6e4162234ebec269
Author: sychen <[email protected]>
AuthorDate: Wed Jul 10 15:51:30 2024 -0700
ORC-1740: Avoid the dump tool repeatedly parsing ColumnStatistics
### What changes were proposed in this pull request?
This PR aims to avoid the dump tool repeatedly parsing ColumnStatistics.
### Why are the changes needed?
`org.apache.orc.StripeStatistics#getColumnStatistics` always generates
statistical information for all columns. When there are many columns, the
parsing performance decreases.
https://github.com/apache/orc/blob/c38e20d862ce19395558e092dd42033a000fe22d/java/core/src/java/org/apache/orc/StripeStatistics.java#L57-L66
### How was this patch tested?
local test and exist UT
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #1972 from cxzl25/ORC-1740.
Authored-by: sychen <[email protected]>
Signed-off-by: William Hyun <[email protected]>
---
java/tools/src/java/org/apache/orc/tools/FileDump.java | 5 +++--
java/tools/src/java/org/apache/orc/tools/JsonFileDump.java | 5 +++--
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index c23505310..55016ddcd 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -357,9 +357,10 @@ public final class FileDump {
for (int n = 0; n < stripeStats.size(); n++) {
System.out.println(" Stripe " + (n + 1) + ":");
StripeStatistics ss = stripeStats.get(n);
- for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
+ ColumnStatistics[] columnStatistics = ss.getColumnStatistics();
+ for (int i = 0; i < columnStatistics.length; ++i) {
System.out.println(" Column " + i + ": " +
- ss.getColumnStatistics()[i].toString());
+ columnStatistics[i].toString());
}
}
ColumnStatistics[] stats = reader.getStatistics();
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index 9737222da..7d893a54c 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -112,10 +112,11 @@ public class JsonFileDump {
writer.name("stripeNumber").value(n + 1);
StripeStatistics ss = stripeStatistics.get(n);
writer.name("columnStatistics").beginArray();
- for (int i = 0; i < ss.getColumnStatistics().length; i++) {
+ ColumnStatistics[] columnStatistics = ss.getColumnStatistics();
+ for (int i = 0; i < columnStatistics.length; i++) {
writer.beginObject();
writer.name("columnId").value(i);
- writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
+ writeColumnStatistics(writer, columnStatistics[i]);
writer.endObject();
}
writer.endArray();