This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 95cb283f3 ORC-1631: Support `summary` output in `sizes` command
95cb283f3 is described below
commit 95cb283f3fc9a56007d8983378609c77bf0e6132
Author: sychen <[email protected]>
AuthorDate: Sun Feb 25 23:20:51 2024 -0800
ORC-1631: Support `summary` output in `sizes` command
### What changes were proposed in this pull request?
Add support for summarizing the number of files, file sizes and file lines
in the sizes command.
### Why are the changes needed?
When we count the size of each field, we only know the percentage and the
average size of each row, but we do not know the overall value.
### How was this patch tested?
local test
```bash
java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -h
usage: sizes
-h,--help Print help message
-i,--ignoreExtension Ignore ORC file extension
-s,--summary Summarize the number of files, file sizes, and
file rows
```
```
java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -s
```
```
Total Files: 5
Total Sizes: 4803687270
Total Rows: 39820045
Percent Bytes/Row Name
26.41 31.86
```
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #1816 from cxzl25/ORC-1631.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit f46e55af8dd11f978f6ced35f8b59789101cd780)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../src/java/org/apache/orc/tools/ColumnSizes.java | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
index b9cfb081b..2347ac744 100644
--- a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
+++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -126,7 +126,7 @@ public class ColumnSizes {
}
}
- private void printResults(PrintStream out) {
+ private void printResults(PrintStream out, boolean summary) {
List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
for(int column = 0; column < columnSizes.length; ++column) {
if (columnSizes[column] > 0) {
@@ -153,6 +153,10 @@ public class ColumnSizes {
// sort by descending size, ascending name
sizes.sort((x, y) -> x.size != y.size ?
Long.compare(y.size, x.size) : x.name.compareTo(y.name));
+ if (summary) {
+ out.printf("Total Sizes: %d%n", totalSize);
+ out.printf("Total Rows: %d%n", rows);
+ }
out.println("Percent Bytes/Row Name");
for (StringLongPair item: sizes) {
out.println(String.format(" %-5.2f %-9.2f %s",
@@ -169,9 +173,11 @@ public class ColumnSizes {
return;
}
boolean ignoreExtension = cli.hasOption("ignoreExtension");
+ boolean summary = cli.hasOption("summary");
String[] files = cli.getArgs();
ColumnSizes result = null;
+ int totalFiles = 0;
int badFiles = 0;
for(String root: files) {
Path rootPath = new Path(root);
@@ -179,6 +185,7 @@ public class ColumnSizes {
for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath,
true); itr.hasNext(); ) {
LocatedFileStatus status = itr.next();
if (status.isFile() && (ignoreExtension ||
status.getPath().getName().endsWith(".orc"))) {
+ totalFiles += 1;
try {
if (result == null) {
result = new ColumnSizes(conf, status);
@@ -197,7 +204,10 @@ public class ColumnSizes {
if (result == null) {
System.err.println("No files found");
} else {
- result.printResults(System.out);
+ if (summary) {
+ System.out.printf("Total Files: %d%n", totalFiles);
+ }
+ result.printResults(System.out, summary);
}
if (badFiles > 0) {
System.err.println(badFiles + " bad ORC files found.");
@@ -217,6 +227,11 @@ public class ColumnSizes {
.desc("Ignore ORC file extension")
.build());
+ result.addOption(Option.builder("s")
+ .longOpt("summary")
+ .desc("Summarize the number of files, file sizes, and file rows")
+ .build());
+
result.addOption(Option.builder("h")
.longOpt("help")
.desc("Print help message")