This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push: new f46e55af8 ORC-1631: Support `summary` output in `sizes` command f46e55af8 is described below commit f46e55af8dd11f978f6ced35f8b59789101cd780 Author: sychen <syc...@ctrip.com> AuthorDate: Sun Feb 25 23:20:51 2024 -0800 ORC-1631: Support `summary` output in `sizes` command ### What changes were proposed in this pull request? Add support for summarizing the number of files, file sizes and file lines in the sizes command. ### Why are the changes needed? When we count the size of each field, we only know the percentage and the average size of each row, but we do not know the overall value. ### How was this patch tested? local test ```bash java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -h usage: sizes -h,--help Print help message -i,--ignoreExtension Ignore ORC file extension -s,--summary Summarize the number of files, file sizes, and file rows ``` ``` java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -s ``` ``` Total Files: 5 Total Sizes: 4803687270 Total Rows: 39820045 Percent Bytes/Row Name 26.41 31.86 ``` ### Was this patch authored or co-authored using generative AI tooling? No Closes #1816 from cxzl25/ORC-1631. Authored-by: sychen <syc...@ctrip.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../src/java/org/apache/orc/tools/ColumnSizes.java | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java index b9cfb081b..2347ac744 100644 --- a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java +++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java @@ -126,7 +126,7 @@ public class ColumnSizes { } } - private void printResults(PrintStream out) { + private void printResults(PrintStream out, boolean summary) { List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5); for(int column = 0; column < columnSizes.length; ++column) { if (columnSizes[column] > 0) { @@ -153,6 +153,10 @@ public class ColumnSizes { // sort by descending size, ascending name sizes.sort((x, y) -> x.size != y.size ? Long.compare(y.size, x.size) : x.name.compareTo(y.name)); + if (summary) { + out.printf("Total Sizes: %d%n", totalSize); + out.printf("Total Rows: %d%n", rows); + } out.println("Percent Bytes/Row Name"); for (StringLongPair item: sizes) { out.println(String.format(" %-5.2f %-9.2f %s", @@ -169,9 +173,11 @@ public class ColumnSizes { return; } boolean ignoreExtension = cli.hasOption("ignoreExtension"); + boolean summary = cli.hasOption("summary"); String[] files = cli.getArgs(); ColumnSizes result = null; + int totalFiles = 0; int badFiles = 0; for(String root: files) { Path rootPath = new Path(root); @@ -179,6 +185,7 @@ public class ColumnSizes { for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) { LocatedFileStatus status = itr.next(); if (status.isFile() && (ignoreExtension || status.getPath().getName().endsWith(".orc"))) { + totalFiles += 1; try { if (result == null) { result = new ColumnSizes(conf, status); @@ -197,7 +204,10 @@ public class ColumnSizes { if (result == null) { System.err.println("No files found"); } else { - result.printResults(System.out); + if (summary) { + System.out.printf("Total Files: %d%n", totalFiles); + } + result.printResults(System.out, summary); } if (badFiles > 0) { System.err.println(badFiles + " bad ORC files found."); @@ -217,6 +227,11 @@ public class ColumnSizes { .desc("Ignore ORC file extension") .build()); + result.addOption(Option.builder("s") + .longOpt("summary") + .desc("Summarize the number of files, file sizes, and file rows") + .build()); + result.addOption(Option.builder("h") .longOpt("help") .desc("Print help message")