This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new f46e55af8 ORC-1631: Support `summary` output in `sizes` command
f46e55af8 is described below

commit f46e55af8dd11f978f6ced35f8b59789101cd780
Author: sychen <syc...@ctrip.com>
AuthorDate: Sun Feb 25 23:20:51 2024 -0800

    ORC-1631: Support `summary` output in `sizes` command
    
    ### What changes were proposed in this pull request?
    Add support for summarizing the number of files, file sizes and file lines 
in the sizes command.
    
    ### Why are the changes needed?
    When we count the size of each field, we only know the percentage and the 
average size of each row, but we do not know the overall value.
    
    ### How was this patch tested?
    local test
    
    ```bash
    java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -h
    usage: sizes
     -h,--help              Print help message
     -i,--ignoreExtension   Ignore ORC file extension
     -s,--summary           Summarize the number of files, file sizes, and
                            file rows
    ```
    
    ```
    java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -s
    ```
    
    ```
    Total Files: 5
    Total Sizes: 4803687270
    Total Rows: 39820045
    Percent  Bytes/Row  Name
      26.41  31.86
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #1816 from cxzl25/ORC-1631.
    
    Authored-by: sychen <syc...@ctrip.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../src/java/org/apache/orc/tools/ColumnSizes.java    | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java 
b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
index b9cfb081b..2347ac744 100644
--- a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
+++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -126,7 +126,7 @@ public class ColumnSizes {
     }
   }
 
-  private void printResults(PrintStream out) {
+  private void printResults(PrintStream out, boolean summary) {
     List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
     for(int column = 0; column < columnSizes.length; ++column) {
       if (columnSizes[column] > 0) {
@@ -153,6 +153,10 @@ public class ColumnSizes {
     // sort by descending size, ascending name
     sizes.sort((x, y) -> x.size != y.size ?
         Long.compare(y.size, x.size) : x.name.compareTo(y.name));
+    if (summary) {
+      out.printf("Total Sizes: %d%n", totalSize);
+      out.printf("Total Rows: %d%n", rows);
+    }
     out.println("Percent  Bytes/Row  Name");
     for (StringLongPair item: sizes) {
       out.println(String.format("  %-5.2f  %-9.2f  %s",
@@ -169,9 +173,11 @@ public class ColumnSizes {
       return;
     }
     boolean ignoreExtension = cli.hasOption("ignoreExtension");
+    boolean summary = cli.hasOption("summary");
     String[] files = cli.getArgs();
 
     ColumnSizes result = null;
+    int totalFiles = 0;
     int badFiles = 0;
     for(String root: files) {
       Path rootPath = new Path(root);
@@ -179,6 +185,7 @@ public class ColumnSizes {
       for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, 
true); itr.hasNext(); ) {
         LocatedFileStatus status = itr.next();
         if (status.isFile() && (ignoreExtension || 
status.getPath().getName().endsWith(".orc"))) {
+          totalFiles += 1;
           try {
             if (result == null) {
               result = new ColumnSizes(conf, status);
@@ -197,7 +204,10 @@ public class ColumnSizes {
     if (result == null) {
       System.err.println("No files found");
     } else {
-      result.printResults(System.out);
+      if (summary) {
+        System.out.printf("Total Files: %d%n", totalFiles);
+      }
+      result.printResults(System.out, summary);
     }
     if (badFiles > 0) {
       System.err.println(badFiles + " bad ORC files found.");
@@ -217,6 +227,11 @@ public class ColumnSizes {
         .desc("Ignore ORC file extension")
         .build());
 
+    result.addOption(Option.builder("s")
+        .longOpt("summary")
+        .desc("Summarize the number of files, file sizes, and file rows")
+        .build());
+
     result.addOption(Option.builder("h")
         .longOpt("help")
         .desc("Print help message")

Reply via email to