This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 95cb283f3 ORC-1631: Support `summary` output in `sizes` command
95cb283f3 is described below

commit 95cb283f3fc9a56007d8983378609c77bf0e6132
Author: sychen <syc...@ctrip.com>
AuthorDate: Sun Feb 25 23:20:51 2024 -0800

    ORC-1631: Support `summary` output in `sizes` command
    
    ### What changes were proposed in this pull request?
    Add support for summarizing the number of files, file sizes and file lines 
in the sizes command.
    
    ### Why are the changes needed?
    When we count the size of each field, we only know the percentage and the 
average size of each row, but we do not know the overall value.
    
    ### How was this patch tested?
    local test
    
    ```bash
    java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -h
    usage: sizes
     -h,--help              Print help message
     -i,--ignoreExtension   Ignore ORC file extension
     -s,--summary           Summarize the number of files, file sizes, and
                            file rows
    ```
    
    ```
    java -jar orc-tools-2.1.0-SNAPSHOT-uber.jar sizes -s
    ```
    
    ```
    Total Files: 5
    Total Sizes: 4803687270
    Total Rows: 39820045
    Percent  Bytes/Row  Name
      26.41  31.86
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #1816 from cxzl25/ORC-1631.
    
    Authored-by: sychen <syc...@ctrip.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
    (cherry picked from commit f46e55af8dd11f978f6ced35f8b59789101cd780)
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../src/java/org/apache/orc/tools/ColumnSizes.java    | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java 
b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
index b9cfb081b..2347ac744 100644
--- a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
+++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -126,7 +126,7 @@ public class ColumnSizes {
     }
   }
 
-  private void printResults(PrintStream out) {
+  private void printResults(PrintStream out, boolean summary) {
     List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
     for(int column = 0; column < columnSizes.length; ++column) {
       if (columnSizes[column] > 0) {
@@ -153,6 +153,10 @@ public class ColumnSizes {
     // sort by descending size, ascending name
     sizes.sort((x, y) -> x.size != y.size ?
         Long.compare(y.size, x.size) : x.name.compareTo(y.name));
+    if (summary) {
+      out.printf("Total Sizes: %d%n", totalSize);
+      out.printf("Total Rows: %d%n", rows);
+    }
     out.println("Percent  Bytes/Row  Name");
     for (StringLongPair item: sizes) {
       out.println(String.format("  %-5.2f  %-9.2f  %s",
@@ -169,9 +173,11 @@ public class ColumnSizes {
       return;
     }
     boolean ignoreExtension = cli.hasOption("ignoreExtension");
+    boolean summary = cli.hasOption("summary");
     String[] files = cli.getArgs();
 
     ColumnSizes result = null;
+    int totalFiles = 0;
     int badFiles = 0;
     for(String root: files) {
       Path rootPath = new Path(root);
@@ -179,6 +185,7 @@ public class ColumnSizes {
       for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, 
true); itr.hasNext(); ) {
         LocatedFileStatus status = itr.next();
         if (status.isFile() && (ignoreExtension || 
status.getPath().getName().endsWith(".orc"))) {
+          totalFiles += 1;
           try {
             if (result == null) {
               result = new ColumnSizes(conf, status);
@@ -197,7 +204,10 @@ public class ColumnSizes {
     if (result == null) {
       System.err.println("No files found");
     } else {
-      result.printResults(System.out);
+      if (summary) {
+        System.out.printf("Total Files: %d%n", totalFiles);
+      }
+      result.printResults(System.out, summary);
     }
     if (badFiles > 0) {
       System.err.println(badFiles + " bad ORC files found.");
@@ -217,6 +227,11 @@ public class ColumnSizes {
         .desc("Ignore ORC file extension")
         .build());
 
+    result.addOption(Option.builder("s")
+        .longOpt("summary")
+        .desc("Summarize the number of files, file sizes, and file rows")
+        .build());
+
     result.addOption(Option.builder("h")
         .longOpt("help")
         .desc("Print help message")

Reply via email to