This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 291a7de3f ORC-1567: Support `-ignoreExtension` option at `sizes` and 
`count` commands of orc-tools
291a7de3f is described below

commit 291a7de3f21538a65f8de86be34df794fc3313dd
Author: sychen <[email protected]>
AuthorDate: Thu Jan 4 22:57:18 2024 -0800

    ORC-1567: Support `-ignoreExtension` option at `sizes` and `count` commands 
of orc-tools
    
    ### What changes were proposed in this pull request?
    Add the `--ignoreExtension` option.
    
    ```bash
    java -jar orc-tools-2.0.0-SNAPSHOT-uber.jar sizes --ignoreExtension path
    java -jar orc-tools-2.0.0-SNAPSHOT-uber.jar count --ignoreExtension path
    
    ```
    
    ### Why are the changes needed?
    The `count` and `sizes` commands provided by `orc-tools` now require that 
the file must have an orc suffix.
    However, files in orc format do not necessarily have the orc suffix, which 
is inconvenient to use.
    
    ### How was this patch tested?
    
    Closes #1722 from cxzl25/ORC-1567.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../src/java/org/apache/orc/tools/ColumnSizes.java | 38 +++++++++++++++++++---
 .../src/java/org/apache/orc/tools/RowCount.java    | 38 +++++++++++++++++++---
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java 
b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
index 79b24304d..b9cfb081b 100644
--- a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
+++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -18,6 +18,11 @@
 
 package org.apache.orc.tools;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
@@ -155,15 +160,25 @@ public class ColumnSizes {
     }
   }
 
-  public static void main(Configuration conf, String[] args) throws 
IOException {
+  public static void main(Configuration conf, String[] args) throws Exception {
+    Options opts = createOptions();
+    CommandLine cli = new DefaultParser().parse(opts, args);
+    if (cli.hasOption('h')) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("sizes", opts);
+      return;
+    }
+    boolean ignoreExtension = cli.hasOption("ignoreExtension");
+    String[] files = cli.getArgs();
+
     ColumnSizes result = null;
     int badFiles = 0;
-    for(String root: args) {
+    for(String root: files) {
       Path rootPath = new Path(root);
       FileSystem fs = rootPath.getFileSystem(conf);
       for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, 
true); itr.hasNext(); ) {
         LocatedFileStatus status = itr.next();
-        if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+        if (status.isFile() && (ignoreExtension || 
status.getPath().getName().endsWith(".orc"))) {
           try {
             if (result == null) {
               result = new ColumnSizes(conf, status);
@@ -190,7 +205,22 @@ public class ColumnSizes {
     }
   }
 
-  public static void main(String[] args) throws IOException {
+  public static void main(String[] args) throws Exception {
     main(new Configuration(), args);
   }
+
+  private static Options createOptions() {
+    Options result = new Options();
+
+    result.addOption(Option.builder("i")
+        .longOpt("ignoreExtension")
+        .desc("Ignore ORC file extension")
+        .build());
+
+    result.addOption(Option.builder("h")
+        .longOpt("help")
+        .desc("Print help message")
+        .build());
+    return result;
+  }
 }
diff --git a/java/tools/src/java/org/apache/orc/tools/RowCount.java 
b/java/tools/src/java/org/apache/orc/tools/RowCount.java
index 3d9d0681b..c7c6014a5 100644
--- a/java/tools/src/java/org/apache/orc/tools/RowCount.java
+++ b/java/tools/src/java/org/apache/orc/tools/RowCount.java
@@ -18,6 +18,11 @@
 
 package org.apache.orc.tools;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
@@ -32,14 +37,24 @@ import java.io.IOException;
  * Given a set of paths, finds all of the "*.orc" files under them and prints 
the number of rows in each file.
  */
 public class RowCount {
-  public static void main(Configuration conf, String[] args) throws 
IOException {
+  public static void main(Configuration conf, String[] args) throws Exception {
+    Options opts = createOptions();
+    CommandLine cli = new DefaultParser().parse(opts, args);
+    if (cli.hasOption('h')) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("count", opts);
+      return;
+    }
+    boolean ignoreExtension = cli.hasOption("ignoreExtension");
+    String[] files = cli.getArgs();
+
     int bad = 0;
-    for(String root: args) {
+    for(String root: files) {
       Path rootPath = new Path(root);
       FileSystem fs = rootPath.getFileSystem(conf);
       for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, 
true); itr.hasNext(); ) {
         LocatedFileStatus status = itr.next();
-        if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+        if (status.isFile() && (ignoreExtension || 
status.getPath().getName().endsWith(".orc"))) {
           Path filename = status.getPath();
           try (Reader reader = OrcFile.createReader(filename, 
OrcFile.readerOptions(conf))) {
             System.out.println(String.format("%s %d",
@@ -54,7 +69,22 @@ public class RowCount {
     System.exit(bad == 0 ? 0 : 1);
   }
 
-  public static void main(String[] args) throws IOException {
+  public static void main(String[] args) throws Exception {
     main(new Configuration(), args);
   }
+
+  private static Options createOptions() {
+    Options result = new Options();
+
+    result.addOption(Option.builder("i")
+        .longOpt("ignoreExtension")
+        .desc("Ignore ORC file extension")
+        .build());
+
+    result.addOption(Option.builder("h")
+        .longOpt("help")
+        .desc("Print help message")
+        .build());
+    return result;
+  }
 }

Reply via email to