This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 291a7de3f ORC-1567: Support `-ignoreExtension` option at `sizes` and
`count` commands of orc-tools
291a7de3f is described below
commit 291a7de3f21538a65f8de86be34df794fc3313dd
Author: sychen <[email protected]>
AuthorDate: Thu Jan 4 22:57:18 2024 -0800
ORC-1567: Support `-ignoreExtension` option at `sizes` and `count` commands
of orc-tools
### What changes were proposed in this pull request?
Add the `--ignoreExtension` option.
```bash
java -jar orc-tools-2.0.0-SNAPSHOT-uber.jar sizes --ignoreExtension path
java -jar orc-tools-2.0.0-SNAPSHOT-uber.jar count --ignoreExtension path
```
### Why are the changes needed?
The `count` and `sizes` commands provided by `orc-tools` now require that
the file must have an orc suffix.
However, files in orc format do not necessarily have the orc suffix, which
is inconvenient to use.
### How was this patch tested?
Closes #1722 from cxzl25/ORC-1567.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../src/java/org/apache/orc/tools/ColumnSizes.java | 38 +++++++++++++++++++---
.../src/java/org/apache/orc/tools/RowCount.java | 38 +++++++++++++++++++---
2 files changed, 68 insertions(+), 8 deletions(-)
diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
index 79b24304d..b9cfb081b 100644
--- a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
+++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -18,6 +18,11 @@
package org.apache.orc.tools;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
@@ -155,15 +160,25 @@ public class ColumnSizes {
}
}
- public static void main(Configuration conf, String[] args) throws
IOException {
+ public static void main(Configuration conf, String[] args) throws Exception {
+ Options opts = createOptions();
+ CommandLine cli = new DefaultParser().parse(opts, args);
+ if (cli.hasOption('h')) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("sizes", opts);
+ return;
+ }
+ boolean ignoreExtension = cli.hasOption("ignoreExtension");
+ String[] files = cli.getArgs();
+
ColumnSizes result = null;
int badFiles = 0;
- for(String root: args) {
+ for(String root: files) {
Path rootPath = new Path(root);
FileSystem fs = rootPath.getFileSystem(conf);
for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath,
true); itr.hasNext(); ) {
LocatedFileStatus status = itr.next();
- if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+ if (status.isFile() && (ignoreExtension ||
status.getPath().getName().endsWith(".orc"))) {
try {
if (result == null) {
result = new ColumnSizes(conf, status);
@@ -190,7 +205,22 @@ public class ColumnSizes {
}
}
- public static void main(String[] args) throws IOException {
+ public static void main(String[] args) throws Exception {
main(new Configuration(), args);
}
+
+ private static Options createOptions() {
+ Options result = new Options();
+
+ result.addOption(Option.builder("i")
+ .longOpt("ignoreExtension")
+ .desc("Ignore ORC file extension")
+ .build());
+
+ result.addOption(Option.builder("h")
+ .longOpt("help")
+ .desc("Print help message")
+ .build());
+ return result;
+ }
}
diff --git a/java/tools/src/java/org/apache/orc/tools/RowCount.java
b/java/tools/src/java/org/apache/orc/tools/RowCount.java
index 3d9d0681b..c7c6014a5 100644
--- a/java/tools/src/java/org/apache/orc/tools/RowCount.java
+++ b/java/tools/src/java/org/apache/orc/tools/RowCount.java
@@ -18,6 +18,11 @@
package org.apache.orc.tools;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
@@ -32,14 +37,24 @@ import java.io.IOException;
* Given a set of paths, finds all of the "*.orc" files under them and prints
the number of rows in each file.
*/
public class RowCount {
- public static void main(Configuration conf, String[] args) throws
IOException {
+ public static void main(Configuration conf, String[] args) throws Exception {
+ Options opts = createOptions();
+ CommandLine cli = new DefaultParser().parse(opts, args);
+ if (cli.hasOption('h')) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("count", opts);
+ return;
+ }
+ boolean ignoreExtension = cli.hasOption("ignoreExtension");
+ String[] files = cli.getArgs();
+
int bad = 0;
- for(String root: args) {
+ for(String root: files) {
Path rootPath = new Path(root);
FileSystem fs = rootPath.getFileSystem(conf);
for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath,
true); itr.hasNext(); ) {
LocatedFileStatus status = itr.next();
- if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+ if (status.isFile() && (ignoreExtension ||
status.getPath().getName().endsWith(".orc"))) {
Path filename = status.getPath();
try (Reader reader = OrcFile.createReader(filename,
OrcFile.readerOptions(conf))) {
System.out.println(String.format("%s %d",
@@ -54,7 +69,22 @@ public class RowCount {
System.exit(bad == 0 ? 0 : 1);
}
- public static void main(String[] args) throws IOException {
+ public static void main(String[] args) throws Exception {
main(new Configuration(), args);
}
+
+ private static Options createOptions() {
+ Options result = new Options();
+
+ result.addOption(Option.builder("i")
+ .longOpt("ignoreExtension")
+ .desc("Ignore ORC file extension")
+ .build());
+
+ result.addOption(Option.builder("h")
+ .longOpt("help")
+ .desc("Print help message")
+ .build());
+ return result;
+ }
}