This is an automated email from the ASF dual-hosted git repository.

shangxinli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new d057b39d9 PARQUET-2195: Add scan command to parquet-cli (#998)
d057b39d9 is described below

commit d057b39d93014fe40f5067ee4a33621e65c91552
Author: Gang Wu <[email protected]>
AuthorDate: Tue Nov 8 01:21:20 2022 +0800

    PARQUET-2195: Add scan command to parquet-cli (#998)
    
    * PARQUET-2195: Add scan command to parquet-cli
    
    * Add ScanCommandTest
    
    * fix argument to use single file name
---
 parquet-cli/README.md                              | 16 ++++
 .../src/main/java/org/apache/parquet/cli/Main.java |  2 +
 .../apache/parquet/cli/commands/ScanCommand.java   | 91 ++++++++++++++++++++++
 .../parquet/cli/commands/ScanCommandTest.java      | 38 +++++++++
 4 files changed, 147 insertions(+)

diff --git a/parquet-cli/README.md b/parquet-cli/README.md
index da1d12319..73e512a0d 100644
--- a/parquet-cli/README.md
+++ b/parquet-cli/README.md
@@ -101,6 +101,22 @@ Usage: parquet [options] [command] [command options]
         Print the first N records from a file
     head
         Print the first N records from a file
+    column-index
+        Prints the column and offset indexes of a Parquet file
+    column-size
+        Print the column sizes of a parquet file
+    prune
+        Prune column(s) in a Parquet file and save it to a new file. The 
columns left are not changed.
+    trans-compression
+        Translate the compression from one to another (It doesn't support 
bloom filter feature yet).
+    masking
+        Replace columns with masked values and write to a new Parquet file
+    footer
+        Print the Parquet file footer in json format
+    bloom-filter
+        Check bloom filters for a Parquet column
+    scan
+        Scan all records from a file
 
   Examples:
 
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java 
b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
index 450fa6e01..d1da6ee00 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -34,6 +34,7 @@ import org.apache.parquet.cli.commands.ConvertCSVCommand;
 import org.apache.parquet.cli.commands.ConvertCommand;
 import org.apache.parquet.cli.commands.ParquetMetadataCommand;
 import org.apache.parquet.cli.commands.PruneColumnsCommand;
+import org.apache.parquet.cli.commands.ScanCommand;
 import org.apache.parquet.cli.commands.SchemaCommand;
 import org.apache.parquet.cli.commands.ShowBloomFilterCommand;
 import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
@@ -101,6 +102,7 @@ public class Main extends Configured implements Tool {
     jc.addCommand("masking", new ColumnMaskingCommand(console));
     jc.addCommand("footer", new ShowFooterCommand(console));
     jc.addCommand("bloom-filter", new ShowBloomFilterCommand(console));
+    jc.addCommand("scan", new ScanCommand(console));
   }
 
   @Override
diff --git 
a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ScanCommand.java 
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ScanCommand.java
new file mode 100644
index 000000000..0b226ab3c
--- /dev/null
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ScanCommand.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+import org.apache.avro.Schema;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.cli.util.Expressions;
+import org.slf4j.Logger;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.List;
+
+@Parameters(commandDescription = "Scan all records from a file")
+public class ScanCommand extends BaseCommand {
+
+  @Parameter(description = "<file>")
+  String sourceFile;
+
+  @Parameter(
+    names = {"-c", "--column", "--columns"},
+    description = "List of columns")
+  List<String> columns;
+
+  public ScanCommand(Logger console) {
+    super(console);
+  }
+
+  @Override
+  public int run() throws IOException {
+    Preconditions.checkArgument(
+      sourceFile != null && !sourceFile.isEmpty(),
+      "Missing file name");
+
+    Schema schema = getAvroSchema(sourceFile);
+    Schema projection = Expressions.filterSchema(schema, columns);
+
+    long startTime = System.currentTimeMillis();
+    Iterable<Object> reader = openDataFile(sourceFile, projection);
+    boolean threw = true;
+    long count = 0;
+    try {
+      for (Object record : reader) {
+        count += 1;
+      }
+      threw = false;
+    } catch (RuntimeException e) {
+      throw new RuntimeException("Failed on record " + count, e);
+    } finally {
+      if (reader instanceof Closeable) {
+        Closeables.close((Closeable) reader, threw);
+      }
+    }
+    long endTime = System.currentTimeMillis();
+
+    console.info("Scanned " + count + " records from " + sourceFile);
+    console.info("Time: " + (endTime - startTime) / 1000.0 + " s");
+    return 0;
+  }
+
+  @Override
+  public List<String> getExamples() {
+    return Lists.newArrayList(
+      "# Scan all the records from file \"data.avro\":",
+      "data.avro",
+      "# Scan all the records from file \"data.parquet\":",
+      "data.parquet"
+    );
+  }
+}
diff --git 
a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ScanCommandTest.java
 
b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ScanCommandTest.java
new file mode 100644
index 000000000..dbe1f889e
--- /dev/null
+++ 
b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ScanCommandTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+
+public class ScanCommandTest extends ParquetFileTest {
+  @Test
+  public void testScanCommand() throws IOException {
+    File file = parquetFile();
+    ScanCommand command = new ScanCommand(createLogger());
+    command.sourceFile = file.getAbsolutePath();
+    command.setConf(new Configuration());
+    Assert.assertEquals(0, command.run());
+  }
+}

Reply via email to