This is an automated email from the ASF dual-hosted git repository.
shangxinli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new d057b39d9 PARQUET-2195: Add scan command to parquet-cli (#998)
d057b39d9 is described below
commit d057b39d93014fe40f5067ee4a33621e65c91552
Author: Gang Wu <[email protected]>
AuthorDate: Tue Nov 8 01:21:20 2022 +0800
PARQUET-2195: Add scan command to parquet-cli (#998)
* PARQUET-2195: Add scan command to parquet-cli
* Add ScanCommandTest
* fix argument to use single file name
---
parquet-cli/README.md | 16 ++++
.../src/main/java/org/apache/parquet/cli/Main.java | 2 +
.../apache/parquet/cli/commands/ScanCommand.java | 91 ++++++++++++++++++++++
.../parquet/cli/commands/ScanCommandTest.java | 38 +++++++++
4 files changed, 147 insertions(+)
diff --git a/parquet-cli/README.md b/parquet-cli/README.md
index da1d12319..73e512a0d 100644
--- a/parquet-cli/README.md
+++ b/parquet-cli/README.md
@@ -101,6 +101,22 @@ Usage: parquet [options] [command] [command options]
Print the first N records from a file
head
Print the first N records from a file
+ column-index
+ Prints the column and offset indexes of a Parquet file
+ column-size
+ Print the column sizes of a parquet file
+ prune
+ Prune column(s) in a Parquet file and save it to a new file. The
columns left are not changed.
+ trans-compression
+ Translate the compression from one to another (It doesn't support
bloom filter feature yet).
+ masking
+ Replace columns with masked values and write to a new Parquet file
+ footer
+ Print the Parquet file footer in json format
+ bloom-filter
+ Check bloom filters for a Parquet column
+ scan
+ Scan all records from a file
Examples:
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
index 450fa6e01..d1da6ee00 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -34,6 +34,7 @@ import org.apache.parquet.cli.commands.ConvertCSVCommand;
import org.apache.parquet.cli.commands.ConvertCommand;
import org.apache.parquet.cli.commands.ParquetMetadataCommand;
import org.apache.parquet.cli.commands.PruneColumnsCommand;
+import org.apache.parquet.cli.commands.ScanCommand;
import org.apache.parquet.cli.commands.SchemaCommand;
import org.apache.parquet.cli.commands.ShowBloomFilterCommand;
import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
@@ -101,6 +102,7 @@ public class Main extends Configured implements Tool {
jc.addCommand("masking", new ColumnMaskingCommand(console));
jc.addCommand("footer", new ShowFooterCommand(console));
jc.addCommand("bloom-filter", new ShowBloomFilterCommand(console));
+ jc.addCommand("scan", new ScanCommand(console));
}
@Override
diff --git
a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ScanCommand.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ScanCommand.java
new file mode 100644
index 000000000..0b226ab3c
--- /dev/null
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ScanCommand.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+import org.apache.avro.Schema;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.cli.util.Expressions;
+import org.slf4j.Logger;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.List;
+
+@Parameters(commandDescription = "Scan all records from a file")
+public class ScanCommand extends BaseCommand {
+
+ @Parameter(description = "<file>")
+ String sourceFile;
+
+ @Parameter(
+ names = {"-c", "--column", "--columns"},
+ description = "List of columns")
+ List<String> columns;
+
+ public ScanCommand(Logger console) {
+ super(console);
+ }
+
+ @Override
+ public int run() throws IOException {
+ Preconditions.checkArgument(
+ sourceFile != null && !sourceFile.isEmpty(),
+ "Missing file name");
+
+ Schema schema = getAvroSchema(sourceFile);
+ Schema projection = Expressions.filterSchema(schema, columns);
+
+ long startTime = System.currentTimeMillis();
+ Iterable<Object> reader = openDataFile(sourceFile, projection);
+ boolean threw = true;
+ long count = 0;
+ try {
+ for (Object record : reader) {
+ count += 1;
+ }
+ threw = false;
+ } catch (RuntimeException e) {
+ throw new RuntimeException("Failed on record " + count, e);
+ } finally {
+ if (reader instanceof Closeable) {
+ Closeables.close((Closeable) reader, threw);
+ }
+ }
+ long endTime = System.currentTimeMillis();
+
+ console.info("Scanned " + count + " records from " + sourceFile);
+ console.info("Time: " + (endTime - startTime) / 1000.0 + " s");
+ return 0;
+ }
+
+ @Override
+ public List<String> getExamples() {
+ return Lists.newArrayList(
+ "# Scan all the records from file \"data.avro\":",
+ "data.avro",
+ "# Scan all the records from file \"data.parquet\":",
+ "data.parquet"
+ );
+ }
+}
diff --git
a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ScanCommandTest.java
b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ScanCommandTest.java
new file mode 100644
index 000000000..dbe1f889e
--- /dev/null
+++
b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ScanCommandTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+
+public class ScanCommandTest extends ParquetFileTest {
+ @Test
+ public void testScanCommand() throws IOException {
+ File file = parquetFile();
+ ScanCommand command = new ScanCommand(createLogger());
+ command.sourceFile = file.getAbsolutePath();
+ command.setConf(new Configuration());
+ Assert.assertEquals(0, command.run());
+ }
+}