This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new 3781dfe78 PARQUET-2428: Make RawPagesReader accept specified columns.
(#1269)
3781dfe78 is described below
commit 3781dfe78979997308809e9e9ca19bed31c4eb9c
Author: Yujiang Zhong <[email protected]>
AuthorDate: Sun Feb 18 13:36:55 2024 +0800
PARQUET-2428: Make RawPagesReader accept specified columns. (#1269)
---
.../org/apache/parquet/cli/commands/ShowPagesCommand.java | 2 +-
.../org/apache/parquet/cli/rawpages/RawPagesReader.java | 13 +++++++++++++
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git
a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
index 6f6e67d82..faee61815 100644
---
a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
+++
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
@@ -83,7 +83,7 @@ public class ShowPagesCommand extends BaseCommand {
// command.
if (raw) {
try (RawPagesReader reader =
- new
RawPagesReader(HadoopInputFile.fromPath(qualifiedPath(targets.get(0)),
getConf()))) {
+ new
RawPagesReader(HadoopInputFile.fromPath(qualifiedPath(targets.get(0)),
getConf()), columns)) {
reader.listPages(console);
}
return 0;
diff --git
a/parquet-cli/src/main/java/org/apache/parquet/cli/rawpages/RawPagesReader.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/rawpages/RawPagesReader.java
index d53fea539..febd94eba 100644
---
a/parquet-cli/src/main/java/org/apache/parquet/cli/rawpages/RawPagesReader.java
+++
b/parquet-cli/src/main/java/org/apache/parquet/cli/rawpages/RawPagesReader.java
@@ -21,6 +21,9 @@ package org.apache.parquet.cli.rawpages;
import static org.apache.parquet.hadoop.ParquetFileWriter.MAGIC;
import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
import org.apache.parquet.cli.util.RawUtils;
import org.apache.parquet.format.CliUtils;
import org.apache.parquet.format.ColumnChunk;
@@ -37,8 +40,13 @@ public class RawPagesReader implements AutoCloseable {
private final SeekableInputStream input;
private final FileMetaData footer;
+ private final Set<String> columns;
public RawPagesReader(InputFile file) throws IOException {
+ this(file, null);
+ }
+
+ public RawPagesReader(InputFile file, List<String> cols) throws IOException {
long fileLen = file.getLength();
if (fileLen < MAGIC.length + 4 + MAGIC.length) {
@@ -47,6 +55,7 @@ public class RawPagesReader implements AutoCloseable {
input = file.newStream();
footer = RawUtils.readFooter(input, fileLen);
+ columns = cols == null || cols.isEmpty() ? null : new HashSet<>(cols);
}
public void listPages(Logger console) throws IOException {
@@ -55,6 +64,10 @@ public class RawPagesReader implements AutoCloseable {
for (ColumnChunk columnChunk : rowGroup.getColumns()) {
ColumnMetaData metaData = columnChunk.getMeta_data();
String path = String.join(".", metaData.getPath_in_schema());
+ if (columns != null && !columns.contains(path)) {
+ continue;
+ }
+
long totalSize = metaData.getTotal_compressed_size();
long dictOffset = metaData.getDictionary_page_offset();
long seekTo = metaData.getData_page_offset();