This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new be5ada2e9 GH-3125: Add CLI for SizeStatistics (#3126)
be5ada2e9 is described below

commit be5ada2e9ade661287a7f07432f13e7990a67e4f
Author: Gang Wu <[email protected]>
AuthorDate: Tue Jan 21 17:08:38 2025 +0800

    GH-3125: Add CLI for SizeStatistics (#3126)
---
 parquet-cli/README.md                              |   2 +
 .../src/main/java/org/apache/parquet/cli/Main.java |   2 +
 .../cli/commands/ShowSizeStatisticsCommand.java    | 116 +++++++++++++++++++++
 .../commands/ShowSizeStatisticsCommandTest.java    |  37 +++++++
 .../column/columnindex/ColumnIndexBuilder.java     |  35 ++++++-
 .../column/columnindex/OffsetIndexBuilder.java     |  11 +-
 6 files changed, 197 insertions(+), 6 deletions(-)

diff --git a/parquet-cli/README.md b/parquet-cli/README.md
index fb02b0817..963e4f171 100644
--- a/parquet-cli/README.md
+++ b/parquet-cli/README.md
@@ -119,6 +119,8 @@ Usage: parquet [options] [command] [command options]
         Scan all records from a file
     rewrite
         Rewrite one or more Parquet files to a new Parquet file
+    size-stats
+        Print size statistics for a Parquet file
 
   Examples:
 
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java 
b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
index 74ecb4408..c39e3b8e5 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -51,6 +51,7 @@ import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
 import org.apache.parquet.cli.commands.ShowDictionaryCommand;
 import org.apache.parquet.cli.commands.ShowFooterCommand;
 import org.apache.parquet.cli.commands.ShowPagesCommand;
+import org.apache.parquet.cli.commands.ShowSizeStatisticsCommand;
 import org.apache.parquet.cli.commands.ToAvroCommand;
 import org.apache.parquet.cli.commands.TransCompressionCommand;
 import org.slf4j.Logger;
@@ -105,6 +106,7 @@ public class Main extends Configured implements Tool {
     jc.addCommand("bloom-filter", new ShowBloomFilterCommand(console));
     jc.addCommand("scan", new ScanCommand(console));
     jc.addCommand("rewrite", new RewriteCommand(console));
+    jc.addCommand("size-stats", new ShowSizeStatisticsCommand(console));
   }
 
   @Override
diff --git 
a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java
 
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java
new file mode 100644
index 000000000..0821d260e
--- /dev/null
+++ 
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.cli.commands;
+
+import static org.apache.parquet.cli.Util.humanReadable;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import java.io.IOException;
+import java.util.List;
+import org.apache.commons.text.TextStringBuilder;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.column.statistics.SizeStatistics;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.MessageType;
+import org.slf4j.Logger;
+
+@Parameters(commandDescription = "Print size statistics for a Parquet file")
+public class ShowSizeStatisticsCommand extends BaseCommand {
+
+  public ShowSizeStatisticsCommand(Logger console) {
+    super(console);
+  }
+
+  @Parameter(description = "<parquet path>")
+  List<String> targets;
+
+  @Override
+  @SuppressWarnings("unchecked")
+  public int run() throws IOException {
+    Preconditions.checkArgument(targets != null && !targets.isEmpty(), "A 
Parquet file is required.");
+    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple 
Parquet files.");
+
+    String source = targets.get(0);
+    try (ParquetFileReader reader = ParquetFileReader.open(getConf(), 
qualifiedPath(source))) {
+      ParquetMetadata footer = reader.getFooter();
+      MessageType schema = footer.getFileMetaData().getSchema();
+
+      console.info("\nFile path: {}", source);
+
+      List<BlockMetaData> rowGroups = footer.getBlocks();
+      for (int index = 0, n = rowGroups.size(); index < n; index++) {
+        printRowGroupSizeStats(console, index, rowGroups.get(index), schema);
+        console.info("");
+      }
+    }
+
+    return 0;
+  }
+
+  private void printRowGroupSizeStats(Logger console, int index, BlockMetaData 
rowGroup, MessageType schema) {
+    int maxColumnWidth = Math.max(
+        "column".length(),
+        rowGroup.getColumns().stream()
+            .map(col -> col.getPath().toString().length())
+            .max(Integer::compare)
+            .orElse(0));
+
+    console.info(String.format("\nRow group %d\n%s", index, new 
TextStringBuilder(80).appendPadding(80, '-')));
+
+    String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", 
maxColumnWidth);
+    console.info(
+        String.format(formatString, "column", "unencoded bytes", "rep level 
histogram", "def level histogram"));
+
+    for (ColumnChunkMetaData column : rowGroup.getColumns()) {
+      printColumnSizeStats(console, column, schema, maxColumnWidth);
+    }
+  }
+
+  private void printColumnSizeStats(Logger console, ColumnChunkMetaData 
column, MessageType schema, int columnWidth) {
+    SizeStatistics stats = column.getSizeStatistics();
+
+    if (stats != null && stats.isValid()) {
+      String unencodedBytes = 
stats.getUnencodedByteArrayDataBytes().isPresent()
+          ? humanReadable(stats.getUnencodedByteArrayDataBytes().get())
+          : "-";
+      List<Long> repLevels = stats.getRepetitionLevelHistogram();
+      String repLevelsString = (repLevels != null && !repLevels.isEmpty()) ? 
repLevels.toString() : "-";
+      List<Long> defLevels = stats.getDefinitionLevelHistogram();
+      String defLevelsString = (defLevels != null && !defLevels.isEmpty()) ? 
defLevels.toString() : "-";
+      String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", 
columnWidth);
+      console.info(
+          String.format(formatString, column.getPath(), unencodedBytes, 
repLevelsString, defLevelsString));
+    } else {
+      String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", 
columnWidth);
+      console.info(String.format(formatString, column.getPath(), "-", "-", 
"-"));
+    }
+  }
+
+  @Override
+  public List<String> getExamples() {
+    return Lists.newArrayList("# Show size statistics for a Parquet file", 
"sample.parquet");
+  }
+}
diff --git 
a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java
 
b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java
new file mode 100644
index 000000000..55d4f9d6e
--- /dev/null
+++ 
b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ShowSizeStatisticsCommandTest extends ParquetFileTest {
+  @Test
+  public void testShowSizeStatisticsCommand() throws IOException {
+    File file = parquetFile();
+    ShowSizeStatisticsCommand command = new 
ShowSizeStatisticsCommand(createLogger());
+    command.targets = Arrays.asList(file.getAbsolutePath());
+    command.setConf(new Configuration());
+    Assert.assertEquals(0, command.run());
+  }
+}
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
index ffbb82197..e78b2ceae 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
@@ -182,14 +182,39 @@ public abstract class ColumnIndexBuilder {
       return LongLists.unmodifiable(LongArrayList.wrap(defLevelHistogram));
     }
 
+    private String formatHistogram(long[] histogram, int pageIndex) {
+      if (histogram != null && histogram.length > 0) {
+        int numLevelsPerPage = histogram.length / nullPages.length;
+        int offset = pageIndex * numLevelsPerPage;
+        StringBuilder sb = new StringBuilder();
+        sb.append('[');
+        for (int j = 0; j < numLevelsPerPage; j++) {
+          if (j > 0) {
+            sb.append(",");
+          }
+          sb.append(histogram[offset + j]);
+        }
+        sb.append(']');
+        return sb.toString();
+      }
+      return TOSTRING_MISSING_VALUE_MARKER;
+    }
+
     @Override
     public String toString() {
       try (Formatter formatter = new Formatter()) {
         formatter.format("Boundary order: %s\n", boundaryOrder);
         String minMaxPart =
-            "  %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s  %-" + 
MAX_VALUE_LENGTH_FOR_TOSTRING + "s\n";
-        formatter.format("%-10s  %20s" + minMaxPart, "", "null count", "min", 
"max");
-        String format = "page-%-5d  %20s" + minMaxPart;
+            "  %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s  %-" + 
MAX_VALUE_LENGTH_FOR_TOSTRING + "s";
+        formatter.format(
+            "%-10s  %20s" + minMaxPart + "  %20s  %20s\n",
+            "",
+            "null count",
+            "min",
+            "max",
+            "rep level histogram",
+            "def level histogram");
+        String format = "page-%-5d  %20s" + minMaxPart + "  %20s  %20s\n";
         int arrayIndex = 0;
         for (int i = 0, n = nullPages.length; i < n; ++i) {
           String nullCount =
@@ -201,7 +226,9 @@ public abstract class ColumnIndexBuilder {
             min = truncate(getMinValueAsString(arrayIndex));
             max = truncate(getMaxValueAsString(arrayIndex++));
           }
-          formatter.format(format, i, nullCount, min, max);
+          String repLevelHist = formatHistogram(repLevelHistogram, i);
+          String defLevelHist = formatHistogram(defLevelHistogram, i);
+          formatter.format(format, i, nullCount, min, max, repLevelHist, 
defLevelHist);
         }
         return formatter.toString();
       }
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java
index b56f58d6f..bd729ad97 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java
@@ -39,10 +39,17 @@ public class OffsetIndexBuilder {
     @Override
     public String toString() {
       try (Formatter formatter = new Formatter()) {
-        formatter.format("%-10s  %20s  %16s  %20s\n", "", "offset", 
"compressed size", "first row index");
+        formatter.format(
+            "%-10s  %20s  %20s  %20s  %20s\n",
+            "", "offset", "compressed size", "first row index", "unencoded 
bytes");
         for (int i = 0, n = offsets.length; i < n; ++i) {
+          String unencodedBytes =
+              (unencodedByteArrayDataBytes != null && 
unencodedByteArrayDataBytes.length > 0)
+                  ? String.valueOf(unencodedByteArrayDataBytes[i])
+                  : "-";
           formatter.format(
-              "page-%-5d  %20d  %16d  %20d\n", i, offsets[i], 
compressedPageSizes[i], firstRowIndexes[i]);
+              "page-%-5d  %20d  %20d  %20d  %20s\n",
+              i, offsets[i], compressedPageSizes[i], firstRowIndexes[i], 
unencodedBytes);
         }
         return formatter.toString();
       }

Reply via email to