[ https://issues.apache.org/jira/browse/PARQUET-2261?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17789309#comment-17789309 ]
ASF GitHub Bot commented on PARQUET-2261: ----------------------------------------- wgtmac commented on code in PR #1177: URL: https://github.com/apache/parquet-mr/pull/1177#discussion_r1403906654 ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java: ########## @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import org.apache.parquet.Preconditions; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +/** + * A structure for capturing metadata for estimating the unencoded, + * uncompressed size of data written. This is useful for readers to estimate + * how much memory is needed to reconstruct data in their memory model and for + * fine-grained filter push down on nested structures (the histograms contained + * in this structure can help determine the number of nulls at a particular + * nesting level and maximum length of lists). + */ +public class SizeStatistics { + + private PrimitiveType type; + /** + * The number of physical bytes stored for BYTE_ARRAY data values assuming + * no encoding. This is exclusive of the bytes needed to store the length of + * each byte array. In other words, this field is equivalent to the `(size + * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + * written)`. To determine unencoded sizes of other types readers can use + * schema information multiplied by the number of non-null and null values. + * The number of null/non-null values can be inferred from the histograms + * below. + * + * For example, if a column chunk is dictionary-encoded with dictionary + * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + * then this value for that data page should be 7 (1 + 1 + 2 + 3). + * + * This field should only be set for types that use BYTE_ARRAY as their + * physical type. + */ + private long unencodedByteArrayDataBytes; + /** + * When present, there is expected to be one element corresponding to each + * repetition (i.e. size=max repetition_level+1) where each element + * represents the number of times the repetition level was observed in the + * data. + * + * This field may be omitted if max_repetition_level is 0 without loss + * of information. + */ + private List<Long> repetitionLevelHistogram; + /** + * Same as repetition_level_histogram except for definition levels. + * + * This field may be omitted if max_definition_level is 0 or 1 without + * loss of information. + */ + private List<Long> definitionLevelHistogram; + + /** + * Whether the statistics has valid value. + * + * It is true by default. Only set to false while it fails to merge statistics. + */ + private boolean hasValue = true; + + /** + * Builder to create a SizeStatistics. + */ + public static class Builder { + private PrimitiveType type; + private long unencodedByteArrayDataBytes; + private long[] repetitionLevelHistogram; + private long[] definitionLevelHistogram; + + /** + * Create a builder to create a SizeStatistics. + * + * @param type physical type of the column associated with this statistics + * @param maxRepetitionLevel maximum repetition level of the column + * @param maxDefinitionLevel maximum definition level of the column + */ + public Builder(PrimitiveType type, int maxRepetitionLevel, int maxDefinitionLevel) { + this.type = type; + this.unencodedByteArrayDataBytes = 0L; + repetitionLevelHistogram = new long[maxRepetitionLevel + 1]; + definitionLevelHistogram = new long[maxDefinitionLevel + 1]; + Arrays.fill(repetitionLevelHistogram, 0L); + Arrays.fill(definitionLevelHistogram, 0L); + } + + /** + * Add repetition and definition level of a value to the statistics. + * It is called when value is null, or the column is not of BYTE_ARRAY type. + * + * @param repetitionLevel repetition level of the value + * @param definitionLevel definition level of the value + */ + public void add(int repetitionLevel, int definitionLevel) { + Preconditions.checkArgument(0 <= repetitionLevel && repetitionLevel < repetitionLevelHistogram.length, + "repetitionLevel %s is out of range [0, %s]", repetitionLevel, repetitionLevelHistogram.length - 1); + Preconditions.checkArgument(definitionLevel < definitionLevelHistogram.length, + "definitionLevel %s is out of range [0, %s]", definitionLevel, definitionLevelHistogram.length - 1); + repetitionLevelHistogram[repetitionLevel]++; + definitionLevelHistogram[definitionLevel]++; + } + + /** + * Add repetition and definition level of a value to the statistics. + * It is called when value is null, or the column is not of BYTE_ARRAY type. Review Comment: The check here simply ignores the Binary value from FIXED_SIZE_BYTE_ARRAY type which we do not care. > [Format] Add statistics that reflect decoded size to metadata > ------------------------------------------------------------- > > Key: PARQUET-2261 > URL: https://issues.apache.org/jira/browse/PARQUET-2261 > Project: Parquet > Issue Type: New Feature > Components: parquet-format > Reporter: Micah Kornfield > Assignee: Micah Kornfield > Priority: Major > Fix For: format-2.10.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)