zeroshade commented on a change in pull request #10951: URL: https://github.com/apache/arrow/pull/10951#discussion_r700409674
########## File path: go/parquet/metadata/statistics.go ########## @@ -0,0 +1,517 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metadata + +import ( + "bytes" + "encoding/binary" + "math" + "unsafe" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/encoding" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "github.com/apache/arrow/go/parquet/schema" +) + +//go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata statistics_types.gen.go.tmpl + +// EncodedStatistics are raw statistics with encoded values that will be written +// to the parquet file, or was read from the parquet file. +type EncodedStatistics struct { + HasMax bool + Max []byte + HasMin bool + Min []byte + Signed bool + HasNullCount bool + NullCount int64 + HasDistinctCount bool + DistinctCount int64 +} + +// ApplyStatSizeLimits sets the maximum size of the min/max values. +// +// from parquet-mr +// we don't write stats larger than the max size rather than truncating. +// the rationale is that some engines may use the minimum value in the page +// as the true minimum for aggregations and there is no way to mark that +// a value has been truncated and is a lower bound and not in the page +func (e *EncodedStatistics) ApplyStatSizeLimits(length int) { + if len(e.Max) > length { + e.HasMax = false + } + if len(e.Min) > length { + e.HasMin = false + } +} + +// IsSet returns true iff one of the Has* values is true. +func (e *EncodedStatistics) IsSet() bool { + return e.HasMin || e.HasMax || e.HasNullCount || e.HasDistinctCount +} + +// SetMax sets the encoded Max value to val and sets HasMax to true +func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics { + e.Max = val[:] + e.HasMax = true + return e +} + +// SetMin sets the encoded Min value to val, and sets HasMin to true +func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics { + e.Min = val[:] + e.HasMin = true + return e +} + +// SetNullCount sets the NullCount to val and sets HasNullCount to true +func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics { + e.NullCount = val + e.HasNullCount = true + return e +} + +// SetDistinctCount sets the DistinctCount to val and sets HasDistinctCount to true +func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics { + e.DistinctCount = val + e.HasDistinctCount = true + return e +} + +func (e *EncodedStatistics) ToThrift() (stats *format.Statistics) { + stats = format.NewStatistics() + if e.HasMin { + stats.MinValue = e.Min + // if sort order is SIGNED then the old min value must be set too for backwards compatibility + if e.Signed { + stats.Min = e.Min + } + } + if e.HasMax { + stats.MaxValue = e.Max + // if sort order is SIGNED then old max value must be set to + if e.Signed { + stats.Max = e.Max + } + } + if e.HasNullCount { + stats.NullCount = &e.NullCount + } + if e.HasDistinctCount { + stats.DistinctCount = &e.DistinctCount + } + return +} + +// TypedStatistics is the base interface for dealing with stats as +// they are being populated +type TypedStatistics interface { + // Type is the underlying physical type for this stat block + Type() parquet.Type + // Returns true if there is a min and max value set for this stat object + HasMinMax() bool + // Returns true if a nullcount has been set + HasNullCount() bool + // returns true only if a distinct count has been set + // current implementation does of the writer does not automatically populate + // the distinct count right now. + HasDistinctCount() bool + NullCount() int64 + DistinctCount() int64 + NumValues() int64 + // return the column descriptor that this stat object was initialized with + Descr() *schema.Column + + // Encode the current min value and return the bytes. ByteArray does not + // include the len in the encoded bytes, otherwise this is identical to + // plain encoding + EncodeMin() []byte + // Encode the current max value and return the bytes. ByteArray does not + // include the len in the encoded bytes, otherwise this is identical to + // plain encoding + EncodeMax() []byte + // Populate an EncodedStatistics object from the current stats + Encode() (EncodedStatistics, error) + // Resets all values to 0 to enable reusing this stat object for multiple + // columns, by calling Encode to get the finished values and then calling + // reset + Reset() + // Merge the min/max/nullcounts and distinct count from the passed stat object + // into this one. + Merge(TypedStatistics) +} + +type statistics struct { + descr *schema.Column + hasMinMax bool + hasNullCount bool + hasDistinctCount bool + mem memory.Allocator + nvalues int64 + stats EncodedStatistics + order schema.SortOrder + + encoder encoding.TypedEncoder +} + +func (s *statistics) incNulls(n int64) { + s.stats.NullCount += n + s.hasNullCount = true +} +func (s *statistics) incDistinct(n int64) { + s.stats.DistinctCount += n + s.hasDistinctCount = true +} + +func (s *statistics) Descr() *schema.Column { return s.descr } +func (s *statistics) Type() parquet.Type { return s.descr.PhysicalType() } +func (s *statistics) HasDistinctCount() bool { return s.hasDistinctCount } +func (s *statistics) HasMinMax() bool { return s.hasMinMax } +func (s *statistics) HasNullCount() bool { return s.hasNullCount } +func (s *statistics) NullCount() int64 { return s.stats.NullCount } +func (s *statistics) DistinctCount() int64 { return s.stats.DistinctCount } +func (s *statistics) NumValues() int64 { return s.nvalues } + +func (s *statistics) Reset() { + s.stats.NullCount = 0 + s.stats.DistinctCount = 0 + s.nvalues = 0 + s.hasMinMax = false + s.hasDistinctCount = false + s.hasNullCount = false +} + +func (s *statistics) merge(other TypedStatistics) { + s.nvalues += other.NumValues() + if other.HasNullCount() { + s.stats.NullCount += other.NullCount() + } + if other.HasDistinctCount() { + s.stats.DistinctCount += other.DistinctCount() Review comment: Funny enough this is also how the C++ implementation works too. Which i believe doesn't currently set distinct count to anything yet anyways. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org