zeroshade commented on a change in pull request #10951:
URL: https://github.com/apache/arrow/pull/10951#discussion_r700409674



##########
File path: go/parquet/metadata/statistics.go
##########
@@ -0,0 +1,517 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metadata
+
+import (
+       "bytes"
+       "encoding/binary"
+       "math"
+       "unsafe"
+
+       "github.com/apache/arrow/go/arrow"
+       "github.com/apache/arrow/go/arrow/array"
+       "github.com/apache/arrow/go/arrow/memory"
+       "github.com/apache/arrow/go/parquet"
+       "github.com/apache/arrow/go/parquet/internal/encoding"
+       format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+       "github.com/apache/arrow/go/parquet/internal/utils"
+       "github.com/apache/arrow/go/parquet/schema"
+)
+
+//go:generate go run ../../arrow/_tools/tmpl/main.go -i 
-data=../internal/encoding/physical_types.tmpldata statistics_types.gen.go.tmpl
+
+// EncodedStatistics are raw statistics with encoded values that will be 
written
+// to the parquet file, or was read from the parquet file.
+type EncodedStatistics struct {
+       HasMax           bool
+       Max              []byte
+       HasMin           bool
+       Min              []byte
+       Signed           bool
+       HasNullCount     bool
+       NullCount        int64
+       HasDistinctCount bool
+       DistinctCount    int64
+}
+
+// ApplyStatSizeLimits sets the maximum size of the min/max values.
+//
+// from parquet-mr
+// we don't write stats larger than the max size rather than truncating.
+// the rationale is that some engines may use the minimum value in the page
+// as the true minimum for aggregations and there is no way to mark that
+// a value has been truncated and is a lower bound and not in the page
+func (e *EncodedStatistics) ApplyStatSizeLimits(length int) {
+       if len(e.Max) > length {
+               e.HasMax = false
+       }
+       if len(e.Min) > length {
+               e.HasMin = false
+       }
+}
+
+// IsSet returns true iff one of the Has* values is true.
+func (e *EncodedStatistics) IsSet() bool {
+       return e.HasMin || e.HasMax || e.HasNullCount || e.HasDistinctCount
+}
+
+// SetMax sets the encoded Max value to val and sets HasMax to true
+func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics {
+       e.Max = val[:]
+       e.HasMax = true
+       return e
+}
+
+// SetMin sets the encoded Min value to val, and sets HasMin to true
+func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics {
+       e.Min = val[:]
+       e.HasMin = true
+       return e
+}
+
+// SetNullCount sets the NullCount to val and sets HasNullCount to true
+func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics {
+       e.NullCount = val
+       e.HasNullCount = true
+       return e
+}
+
+// SetDistinctCount sets the DistinctCount to val and sets HasDistinctCount to 
true
+func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics {
+       e.DistinctCount = val
+       e.HasDistinctCount = true
+       return e
+}
+
+func (e *EncodedStatistics) ToThrift() (stats *format.Statistics) {
+       stats = format.NewStatistics()
+       if e.HasMin {
+               stats.MinValue = e.Min
+               // if sort order is SIGNED then the old min value must be set 
too for backwards compatibility
+               if e.Signed {
+                       stats.Min = e.Min
+               }
+       }
+       if e.HasMax {
+               stats.MaxValue = e.Max
+               // if sort order is SIGNED then old max value must be set to
+               if e.Signed {
+                       stats.Max = e.Max
+               }
+       }
+       if e.HasNullCount {
+               stats.NullCount = &e.NullCount
+       }
+       if e.HasDistinctCount {
+               stats.DistinctCount = &e.DistinctCount
+       }
+       return
+}
+
+// TypedStatistics is the base interface for dealing with stats as
+// they are being populated
+type TypedStatistics interface {
+       // Type is the underlying physical type for this stat block
+       Type() parquet.Type
+       // Returns true if there is a min and max value set for this stat object
+       HasMinMax() bool
+       // Returns true if a nullcount has been set
+       HasNullCount() bool
+       // returns true only if a distinct count has been set
+       // current implementation does of the writer does not automatically 
populate
+       // the distinct count right now.
+       HasDistinctCount() bool
+       NullCount() int64
+       DistinctCount() int64
+       NumValues() int64
+       // return the column descriptor that this stat object was initialized 
with
+       Descr() *schema.Column
+
+       // Encode the current min value and return the bytes. ByteArray does not
+       // include the len in the encoded bytes, otherwise this is identical to
+       // plain encoding
+       EncodeMin() []byte
+       // Encode the current max value and return the bytes. ByteArray does not
+       // include the len in the encoded bytes, otherwise this is identical to
+       // plain encoding
+       EncodeMax() []byte
+       // Populate an EncodedStatistics object from the current stats
+       Encode() (EncodedStatistics, error)
+       // Resets all values to 0 to enable reusing this stat object for 
multiple
+       // columns, by calling Encode to get the finished values and then 
calling
+       // reset
+       Reset()
+       // Merge the min/max/nullcounts and distinct count from the passed stat 
object
+       // into this one.
+       Merge(TypedStatistics)
+}
+
+type statistics struct {
+       descr            *schema.Column
+       hasMinMax        bool
+       hasNullCount     bool
+       hasDistinctCount bool
+       mem              memory.Allocator
+       nvalues          int64
+       stats            EncodedStatistics
+       order            schema.SortOrder
+
+       encoder encoding.TypedEncoder
+}
+
+func (s *statistics) incNulls(n int64) {
+       s.stats.NullCount += n
+       s.hasNullCount = true
+}
+func (s *statistics) incDistinct(n int64) {
+       s.stats.DistinctCount += n
+       s.hasDistinctCount = true
+}
+
+func (s *statistics) Descr() *schema.Column  { return s.descr }
+func (s *statistics) Type() parquet.Type     { return s.descr.PhysicalType() }
+func (s *statistics) HasDistinctCount() bool { return s.hasDistinctCount }
+func (s *statistics) HasMinMax() bool        { return s.hasMinMax }
+func (s *statistics) HasNullCount() bool     { return s.hasNullCount }
+func (s *statistics) NullCount() int64       { return s.stats.NullCount }
+func (s *statistics) DistinctCount() int64   { return s.stats.DistinctCount }
+func (s *statistics) NumValues() int64       { return s.nvalues }
+
+func (s *statistics) Reset() {
+       s.stats.NullCount = 0
+       s.stats.DistinctCount = 0
+       s.nvalues = 0
+       s.hasMinMax = false
+       s.hasDistinctCount = false
+       s.hasNullCount = false
+}
+
+func (s *statistics) merge(other TypedStatistics) {
+       s.nvalues += other.NumValues()
+       if other.HasNullCount() {
+               s.stats.NullCount += other.NullCount()
+       }
+       if other.HasDistinctCount() {
+               s.stats.DistinctCount += other.DistinctCount()

Review comment:
       Funny enough this is also how the C++ implementation works too. Which i 
believe doesn't currently set distinct count to anything yet anyways. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to