emkornfield commented on a change in pull request #10071: URL: https://github.com/apache/arrow/pull/10071#discussion_r619882824
########## File path: go/parquet/schema/converted_types.go ########## @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package schema + +import ( + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" +) + +// ConvertedType corresponds to the ConvertedType in the parquet.Thrift, +// with added values of None and NA for handling when these values are not +// set in the metadata +type ConvertedType format.ConvertedType + +var ( + // ConvertedTypes is a struct containing the constants for the types + // to make it easy to reference them while making it clear what they are + ConvertedTypes = struct { + None ConvertedType + UTF8 ConvertedType + Map ConvertedType + MapKeyValue ConvertedType + List ConvertedType + Enum ConvertedType + Decimal ConvertedType + Date ConvertedType + TimeMillis ConvertedType + TimeMicros ConvertedType + TimestampMillis ConvertedType + TimestampMicros ConvertedType + Uint8 ConvertedType + Uint16 ConvertedType + Uint32 ConvertedType + Uint64 ConvertedType + Int8 ConvertedType + Int16 ConvertedType + Int32 ConvertedType + Int64 ConvertedType + JSON ConvertedType + BSON ConvertedType + Interval ConvertedType + NA ConvertedType + }{ + None: -1, // thrift enum starts at 0, so we know this will not be used + UTF8: ConvertedType(format.ConvertedType_UTF8), + Map: ConvertedType(format.ConvertedType_MAP), + MapKeyValue: ConvertedType(format.ConvertedType_MAP_KEY_VALUE), + List: ConvertedType(format.ConvertedType_LIST), + Enum: ConvertedType(format.ConvertedType_ENUM), + Decimal: ConvertedType(format.ConvertedType_DECIMAL), + Date: ConvertedType(format.ConvertedType_DATE), + TimeMillis: ConvertedType(format.ConvertedType_TIME_MILLIS), + TimeMicros: ConvertedType(format.ConvertedType_TIME_MICROS), + TimestampMillis: ConvertedType(format.ConvertedType_TIMESTAMP_MILLIS), + TimestampMicros: ConvertedType(format.ConvertedType_TIMESTAMP_MICROS), + Uint8: ConvertedType(format.ConvertedType_UINT_8), + Uint16: ConvertedType(format.ConvertedType_UINT_16), + Uint32: ConvertedType(format.ConvertedType_UINT_32), + Uint64: ConvertedType(format.ConvertedType_UINT_64), + Int8: ConvertedType(format.ConvertedType_INT_8), + Int16: ConvertedType(format.ConvertedType_INT_16), + Int32: ConvertedType(format.ConvertedType_INT_32), + Int64: ConvertedType(format.ConvertedType_INT_64), + JSON: ConvertedType(format.ConvertedType_JSON), + BSON: ConvertedType(format.ConvertedType_BSON), + Interval: ConvertedType(format.ConvertedType_INTERVAL), + NA: 24, // should always be the last values after Interval + } +) + +func (p ConvertedType) String() string { + switch p { + case ConvertedTypes.None: + return "NONE" + case ConvertedTypes.NA: + return "UNKNOWN" + default: + return format.ConvertedType(p).String() + } +} + +// ToLogicalType returns the correct LogicalType for the given ConvertedType, using the decimal +// metadata provided to define the precision/scale if necessary +func (p ConvertedType) ToLogicalType(convertedDecimal DecimalMetadata) LogicalType { + switch p { + case ConvertedTypes.UTF8: + return StringLogicalType{} + case ConvertedTypes.Map, ConvertedTypes.MapKeyValue: + return MapLogicalType{} + case ConvertedTypes.List: + return ListLogicalType{} + case ConvertedTypes.Enum: + return EnumLogicalType{} + case ConvertedTypes.Decimal: + return NewDecimalLogicalType(convertedDecimal.Precision, convertedDecimal.Scale) + case ConvertedTypes.Date: + return DateLogicalType{} + case ConvertedTypes.TimeMillis: + return NewTimeLogicalType(true, TimeUnitMillis) + case ConvertedTypes.TimeMicros: + return NewTimeLogicalType(true, TimeUnitMicros) + case ConvertedTypes.TimestampMillis: + t := NewTimestampLogicalType(true, TimeUnitMillis) + t.(*TimestampLogicalType).fromConverted = true + return t + case ConvertedTypes.TimestampMicros: + t := NewTimestampLogicalType(true, TimeUnitMicros) + t.(*TimestampLogicalType).fromConverted = true + return t + case ConvertedTypes.Interval: + return IntervalLogicalType{} + case ConvertedTypes.Int8: + return NewIntLogicalType(8, true) + case ConvertedTypes.Int16: + return NewIntLogicalType(16, true) + case ConvertedTypes.Int32: + return NewIntLogicalType(32, true) + case ConvertedTypes.Int64: + return NewIntLogicalType(64, true) + case ConvertedTypes.Uint8: + return NewIntLogicalType(8, false) + case ConvertedTypes.Uint16: + return NewIntLogicalType(16, false) + case ConvertedTypes.Uint32: + return NewIntLogicalType(32, false) + case ConvertedTypes.Uint64: + return NewIntLogicalType(64, false) + case ConvertedTypes.JSON: + return JSONLogicalType{} + case ConvertedTypes.BSON: + return BSONLogicalType{} + case ConvertedTypes.None: + return NoLogicalType{} + case ConvertedTypes.NA: + fallthrough + default: + return UnknownLogicalType{} + } +} + +// GetSortOrder defaults to the sort order based on the physical type if convert +// is ConvertedTypes.None, otherwise determines the sort order by the converted type. +func GetSortOrder(convert ConvertedType, primitive format.Type) SortOrder { + if convert == ConvertedTypes.None { + return DefaultSortOrder(primitive) + } + switch convert { + case ConvertedTypes.Int8, + ConvertedTypes.Int16, + ConvertedTypes.Int32, + ConvertedTypes.Int64, + ConvertedTypes.Date, + ConvertedTypes.TimeMicros, + ConvertedTypes.TimeMillis, + ConvertedTypes.TimestampMicros, + ConvertedTypes.TimestampMillis: + return SortSIGNED + case ConvertedTypes.Uint8, + ConvertedTypes.Uint16, + ConvertedTypes.Uint32, + ConvertedTypes.Uint64, + ConvertedTypes.Enum, + ConvertedTypes.UTF8, + ConvertedTypes.BSON, + ConvertedTypes.JSON: + return SortUNSIGNED + case ConvertedTypes.Decimal, Review comment: this doesn't look right for decimal. I believe it should be signed? Or might vary based on underlying type. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
