zeroshade commented on a change in pull request #10951: URL: https://github.com/apache/arrow/pull/10951#discussion_r700459055
########## File path: go/parquet/metadata/file.go ########## @@ -0,0 +1,478 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metadata + +import ( + "bytes" + "context" + "io" + "reflect" + + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/compress" + "github.com/apache/arrow/go/parquet/internal/encryption" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" + "github.com/apache/arrow/go/parquet/internal/thrift" + "github.com/apache/arrow/go/parquet/schema" + "golang.org/x/xerrors" +) + +// DefaultCompressionType is used unless a different compression is specified +// in the properties +var DefaultCompressionType = compress.Codecs.Uncompressed + +// FileMetaDataBuilder is a proxy for more easily constructing file metadata +// particularly used when writing a file out. +type FileMetaDataBuilder struct { + metadata *format.FileMetaData + props *parquet.WriterProperties + schema *schema.Schema + rowGroups []*format.RowGroup + currentRgBldr *RowGroupMetaDataBuilder + kvmeta KeyValueMetadata + cryptoMetadata *format.FileCryptoMetaData +} + +// NewFileMetadataBuilder will use the default writer properties if nil is passed for +// the writer properties and nil is allowable for the key value metadata. +func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder { + var crypto *format.FileCryptoMetaData + if props.FileEncryptionProperties() != nil && props.FileEncryptionProperties().EncryptedFooter() { + crypto = format.NewFileCryptoMetaData() + } + return &FileMetaDataBuilder{ + metadata: format.NewFileMetaData(), + props: props, + schema: schema, + kvmeta: kvmeta, + cryptoMetadata: crypto, + } +} + +// GetFileCryptoMetaData returns the cryptographic information for encrypting/ +// decrypting the file. +func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata { + if f.cryptoMetadata == nil { + return nil + } + + props := f.props.FileEncryptionProperties() + f.cryptoMetadata.EncryptionAlgorithm = props.Algorithm().ToThrift() + keyMetadata := props.FooterKeyMetadata() + if keyMetadata != "" { + f.cryptoMetadata.KeyMetadata = []byte(keyMetadata) + } + + return &FileCryptoMetadata{f.cryptoMetadata, 0} +} + +// AppendRowGroup adds a rowgroup to the list and returns a builder +// for that row group +func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder { + if f.rowGroups == nil { + f.rowGroups = make([]*format.RowGroup, 0, 1) + } + + rg := format.NewRowGroup() + f.rowGroups = append(f.rowGroups, rg) + f.currentRgBldr = NewRowGroupMetaDataBuilder(f.props, f.schema, rg) + return f.currentRgBldr +} + +// Finish will finalize the metadata of the number of rows, row groups, +// version etc. This will clear out this filemetadatabuilder so it can +// be re-used +func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) { + totalRows := int64(0) + for _, rg := range f.rowGroups { + totalRows += rg.NumRows + } + f.metadata.NumRows = totalRows + f.metadata.RowGroups = f.rowGroups + switch f.props.Version() { + case parquet.V1: + f.metadata.Version = 1 + case parquet.V2: + f.metadata.Version = 2 + default: + f.metadata.Version = 0 + } + createdBy := f.props.CreatedBy() + f.metadata.CreatedBy = &createdBy + + // Users cannot set the `ColumnOrder` since we donot not have user defined sort order + // in the spec yet. + // We always default to `TYPE_DEFINED_ORDER`. We can expose it in + // the API once we have user defined sort orders in the Parquet format. + // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType + typeDefined := format.NewTypeDefinedOrder() + colOrder := &format.ColumnOrder{TYPE_ORDER: typeDefined} + f.metadata.ColumnOrders = make([]*format.ColumnOrder, f.schema.NumColumns()) + for idx := range f.metadata.ColumnOrders { + f.metadata.ColumnOrders[idx] = colOrder + } + + fileEncProps := f.props.FileEncryptionProperties() + if fileEncProps != nil && !fileEncProps.EncryptedFooter() { Review comment: Ah, i misunderstood the question here. The defaults are that the `FileEncryptionProperties()` are nil which indicates no encryption. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org