shangxinli commented on a change in pull request #928: URL: https://github.com/apache/parquet-mr/pull/928#discussion_r720460937
########## File path: parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/ColumnEncryptor.java ########## @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop.util; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.HadoopReadOptions; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.crypto.AesCipher; +import org.apache.parquet.crypto.FileEncryptionProperties; +import org.apache.parquet.crypto.InternalColumnEncryptionSetup; +import org.apache.parquet.crypto.InternalFileEncryptor; +import org.apache.parquet.format.BlockCipher; +import org.apache.parquet.format.DataPageHeader; +import org.apache.parquet.format.DataPageHeaderV2; +import org.apache.parquet.format.DictionaryPageHeader; +import org.apache.parquet.format.PageHeader; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.parquet.column.ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; +import static org.apache.parquet.column.ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH; +import static org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER; +import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE; +import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT; + +/** + * This class is for fast rewriting existing file with column encryption + * + * For columns to be encrypted, all the pages of those columns are read, but decompression/decoding, + * it is encrypted immediately and write back. + * + * For columns not to be encrypted, the whole column chunk will be appended directly to writer. + */ +public class ColumnEncryptor { + private static class EncryptorRunTime { + private final InternalColumnEncryptionSetup colEncrSetup; + private final BlockCipher.Encryptor dataEncryptor; + private final BlockCipher.Encryptor metaDataEncryptor; + private final byte[] fileAAD ; + + private byte[] dataPageHeaderAAD; + private byte[] dataPageAAD; + private byte[] dictPageHeaderAAD; + private byte[] dictPageAAD; + + public EncryptorRunTime(InternalFileEncryptor fileEncryptor, ColumnChunkMetaData chunk, + int blockId, int columnId) throws IOException { + if (fileEncryptor == null) { + this.colEncrSetup = null; + this.dataEncryptor = null; + this.metaDataEncryptor = null; + + this.fileAAD = null; + this.dataPageHeaderAAD = null; + this.dataPageAAD = null; + this.dictPageHeaderAAD = null; + this.dictPageAAD = null; + } else { + this.colEncrSetup = fileEncryptor.getColumnSetup(chunk.getPath(), true, columnId); + this.dataEncryptor = colEncrSetup.getDataEncryptor(); + this.metaDataEncryptor = colEncrSetup.getMetaDataEncryptor(); + + this.fileAAD = fileEncryptor.getFileAAD(); + this.dataPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPageHeader, blockId, columnId, 0); + this.dataPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPage, blockId, columnId, 0); + this.dictPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DictionaryPageHeader, blockId, columnId, 0); + this.dictPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DictionaryPage, blockId, columnId, 0); + } + } + + public BlockCipher.Encryptor getDataEncryptor() { + return this.dataEncryptor; + } + + public BlockCipher.Encryptor getMetaDataEncryptor() { + return this.metaDataEncryptor; + } + + public byte[] getDataPageHeaderAAD() { + return this.dataPageHeaderAAD; + } + + public byte[] getDataPageAAD() { + return this.dataPageAAD; + } + + public byte[] getDictPageHeaderAAD() { + return this.dictPageHeaderAAD; + } + + public byte[] getDictPageAAD() { + return this.dictPageAAD; + } + } + + private Configuration conf; + + public ColumnEncryptor(Configuration conf) { + this.conf = conf; + } + + /** + * Given the input file, encrypt the columns specified by paths, and output the file. + * The encryption settings can be specified in the parameter of fileEncryptionProperties + * @param inputFile Input file + * @param outputFile Output file + * @param paths columns to be encrypted + * @param fileEncryptionProperties FileEncryptionProperties of the file + * @throws IOException + */ + public void encryptColumns(String inputFile, String outputFile, List<String> paths, FileEncryptionProperties fileEncryptionProperties) throws IOException { + Path inPath = new Path(inputFile); + Path outPath = new Path(outputFile); + + ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER); + MessageType schema = metaData.getFileMetaData().getSchema(); + + ParquetFileWriter writer = new ParquetFileWriter(HadoopOutputFile.fromPath(outPath, conf), schema, ParquetFileWriter.Mode.OVERWRITE, + DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_STATISTICS_TRUNCATE_LENGTH, + ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED, fileEncryptionProperties); + writer.start(); + + try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) { + processBlocks(reader, writer, metaData, schema, paths); + } + writer.end(metaData.getFileMetaData().getKeyValueMetaData()); + } + + private void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, + MessageType schema, List<String> encryptPaths) throws IOException { + Set<ColumnPath> encryptColumnsPath = convertToColumnPaths(encryptPaths); + int blockId = 0; + PageReadStore store = reader.readNextRowGroup(); + + while (store != null) { + writer.startBlock(store.getRowCount()); + + List<ColumnChunkMetaData> columnsInOrder = meta.getBlocks().get(blockId).getColumns(); + Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect( + Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x)); + + for (int i = 0; i < columnsInOrder.size(); i += 1) { + ColumnChunkMetaData chunk = columnsInOrder.get(i); + ColumnDescriptor descriptor = descriptorsMap.get(chunk.getPath()); + processChunk(descriptor, chunk, reader, writer, encryptColumnsPath, blockId, i, meta.getFileMetaData().getCreatedBy()); + } + + writer.endBlock(); + store = reader.readNextRowGroup(); + blockId++; + } + } + + private void processChunk(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, TransParquetFileReader reader, ParquetFileWriter writer, + Set<ColumnPath> encryptPaths, int blockId, int columnId, String createdBy) throws IOException { + reader.setStreamPosition(chunk.getStartingPos()); + writer.startColumn(descriptor, chunk.getValueCount(), chunk.getCodec()); + processPages(reader, chunk, writer, createdBy, blockId, columnId, encryptPaths.contains(chunk.getPath())); + writer.endColumn(); + } + + private void processPages(TransParquetFileReader reader, ColumnChunkMetaData chunk, ParquetFileWriter writer, + String createdBy, int blockId, int columnId, boolean encrypt) throws IOException { + int pageOrdinal = 0; + EncryptorRunTime encryptorRunTime = new EncryptorRunTime(writer.getEncryptor(), chunk, blockId, columnId); + DictionaryPage dictionaryPage = null; + long readValues = 0; + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + OffsetIndex offsetIndex = reader.readOffsetIndex(chunk); + reader.setStreamPosition(chunk.getStartingPos()); + long totalChunkValues = chunk.getValueCount(); + while (readValues < totalChunkValues) { + PageHeader pageHeader = reader.readPageHeader(); + byte[] pageLoad; + switch (pageHeader.type) { + case DICTIONARY_PAGE: + if (dictionaryPage != null) { + throw new IOException("has more than one dictionary page in column chunk"); + } + //No quickUpdatePageAAD needed for dictionary page + DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header; + pageLoad = processPayload(reader, pageHeader.getCompressed_page_size(), encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDictPageAAD(), encrypt); Review comment: Inside processPayload() we treat the differently. That is why Gabor had earlier comment to change the method name from encryptPayload() to processPayload(). ########## File path: parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/ColumnEncryptor.java ########## @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop.util; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.HadoopReadOptions; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.crypto.AesCipher; +import org.apache.parquet.crypto.FileEncryptionProperties; +import org.apache.parquet.crypto.InternalColumnEncryptionSetup; +import org.apache.parquet.crypto.InternalFileEncryptor; +import org.apache.parquet.format.BlockCipher; +import org.apache.parquet.format.DataPageHeader; +import org.apache.parquet.format.DataPageHeaderV2; +import org.apache.parquet.format.DictionaryPageHeader; +import org.apache.parquet.format.PageHeader; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.parquet.column.ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; +import static org.apache.parquet.column.ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH; +import static org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER; +import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE; +import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT; + +/** + * This class is for fast rewriting existing file with column encryption + * + * For columns to be encrypted, all the pages of those columns are read, but decompression/decoding, + * it is encrypted immediately and write back. + * + * For columns not to be encrypted, the whole column chunk will be appended directly to writer. + */ +public class ColumnEncryptor { + private static class EncryptorRunTime { + private final InternalColumnEncryptionSetup colEncrSetup; + private final BlockCipher.Encryptor dataEncryptor; + private final BlockCipher.Encryptor metaDataEncryptor; + private final byte[] fileAAD ; + + private byte[] dataPageHeaderAAD; + private byte[] dataPageAAD; + private byte[] dictPageHeaderAAD; + private byte[] dictPageAAD; + + public EncryptorRunTime(InternalFileEncryptor fileEncryptor, ColumnChunkMetaData chunk, + int blockId, int columnId) throws IOException { + if (fileEncryptor == null) { + this.colEncrSetup = null; + this.dataEncryptor = null; + this.metaDataEncryptor = null; + + this.fileAAD = null; + this.dataPageHeaderAAD = null; + this.dataPageAAD = null; + this.dictPageHeaderAAD = null; + this.dictPageAAD = null; + } else { + this.colEncrSetup = fileEncryptor.getColumnSetup(chunk.getPath(), true, columnId); + this.dataEncryptor = colEncrSetup.getDataEncryptor(); + this.metaDataEncryptor = colEncrSetup.getMetaDataEncryptor(); + + this.fileAAD = fileEncryptor.getFileAAD(); + this.dataPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPageHeader, blockId, columnId, 0); + this.dataPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPage, blockId, columnId, 0); + this.dictPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DictionaryPageHeader, blockId, columnId, 0); + this.dictPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DictionaryPage, blockId, columnId, 0); + } + } + + public BlockCipher.Encryptor getDataEncryptor() { + return this.dataEncryptor; + } + + public BlockCipher.Encryptor getMetaDataEncryptor() { + return this.metaDataEncryptor; + } + + public byte[] getDataPageHeaderAAD() { + return this.dataPageHeaderAAD; + } + + public byte[] getDataPageAAD() { + return this.dataPageAAD; + } + + public byte[] getDictPageHeaderAAD() { + return this.dictPageHeaderAAD; + } + + public byte[] getDictPageAAD() { + return this.dictPageAAD; + } + } + + private Configuration conf; + + public ColumnEncryptor(Configuration conf) { + this.conf = conf; + } + + /** + * Given the input file, encrypt the columns specified by paths, and output the file. + * The encryption settings can be specified in the parameter of fileEncryptionProperties + * @param inputFile Input file + * @param outputFile Output file + * @param paths columns to be encrypted + * @param fileEncryptionProperties FileEncryptionProperties of the file + * @throws IOException + */ + public void encryptColumns(String inputFile, String outputFile, List<String> paths, FileEncryptionProperties fileEncryptionProperties) throws IOException { + Path inPath = new Path(inputFile); + Path outPath = new Path(outputFile); + + ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER); + MessageType schema = metaData.getFileMetaData().getSchema(); + + ParquetFileWriter writer = new ParquetFileWriter(HadoopOutputFile.fromPath(outPath, conf), schema, ParquetFileWriter.Mode.OVERWRITE, + DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_STATISTICS_TRUNCATE_LENGTH, + ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED, fileEncryptionProperties); + writer.start(); + + try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) { + processBlocks(reader, writer, metaData, schema, paths); + } + writer.end(metaData.getFileMetaData().getKeyValueMetaData()); + } + + private void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, + MessageType schema, List<String> encryptPaths) throws IOException { + Set<ColumnPath> encryptColumnsPath = convertToColumnPaths(encryptPaths); + int blockId = 0; + PageReadStore store = reader.readNextRowGroup(); + + while (store != null) { + writer.startBlock(store.getRowCount()); + + List<ColumnChunkMetaData> columnsInOrder = meta.getBlocks().get(blockId).getColumns(); + Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect( + Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x)); + + for (int i = 0; i < columnsInOrder.size(); i += 1) { + ColumnChunkMetaData chunk = columnsInOrder.get(i); + ColumnDescriptor descriptor = descriptorsMap.get(chunk.getPath()); + processChunk(descriptor, chunk, reader, writer, encryptColumnsPath, blockId, i, meta.getFileMetaData().getCreatedBy()); + } + + writer.endBlock(); + store = reader.readNextRowGroup(); + blockId++; + } + } + + private void processChunk(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, TransParquetFileReader reader, ParquetFileWriter writer, + Set<ColumnPath> encryptPaths, int blockId, int columnId, String createdBy) throws IOException { + reader.setStreamPosition(chunk.getStartingPos()); + writer.startColumn(descriptor, chunk.getValueCount(), chunk.getCodec()); + processPages(reader, chunk, writer, createdBy, blockId, columnId, encryptPaths.contains(chunk.getPath())); + writer.endColumn(); + } + + private void processPages(TransParquetFileReader reader, ColumnChunkMetaData chunk, ParquetFileWriter writer, + String createdBy, int blockId, int columnId, boolean encrypt) throws IOException { + int pageOrdinal = 0; + EncryptorRunTime encryptorRunTime = new EncryptorRunTime(writer.getEncryptor(), chunk, blockId, columnId); + DictionaryPage dictionaryPage = null; + long readValues = 0; + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + OffsetIndex offsetIndex = reader.readOffsetIndex(chunk); + reader.setStreamPosition(chunk.getStartingPos()); + long totalChunkValues = chunk.getValueCount(); + while (readValues < totalChunkValues) { + PageHeader pageHeader = reader.readPageHeader(); + byte[] pageLoad; + switch (pageHeader.type) { + case DICTIONARY_PAGE: + if (dictionaryPage != null) { + throw new IOException("has more than one dictionary page in column chunk"); + } + //No quickUpdatePageAAD needed for dictionary page + DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header; + pageLoad = processPayload(reader, pageHeader.getCompressed_page_size(), encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDictPageAAD(), encrypt); + writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad), + pageHeader.getUncompressed_page_size(), + dictPageHeader.getNum_values(), + converter.getEncoding(dictPageHeader.getEncoding())), + encryptorRunTime.getMetaDataEncryptor(), encryptorRunTime.getDictPageHeaderAAD()); + break; + case DATA_PAGE: + if (encrypt) { + AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageHeaderAAD(), pageOrdinal); + AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageAAD(), pageOrdinal); + } + DataPageHeader headerV1 = pageHeader.data_page_header; + pageLoad = processPayload(reader, pageHeader.getCompressed_page_size(), encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDataPageAAD(), encrypt); Review comment: Same as above. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
