gszadovszky commented on a change in pull request #928:
URL: https://github.com/apache/parquet-mr/pull/928#discussion_r710846455



##########
File path: 
parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/ColumnEncryptor.java
##########
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.hadoop.util;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.HadoopReadOptions;
+import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.ParquetProperties;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.column.values.bloomfilter.BloomFilter;
+import org.apache.parquet.crypto.AesCipher;
+import org.apache.parquet.crypto.FileEncryptionProperties;
+import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
+import org.apache.parquet.crypto.InternalFileEncryptor;
+import org.apache.parquet.format.BlockCipher;
+import org.apache.parquet.format.DataPageHeader;
+import org.apache.parquet.format.DataPageHeaderV2;
+import org.apache.parquet.format.DictionaryPageHeader;
+import org.apache.parquet.format.PageHeader;
+import org.apache.parquet.format.converter.ParquetMetadataConverter;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnPath;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.internal.column.columnindex.OffsetIndex;
+import org.apache.parquet.io.ParquetEncodingException;
+import org.apache.parquet.schema.MessageType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import static 
org.apache.parquet.column.ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
+import static 
org.apache.parquet.column.ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH;
+import static org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
+import static 
org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER;
+import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
+import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
+
+public class ColumnEncryptor {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ColumnEncryptor.class);
+
+  private static class EncryptorRunTime {
+    private final InternalColumnEncryptionSetup colEncrSetup;
+    private final BlockCipher.Encryptor dataEncryptor;
+    private final BlockCipher.Encryptor metaDataEncryptor;
+    private final byte[] fileAAD ;
+
+    private byte[] dataPageHeaderAAD;
+    private byte[] dataPageAAD;
+    private byte[] dictPageHeaderAAD;
+    private byte[] dictPageAAD;
+
+    public EncryptorRunTime(InternalFileEncryptor fileEncryptor, 
ColumnChunkMetaData chunk,
+                            int blockId, int columnId) throws IOException  {
+      this.colEncrSetup = fileEncryptor.getColumnSetup(chunk.getPath(), true, 
toShortWithCheck(columnId));
+      this.dataEncryptor = colEncrSetup.getDataEncryptor();
+      this.metaDataEncryptor = colEncrSetup.getMetaDataEncryptor();
+
+      this.fileAAD = fileEncryptor.getFileAAD();
+      this.dataPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, 
ModuleType.DataPageHeader,
+        toShortWithCheck(blockId), toShortWithCheck(columnId), (short) 0);
+      this.dataPageAAD = AesCipher.createModuleAAD(fileAAD, 
ModuleType.DataPage,
+        toShortWithCheck(blockId), toShortWithCheck(columnId), (short) 0);
+      this.dictPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, 
ModuleType.DictionaryPageHeader,
+        toShortWithCheck(blockId), toShortWithCheck(columnId), (short) 0);
+      this.dictPageAAD = AesCipher.createModuleAAD(fileAAD, 
ModuleType.DictionaryPage,
+        toShortWithCheck(blockId), toShortWithCheck(columnId), (short) 0);
+    }
+
+    public InternalColumnEncryptionSetup getColEncrSetup() {
+      return this.colEncrSetup;
+    }
+
+    public BlockCipher.Encryptor getDataEncryptor() {
+      return this.dataEncryptor;
+    }
+
+    public BlockCipher.Encryptor getMetaDataEncryptor() {
+      return this.metaDataEncryptor;
+    }
+
+    public byte[] getFileAAD() {
+      return this.fileAAD;
+    }
+
+    public byte[] getDataPageHeaderAAD() {
+      return this.dataPageHeaderAAD;
+    }
+
+    public byte[] getDataPageAAD() {
+      return this.dataPageAAD;
+    }
+
+    public byte[] getDictPageHeaderAAD() {
+      return this.dictPageHeaderAAD;
+    }
+
+    public byte[] getDictPageAAD() {
+      return this.dictPageAAD;
+    }
+  }
+
+  private final int pageBufferSize = ParquetProperties.DEFAULT_PAGE_SIZE * 2;
+  private byte[] pageBuffer;
+  private Configuration conf;
+
+  public ColumnEncryptor(Configuration conf) {
+    this.pageBuffer = new byte[pageBufferSize];
+    this.conf = conf;
+  }
+
+  /**
+   *
+   * @param inputFile Input file
+   * @param outputFile Output file
+   * @throws IOException
+   */
+  public void encryptColumns(String inputFile, String outputFile, List<String> 
paths, FileEncryptionProperties fileEncryptionProperties) throws IOException {
+    Path inPath = new Path(inputFile);
+    Path outPath = new Path(outputFile);
+
+    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, 
NO_FILTER);
+    MessageType schema = metaData.getFileMetaData().getSchema();
+
+    ParquetFileWriter writer = new 
ParquetFileWriter(HadoopOutputFile.fromPath(outPath, conf), schema, 
ParquetFileWriter.Mode.OVERWRITE,
+      DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT, 
DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_STATISTICS_TRUNCATE_LENGTH,
+      ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED, 
fileEncryptionProperties);
+    writer.start();
+
+    try (TransParquetFileReader reader = new 
TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), 
HadoopReadOptions.builder(conf).build())) {
+      processBlocks(reader, writer, metaData, schema, paths);
+    } catch (Exception e) {
+      LOG.error("Exception happened while processing blocks: ", e);
+    } finally {
+      try {
+        writer.end(metaData.getFileMetaData().getKeyValueMetaData());
+      } catch (Exception e) {
+        LOG.error("Exception happened when ending the writer: ", e);
+      }
+    }
+  }
+
+  public void processBlocks(TransParquetFileReader reader, ParquetFileWriter 
writer, ParquetMetadata meta,
+                            MessageType schema, List<String> paths) throws 
IOException {
+    Set<ColumnPath> nullifyColumns = convertToColumnPaths(paths);
+    int blockId = 0;
+    PageReadStore store = reader.readNextRowGroup();
+
+    while (store != null) {
+      writer.startBlock(store.getRowCount());
+
+      List<ColumnChunkMetaData> columnsInOrder = 
meta.getBlocks().get(blockId).getColumns();
+      Map<ColumnPath, ColumnDescriptor> descriptorsMap = 
schema.getColumns().stream().collect(
+        Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
+
+      for (int i = 0; i < columnsInOrder.size(); i += 1) {
+        ColumnChunkMetaData chunk = columnsInOrder.get(i);
+        ColumnDescriptor descriptor = descriptorsMap.get(chunk.getPath());
+        processChunk(descriptor, chunk, reader, writer, nullifyColumns, 
blockId, i, meta.getFileMetaData().getCreatedBy());

Review comment:
       Actually, I've suggested only one additional field and the other would 
be the already existing `created_by`. The question is which one should we add 
to `created_by` and which one to the key-value metadata in addition. Since we 
do not decode the data (so we cannot validate it) I think it is good to copy 
the `created_by` field from the original file and add the "rewriter version" to 
the key-value metadata. But it is a tricky question. Let's say we have a bug in 
one of the compression codecs so we generate invalid compressed pages with a 
certain version or parquet-mr. Now, we have a parquet file that we 
"trans-compress" the pages with a tool with this certain parquet-mr version and 
codec. Then, we use another tool to encrypt the file. I think this topic is 
more for parquet-format and the whole Parquet community. We shall introduce the 
discussed and accepted solution for any tool that "rewrites" a file while 
keeping the original data "undecoded".
   
   Some questions (about a common rewriter) we need to answer before step 
forward:
   * What would be the place of the rewriter versions? (New specific field or 
key-value metadata? Which key shall we use?)
   * Shall we somehow also save what the rewriter has done? How?
   * At what level shall we copy the original `created_by` field and what level 
shall we write the version of the rewriter to that field instead? (What 
different levels are possible?)
   * From the introduction of this rewriter(s) field in case of any related 
writer version dependent fix we need to check this field as well and not only 
the `created_by` one.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to