[GitHub] [hudi] nsivabalan commented on a change in pull request #4067: [HUDI-2763] Metadata table records key deduplication

GitBox Wed, 24 Nov 2021 17:00:40 -0800


nsivabalan commented on a change in pull request #4067:
URL: https://github.com/apache/hudi/pull/4067#discussion_r754513054




##########
File path: 
hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java
##########
@@ -104,27 +106,30 @@ public HoodieLogBlockType getBlockType() {
     HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig)
         .withOutputStream(ostream).withFileContext(context).create();
 
-    // Serialize records into bytes
+    // Serialize records into bytes, sort them and write to HFile
     Map<String, byte[]> sortedRecordsMap = new TreeMap<>();
     Iterator<IndexedRecord> itr = records.iterator();
     boolean useIntegerKey = false;
     int key = 0;
     int keySize = 0;
-    Field keyField = records.get(0).getSchema().getField(this.keyField);
-    if (keyField == null) {
-      // Missing key metadata field so we should use an integer sequence key
+
+    // Build the record key
+    final Field schemaKeyField = 
records.get(0).getSchema().getField(this.keyField);
+    if (schemaKeyField == null) {
+      // Missing key metadata field. Use an integer sequence key instead.

Review comment:
       can you create a follow up ticket on this integer sequence key. On what 
case we use this and whats the purpose. May be clean it up if required. 

##########
File path: 
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
##########
@@ -360,11 +361,15 @@ protected void 
appendDataAndDeleteBlocks(Map<HeaderMetadataType, String> header)
       header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, 
writeSchemaWithMetaFields.toString());
       List<HoodieLogBlock> blocks = new ArrayList<>(2);
       if (recordList.size() > 0) {
+        final boolean withMetadataKeyDeDuplication = 
config.getMetadataConfig().getRecordKeyDeDuplicate()
+            && 
HoodieTableMetadata.isMetadataTable(hoodieTable.getMetaClient().getBasePath());

Review comment:
       this is getting bit out of hand. Can you file a ticket around 
HoodieDataBlock.getblock. may be we should think about adding a builder instead 
of n no of overloaded methods. 

##########
File path: 
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataHFileReader.java
##########
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.metadata;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.hfile.CacheConfig;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.io.storage.HoodieHFileReader;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * HFile reader for the Metadata table log files. Metadata table records in the
+ * HFile data blocks have the redundant key field in the record payload 
trimmed.
+ * So, when the log reader is reading records, materialization of such trimmed
+ * records must be done before handing the records to the callers. This class
+ * takes care of Metadata table record materialization, any needed.
+ *
+ * @param <R> Metadata table record type.
+ */
+public class HoodieMetadataHFileReader<R extends IndexedRecord> extends 
HoodieHFileReader<R> {
+
+  private static final Logger LOG = 
LogManager.getLogger(HoodieMetadataHFileReader.class);
+
+  public HoodieMetadataHFileReader(Configuration configuration, Path path, 
CacheConfig cacheConfig) throws IOException {
+    super(configuration, path, cacheConfig);
+  }
+
+  public HoodieMetadataHFileReader(Configuration configuration, Path path, 
CacheConfig cacheConfig, FileSystem inlineFs,
+                                   String keyField) throws IOException {
+    super(configuration, path, cacheConfig, inlineFs, keyField);
+  }
+
+  public HoodieMetadataHFileReader(final byte[] content, final String 
keyField) throws IOException {
+    super(content, keyField);
+  }
+
+  /**
+   * Materialize the record key field.
+   *
+   * @param keyField - Key field in the schema
+   * @param keyBytes - Key byte array
+   * @param record   - Record to materialize
+   */
+  @Override
+  protected void materializeRecordIfNeeded(final Option<String> keyField, 
final ByteBuffer keyBytes, R record) {
+    if (!keyField.isPresent()) {
+      return;
+    }
+
+    final Schema.Field keySchemaField = 
record.getSchema().getField(keyField.get());

Review comment:
       if we have the schema handy during instantiation of 
HoodieMetadataHFileReader, can we use that instead of fetching from record 
everytime?
   

##########
File path: 
hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java
##########
@@ -282,12 +286,35 @@ public Builder withPartition(String partitionName) {
       return this;
     }
 
+    public Builder enableFullScan(boolean enableFullScan) {
+      this.enableFullScan = enableFullScan;
+      return this;
+    }
+
     @Override
     public HoodieMergedLogRecordScanner build() {
+      if (HoodieTableMetadata.isMetadataTable(basePath)) {

Review comment:
       shouldn't we be using HoodieMetadataMergedLogRecordReader for any 
metadata based reading ? can you help me understand this please

##########
File path: 
hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java
##########
@@ -259,6 +254,11 @@ private HoodieLogBlock readBlock() throws IOException {
               contentPosition, contentLength, blockEndPos, readerSchema, 
header, footer, keyField);
         }
       case HFILE_DATA_BLOCK:
+        if (HoodieTableMetadata.isMetadataTable(logFile) && keyDeDuplication) {
+          return new HoodieMetadataHFileDataBlock(logFile, inputStream, 
Option.ofNullable(content), readBlockLazily,

Review comment:
       Can you clarify w/ vinoth on this. I remember he suggesting to avoid 
adding a new block altogether.

##########
File path: hudi-common/src/main/avro/HoodieMetadata.avsc
##########
@@ -23,7 +23,10 @@
     "fields": [
         {
             "name": "key",
-            "type": "string"
+            "type": [

Review comment:
       we might want to think how would old reader(090) might behave with this 
payload schema change? 

##########
File path: 
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataHFileDataBlock.java
##########
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.metadata;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.hfile.CacheConfig;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.io.storage.HoodieHFileReader;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.jetbrains.annotations.NotNull;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * HFile data block for the Metadata table records. Since the backing log 
format for metadata table
+ * is the HFile KeyValue and since the key field in the metadata record 
payload is a duplicate
+ * of the Key in the Cell, the redundant key field in the record can be 
nullified to save on the
+ * cost. Such trimmed metadata records need to re-materialized with the key 
field during deserialization.
+ */
+public class HoodieMetadataHFileDataBlock extends HoodieHFileDataBlock {
+
+  private static final Logger LOG = 
LogManager.getLogger(HoodieMetadataHFileDataBlock.class);
+
+  public HoodieMetadataHFileDataBlock(HoodieLogFile logFile, FSDataInputStream 
inputStream, Option<byte[]> content,
+                                      boolean readBlockLazily, long position, 
long blockSize, long blockEndPos,
+                                      Schema readerSchema, 
Map<HeaderMetadataType, String> header,
+                                      Map<HeaderMetadataType, String> footer, 
boolean enableInlineReading,
+                                      String keyField) {
+    super(logFile, inputStream, content, readBlockLazily, position, blockSize, 
blockEndPos, readerSchema, header,
+        footer, enableInlineReading, keyField);
+  }
+
+  public HoodieMetadataHFileDataBlock(@NotNull List<IndexedRecord> records,
+                                      @NotNull Map<HeaderMetadataType, String> 
header, String keyField) {
+    super(records, header, keyField);
+  }
+
+  /**
+   * Serialize the metadata table record to byte buffer after any field 
trimming if needed.
+   *
+   * @param record         - Record to serialize
+   * @param schemaKeyField - Key field in the schema
+   * @return Serialized byte array of the metadata trimmed record
+   */
+  @Override
+  protected ByteBuffer serializeRecord(final IndexedRecord record, final 
Option<Schema.Field> schemaKeyField) {
+    if (!schemaKeyField.isPresent()) {
+      return super.serializeRecord(record, schemaKeyField);
+    }
+
+    ValidationUtils.checkArgument(record.getSchema() != null, "Unknown schema 
for the record!");
+    record.put(schemaKeyField.get().pos(), null);
+    return super.serializeRecord(record, schemaKeyField);

Review comment:
       Wondering if there will be cost with this extra step during 
serialization and deserialization ? 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] nsivabalan commented on a change in pull request #4067: [HUDI-2763] Metadata table records key deduplication

Reply via email to