njalan opened a new issue #2791:
URL: https://github.com/apache/hudi/issues/2791


   I am facing performance issue by S3 slow file listing. So I try to enable 
hoodie metadata to improve performance.
   
   **Environment Description**
   
   Hudi version : 0.7
   
   Spark version : 3.0.1
   
   Hive version : 3.1.2
   
   Hadoop version : 3.2.1
   
   Storage (HDFS/S3/GCS..) : S3
   
   Running on Docker? (yes/no) : no
   
    
   
   **Additional context**
   
   Below is my hudi configuration:
   
       df.write.format("org.apache.hudi")
           .options(getQuickstartWriteConfigs)
           .option(HIVE_URL_OPT_KEY, hive_jbdc_url)
           .option(HIVE_USER_OPT_KEY, hive_user)
           .option(HIVE_PASS_OPT_KEY, "")
           .option(HIVE_DATABASE_OPT_KEY, dataBase)
           .option(HIVE_TABLE_OPT_KEY, tableName)
           .option(HIVE_SYNC_ENABLED_OPT_KEY, true)
           .option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "")
           .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "")
           .option(HoodieWriteConfig.TABLE_NAME, tableName)
           .option("hoodie.upsert.shuffle.parallelism", "8")
           .option("hoodie.insert.shuffle.parallelism", "8")
           .option("hoodie.cleaner.commits.retained",2)
           .option("hoodie.keep.min.commits",3)
           .option("hoodie.keep.max.commits",4)
           .option("hoodie.metadata.enable",true)
           .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY, 
KEY_GENERATOR_Non_Partition)
           .option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, 
classOf[DefaultHoodieRecordPayload].getName)
           .option(HoodiePayloadProps.DEFAULT_PAYLOAD_ORDERING_FIELD_VAL, 
combineKey)
           .option(PRECOMBINE_FIELD_OPT_KEY, combineKey)
           .option(RECORDKEY_FIELD_OPT_KEY, key)
           .option(HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, 
EXTRACTOR_CLASS_Non_Partition)
           .option("hoodie.datasource.hive_sync.support_timestamp", true)
           .mode(mode)
           .save(basePath)
   
    
   
   
   **Stacktrace**
   
   Caused by: org.apache.hudi.exception.HoodieUpsertException: Failed to merge 
old record into new file for key  from old file 
xx_2/.hoodie/metadata/files/6bd6d5c7-712f-4580-b895-79c471bf6dab-0_0-5-5_20210408215411.hfile
 to new file 
xx_2/.hoodie/metadata/files/6bd6d5c7-712f-4580-b895-79c471bf6dab-0_0-44-95_20210408221019001.hfile
 with writerSchema {
     "type" : "record",
     "name" : "HoodieMetadataRecord",
     "namespace" : "org.apache.hudi.avro.model",
     "doc" : "A record saved within the Metadata Table",
     "fields" : [ {
       "name" : "_hoodie_commit_time",
       "type" : [ "null", "string" ],
       "doc" : "",
       "default" : null
     }, {
       "name" : "_hoodie_commit_seqno",
       "type" : [ "null", "string" ],
       "doc" : "",
       "default" : null
     }, {
       "name" : "_hoodie_record_key",
       "type" : [ "null", "string" ],
       "doc" : "",
       "default" : null
     }, {
       "name" : "_hoodie_partition_path",
       "type" : [ "null", "string" ],
       "doc" : "",
       "default" : null
     }, {
       "name" : "_hoodie_file_name",
       "type" : [ "null", "string" ],
       "doc" : "",
       "default" : null
     }, {
       "name" : "key",
       "type" : {
         "type" : "string",
         "avro.java.string" : "String"
       }
     }, {
       "name" : "type",
       "type" : "int",
       "doc" : "Type of the metadata record"
     }, {
       "name" : "filesystemMetadata",
       "type" : [ "null", {
         "type" : "map",
         "values" : {
           "type" : "record",
           "name" : "HoodieMetadataFileInfo",
           "fields" : [ {
             "name" : "size",
             "type" : "long",
             "doc" : "Size of the file"
           }, {
             "name" : "isDeleted",
             "type" : "boolean",
             "doc" : "True if this file has been deleted"
           } ]
         },
         "avro.java.string" : "String"
       } ],
       "doc" : "Contains information about partitions and files within the 
dataset"
     } ]
   }
        at 
org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:256)
        at 
org.apache.hudi.io.HoodieSortedMergeHandle.write(HoodieSortedMergeHandle.java:101)
        at 
org.apache.hudi.table.action.commit.AbstractMergeHelper$UpdateHandler.consumeOneRecord(AbstractMergeHelper.java:122)
        at 
org.apache.hudi.table.action.commit.AbstractMergeHelper$UpdateHandler.consumeOneRecord(AbstractMergeHelper.java:112)
        at 
org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer.consume(BoundedInMemoryQueueConsumer.java:37)
        at 
org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.lambda$null$2(BoundedInMemoryExecutor.java:121)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        ... 3 more
   Caused by: java.lang.IllegalArgumentException: key length must be > 0
        at org.apache.hadoop.util.bloom.HashFunction.hash(HashFunction.java:114)
        at org.apache.hadoop.util.bloom.BloomFilter.add(BloomFilter.java:122)
        at 
org.apache.hudi.common.bloom.SimpleBloomFilter.add(SimpleBloomFilter.java:83)
        at 
org.apache.hudi.io.storage.HoodieHFileWriter.writeAvro(HoodieHFileWriter.java:123)
        at 
org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:251)
        ... 9 more
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to