[ https://issues.apache.org/jira/browse/HUDI-314?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
ASF GitHub Bot updated HUDI-314: -------------------------------- Labels: pull-request-available (was: ) > Unable to query a multi-partitions MOR realtime table > ----------------------------------------------------- > > Key: HUDI-314 > URL: https://issues.apache.org/jira/browse/HUDI-314 > Project: Apache Hudi (incubating) > Issue Type: Bug > Reporter: Wenning Ding > Priority: Major > Labels: pull-request-available > > h3. Description > I created a Hudi MOR table with multiple partition keys. The partition keys > are "year", "month" and "day". > While I try to query its realtime time in Hive like this: > {code:java} > SELECT * FROM hudi_multi_partitions_test_rt; > {code} > It returns: > {code:java} > java.lang.Exception: java.io.IOException: > org.apache.avro.SchemaParseException: Illegal character in: year/month/day > at > org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:489) > ~[hadoop-mapreduce-client-common-2.8.4.jar:?] > at > org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:549) > ~[hadoop-mapreduce-client-common-2.8.4.jar:?] > Caused by: java.io.IOException: org.apache.avro.SchemaParseException: Illegal > character in: year/month/day > at > org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97) > ~[hive-exec-2.3.3.jar:2.3.3] > at > org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57) > ~[hive-exec-2.3.3.jar:2.3.3] > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:379) > ~[hive-exec-2.3.3.jar:2.3.3] > at > org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:169) > ~[hadoop-mapreduce-client-core-2.8.4.jar:?] > at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:432) > ~[hadoop-mapreduce-client-core-2.8.4.jar:?] > at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343) > ~[hadoop-mapreduce-client-core-2.8.4.jar:?] > at > org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:270) > ~[hadoop-mapreduce-client-common-2.8.4.jar:?] > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > ~[?:1.8.0_212] > at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_212] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > [?:1.8.0_212] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > [?:1.8.0_212] > at java.lang.Thread.run(Thread.java:748) [?:1.8.0_212] > Caused by: org.apache.avro.SchemaParseException: Illegal character in: > year/month/day > at org.apache.avro.Schema.validateName(Schema.java:1083) > ~[avro-1.7.7.jar:1.7.7] > at org.apache.avro.Schema.access$200(Schema.java:79) > ~[avro-1.7.7.jar:1.7.7] > at org.apache.avro.Schema$Field.<init>(Schema.java:372) > ~[avro-1.7.7.jar:1.7.7] > at org.apache.avro.Schema$Field.<init>(Schema.java:367) > ~[avro-1.7.7.jar:1.7.7] > at > org.apache.hudi.common.util.HoodieAvroUtils.appendNullSchemaFields(HoodieAvroUtils.java:166) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hudi.hadoop.realtime.AbstractRealtimeRecordReader.addPartitionFields(AbstractRealtimeRecordReader.java:305) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hudi.hadoop.realtime.AbstractRealtimeRecordReader.init(AbstractRealtimeRecordReader.java:328) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hudi.hadoop.realtime.AbstractRealtimeRecordReader.<init>(AbstractRealtimeRecordReader.java:103) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hudi.hadoop.realtime.RealtimeCompactedRecordReader.<init>(RealtimeCompactedRecordReader.java:48) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.constructRecordReader(HoodieRealtimeRecordReader.java:67) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.<init>(HoodieRealtimeRecordReader.java:45) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat.getRecordReader(HoodieParquetRealtimeInputFormat.java:233) > ~[hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar:0.5.1-SNAPSHOT] > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:376) > ~[hive-exec-2.3.3.jar:2.3.3] > at > org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:169) > ~[hadoop-mapreduce-client-core-2.8.4.jar:?] > at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:432) > ~[hadoop-mapreduce-client-core-2.8.4.jar:?] > at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343) > ~[hadoop-mapreduce-client-core-2.8.4.jar:?] > at > org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:270) > ~[hadoop-mapreduce-client-common-2.8.4.jar:?] > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > ~[?:1.8.0_212] > at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_212] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > ~[?:1.8.0_212] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > ~[?:1.8.0_212] > at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_212] > {code} > h3. Reproduction > {code:java} > import org.apache.hudi.DataSourceWriteOptions > import org.apache.hudi.config.HoodieWriteConfig > import org.apache.spark.sql.SaveMode > var tableName = "hudi_multi_partitions_test" > var tablePath = "s3://emr-users/wenningd/hudi/tables/events/" + tableName > var tableType = "MERGE_ON_READ" > val inputDF2 = Seq( > ("100", "event_name_897", "2015-01-01T23:52:39.340396Z", "type1", "2015", > "01", "01"), > ("101", "event_name_236", "2015-01-01T22:14:58.597216Z", "type2", "2015", > "01", "01"), > ("104", "event_name_764", "2015-02-01T12:15:00.512679Z", "type1", "2015", > "01", "01"), > ("105", "event_name_675", "2015-02-01T13:51:42.248818Z", "type2", "2015", > "01", "01"), > ("106", "event_name_337", "2015-02-01T13:51:42.248818Z", "type2", "2015", > "03", "16"), > ("107", "event_name_452", "2015-02-01T13:51:42.248818Z", "type2", "2015", > "03", "16"), > ("108", "event_name_234", "2015-02-01T13:51:42.248818Z", "type2", "2015", > "03", "16"), > ("199", "event_name_011", "2015-02-01T13:51:42.248818Z", "type2", "2015", > "03", "16") > ).toDF("_row_key", "event_name", "timestamp", "event_type", "year", > "month", "day") > inputDF2.write.format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", > "2") > .option("hoodie.upsert.shuffle.parallelism", "2") > .option(HoodieWriteConfig.TABLE_NAME, tableName) > .option(DataSourceWriteOptions.OPERATION_OPT_KEY, > DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) > .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY, > DataSourceWriteOptions.MOR_STORAGE_TYPE_OPT_VAL) > .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key") > .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, > "year,month,day") > .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "timestamp") > .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true") > .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY, > "org.apache.hudi.ComplexKeyGenerator") > .option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, tableName) > .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, > "org.apache.hudi.hive.MultiPartKeysValueExtractor") > .option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, > "year,month,day") > .mode(SaveMode.Append) > .save(tablePath) > {code} > h3. Investigations > For realtime table, Hudi will read partition keys from jobConf and split it > with comma. Like this: > {code:java} > String partitionFields = > jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); > List<String> partitioningFields = partitionFields.length() > 0 ? > Arrays.stream(partitionFields.split(",")).collect(Collectors.toList()) : new > ArrayList<>(); > {code} > The problem is Hudi split partitionFields with comma, but in Hive, Hive uses > slash to split multiple partition keys e.g. "/year/month/day". Also I checked > previous versions of Hive, they all use slash for split. > So my idea is just replace comma with slash. -- This message was sent by Atlassian Jira (v8.3.4#803005)