[GitHub] [hudi] vinothchandar commented on pull request #2899: [HUDI-1865] Make embedded time line service singleton
vinothchandar commented on pull request #2899: URL: https://github.com/apache/hudi/pull/2899#issuecomment-837816065 > each task manager is, sending RPC calls to the timeline server running locally? each write task is sending a RPC call to itself? Could you confirm that. >we start a write client for each task and we use that client always. So its a singleton per JVM for a given table? >if i request the embedded timeline server with a new instant time, the server should sync itself first, The incremental timeline sync should be efficient. Is that what you are referring to? Thanks for answering patiently. This model is very different. :) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xiarixiaoyao commented on pull request #2720: [HUDI-1719]hive on spark/mr,Incremental query of the mor table, the partition field is incorrect
xiarixiaoyao commented on pull request #2720: URL: https://github.com/apache/hudi/pull/2720#issuecomment-837735008 @vinothchandar i will do it today, thanks -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xiarixiaoyao commented on pull request #2722: [HUDI-1722]hive beeline/spark-sql query specified field on mor table occur NPE
xiarixiaoyao commented on pull request #2722: URL: https://github.com/apache/hudi/pull/2722#issuecomment-837732845 @vinothchandar i will rebase this pr, thanks -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] garyli1019 merged pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
garyli1019 merged pull request #2930: URL: https://github.com/apache/hudi/pull/2930 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated: [HUDI-1818] Validate required fields for Flink HoodieTable (#2930)
This is an automated email from the ASF dual-hosted git repository. garyli pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 7a5af80 [HUDI-1818] Validate required fields for Flink HoodieTable (#2930) 7a5af80 is described below commit 7a5af806cfe54474014db5c70641c0c6269fff03 Author: hiscat <46845236+mylanpan...@users.noreply.github.com> AuthorDate: Tue May 11 11:11:19 2021 +0800 [HUDI-1818] Validate required fields for Flink HoodieTable (#2930) --- .../org/apache/hudi/table/HoodieTableFactory.java | 31 +++ .../apache/hudi/table/TestHoodieTableFactory.java | 46 ++ 2 files changed, 77 insertions(+) diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java index 1566aa6..02ab280 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java @@ -40,9 +40,11 @@ import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; /** * Hoodie data source/sink factory. @@ -59,6 +61,7 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab Configuration conf = (Configuration) helper.getOptions(); TableSchema schema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); +validateRequiredFields(conf, schema); setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema); Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() -> @@ -75,6 +78,7 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab public DynamicTableSink createDynamicTableSink(Context context) { Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions()); TableSchema schema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); +validateRequiredFields(conf, schema); setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema); return new HoodieTableSink(conf, schema); } @@ -98,6 +102,33 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab // Utilities // - + /** Validate required options. e.g record key and pre combine key. + * + * @param conf The table options + * @param schema The table schema + */ + private void validateRequiredFields(Configuration conf, TableSchema schema) { +List fields = Arrays.stream(schema.getFieldNames()).collect(Collectors.toList()); + +// validate record key in pk absence. +if (!schema.getPrimaryKey().isPresent()) { + Arrays.stream(conf.get(FlinkOptions.RECORD_KEY_FIELD).split(",")) + .filter(field -> !fields.contains(field)) + .findAny() + .ifPresent(e -> { +throw new ValidationException("The " + e + " field not exists in table schema." ++ "Please define primary key or modify hoodie.datasource.write.recordkey.field option."); + }); +} + +// validate pre combine key +String preCombineField = conf.get(FlinkOptions.PRECOMBINE_FIELD); +if (!fields.contains(preCombineField)) { + throw new ValidationException("The " + preCombineField + " field not exists in table schema." + + "Please check write.precombine.field option."); +} + } + /** * Setup the config options based on the table definition, for e.g the table name, primary key. * diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java index b7f4429..d7ec693 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java @@ -28,6 +28,7 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.ValidationException; import org.apache.flink.table.catalog.CatalogTable; import org.apache.flink.table.catalog.CatalogTableImpl; import org.apache.flink.table.catalog.ObjectIdentifier; @@ -44,7 +45,9 @@ import java.util.Objects; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static
[GitHub] [hudi] pengzhiwei2018 edited a comment on issue #2935: [Support] HoodieFileIndex get a error when there is no partition path in table storage
pengzhiwei2018 edited a comment on issue #2935: URL: https://github.com/apache/hudi/issues/2935#issuecomment-837684290 Hi @shenbinglife , Thanks for your feedback on this question. We have noticed this issue and after https://github.com/apache/hudi/pull/2893 merged, this will be solved. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] pengzhiwei2018 commented on issue #2935: [Support] HoodieFileIndex get a error when there is no partition path in table storage
pengzhiwei2018 commented on issue #2935: URL: https://github.com/apache/hudi/issues/2935#issuecomment-837684290 Hi @shenbinglife , Thanks for your feedback on this question. We have noticed this issue and after https://github.com/apache/hudi/pull/2893 merge, this will be solved. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] shenbinglife commented on issue #2935: HoodieFileIndex get a error when there is no partition path in table storage
shenbinglife commented on issue #2935: URL: https://github.com/apache/hudi/issues/2935#issuecomment-837680457 I create a issue at https://issues.apache.org/jira/browse/HUDI-1893 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Created] (HUDI-1893) HoodieFileIndex get a error when there is no partition path in table storage
shenbing created HUDI-1893: -- Summary: HoodieFileIndex get a error when there is no partition path in table storage Key: HUDI-1893 URL: https://issues.apache.org/jira/browse/HUDI-1893 Project: Apache Hudi Issue Type: Bug Reporter: shenbing https://github.com/apache/hudi/blob/42ec7e30d74104a15ff76cb004fe7a0c2649abe6/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala#L319 Version: hudi-0.9.0-snapshot May it should be : spark.sparkContext.parallelize(pathToFetch, Math.max(1, Math.min(pathToFetch.size, maxListParallelism))) -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] shenbinglife opened a new issue #2935: HoodieFileIndex get a error when there is no partition path in table storage
shenbinglife opened a new issue #2935: URL: https://github.com/apache/hudi/issues/2935 https://github.com/apache/hudi/blob/42ec7e30d74104a15ff76cb004fe7a0c2649abe6/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala#L319 May it should be : spark.sparkContext.parallelize(pathToFetch, Math.max(1, Math.min(pathToFetch.size, maxListParallelism))) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Created] (HUDI-1892) NullPointerException when using OverwriteNonDefaultsWithLatestAvroPayload at hudi 0.9.0
shenbing created HUDI-1892: -- Summary: NullPointerException when using OverwriteNonDefaultsWithLatestAvroPayload at hudi 0.9.0 Key: HUDI-1892 URL: https://issues.apache.org/jira/browse/HUDI-1892 Project: Apache Hudi Issue Type: Bug Reporter: shenbing using compiled hudi 0.9.0 with hadoop3.0.0 and hive3.1.1 after resolving dependency conflicts, I import hudi-spark-bundle_2.11-0.9.0-SNAPSHOT.jar into my project. When I using OverwriteNonDefaultsWithLatestAvroPayload to update field with new value, I got the error. {code:java} Caused by: java.util.concurrent.ExecutionException: org.apache.hudi.exception.HoodieUpsertException: Failed to combine/merge new record with old value in storage, for new record {HoodieRecord{key=HoodieKey { recordKey=1 partitionPath=date=1}, currentLocation='HoodieRecordLocation {instantTime=20210510160355, fileId=9a0fcb8e-8cd9-4c9c-bea8-46bbf509035e-0}', newLocation='HoodieRecordLocation {instantTime=20210510160400, fileId=9a0fcb8e-8cd9-4c9c-bea8-46bbf509035e-0}'}}, old value {{"_hoodie_commit_time": "20210510160355", "_hoodie_commit_seqno": "20210510160355_0_50", "_hoodie_record_key": "1", "_hoodie_partition_path": "date=1", "_hoodie_file_name": "9a0fcb8e-8cd9-4c9c-bea8-46bbf509035e-0_0-1502-1519_20210510160355.parquet", "uuid": "1", "name": "jerry", "age": 10, "date": "1", "update_time": "1"}} at java.util.concurrent.FutureTask.report(FutureTask.java:122) at java.util.concurrent.FutureTask.get(FutureTask.java:192) at org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.execute(BoundedInMemoryExecutor.java:141) ... 34 more Caused by: org.apache.hudi.exception.HoodieUpsertException: Failed to combine/merge new record with old value in storage, for new record {HoodieRecord{key=HoodieKey { recordKey=1 partitionPath=date=1}, currentLocation='HoodieRecordLocation {instantTime=20210510160355, fileId=9a0fcb8e-8cd9-4c9c-bea8-46bbf509035e-0}', newLocation='HoodieRecordLocation {instantTime=20210510160400, fileId=9a0fcb8e-8cd9-4c9c-bea8-46bbf509035e-0}'}}, old value {{"_hoodie_commit_time": "20210510160355", "_hoodie_commit_seqno": "20210510160355_0_50", "_hoodie_record_key": "1", "_hoodie_partition_path": "date=1", "_hoodie_file_name": "9a0fcb8e-8cd9-4c9c-bea8-46bbf509035e-0_0-1502-1519_20210510160355.parquet", "uuid": "1", "name": "jerry", "age": 10, "date": "1", "update_time": "1"}} at org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:290) at org.apache.hudi.table.action.commit.AbstractMergeHelper$UpdateHandler.consumeOneRecord(AbstractMergeHelper.java:122) at org.apache.hudi.table.action.commit.AbstractMergeHelper$UpdateHandler.consumeOneRecord(AbstractMergeHelper.java:112) at org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer.consume(BoundedInMemoryQueueConsumer.java:37) at org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.lambda$null$2(BoundedInMemoryExecutor.java:121) at java.util.concurrent.FutureTask.run(FutureTask.java:266) ... 3 more Caused by: java.lang.NullPointerException at org.apache.hudi.common.model.OverwriteWithLatestAvroPayload.overwriteField(OverwriteWithLatestAvroPayload.java:97) at org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload.lambda$combineAndGetUpdateValue$0(OverwriteNonDefaultsWithLatestAvroPayload.java:67) at java.util.ArrayList.forEach(ArrayList.java:1259) at org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload.combineAndGetUpdateValue(OverwriteNonDefaultsWithLatestAvroPayload.java:64) at org.apache.hudi.common.model.HoodieRecordPayload.combineAndGetUpdateValue(HoodieRecordPayload.java:81) at org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:276) ... 8 more {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] danny0405 edited a comment on pull request #2899: [HUDI-1865] Make embedded time line service singleton
danny0405 edited a comment on pull request #2899: URL: https://github.com/apache/hudi/pull/2899#issuecomment-837659380 > > each write task write out these records buffer (grouping by file group id) using a write client there. > > Understood. So each task manager is, sending RPC calls to the timeline server running locally? This is very different from Spark. What's the life cycle of the `write client` in each JVM process? i.e how often do we re-initialize it? (it can't be per record, so wondering) > > I get why you want a singleton now. Let's work together to figure something out. The life cycle of the flink `write client` is: it is living until the job ends, we start a write client for each task and we use that client always. In my thoughts, if i request the embedded timeline server with a new instant time, the server should sync itself first, it is more easy for server to keep the freshness than client(i.e each client should figure out how and when to re-initialize the fs view server). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on pull request #2899: [HUDI-1865] Make embedded time line service singleton
danny0405 commented on pull request #2899: URL: https://github.com/apache/hudi/pull/2899#issuecomment-837659380 > > each write task write out these records buffer (grouping by file group id) using a write client there. > > Understood. So each task manager is, sending RPC calls to the timeline server running locally? This is very different from Spark. What's the life cycle of the `write client` in each JVM process? i.e how often do we re-initialize it? (it can't be per record, so wondering) > > I get why you want a singleton now. Let's work together to figure something out. The life cycle of the `write client` is: it is living until the job ends, we start a write client for each task and we use that client always. In my thoughts, if i request the embedded timeline server with a new instant time, the server should sync itself first, it is more easy for server to keep the freshness than client(i.e each client should figure out how and when to re-initialize the fs view server). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] pengzhiwei2018 edited a comment on pull request #2645: [HUDI-1659] Basic Implementation Of Spark Sql Support
pengzhiwei2018 edited a comment on pull request #2645: URL: https://github.com/apache/hudi/pull/2645#issuecomment-835134388 Hi @vinothchandar @umehrot2 , The PR has updated, mainly with the follow changes: - Support atomic for CTAS - Support use timestamp type as partition field - Fix exception when partition column is not in the rightest of select list for CTAS. - Add `TestSqlStatement` which does a sequence of statements. see [sql-statements.sql](https://github.com/apache/hudi/blob/171d607b1adc3972aa2c9e3efce5362368599d00/hudi-spark-datasource/hudi-spark/src/test/resources/sql-statements.sql) - Add more test case for CTAS & partitioned table. - Change the `SparkSqlAdpater` to `SparkAdapter` For other issues your have mentioned above, I have filed a JIRA for each. - Support Truncate Command For Hoodie [1883](https://issues.apache.org/jira/browse/HUDI-1883) - Support Partial Update For MergeInto [1884](https://issues.apache.org/jira/browse/HUDI-1884) - Support Delete/Update Non-pk table [1885](https://issues.apache.org/jira/browse/HUDI-1885) After this first pr has merged, we can continue to solve these JIRAs. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] pengzhiwei2018 edited a comment on pull request #2645: [HUDI-1659] Basic Implementation Of Spark Sql Support
pengzhiwei2018 edited a comment on pull request #2645: URL: https://github.com/apache/hudi/pull/2645#issuecomment-835134388 Hi @vinothchandar @umehrot2 , The PR has updated, mainly with the follow changes: - Support atomic for CTAS - Support use timestamp type as partition field - Fix exception when partition column is not in the rightest of select list for CTAS. - Add `TestSqlStatement` which does a sequence of statements. see [sql-statements.sql](https://github.com/apache/hudi/pull/2645/files#diff-71c005a921dcea9f712db30bd3376fbc5707d09e9777c583b933072cc64276fd) - Add more test case for CTAS & partitioned table. - Change the `SparkSqlAdpater` to `SparkAdapter` For other issues your have mentioned above, I have filed a JIRA for each. - Support Truncate Command For Hoodie [1883](https://issues.apache.org/jira/browse/HUDI-1883) - Support Partial Update For MergeInto [1884](https://issues.apache.org/jira/browse/HUDI-1884) - Support Delete/Update Non-pk table [1885](https://issues.apache.org/jira/browse/HUDI-1885) After this first pr has merged, we can continue to solve these JIRAs. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Created] (HUDI-1891) Jetty Dependency conflict when upgrade to hive3.1.1 and hadoop3.0.0
shenbing created HUDI-1891: -- Summary: Jetty Dependency conflict when upgrade to hive3.1.1 and hadoop3.0.0 Key: HUDI-1891 URL: https://issues.apache.org/jira/browse/HUDI-1891 Project: Apache Hudi Issue Type: Bug Reporter: shenbing when package hudi 0.7.0 or 0.9.0-SNAPSHOT using {code:java} mvn clean install -DskipTests -DskipITs -Dcheckstyle.skip=true -Drat.skip=true -Dhadoop.version=3.0.0 -Dhive.version=3.1.1{code} and then import hudi-spark-bundle_2.11-0.9.0-SNAPSHOT.jar into my project. I got a error : {code:java} org.apache.hudi.org.apache.jetty.server.session.SessionHandler.setHttpOnly(Z)Vjava.lang.NoSuchMethodError: org.apache.hudi.org.apache.jetty.server.session.SessionHandler.setHttpOnly(Z)V at io.javalin.core.util.JettyServerUtil.defaultSessionHandler(JettyServerUtil.kt:50) at io.javalin.Javalin.(Javalin.java:94) at io.javalin.Javalin.create(Javalin.java:107) at org.apache.hudi.timeline.service.TimelineService.startService(TimelineService.java:156) at org.apache.hudi.client.embedded.EmbeddedTimelineService.startServer(EmbeddedTimelineService.java:88) at org.apache.hudi.client.embedded.EmbeddedTimelineServerHelper.createEmbeddedTimelineService(EmbeddedTimelineServerHelper.java:56) at org.apache.hudi.client.AbstractHoodieClient.startEmbeddedServerView(AbstractHoodieClient.java:109) at org.apache.hudi.client.AbstractHoodieClient.(AbstractHoodieClient.java:77) at org.apache.hudi.client.AbstractHoodieWriteClient.(AbstractHoodieWriteClient.java:132) at org.apache.hudi.client.AbstractHoodieWriteClient.(AbstractHoodieWriteClient.java:120) at org.apache.hudi.client.SparkRDDWriteClient.(SparkRDDWriteClient.java:84) {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] vinothchandar commented on a change in pull request #2819: [HUDI-1794] Moved static COMMIT_FORMATTER to thread local variable as SimpleDateFormat is not thread safe.
vinothchandar commented on a change in pull request #2819: URL: https://github.com/apache/hudi/pull/2819#discussion_r629798021 ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java ## @@ -73,6 +71,16 @@ private static final Logger LOG = LogManager.getLogger(HoodieActiveTimeline.class); protected HoodieTableMetaClient metaClient; private static AtomicReference lastInstantTime = new AtomicReference<>(String.valueOf(Integer.MIN_VALUE)); + private static ThreadLocal commitFormatHolder = new ThreadLocal() { Review comment: yes. we can make a `LocalDate` from `Date` easily. (https://www.baeldung.com/java-date-to-localdate-and-localdatetime) +1 on the new static method. In fact, we can even have a separate class `HoodieInstantTimeGenerator` and keep an singleton instance in HoodieActiveTimeline. We can then add methods to `HoodieInstantTimeGenerator` as needed, to support date, ms etc. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on a change in pull request #2819: [HUDI-1794] Moved static COMMIT_FORMATTER to thread local variable as SimpleDateFormat is not thread safe.
vinothchandar commented on a change in pull request #2819: URL: https://github.com/apache/hudi/pull/2819#discussion_r629798021 ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java ## @@ -73,6 +71,16 @@ private static final Logger LOG = LogManager.getLogger(HoodieActiveTimeline.class); protected HoodieTableMetaClient metaClient; private static AtomicReference lastInstantTime = new AtomicReference<>(String.valueOf(Integer.MIN_VALUE)); + private static ThreadLocal commitFormatHolder = new ThreadLocal() { Review comment: yes. we can make a `LocalDate` from `Date` easily. (https://www.baeldung.com/java-date-to-localdate-and-localdatetime) +1 on the new static method. In fact, we can even have a separate class `HoodieInstantTimeGenerator` and keep an singleton instance in HoodieActiveTimeline. We can then add methods to `HoodieInstantTimeGenerator` as needed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on pull request #2720: [HUDI-1719]hive on spark/mr,Incremental query of the mor table, the partition field is incorrect
vinothchandar commented on pull request #2720: URL: https://github.com/apache/hudi/pull/2720#issuecomment-837638216 @xiarixiaoyao do you intend to add a UT for this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on pull request #2879: [HUDI-1848] Adding support for HMS for running DDL queries in hive-sy…
vinothchandar commented on pull request #2879: URL: https://github.com/apache/hudi/pull/2879#issuecomment-837634888 @jsbali @satishkotha do we need #2532 to land for this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-1887) Make schema post processor's default as disabled
[ https://issues.apache.org/jira/browse/HUDI-1887?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-1887: - Labels: pull-request-available (was: ) > Make schema post processor's default as disabled > > > Key: HUDI-1887 > URL: https://issues.apache.org/jira/browse/HUDI-1887 > Project: Apache Hudi > Issue Type: Task >Reporter: sivabalan narayanan >Assignee: sivabalan narayanan >Priority: Major > Labels: pull-request-available > > With default value [fix|https://github.com/apache/hudi/pull/2765], schema > post processor is not required as mandatory. -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] vinothchandar commented on pull request #2911: [HUDI-1887] Setting default value to false for enabling schema post processor
vinothchandar commented on pull request #2911: URL: https://github.com/apache/hudi/pull/2911#issuecomment-837632064 @nsivabalan are n't there any tests affected by this change? Also do we even need this post processor feature anymore? Should can the entire feature instead of just making it not enabled by default? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Closed] (HUDI-1890) FlinkCreateHandle and FlinkAppendHandle canWrite should always return true
[ https://issues.apache.org/jira/browse/HUDI-1890?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] vinoyang closed HUDI-1890. -- Resolution: Fixed 42ec7e30d74104a15ff76cb004fe7a0c2649abe6 > FlinkCreateHandle and FlinkAppendHandle canWrite should always return true > -- > > Key: HUDI-1890 > URL: https://issues.apache.org/jira/browse/HUDI-1890 > Project: Apache Hudi > Issue Type: Bug > Components: Flink Integration >Reporter: Danny Chen >Assignee: Danny Chen >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > The method {{canWrite}} should always return true because they can already > write based on file size, e.g. the {{BucketAssigner}}. -- This message was sent by Atlassian Jira (v8.3.4#803005)
[hudi] branch master updated (aa398f7 -> 42ec7e3)
This is an automated email from the ASF dual-hosted git repository. vinoyang pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from aa398f7 [HUDI-1789] Support reading older snapshots (#2809) add 42ec7e3 [HUDI-1890] FlinkCreateHandle and FlinkAppendHandle canWrite should always return true (#2933) No new revisions were added by this update. Summary of changes: .../src/main/java/org/apache/hudi/io/FlinkAppendHandle.java | 5 + .../src/main/java/org/apache/hudi/io/FlinkCreateHandle.java | 6 ++ 2 files changed, 11 insertions(+)
[GitHub] [hudi] yanghua merged pull request #2933: [HUDI-1890] FlinkCreateHandle and FlinkAppendHandle canWrite should a…
yanghua merged pull request #2933: URL: https://github.com/apache/hudi/pull/2933 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
nsivabalan commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629763845 ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java ## @@ -353,6 +360,89 @@ public static boolean isSchemaCompatible(String oldSchema, String newSchema) { return isSchemaCompatible(new Schema.Parser().parse(oldSchema), new Schema.Parser().parse(newSchema)); } + /** + * Get latest schema either from incoming schema or table schema. + * @param incomingSchema incoming batch's schema. + * @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted to add namespace. false otherwise. + * @param converterFn converter function to be called over table schema. + * @return the latest schema. + */ + public Schema getLatestSchema(Schema incomingSchema, boolean convertTableSchemaToAddNamespace, + Function1 converterFn) { +Schema latestSchema = incomingSchema; +try { + if (isTimelineNonEmpty()) { +Schema tableSchema = getTableAvroSchemaWithoutMetadataFields(); +if (convertTableSchemaToAddNamespace) { + tableSchema = converterFn.apply(tableSchema); +} +if (incomingSchema.getFields().size() < tableSchema.getFields().size() && isSchemaSubset(tableSchema, incomingSchema)) { Review comment: I guess one could evolve int -> long in a compatible way. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on pull request #2722: [HUDI-1722]hive beeline/spark-sql query specified field on mor table occur NPE
vinothchandar commented on pull request #2722: URL: https://github.com/apache/hudi/pull/2722#issuecomment-837528970 @xiarixiaoyao Could you please rebase this PR . I tried doing this myself, seems tricky -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on a change in pull request #2722: [HUDI-1722]hive beeline/spark-sql query specified field on mor table occur NPE
vinothchandar commented on a change in pull request #2722: URL: https://github.com/apache/hudi/pull/2722#discussion_r629744627 ## File path: hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java ## @@ -85,12 +85,14 @@ void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf job // risk of experiencing race conditions. Hence, we synchronize on the JobConf object here. There is negligible // latency incurred here due to the synchronization since get record reader is called once per spilt before the // actual heavy lifting of reading the parquet files happen. -if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) { +if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null +|| (!realtimeSplit.getDeltaLogPaths().isEmpty() && !HoodieRealtimeInputFormatUtils.requiredProjectionFieldsExistInConf(jobConf))) { synchronized (jobConf) { LOG.info( "Before adding Hoodie columns, Projections :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); -if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) { +if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null +|| (!realtimeSplit.getDeltaLogPaths().isEmpty() && !HoodieRealtimeInputFormatUtils.requiredProjectionFieldsExistInConf(jobConf))) { Review comment: can we pull this check into a small util method, that we can call in both places? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on pull request #2899: [HUDI-1865] Make embedded time line service singleton
vinothchandar commented on pull request #2899: URL: https://github.com/apache/hudi/pull/2899#issuecomment-837465319 >each write task write out these records buffer (grouping by file group id) using a write client there. Understood. So each task manager is, sending RPC calls to the timeline server running locally? This is very different from Spark. What's the life cycle of the `write client` in each JVM process? i.e how often do we re-initialize it? (it can't be per record, so wondering) I get why you want a singleton now. Let's work together to figure something out. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Commented] (HUDI-1723) DFSPathSelector skips files with the same modify date when read up to source limit
[ https://issues.apache.org/jira/browse/HUDI-1723?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17342178#comment-17342178 ] Vinoth Chandar commented on HUDI-1723: -- yes. [~xushiyan] can we file an umbrella issue and file one for s3 and one for gcs [https://cloud.google.com/storage/docs/object-change-notification] > DFSPathSelector skips files with the same modify date when read up to source > limit > -- > > Key: HUDI-1723 > URL: https://issues.apache.org/jira/browse/HUDI-1723 > Project: Apache Hudi > Issue Type: Bug > Components: DeltaStreamer >Reporter: Raymond Xu >Assignee: Raymond Xu >Priority: Blocker > Labels: pull-request-available, sev:critical, user-support-issues > Fix For: 0.9.0 > > Attachments: Screen Shot 2021-03-26 at 1.42.42 AM.png > > > org.apache.hudi.utilities.sources.helpers.DFSPathSelector#listEligibleFiles > filters the input files based on last saved checkpoint, which was the > modification date from last read file. However, the last read file's > modification date could be duplicated for multiple files and resulted in > skipping a few of them when reading up to source limit. An illustration is > shown in the attached picture. -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] xushiyan commented on pull request #2845: [HUDI-1723] Fix path selector listing files with the same mod date
xushiyan commented on pull request #2845: URL: https://github.com/apache/hudi/pull/2845#issuecomment-837449159 @vinothchandar sounds good. i'll add some tests in these 2 days -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on pull request #2845: [HUDI-1723] Fix path selector listing files with the same mod date
vinothchandar commented on pull request #2845: URL: https://github.com/apache/hudi/pull/2845#issuecomment-837447578 @xushiyan @nsivabalan This looks like a reasonable fix, if we can add couple tests and land this. That would be great. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] vinothchandar commented on a change in pull request #2903: [HUDI-1850] Fixing read of a empty table but with failed write
vinothchandar commented on a change in pull request #2903: URL: https://github.com/apache/hudi/pull/2903#discussion_r629724089 ## File path: hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala ## @@ -105,7 +105,9 @@ class DefaultSource extends RelationProvider val tableType = metaClient.getTableType val queryType = parameters(QUERY_TYPE_OPT_KEY) log.info(s"Is bootstrapped table => $isBootstrappedTable, tableType is: $tableType") - +val schemaUtil = new TableSchemaResolver(metaClient) Review comment: why though? we can create the using the test table API and read from data source right? ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java ## @@ -130,7 +131,11 @@ private MessageType getTableParquetSchemaFromDataFile() throws Exception { + " for file " + filePathWithFormat.getLeft()); } } else { -return readSchemaFromLastCompaction(lastCompactionCommit); +if (lastCompactionCommit.isPresent()) { Review comment: how about using `lastCompactionCommit.map().orElseThrow()`. instead of the if-else -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated (8a48d16 -> aa398f7)
This is an automated email from the ASF dual-hosted git repository. vinoth pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from 8a48d16 [HUDI-1707] Reduces log level for too verbose messages from info to debug level. (#2714) add aa398f7 [HUDI-1789] Support reading older snapshots (#2809) No new revisions were added by this update. Summary of changes: .../apache/hudi/hadoop/utils/HoodieHiveUtils.java | 56 ++ .../hudi/hadoop/TestHoodieParquetInputFormat.java | 56 +- .../hudi/hadoop/testutils/InputFormatTestUtil.java | 17 +-- 3 files changed, 115 insertions(+), 14 deletions(-)
[GitHub] [hudi] vinothchandar merged pull request #2809: [HUDI-1789] Support reading older snapshots
vinothchandar merged pull request #2809: URL: https://github.com/apache/hudi/pull/2809 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] 01/01: Trying to reprod the schema evolution test break
This is an automated email from the ASF dual-hosted git repository. vinoth pushed a commit to branch flaky-ci in repository https://gitbox.apache.org/repos/asf/hudi.git commit 12cf04d53eb7f10be51c11888c946f5e88d11486 Author: Vinoth Chandar AuthorDate: Mon May 10 14:18:42 2021 -0700 Trying to reprod the schema evolution test break --- .../apache/hudi/functional/HoodieSparkSqlWriterSuite.scala | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/HoodieSparkSqlWriterSuite.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/HoodieSparkSqlWriterSuite.scala index 606435a..a4fb4ca 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/HoodieSparkSqlWriterSuite.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/HoodieSparkSqlWriterSuite.scala @@ -17,10 +17,6 @@ package org.apache.hudi.functional -import java.time.Instant -import java.util -import java.util.{Collections, Date, UUID} - import org.apache.commons.io.FileUtils import org.apache.hadoop.fs.Path import org.apache.hudi.DataSourceWriteOptions._ @@ -29,8 +25,8 @@ import org.apache.hudi.common.model.{HoodieRecord, HoodieRecordPayload} import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig} import org.apache.hudi.exception.HoodieException -import org.apache.hudi.keygen.{NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.hive.HiveSyncConfig +import org.apache.hudi.keygen.{NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.testutils.DataSourceTestUtils import org.apache.hudi.{AvroConversionUtils, DataSourceUtils, DataSourceWriteOptions, HoodieSparkSqlWriter, HoodieWriterUtils} import org.apache.spark.SparkContext @@ -38,12 +34,15 @@ import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext, SaveMode, SparkSession} +import org.junit.jupiter.api.Assertions.assertEquals import org.mockito.ArgumentMatchers.any import org.mockito.Mockito.{spy, times, verify} import org.scalatest.{FunSuite, Matchers} +import java.time.Instant +import java.util +import java.util.{Collections, Date, UUID} import scala.collection.JavaConversions._ -import org.junit.jupiter.api.Assertions.assertEquals class HoodieSparkSqlWriterSuite extends FunSuite with Matchers { @@ -459,6 +458,9 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers { .drop(HoodieRecord.HOODIE_META_COLUMNS.get(2)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(3)) .drop(HoodieRecord.HOODIE_META_COLUMNS.get(4)) + updatesDf.printSchema() + trimmedDf2.printSchema() + // ensure 2nd batch of updates matches. assert(updatesDf.intersect(trimmedDf2).except(updatesDf).count() == 0)
[hudi] branch flaky-ci created (now 12cf04d)
This is an automated email from the ASF dual-hosted git repository. vinoth pushed a change to branch flaky-ci in repository https://gitbox.apache.org/repos/asf/hudi.git. at 12cf04d Trying to reprod the schema evolution test break This branch includes the following new commits: new 12cf04d Trying to reprod the schema evolution test break The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629687494 ## File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java ## @@ -429,8 +432,55 @@ public static SchemaProvider createRowBasedSchemaProvider(StructType structType, return wrapSchemaProviderWithPostProcessor(rowSchemaProvider, cfg, jssc, null); } + /** + * Create latest schema provider for Target schema. + * @param structType spark data type of incoming batch. + * @param jssc instance of {@link JavaSparkContext}. + * @param fs instance of {@link FileSystem}. + * @param basePath base path of the table. + * @return the schema provider where target schema refers to latest schema(either incoming schema or table schema). + */ + public static SchemaProvider createLatestSchemaProvider(StructType structType, + JavaSparkContext jssc, FileSystem fs, String basePath) { +SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(structType); +Schema incomingSchema = rowSchemaProvider.getTargetSchema(); +Schema latestSchema = incomingSchema; + +try { + if (fs.exists(new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME))) { +HoodieTableMetaClient tableMetaClient = HoodieTableMetaClient.builder().setConf(jssc.sc().hadoopConfiguration()).setBasePath(basePath).build(); +TableSchemaResolver +tableSchemaResolver = new TableSchemaResolver(tableMetaClient); +latestSchema = tableSchemaResolver.getLatestSchema(incomingSchema, true, new Function1() { + @Override + public Schema apply(Schema v1) throws IOException { +return AvroConversionUtils.convertStructTypeToAvroSchema( +AvroConversionUtils.convertAvroSchemaToStructType(v1), RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, +RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE); + } +}); + } +} catch (IOException e) { + LOG.debug("Swallowing exception while trying to fetch table's latest schema. Falling back to incoming schema"); +} + +final Schema finalLatestSchema = latestSchema; +return new SchemaProvider(null) { + @Override + public Schema getSourceSchema() { +return rowSchemaProvider.getSourceSchema(); + } + + @Override + public Schema getTargetSchema() { +return finalLatestSchema; + } +}; + } + @FunctionalInterface public interface CheckedSupplier { + Review comment: remove this -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629687388 ## File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java ## @@ -24,6 +24,9 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.DFSPropertiesConfiguration; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.TableSchemaResolver.Function1; Review comment: Okay, I see this Function1 is used here. There is already a function1 -> https://github.com/apache/hudi/blob/03668dbaf1a60428d7e0d68c6622605e0809150a/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java#L37. Can we re-use this ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629686478 ## File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java ## @@ -429,8 +432,55 @@ public static SchemaProvider createRowBasedSchemaProvider(StructType structType, return wrapSchemaProviderWithPostProcessor(rowSchemaProvider, cfg, jssc, null); } + /** + * Create latest schema provider for Target schema. + * @param structType spark data type of incoming batch. + * @param jssc instance of {@link JavaSparkContext}. + * @param fs instance of {@link FileSystem}. + * @param basePath base path of the table. + * @return the schema provider where target schema refers to latest schema(either incoming schema or table schema). + */ + public static SchemaProvider createLatestSchemaProvider(StructType structType, + JavaSparkContext jssc, FileSystem fs, String basePath) { +SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(structType); +Schema incomingSchema = rowSchemaProvider.getTargetSchema(); +Schema latestSchema = incomingSchema; + +try { + if (fs.exists(new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME))) { Review comment: Is there are better way to check if the table is present on the basePath ? May be add a static method to HoodieTableMetaClient ? Would like to contain explicit use of `fs.` as much as possible in specific classes. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629685145 ## File path: hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/HoodieSparkSqlWriterSuite.scala ## @@ -483,6 +483,17 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers { // ensure 2nd batch of updates matches. assert(df3.intersect(trimmedDf3).except(df3).count() == 0) + // ingest new batch with old schema. + records = DataSourceTestUtils.generateRandomRows(10) + recordsSeq = convertRowListToSeq(records) + val df4 = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + // write to Hudi + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df4) + + val snapshotDF4 = spark.read.format("org.apache.hudi") +.load(path.toAbsolutePath.toString + "/*/*/*/*") + assertEquals(25, snapshotDF4.count()) Review comment: Can we also validate the schema of the newly written files is the same as the latest schema and not the older schema from the records ? That will also help test the avro -> df -> avro conversion flow or may be add a specific test for that too in HoodieSparkUtils ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629685145 ## File path: hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/HoodieSparkSqlWriterSuite.scala ## @@ -483,6 +483,17 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers { // ensure 2nd batch of updates matches. assert(df3.intersect(trimmedDf3).except(df3).count() == 0) + // ingest new batch with old schema. + records = DataSourceTestUtils.generateRandomRows(10) + recordsSeq = convertRowListToSeq(records) + val df4 = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + // write to Hudi + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df4) + + val snapshotDF4 = spark.read.format("org.apache.hudi") +.load(path.toAbsolutePath.toString + "/*/*/*/*") + assertEquals(25, snapshotDF4.count()) Review comment: Can we also validate the schema of the newly written files is the same as the latest schema and not the older schema from the records ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629684633 ## File path: hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala ## @@ -103,4 +106,34 @@ class TestHoodieSparkUtils { assertEquals(files.sortWith(_.toString < _.toString), indexedFilePaths.sortWith(_.toString < _.toString)) spark.stop() } + + @Test + def testCreateRdd(@TempDir tempDir: File): Unit = { Review comment: Can you rename this method to something that you're trying to test about createRDD (here schema evolution) ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629684346 ## File path: hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala ## @@ -103,4 +106,34 @@ class TestHoodieSparkUtils { assertEquals(files.sortWith(_.toString < _.toString), indexedFilePaths.sortWith(_.toString < _.toString)) spark.stop() } + + @Test + def testCreateRdd(@TempDir tempDir: File): Unit = { +val spark = SparkSession.builder + .appName("Hoodie Datasource test") + .master("local[2]") + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .getOrCreate + +val schema = DataSourceTestUtils.getStructTypeExampleSchema +val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) +var records = DataSourceTestUtils.generateRandomRows(5) +var recordsSeq = convertRowListToSeq(records) +val df1 = spark.createDataFrame(spark.sparkContext.parallelize(recordsSeq), structType) + +var genRecRDD = HoodieSparkUtils.createRdd(df1, schema,"test_struct_name", "test_namespace") +genRecRDD.collect() + +val evolSchema = DataSourceTestUtils.getStructTypeExampleEvolvedSchema +records = DataSourceTestUtils.generateRandomRowsEvolvedSchema(5) +recordsSeq = convertRowListToSeq(records) + +genRecRDD = HoodieSparkUtils.createRddWithLatestSchema(df1, schema, evolSchema, "test_struct_name", "test_namespace") +val genRecs = genRecRDD.collect() +// if this succeeds w/o throwing any exception, test succeeded. Review comment: Can you add some more details for future folks as to what you are trying to test here and why w/o exception is expected ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629683323 ## File path: hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala ## @@ -111,6 +112,34 @@ object HoodieSparkUtils { } } + def createRddWithLatestSchema(df: DataFrame, latestSchema: Schema, structName: String, recordNamespace: String): RDD[GenericRecord] = { +val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, recordNamespace) +// if schema generated from df.schema is same as latest schema, no special handling is required. +if(TableSchemaResolver.isSchemaEquals(avroSchema, latestSchema)) { + createRdd(df, avroSchema, structName, recordNamespace) +} else { // if not, it means that table schema got evolved, but this batch of records were generated with an older + // schema. + createRddWithLatestSchema(df, avroSchema, latestSchema, structName, recordNamespace) +} + } + + def createRddWithLatestSchema(df: DataFrame, avroSchema: Schema, latestSchema: Schema, structName: String, recordNamespace: String) + : RDD[GenericRecord] = { +// Use the Avro schema to derive the StructType which has the correct nullability information +val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType] +val encoder = RowEncoder.apply(dataType).resolveAndBind() +val deserializer = HoodieSparkUtils.createRowSerDe(encoder) +val latestDataType = SchemaConverters.toSqlType(latestSchema).dataType.asInstanceOf[StructType] Review comment: Let's fold this into a single method to avoid more confusions down the road. We can sync f2f today. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629682943 ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java ## @@ -432,4 +522,16 @@ public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws } return null; } + + /** + * Converter function to convert table schema to add namespace. + * @param return the table schema with namespace added. + * @param original table schema. + */ + @FunctionalInterface Review comment: Where do we use this ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629682415 ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java ## @@ -353,6 +360,89 @@ public static boolean isSchemaCompatible(String oldSchema, String newSchema) { return isSchemaCompatible(new Schema.Parser().parse(oldSchema), new Schema.Parser().parse(newSchema)); } + /** + * Get latest schema either from incoming schema or table schema. + * @param incomingSchema incoming batch's schema. + * @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted to add namespace. false otherwise. + * @param converterFn converter function to be called over table schema. + * @return the latest schema. + */ + public Schema getLatestSchema(Schema incomingSchema, boolean convertTableSchemaToAddNamespace, + Function1 converterFn) { +Schema latestSchema = incomingSchema; +try { + if (isTimelineNonEmpty()) { +Schema tableSchema = getTableAvroSchemaWithoutMetadataFields(); +if (convertTableSchemaToAddNamespace) { + tableSchema = converterFn.apply(tableSchema); +} +if (incomingSchema.getFields().size() < tableSchema.getFields().size() && isSchemaSubset(tableSchema, incomingSchema)) { Review comment: We don't allow changing the data types of columns. We just add new columns when using AVRO schema rules for Hudi so implementing comparator should be possible. I'm OK to skip implementing the comparator for this PR in the interest to land it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629680557 ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java ## @@ -353,6 +360,89 @@ public static boolean isSchemaCompatible(String oldSchema, String newSchema) { return isSchemaCompatible(new Schema.Parser().parse(oldSchema), new Schema.Parser().parse(newSchema)); } + /** + * Get latest schema either from incoming schema or table schema. + * @param incomingSchema incoming batch's schema. + * @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted to add namespace. false otherwise. + * @param converterFn converter function to be called over table schema. Review comment: Can you enhance this -> `converter function to be called over table schema ` to explain what is the intention of the converter function -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629679886 ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java ## @@ -353,6 +360,89 @@ public static boolean isSchemaCompatible(String oldSchema, String newSchema) { return isSchemaCompatible(new Schema.Parser().parse(oldSchema), new Schema.Parser().parse(newSchema)); } + /** + * Get latest schema either from incoming schema or table schema. + * @param incomingSchema incoming batch's schema. + * @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted to add namespace. false otherwise. Review comment: nit: {@code true} if table schema needs to be converted to add namespace else {@code true}. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629679339 ## File path: hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala ## @@ -139,6 +139,7 @@ object AvroConversionHelper { val length = struct.fields.length val converters = new Array[AnyRef => AnyRef](length) val avroFieldIndexes = new Array[Int](length) + //val avroFieldNames = new Array[String](length) Review comment: Left a comment below to do a quick test before taking this call. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629679024 ## File path: hudi-utilities/src/test/resources/delta-streamer-config/source.avsc ## @@ -70,7 +70,7 @@ "name" : "height", "type" : { "type" : "fixed", - "name" : "abc", + "name" : "fixed", Review comment: Do we still need this fix ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash commented on a change in pull request #2927: [HUDI-1129] Adding support to ingest records with old schema after table's schema is evolved
n3nash commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629678909 ## File path: hudi-utilities/src/test/resources/delta-streamer-config/source_evolved.avsc ## @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "type" : "record", + "name" : "triprec", + "fields" : [ + { +"name" : "timestamp", +"type" : "long" + }, { +"name" : "_row_key", +"type" : "string" + }, { +"name" : "rider", +"type" : "string" + }, { +"name" : "driver", +"type" : "string" + }, { +"name" : "begin_lat", +"type" : "double" + }, { +"name" : "begin_lon", +"type" : "double" + }, { +"name" : "end_lat", +"type" : "double" + }, { +"name" : "end_lon", +"type" : "double" + }, { +"name" : "distance_in_meters", +"type" : "int" + }, { +"name" : "seconds_since_epoch", +"type" : "long" + }, { +"name" : "weight", +"type" : "float" + },{ Review comment: Can we insert a field here and see if the position based resolution works as expected ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] satishkotha commented on pull request #2918: [HUDI-1877] Add support in clustering to not change record location
satishkotha commented on pull request #2918: URL: https://github.com/apache/hudi/pull/2918#issuecomment-837283028 > @satishkotha hello , have some doubt > > 1. Just see add a test strategy . Will a formal strategy be added later? > 2. This PR is to support which Index? > 3. If every file group just transfrom to a same name file group. If the small files can not merge ? @lw309637554 1. Yes, the actual strategy can be added easily if we agree on high level change 2. This is to support HBaseIndex, which does not support update for record location 3. yes, you are right. merging strategy cannot be applied to tables that use HBaseIndex. We can still local 'file-level' sorting i.e., sorting records in each data file by specified column so only one block (row group) needs to be read for queries. Let me know if you any other questions/comments. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] satishkotha commented on a change in pull request #2918: [HUDI-1877] Add support in clustering to not change record location
satishkotha commented on a change in pull request #2918: URL: https://github.com/apache/hudi/pull/2918#discussion_r629658906 ## File path: hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/ClusteringIdentityTestExecutionStrategy.java ## @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi; + +import org.apache.avro.Schema; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.execution.SparkLazyInsertIterable; +import org.apache.hudi.io.CreateFixedFileHandleFactory; +import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.apache.hudi.table.HoodieSparkMergeOnReadTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +/** + * Sample clustering strategy for testing. This actually doesnt transform data, but simply rewrites the same data + * in a new file. + */ +public class ClusteringIdentityTestExecutionStrategy> +extends ClusteringExecutionStrategy>, JavaRDD, JavaRDD> { + + private static final Logger LOG = LogManager.getLogger(ClusteringIdentityTestExecutionStrategy.class); + + public ClusteringIdentityTestExecutionStrategy(HoodieSparkCopyOnWriteTable table, + HoodieSparkEngineContext engineContext, + HoodieWriteConfig writeConfig) { +super(table, engineContext, writeConfig); + } + + public ClusteringIdentityTestExecutionStrategy(HoodieSparkMergeOnReadTable table, + HoodieSparkEngineContext engineContext, + HoodieWriteConfig writeConfig) { +super(table, engineContext, writeConfig); + } + + @Override + public JavaRDD performClustering( + final JavaRDD> inputRecords, + final int numOutputGroups, + final String instantTime, + final Map strategyParams, + final Schema schema, + final List inputFileIds) { +if (inputRecords.getNumPartitions() != 1 || inputFileIds.size() != 1) { Review comment: yes, this is enforced by setting group size limit to a small number. See unit test added `.withClusteringMaxBytesInGroup(10) // set small number so each file is considered as separate clustering group` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon edited a comment on issue #2123: Timestamp not parsed correctly on Athena
l-jhon edited a comment on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837270834 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon commented on issue #2123: Timestamp not parsed correctly on Athena
l-jhon commented on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837270834 > > For example, the output of Dataframe .show is: 2017-05-09 08:21:35, but output at Athena is +49134-01-07 05:30:00.000 > > What is the schema of the table through `DESCRIBE table`? Follows describe: _hoodie_commit_time string _hoodie_commit_seqno string _hoodie_record_key string _hoodie_partition_path string _hoodie_file_namestring origin string created_at timestamp updated_at timestamp id string channel_order_code string purchase_timestamp timestamp # Partition Information year string monthstring day string -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jackye1995 commented on issue #2123: Timestamp not parsed correctly on Athena
jackye1995 commented on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837249804 > For example, the output of Dataframe .show is: 2017-05-09 08:21:35, but output at Athena is +49134-01-07 05:30:00.000 What is the schema of the table through `DESCRIBE table`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon removed a comment on issue #2123: Timestamp not parsed correctly on Athena
l-jhon removed a comment on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837241384 For example, the output of Dataframe .show is: `2017-05-09 08:21:35`, but output at Athena is `+49134-01-07 05:30:00.000` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon commented on issue #2123: Timestamp not parsed correctly on Athena
l-jhon commented on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837242269 > @l-jhon could you provide more details on what your problem is? What is the table you create in Athena/Glue, and what query did you run? For example, the output of Dataframe .show is: 2017-05-09 08:21:35, but output at Athena is +49134-01-07 05:30:00.000 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon commented on issue #2123: Timestamp not parsed correctly on Athena
l-jhon commented on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837241384 For example, the output of Dataframe .show is: `2017-05-09 08:21:35`, but output at Athena is `+49134-01-07 05:30:00.000` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jackye1995 edited a comment on issue #2123: Timestamp not parsed correctly on Athena
jackye1995 edited a comment on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837234410 @l-jhon could you provide more details on what your problem is? What is the table you create in Athena/Glue, and what query did you run? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jackye1995 commented on issue #2123: Timestamp not parsed correctly on Athena
jackye1995 commented on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837234410 @l-jhon could you provide more details on what your problem is? What is the table you create in Athena/Glue, and what query you ran? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon edited a comment on issue #2123: Timestamp not parsed correctly on Athena
l-jhon edited a comment on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837229546 @jackye1995 Is it necessary to change something to make it work? Because I still have a problem. My code: `spark = SparkSession \` `.builder \` `.appName("data_ingestion_incremental_dev") \` `.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \` `.config("spark.sql.hive.convertMetastoreParquet", "false") \` `.config("spark.sql.session.timeZone", "America/Sao_Paulo") \` `.config("spark.hadoop.mapred.output.committer.class","com.appsflyer.spark.DirectOutputCommitter") \` `.config("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") \` `.config("spark.dynamicAllocation.enabled", "true") \` `.enableHiveSupport() \` `.getOrCreate()` `table.write \` `.format("org.apache.hudi") \` `.option("hoodie.table.name", f"{data_source}_{table_name}") \` `.option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \` `.option("hoodie.datasource.write.operation", "INSERT") \` `.option("hoodie.datasource.write.recordkey.field", pk_column) \` `.option("hoodie.datasource.write.precombine.field", incremental_column) \` `.option("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.CustomKeyGenerator") \` `.option("hoodie.datasource.write.partitionpath.field", "year:SIMPLE, month:SIMPLE, day:SIMPLE") \` `.option("hoodie.datasource.hive_sync.enable", True) \` `.option("hoodie.datasource.hive_sync.support_timestamp", True) \` `.option("hoodie.datasource.hive_sync.database", database_athena) \` `.option("hoodie.datasource.hive_sync.table", f"{data_source}_{table_name}") \` `.option("hoodie.datasource.hive_sync.partition_fields", "year, month, day") \` `.option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \` `.option("hoodie.consistency.check.enabled", True) \` `.option("hoodie.parquet.max.file.size", 1073741824) \` `.option("hoodie.parquet.small.file.limit", 629145600) \` `.option("hoodie.parquet.compression.codec", "snappy") \` `.option("hoodie.copyonwrite.insert.split.size", 50) \` `.option("hoodie.copyonwrite.record.size.estimate", 1024) \` `.mode("overwrite") \` `.save(path_s3)` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon edited a comment on issue #2123: Timestamp not parsed correctly on Athena
l-jhon edited a comment on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837229546 @jackye1995 Is it necessary to change something to make it work? Because I still have a problem. My code: `spark = SparkSession \ .builder \ .appName("data_ingestion_incremental_dev") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.sql.hive.convertMetastoreParquet", "false") \ .config("spark.sql.session.timeZone", "America/Sao_Paulo") \ .config("spark.hadoop.mapred.output.committer.class","com.appsflyer.spark.DirectOutputCommitter") \ .config("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") \ .config("spark.dynamicAllocation.enabled", "true") \ .enableHiveSupport() \ .getOrCreate()` `table.write \ .format("org.apache.hudi") \ .option("hoodie.table.name", f"{data_source}_{table_name}") \ .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \ .option("hoodie.datasource.write.operation", "INSERT") \ .option("hoodie.datasource.write.recordkey.field", pk_column) \ .option("hoodie.datasource.write.precombine.field", incremental_column) \ .option("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.CustomKeyGenerator") \ .option("hoodie.datasource.write.partitionpath.field", "year:SIMPLE, month:SIMPLE, day:SIMPLE") \ .option("hoodie.datasource.hive_sync.enable", True) \ .option("hoodie.datasource.hive_sync.support_timestamp", True) \ .option("hoodie.datasource.hive_sync.database", database_athena) \ .option("hoodie.datasource.hive_sync.table", f"{data_source}_{table_name}") \ .option("hoodie.datasource.hive_sync.partition_fields", "year, month, day") \ .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \ .option("hoodie.consistency.check.enabled", True) \ .option("hoodie.parquet.max.file.size", 1073741824) \ .option("hoodie.parquet.small.file.limit", 629145600) \ .option("hoodie.parquet.compression.codec", "snappy") \ .option("hoodie.copyonwrite.insert.split.size", 50) \ .option("hoodie.copyonwrite.record.size.estimate", 1024) \ .mode("overwrite") \ .save(path_s3)` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon edited a comment on issue #2123: Timestamp not parsed correctly on Athena
l-jhon edited a comment on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837229546 @jackye1995 Is it necessary to change something to make it work? Because I still have a problem. My code: `spark = SparkSession \ .builder \ .appName("data_ingestion_incremental_dev") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.sql.hive.convertMetastoreParquet", "false") \ .config("spark.sql.session.timeZone", "America/Sao_Paulo") \ .config("spark.hadoop.mapred.output.committer.class","com.appsflyer.spark.DirectOutputCommitter") \ .config("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") \ .config("spark.dynamicAllocation.enabled", "true") \ .enableHiveSupport() \ .getOrCreate() table.write \ .format("org.apache.hudi") \ .option("hoodie.table.name", f"{data_source}_{table_name}") \ .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \ .option("hoodie.datasource.write.operation", "INSERT") \ .option("hoodie.datasource.write.recordkey.field", pk_column) \ .option("hoodie.datasource.write.precombine.field", incremental_column) \ .option("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.CustomKeyGenerator") \ .option("hoodie.datasource.write.partitionpath.field", "year:SIMPLE, month:SIMPLE, day:SIMPLE") \ .option("hoodie.datasource.hive_sync.enable", True) \ .option("hoodie.datasource.hive_sync.support_timestamp", True) \ .option("hoodie.datasource.hive_sync.database", database_athena) \ .option("hoodie.datasource.hive_sync.table", f"{data_source}_{table_name}") \ .option("hoodie.datasource.hive_sync.partition_fields", "year, month, day") \ .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \ .option("hoodie.consistency.check.enabled", True) \ .option("hoodie.parquet.max.file.size", 1073741824) \ .option("hoodie.parquet.small.file.limit", 629145600) \ .option("hoodie.parquet.compression.codec", "snappy") \ .option("hoodie.copyonwrite.insert.split.size", 50) \ .option("hoodie.copyonwrite.record.size.estimate", 1024) \ .mode("overwrite") \ .save(path_s3)` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon edited a comment on issue #2123: Timestamp not parsed correctly on Athena
l-jhon edited a comment on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837229546 @jackye1995 Is it necessary to change something to make it work? Because I still have a problem. My code: `spark = SparkSession \ .builder \ .appName("data_ingestion_incremental_dev") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.sql.hive.convertMetastoreParquet", "false") \ .config("spark.sql.session.timeZone", "America/Sao_Paulo") \ .config("spark.hadoop.mapred.output.committer.class","com.appsflyer.spark.DirectOutputCommitter") \ .config("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") \ .config("spark.dynamicAllocation.enabled", "true") \ .enableHiveSupport() \ .getOrCreate()` table.write \ .format("org.apache.hudi") \ .option("hoodie.table.name", f"{data_source}_{table_name}") \ .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \ .option("hoodie.datasource.write.operation", "INSERT") \ .option("hoodie.datasource.write.recordkey.field", pk_column) \ .option("hoodie.datasource.write.precombine.field", incremental_column) \ .option("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.CustomKeyGenerator") \ .option("hoodie.datasource.write.partitionpath.field", "year:SIMPLE, month:SIMPLE, day:SIMPLE") \ .option("hoodie.datasource.hive_sync.enable", True) \ .option("hoodie.datasource.hive_sync.support_timestamp", True) \ .option("hoodie.datasource.hive_sync.database", database_athena) \ .option("hoodie.datasource.hive_sync.table", f"{data_source}_{table_name}") \ .option("hoodie.datasource.hive_sync.partition_fields", "year, month, day") \ .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \ .option("hoodie.consistency.check.enabled", True) \ .option("hoodie.parquet.max.file.size", 1073741824) \ .option("hoodie.parquet.small.file.limit", 629145600) \ .option("hoodie.parquet.compression.codec", "snappy") \ .option("hoodie.copyonwrite.insert.split.size", 50) \ .option("hoodie.copyonwrite.record.size.estimate", 1024) \ .mode("overwrite") \ .save(path_s3) ` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] l-jhon commented on issue #2123: Timestamp not parsed correctly on Athena
l-jhon commented on issue #2123: URL: https://github.com/apache/hudi/issues/2123#issuecomment-837229546 @jackye1995 Is it necessary to change something to make it work? Because I still have a problem. My code: `spark = SparkSession \ .builder \ .appName("data_ingestion_incremental_dev") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.sql.hive.convertMetastoreParquet", "false") \ .config("spark.sql.session.timeZone", "America/Sao_Paulo") \ .config("spark.hadoop.mapred.output.committer.class","com.appsflyer.spark.DirectOutputCommitter") \ .config("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") \ .config("spark.dynamicAllocation.enabled", "true") \ .enableHiveSupport() \ .getOrCreate() table.write \ .format("org.apache.hudi") \ .option("hoodie.table.name", f"{data_source}_{table_name}") \ .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \ .option("hoodie.datasource.write.operation", "INSERT") \ .option("hoodie.datasource.write.recordkey.field", pk_column) \ .option("hoodie.datasource.write.precombine.field", incremental_column) \ .option("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.CustomKeyGenerator") \ .option("hoodie.datasource.write.partitionpath.field", "year:SIMPLE, month:SIMPLE, day:SIMPLE") \ .option("hoodie.datasource.hive_sync.enable", True) \ .option("hoodie.datasource.hive_sync.support_timestamp", True) \ .option("hoodie.datasource.hive_sync.database", database_athena) \ .option("hoodie.datasource.hive_sync.table", f"{data_source}_{table_name}") \ .option("hoodie.datasource.hive_sync.partition_fields", "year, month, day") \ .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \ .option("hoodie.consistency.check.enabled", True) \ .option("hoodie.parquet.max.file.size", 1073741824) \ .option("hoodie.parquet.small.file.limit", 629145600) \ .option("hoodie.parquet.compression.codec", "snappy") \ .option("hoodie.copyonwrite.insert.split.size", 50) \ .option("hoodie.copyonwrite.record.size.estimate", 1024) \ .mode("overwrite") \ .save(path_s3) ` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Commented] (HUDI-1138) Re-implement marker files via timeline server
[ https://issues.apache.org/jira/browse/HUDI-1138?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17342017#comment-17342017 ] Vinoth Chandar commented on HUDI-1138: -- [~guoyihua] is picking this up and running with that. > Re-implement marker files via timeline server > - > > Key: HUDI-1138 > URL: https://issues.apache.org/jira/browse/HUDI-1138 > Project: Apache Hudi > Issue Type: Improvement > Components: Writer Core >Affects Versions: 0.9.0 >Reporter: Vinoth Chandar >Assignee: Ethan Guo >Priority: Blocker > Fix For: 0.9.0 > > > Even as you can argue that RFC-15/consolidated metadata, removes the need for > deleting partial files written due to spark task failures/stage retries. It > will still leave extra files inside the table (and users will pay for it > every month) and we need the marker mechanism to be able to delete these > partial files. > Here we explore if we can improve the current marker file mechanism, that > creates one marker file per data file written, by > Delegating the createMarker() call to the driver/timeline server, and have it > create marker metadata into a single file handle, that is flushed for > durability guarantees > > P.S: I was tempted to think Spark listener mechanism can help us deal with > failed tasks, but it has no guarantees. the writer job could die without > deleting a partial file. i.e it can improve things, but cant provide > guarantees -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Assigned] (HUDI-1138) Re-implement marker files via timeline server
[ https://issues.apache.org/jira/browse/HUDI-1138?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Vinoth Chandar reassigned HUDI-1138: Assignee: Ethan Guo > Re-implement marker files via timeline server > - > > Key: HUDI-1138 > URL: https://issues.apache.org/jira/browse/HUDI-1138 > Project: Apache Hudi > Issue Type: Improvement > Components: Writer Core >Affects Versions: 0.9.0 >Reporter: Vinoth Chandar >Assignee: Ethan Guo >Priority: Blocker > Fix For: 0.9.0 > > > Even as you can argue that RFC-15/consolidated metadata, removes the need for > deleting partial files written due to spark task failures/stage retries. It > will still leave extra files inside the table (and users will pay for it > every month) and we need the marker mechanism to be able to delete these > partial files. > Here we explore if we can improve the current marker file mechanism, that > creates one marker file per data file written, by > Delegating the createMarker() call to the driver/timeline server, and have it > create marker metadata into a single file handle, that is flushed for > durability guarantees > > P.S: I was tempted to think Spark listener mechanism can help us deal with > failed tasks, but it has no guarantees. the writer job could die without > deleting a partial file. i.e it can improve things, but cant provide > guarantees -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Updated] (HUDI-1138) Re-implement marker files via timeline server
[ https://issues.apache.org/jira/browse/HUDI-1138?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Vinoth Chandar updated HUDI-1138: - Status: Open (was: New) > Re-implement marker files via timeline server > - > > Key: HUDI-1138 > URL: https://issues.apache.org/jira/browse/HUDI-1138 > Project: Apache Hudi > Issue Type: Improvement > Components: Writer Core >Affects Versions: 0.9.0 >Reporter: Vinoth Chandar >Priority: Blocker > Fix For: 0.9.0 > > > Even as you can argue that RFC-15/consolidated metadata, removes the need for > deleting partial files written due to spark task failures/stage retries. It > will still leave extra files inside the table (and users will pay for it > every month) and we need the marker mechanism to be able to delete these > partial files. > Here we explore if we can improve the current marker file mechanism, that > creates one marker file per data file written, by > Delegating the createMarker() call to the driver/timeline server, and have it > create marker metadata into a single file handle, that is flushed for > durability guarantees > > P.S: I was tempted to think Spark listener mechanism can help us deal with > failed tasks, but it has no guarantees. the writer job could die without > deleting a partial file. i.e it can improve things, but cant provide > guarantees -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] nsivabalan commented on a change in pull request #2932: [WIP] Fixing schema evolution in sparksql_writer and deltastreamer
nsivabalan commented on a change in pull request #2932: URL: https://github.com/apache/hudi/pull/2932#discussion_r629447410 ## File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java ## @@ -374,11 +375,12 @@ public void refreshTimeline() throws IOException { transformed .map(r -> (SchemaProvider) new DelegatingSchemaProvider(props, jssc, dataAndCheckpoint.getSchemaProvider(), -UtilHelpers.createRowBasedSchemaProvider(r.schema(), props, jssc))) +UtilHelpers.createLatestSchemaProvider(r.schema(), jssc, fs, cfg.targetBasePath))) .orElse(dataAndCheckpoint.getSchemaProvider()); avroRDDOptional = transformed .map(t -> HoodieSparkUtils.createRdd( -t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()); +t, schemaProvider.getTargetSchema(), HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()); Review comment: existing HoodieSparkUtils.createRdd() won't work if schema is evolved schema compared to record's schema. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
codecov-commenter edited a comment on pull request #2930: URL: https://github.com/apache/hudi/pull/2930#issuecomment-835670270 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a change in pull request #2927: [HUDI-1129][WIP] Adding support to ingest records with old schema after table's schema is evolved
nsivabalan commented on a change in pull request #2927: URL: https://github.com/apache/hudi/pull/2927#discussion_r629421999 ## File path: hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala ## @@ -139,6 +139,7 @@ object AvroConversionHelper { val length = struct.fields.length val converters = new Array[AnyRef => AnyRef](length) val avroFieldIndexes = new Array[Int](length) + //val avroFieldNames = new Array[String](length) Review comment: I am not making this change in this diff as I am not sure in general, AvroConversionHelper and AvroConversionUtils are capable of handling schema evolution where new columns are added in the middle. Need to do some more investigation to validate this. But the tests are succeeding w/o this fix and hence leaving it this way for now. ## File path: hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala ## @@ -111,6 +112,34 @@ object HoodieSparkUtils { } } + def createRddWithLatestSchema(df: DataFrame, latestSchema: Schema, structName: String, recordNamespace: String): RDD[GenericRecord] = { +val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, recordNamespace) +// if schema generated from df.schema is same as latest schema, no special handling is required. +if(TableSchemaResolver.isSchemaEquals(avroSchema, latestSchema)) { + createRdd(df, avroSchema, structName, recordNamespace) +} else { // if not, it means that table schema got evolved, but this batch of records were generated with an older + // schema. + createRddWithLatestSchema(df, avroSchema, latestSchema, structName, recordNamespace) +} + } + + def createRddWithLatestSchema(df: DataFrame, avroSchema: Schema, latestSchema: Schema, structName: String, recordNamespace: String) + : RDD[GenericRecord] = { +// Use the Avro schema to derive the StructType which has the correct nullability information +val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType] +val encoder = RowEncoder.apply(dataType).resolveAndBind() +val deserializer = HoodieSparkUtils.createRowSerDe(encoder) +val latestDataType = SchemaConverters.toSqlType(latestSchema).dataType.asInstanceOf[StructType] Review comment: this path is different from existing createRdd(). in this method, we first deserialize DF with old schema, and then use latestSchema to convert to avro in line 137. where as in createRdd(), same schema is used in both places. We can try to fold both these into single method. open to discussion. ## File path: hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java ## @@ -353,6 +360,89 @@ public static boolean isSchemaCompatible(String oldSchema, String newSchema) { return isSchemaCompatible(new Schema.Parser().parse(oldSchema), new Schema.Parser().parse(newSchema)); } + /** + * Get latest schema either from incoming schema or table schema. + * @param incomingSchema incoming batch's schema. + * @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted to add namespace. false otherwise. + * @param converterFn converter function to be called over table schema. + * @return the latest schema. + */ + public Schema getLatestSchema(Schema incomingSchema, boolean convertTableSchemaToAddNamespace, + Function1 converterFn) { +Schema latestSchema = incomingSchema; +try { + if (isTimelineNonEmpty()) { +Schema tableSchema = getTableAvroSchemaWithoutMetadataFields(); +if (convertTableSchemaToAddNamespace) { + tableSchema = converterFn.apply(tableSchema); +} +if (incomingSchema.getFields().size() < tableSchema.getFields().size() && isSchemaSubset(tableSchema, incomingSchema)) { Review comment: we can't really introduce schema comparators as two schemas are not comparable similar to numbers or strings. for eg, if two schemas have same no of columns, but differs in data type of one column, do we return -1 or 1? So, resorting to use isSchemaSubset(). let me know if that makes sense. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
codecov-commenter edited a comment on pull request #2930: URL: https://github.com/apache/hudi/pull/2930#issuecomment-835670270 # [Codecov](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=h1_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) Report > Merging [#2930](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (aaf35bf) into [master](https://codecov.io/gh/apache/hudi/commit/bfbf993cbe3f1e3fab93095f4342ed17423efab5?el=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (bfbf993) will **increase** coverage by `14.75%`. > The diff coverage is `n/a`. [![Impacted file tree graph](https://codecov.io/gh/apache/hudi/pull/2930/graphs/tree.svg?width=650=150=pr=VTTXabwbs2_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation)](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) ```diff @@ Coverage Diff @@ ## master#2930 +/- ## = + Coverage 54.77% 69.53% +14.75% + Complexity 3796 374 -3422 = Files 481 54 -427 Lines 23284 2002-21282 Branches 2478 237 -2241 = - Hits 12753 1392-11361 + Misses 9385 478 -8907 + Partials 1146 132 -1014 ``` | Flag | Coverage Δ | Complexity Δ | | |---|---|---|---| | hudicli | `?` | `?` | | | hudiclient | `?` | `?` | | | hudicommon | `?` | `?` | | | hudiflink | `?` | `?` | | | hudihadoopmr | `?` | `?` | | | hudisparkdatasource | `?` | `?` | | | hudisync | `?` | `?` | | | huditimelineservice | `?` | `?` | | | hudiutilities | `69.53% <ø> (-0.05%)` | `374.00 <ø> (-1.00)` | | Flags with carried forward coverage won't be shown. [Click here](https://docs.codecov.io/docs/carryforward-flags?utm_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#carryforward-flags-in-the-pull-request-comment) to find out more. | [Impacted Files](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) | Coverage Δ | Complexity Δ | | |---|---|---|---| | [...apache/hudi/utilities/deltastreamer/DeltaSync.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL2RlbHRhc3RyZWFtZXIvRGVsdGFTeW5jLmphdmE=) | `71.08% <0.00%> (-0.35%)` | `55.00% <0.00%> (-1.00%)` | | | [.../apache/hudi/common/util/DefaultSizeEstimator.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY29tbW9uL3V0aWwvRGVmYXVsdFNpemVFc3RpbWF0b3IuamF2YQ==) | | | | | [...apache/hudi/common/model/HoodieRecordLocation.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY29tbW9uL21vZGVsL0hvb2RpZVJlY29yZExvY2F0aW9uLmphdmE=) | | | | | [.../org/apache/hudi/hadoop/utils/HoodieHiveUtils.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1oYWRvb3AtbXIvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvaGFkb29wL3V0aWxzL0hvb2RpZUhpdmVVdGlscy5qYXZh) | | | | | [.../apache/hudi/common/bloom/BloomFilterTypeCode.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY29tbW9uL2Jsb29tL0Jsb29tRmlsdGVyVHlwZUNvZGUuamF2YQ==) | | | | | [...va/org/apache/hudi/cli/commands/CleansCommand.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jbGkvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY2xpL2NvbW1hbmRzL0NsZWFuc0NvbW1hbmQuamF2YQ==) | | | | |
[GitHub] [hudi] vinothchandar merged pull request #2714: [HUDI-1707] Reduces log level for too verbose messages from info to debug level.
vinothchandar merged pull request #2714: URL: https://github.com/apache/hudi/pull/2714 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated (511ac48 -> 8a48d16)
This is an automated email from the ASF dual-hosted git repository. vinoth pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from 511ac48 [MINOR] optimize FilePathUtils (#2931) add 8a48d16 [HUDI-1707] Reduces log level for too verbose messages from info to debug level. (#2714) No new revisions were added by this update. Summary of changes: .../table/timeline/HoodieActiveTimeline.java | 3 +-- .../table/view/AbstractTableFileSystemView.java| 4 ++-- .../table/view/HoodieTableFileSystemView.java | 2 +- .../org/apache/hudi/hive/util/HiveSchemaUtil.java | 9 +++-- .../hudi/timeline/service/RequestHandler.java | 6 +++--- .../hudi/utilities/deltastreamer/DeltaSync.java| 4 +++- .../deltastreamer/HoodieDeltaStreamer.java | 22 +- .../transform/SqlQueryBasedTransformer.java| 2 +- .../TestHoodieMultiTableDeltaStreamer.java | 5 + 9 files changed, 44 insertions(+), 13 deletions(-)
[GitHub] [hudi] t0il3ts0ap opened a new issue #2934: [SUPPORT] Parquet file does not exist when trying to read hudi table incrementally
t0il3ts0ap opened a new issue #2934: URL: https://github.com/apache/hudi/issues/2934 **Describe the problem you faced** My aim is to read an existing hudi table (COW) using deltastreamer, do some transformations and write it to another ( fresh ) table. I am using deltastreamer so as check-pointing can be automated. Relevant hudi configs used for deltastreamer ``` --hoodie-conf hoodie.parquet.compression.codec=snappy --table-type COPY_ON_WRITE --source-class org.apache.hudi.utilities.sources.HoodieIncrSource --hoodie-conf hoodie.deltastreamer.source.hoodieincr.path=s3://poc-bucket/raw-data/customer_service/credit_analysis_data --hoodie-conf hoodie.deltastreamer.source.hoodieincr.partition.extractor.class=org.apache.hudi.hive.NonPartitionedExtractor --hoodie-conf hoodie.deltastreamer.source.hoodieincr.partition.fields='' --hoodie-conf hoodie.deltastreamer.source.hoodieincr.num_instants=1 --enable-sync --checkpoint 0 ``` The first run of deltastreamer failed with ``` Exception in thread "main" org.apache.spark.sql.AnalysisException: Path does not exist: s3://poc-bucket/raw-data/customer_service/credit_analysis_data/default/c67b4ac1-4597-4896-81c5-dc70b2f62892-1_0-23-13659_20210508062123.parquet; at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:764) at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245) at scala.collection.immutable.List.foreach(List.scala:392) at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245) at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242) at scala.collection.immutable.List.flatMap(List.scala:355) at org.apache.spark.sql.execution.datasources.DataSource$.checkAndGlobPathIfNecessary(DataSource.scala:751) at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:580) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:405) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297) at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286) at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755) at org.apache.hudi.IncrementalRelation.buildScan(IncrementalRelation.scala:151) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.apply(DataSourceStrategy.scala:313) at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63) at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69) at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78) at scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162) at scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162) at scala.collection.Iterator.foreach(Iterator.scala:941) at scala.collection.Iterator.foreach$(Iterator.scala:941) ``` The original table is couple of months old. At any moment I find commits spanning over last 3 days in its .hoodie directory. Surprisingly, the parquet file mentioned in most of commits does not exist. I am able to obtain same error when trying to run hudi incremental query in spark-shell. * Hudi version : 0.7.0 * Spark version : 3.0.2 with scala 2.12 * Storage (HDFS/S3/GCS..) :S3 * Running on Docker? (yes/no) : no -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2933: [HUDI-1890] FlinkCreateHandle and FlinkAppendHandle canWrite should a…
codecov-commenter edited a comment on pull request #2933: URL: https://github.com/apache/hudi/pull/2933#issuecomment-836676096 # [Codecov](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=h1_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) Report > Merging [#2933](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (66379e3) into [master](https://codecov.io/gh/apache/hudi/commit/c1b331bcffaffde7661b75a9b846b117e9df63dc?el=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (c1b331b) will **increase** coverage by `0.67%`. > The diff coverage is `n/a`. [![Impacted file tree graph](https://codecov.io/gh/apache/hudi/pull/2933/graphs/tree.svg?width=650=150=pr=VTTXabwbs2_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation)](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) ```diff @@ Coverage Diff @@ ## master#2933 +/- ## + Coverage 54.07% 54.75% +0.67% - Complexity 3757 3797 +40 Files 481 481 Lines 2329123291 Branches 2480 2480 + Hits 1259512753 +158 + Misses 9556 9389 -167 - Partials 1140 1149 +9 ``` | Flag | Coverage Δ | Complexity Δ | | |---|---|---|---| | hudicli | `39.53% <ø> (ø)` | `220.00 <ø> (ø)` | | | hudiclient | `∅ <ø> (∅)` | `0.00 <ø> (ø)` | | | hudicommon | `50.38% <ø> (ø)` | `1975.00 <ø> (ø)` | | | hudiflink | `62.88% <ø> (ø)` | `525.00 <ø> (ø)` | | | hudihadoopmr | `50.93% <ø> (ø)` | `259.00 <ø> (ø)` | | | hudisparkdatasource | `73.33% <ø> (ø)` | `237.00 <ø> (ø)` | | | hudisync | `46.52% <ø> (ø)` | `144.00 <ø> (ø)` | | | huditimelineservice | `64.36% <ø> (ø)` | `62.00 <ø> (ø)` | | | hudiutilities | `69.58% <ø> (+7.89%)` | `375.00 <ø> (+40.00)` | | Flags with carried forward coverage won't be shown. [Click here](https://docs.codecov.io/docs/carryforward-flags?utm_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#carryforward-flags-in-the-pull-request-comment) to find out more. | [Impacted Files](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) | Coverage Δ | Complexity Δ | | |---|---|---|---| | [...apache/hudi/utilities/deltastreamer/DeltaSync.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL2RlbHRhc3RyZWFtZXIvRGVsdGFTeW5jLmphdmE=) | `71.42% <0.00%> (+0.34%)` | `56.00% <0.00%> (+1.00%)` | | | [...in/java/org/apache/hudi/utilities/UtilHelpers.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL1V0aWxIZWxwZXJzLmphdmE=) | `64.53% <0.00%> (+23.83%)` | `33.00% <0.00%> (+6.00%)` | | | [...he/hudi/utilities/transform/AWSDmsTransformer.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL3RyYW5zZm9ybS9BV1NEbXNUcmFuc2Zvcm1lci5qYXZh) | `66.66% <0.00%> (+66.66%)` | `2.00% <0.00%> (+2.00%)` | | | [...hudi/utilities/schema/JdbcbasedSchemaProvider.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL3NjaGVtYS9KZGJjYmFzZWRTY2hlbWFQcm92aWRlci5qYXZh) | `72.22% <0.00%> (+72.22%)` | `2.00% <0.00%> (+2.00%)` | | | [.../apache/hudi/utilities/HoodieSnapshotExporter.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL0hvb2RpZVNuYXBzaG90RXhwb3J0ZXIuamF2YQ==) | `88.79% <0.00%> (+83.62%)` | `28.00% <0.00%>
[GitHub] [hudi] garyli1019 merged pull request #2931: [MINOR] optimize FilePathUtils
garyli1019 merged pull request #2931: URL: https://github.com/apache/hudi/pull/2931 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated: [MINOR] optimize FilePathUtils (#2931)
This is an automated email from the ASF dual-hosted git repository. garyli pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 511ac48 [MINOR] optimize FilePathUtils (#2931) 511ac48 is described below commit 511ac4881d6bf3385fd98946594aae16088047f1 Author: hiscat <46845236+mylanpan...@users.noreply.github.com> AuthorDate: Mon May 10 21:47:56 2021 +0800 [MINOR] optimize FilePathUtils (#2931) --- .../org/apache/hudi/table/format/FilePathUtils.java | 21 +++-- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java index 4dfe990..0623eb9 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java @@ -132,21 +132,6 @@ public class FilePathUtils { } /** - * Generates partition values from path. - * - * @param currPath Partition file path - * @param hivePartition Whether the partition path is with Hive style - * @param partitionKeys Partition keys - * @return Sequential partition specs. - */ - public static List extractPartitionValues( - Path currPath, - boolean hivePartition, - String[] partitionKeys) { -return new ArrayList<>(extractPartitionKeyValues(currPath, hivePartition, partitionKeys).values()); - } - - /** * Generates partition key value mapping from path. * * @param currPath Partition file path @@ -265,7 +250,7 @@ public class FilePathUtils { return; } -if (fileStatus.isDir() && !isHiddenFile(fileStatus)) { +if (fileStatus.isDirectory() && !isHiddenFile(fileStatus)) { for (FileStatus stat : fs.listStatus(fileStatus.getPath())) { listStatusRecursively(fs, stat, level + 1, expectLevel, results); } @@ -275,7 +260,7 @@ public class FilePathUtils { private static boolean isHiddenFile(FileStatus fileStatus) { String name = fileStatus.getPath().getName(); // the log files is hidden file -return name.startsWith("_") || name.startsWith(".") && !name.contains(".log."); +return name.startsWith("_") || (name.startsWith(".") && !name.contains(".log.")); } /** @@ -393,7 +378,7 @@ public class FilePathUtils { */ public static org.apache.flink.core.fs.Path[] toFlinkPaths(Path[] paths) { return Arrays.stream(paths) -.map(p -> toFlinkPath(p)) +.map(FilePathUtils::toFlinkPath) .toArray(org.apache.flink.core.fs.Path[]::new); }
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2933: [HUDI-1890] FlinkCreateHandle and FlinkAppendHandle canWrite should a…
codecov-commenter edited a comment on pull request #2933: URL: https://github.com/apache/hudi/pull/2933#issuecomment-836676096 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] MyLanPangzi commented on a change in pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
MyLanPangzi commented on a change in pull request #2930: URL: https://github.com/apache/hudi/pull/2930#discussion_r629365106 ## File path: hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java ## @@ -297,4 +343,4 @@ public boolean isTemporary() { return false; } } -} +} Review comment: done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] Akshay2Agarwal commented on issue #2913: [SUPPORT] Hudi + Hive Metastore Sync
Akshay2Agarwal commented on issue #2913: URL: https://github.com/apache/hudi/issues/2913#issuecomment-836704935 Closing the ticket, as I read through the code, I realized hudi is integrated with hive2 and queryable in presto as an external table and not as managed table. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] Akshay2Agarwal closed issue #2913: [SUPPORT] Hudi + Hive Metastore Sync
Akshay2Agarwal closed issue #2913: URL: https://github.com/apache/hudi/issues/2913 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codecov-commenter commented on pull request #2933: [HUDI-1890] FlinkCreateHandle and FlinkAppendHandle canWrite should a…
codecov-commenter commented on pull request #2933: URL: https://github.com/apache/hudi/pull/2933#issuecomment-836676096 # [Codecov](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=h1_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) Report > Merging [#2933](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (66379e3) into [master](https://codecov.io/gh/apache/hudi/commit/c1b331bcffaffde7661b75a9b846b117e9df63dc?el=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (c1b331b) will **decrease** coverage by `44.73%`. > The diff coverage is `n/a`. [![Impacted file tree graph](https://codecov.io/gh/apache/hudi/pull/2933/graphs/tree.svg?width=650=150=pr=VTTXabwbs2_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation)](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) ```diff @@ Coverage Diff @@ ## master #2933 +/- ## - Coverage 54.07% 9.34% -44.74% + Complexity 3757 48 -3709 Files 481 54 -427 Lines 232912002-21289 Branches 2480 237 -2243 - Hits 12595 187-12408 + Misses 95561802 -7754 + Partials 1140 13 -1127 ``` | Flag | Coverage Δ | Complexity Δ | | |---|---|---|---| | hudicli | `?` | `?` | | | hudiclient | `?` | `?` | | | hudicommon | `?` | `?` | | | hudiflink | `?` | `?` | | | hudihadoopmr | `?` | `?` | | | hudisparkdatasource | `?` | `?` | | | hudisync | `?` | `?` | | | huditimelineservice | `?` | `?` | | | hudiutilities | `9.34% <ø> (-52.35%)` | `48.00 <ø> (-287.00)` | | Flags with carried forward coverage won't be shown. [Click here](https://docs.codecov.io/docs/carryforward-flags?utm_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#carryforward-flags-in-the-pull-request-comment) to find out more. | [Impacted Files](https://codecov.io/gh/apache/hudi/pull/2933?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) | Coverage Δ | Complexity Δ | | |---|---|---|---| | [...va/org/apache/hudi/utilities/IdentitySplitter.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL0lkZW50aXR5U3BsaXR0ZXIuamF2YQ==) | `0.00% <0.00%> (-100.00%)` | `0.00% <0.00%> (-2.00%)` | | | [...va/org/apache/hudi/utilities/schema/SchemaSet.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL3NjaGVtYS9TY2hlbWFTZXQuamF2YQ==) | `0.00% <0.00%> (-100.00%)` | `0.00% <0.00%> (-3.00%)` | | | [...a/org/apache/hudi/utilities/sources/RowSource.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL3NvdXJjZXMvUm93U291cmNlLmphdmE=) | `0.00% <0.00%> (-100.00%)` | `0.00% <0.00%> (-4.00%)` | | | [.../org/apache/hudi/utilities/sources/AvroSource.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL3NvdXJjZXMvQXZyb1NvdXJjZS5qYXZh) | `0.00% <0.00%> (-100.00%)` | `0.00% <0.00%> (-1.00%)` | | | [.../org/apache/hudi/utilities/sources/JsonSource.java](https://codecov.io/gh/apache/hudi/pull/2933/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL3NvdXJjZXMvSnNvblNvdXJjZS5qYXZh) | `0.00% <0.00%> (-100.00%)` | `0.00% <0.00%> (-1.00%)` | | |
[GitHub] [hudi] garyli1019 commented on a change in pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
garyli1019 commented on a change in pull request #2930: URL: https://github.com/apache/hudi/pull/2930#discussion_r629312247 ## File path: hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java ## @@ -297,4 +343,4 @@ public boolean isTemporary() { return false; } } -} +} Review comment: the new line check the last line of the file, should be like this ``` } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-1890) FlinkCreateHandle and FlinkAppendHandle canWrite should always return true
[ https://issues.apache.org/jira/browse/HUDI-1890?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-1890: - Labels: pull-request-available (was: ) > FlinkCreateHandle and FlinkAppendHandle canWrite should always return true > -- > > Key: HUDI-1890 > URL: https://issues.apache.org/jira/browse/HUDI-1890 > Project: Apache Hudi > Issue Type: Bug > Components: Flink Integration >Reporter: Danny Chen >Assignee: Danny Chen >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > The method {{canWrite}} should always return true because they can already > write based on file size, e.g. the {{BucketAssigner}}. -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] danny0405 opened a new pull request #2933: [HUDI-1890] FlinkCreateHandle and FlinkAppendHandle canWrite should a…
danny0405 opened a new pull request #2933: URL: https://github.com/apache/hudi/pull/2933 …lways return true The method #canWrite should always return true because they can already write based on file size, e.g. the BucketAssigner. ## *Tips* - *Thank you very much for contributing to Apache Hudi.* - *Please review https://hudi.apache.org/contributing.html before opening a pull request.* ## What is the purpose of the pull request *(For example: This pull request adds quick-start document.)* ## Brief change log *(for example:)* - *Modify AnnotationLocation checkstyle rule in checkstyle.xml* ## Verify this pull request *(Please pick either of the following options)* This pull request is a trivial rework / code cleanup without any test coverage. *(or)* This pull request is already covered by existing tests, such as *(please describe tests)*. (or) This change added tests and can be verified as follows: *(example:)* - *Added integration tests for end-to-end.* - *Added HoodieClientWriteTest to verify the change.* - *Manually verified the change by running a job locally.* ## Committer checklist - [ ] Has a corresponding JIRA in PR title & commit - [ ] Commit message is descriptive of the change - [ ] CI is green - [ ] Necessary doc changes done or have another open PR - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2931: [MINOR] optimize FilePathUtils
codecov-commenter edited a comment on pull request #2931: URL: https://github.com/apache/hudi/pull/2931#issuecomment-835768056 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
codecov-commenter edited a comment on pull request #2930: URL: https://github.com/apache/hudi/pull/2930#issuecomment-835670270 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2931: [MINOR] optimize FilePathUtils
codecov-commenter edited a comment on pull request #2931: URL: https://github.com/apache/hudi/pull/2931#issuecomment-835768056 # [Codecov](https://codecov.io/gh/apache/hudi/pull/2931?src=pr=h1_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) Report > Merging [#2931](https://codecov.io/gh/apache/hudi/pull/2931?src=pr=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (3131b53) into [master](https://codecov.io/gh/apache/hudi/commit/bfbf993cbe3f1e3fab93095f4342ed17423efab5?el=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (bfbf993) will **increase** coverage by `14.80%`. > The diff coverage is `n/a`. [![Impacted file tree graph](https://codecov.io/gh/apache/hudi/pull/2931/graphs/tree.svg?width=650=150=pr=VTTXabwbs2_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation)](https://codecov.io/gh/apache/hudi/pull/2931?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) ```diff @@ Coverage Diff @@ ## master#2931 +/- ## = + Coverage 54.77% 69.58% +14.80% + Complexity 3796 375 -3421 = Files 481 54 -427 Lines 23284 2002-21282 Branches 2478 237 -2241 = - Hits 12753 1393-11360 + Misses 9385 478 -8907 + Partials 1146 131 -1015 ``` | Flag | Coverage Δ | Complexity Δ | | |---|---|---|---| | hudicli | `?` | `?` | | | hudiclient | `?` | `?` | | | hudicommon | `?` | `?` | | | hudiflink | `?` | `?` | | | hudihadoopmr | `?` | `?` | | | hudisparkdatasource | `?` | `?` | | | hudisync | `?` | `?` | | | huditimelineservice | `?` | `?` | | | hudiutilities | `69.58% <ø> (ø)` | `375.00 <ø> (ø)` | | Flags with carried forward coverage won't be shown. [Click here](https://docs.codecov.io/docs/carryforward-flags?utm_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#carryforward-flags-in-the-pull-request-comment) to find out more. | [Impacted Files](https://codecov.io/gh/apache/hudi/pull/2931?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) | Coverage Δ | Complexity Δ | | |---|---|---|---| | [...n/java/org/apache/hudi/hadoop/InputSplitUtils.java](https://codecov.io/gh/apache/hudi/pull/2931/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1oYWRvb3AtbXIvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvaGFkb29wL0lucHV0U3BsaXRVdGlscy5qYXZh) | | | | | [.../apache/hudi/common/model/CompactionOperation.java](https://codecov.io/gh/apache/hudi/pull/2931/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY29tbW9uL21vZGVsL0NvbXBhY3Rpb25PcGVyYXRpb24uamF2YQ==) | | | | | [...apache/hudi/common/fs/HoodieWrapperFileSystem.java](https://codecov.io/gh/apache/hudi/pull/2931/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY29tbW9uL2ZzL0hvb2RpZVdyYXBwZXJGaWxlU3lzdGVtLmphdmE=) | | | | | [...del/OverwriteNonDefaultsWithLatestAvroPayload.java](https://codecov.io/gh/apache/hudi/pull/2931/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY29tbW9uL21vZGVsL092ZXJ3cml0ZU5vbkRlZmF1bHRzV2l0aExhdGVzdEF2cm9QYXlsb2FkLmphdmE=) | | | | | [...in/scala/org/apache/hudi/HoodieEmptyRelation.scala](https://codecov.io/gh/apache/hudi/pull/2931/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1zcGFyay1kYXRhc291cmNlL2h1ZGktc3Bhcmsvc3JjL21haW4vc2NhbGEvb3JnL2FwYWNoZS9odWRpL0hvb2RpZUVtcHR5UmVsYXRpb24uc2NhbGE=) | | | | | [...org/apache/hudi/hadoop/HoodieHFileInputFormat.java](https://codecov.io/gh/apache/hudi/pull/2931/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1oYWRvb3AtbXIvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvaGFkb29wL0hvb2RpZUhGaWxlSW5wdXRGb3JtYXQuamF2YQ==) | | | | |
[GitHub] [hudi] codecov-commenter edited a comment on pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
codecov-commenter edited a comment on pull request #2930: URL: https://github.com/apache/hudi/pull/2930#issuecomment-835670270 # [Codecov](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=h1_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) Report > Merging [#2930](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (fe73515) into [master](https://codecov.io/gh/apache/hudi/commit/bfbf993cbe3f1e3fab93095f4342ed17423efab5?el=desc_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) (bfbf993) will **increase** coverage by `14.75%`. > The diff coverage is `n/a`. [![Impacted file tree graph](https://codecov.io/gh/apache/hudi/pull/2930/graphs/tree.svg?width=650=150=pr=VTTXabwbs2_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation)](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) ```diff @@ Coverage Diff @@ ## master#2930 +/- ## = + Coverage 54.77% 69.53% +14.75% + Complexity 3796 374 -3422 = Files 481 54 -427 Lines 23284 2002-21282 Branches 2478 237 -2241 = - Hits 12753 1392-11361 + Misses 9385 478 -8907 + Partials 1146 132 -1014 ``` | Flag | Coverage Δ | Complexity Δ | | |---|---|---|---| | hudicli | `?` | `?` | | | hudiclient | `?` | `?` | | | hudicommon | `?` | `?` | | | hudiflink | `?` | `?` | | | hudihadoopmr | `?` | `?` | | | hudisparkdatasource | `?` | `?` | | | hudisync | `?` | `?` | | | huditimelineservice | `?` | `?` | | | hudiutilities | `69.53% <ø> (-0.05%)` | `374.00 <ø> (-1.00)` | | Flags with carried forward coverage won't be shown. [Click here](https://docs.codecov.io/docs/carryforward-flags?utm_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#carryforward-flags-in-the-pull-request-comment) to find out more. | [Impacted Files](https://codecov.io/gh/apache/hudi/pull/2930?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation) | Coverage Δ | Complexity Δ | | |---|---|---|---| | [...apache/hudi/utilities/deltastreamer/DeltaSync.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS11dGlsaXRpZXMvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvdXRpbGl0aWVzL2RlbHRhc3RyZWFtZXIvRGVsdGFTeW5jLmphdmE=) | `71.08% <0.00%> (-0.35%)` | `55.00% <0.00%> (-1.00%)` | | | [...rg/apache/hudi/metadata/MetadataPartitionType.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvbWV0YWRhdGEvTWV0YWRhdGFQYXJ0aXRpb25UeXBlLmphdmE=) | | | | | [...org/apache/hudi/hadoop/BootstrapBaseFileSplit.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1oYWRvb3AtbXIvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvaGFkb29wL0Jvb3RzdHJhcEJhc2VGaWxlU3BsaXQuamF2YQ==) | | | | | [...in/scala/org/apache/hudi/IncrementalRelation.scala](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1zcGFyay1kYXRhc291cmNlL2h1ZGktc3Bhcmsvc3JjL21haW4vc2NhbGEvb3JnL2FwYWNoZS9odWRpL0luY3JlbWVudGFsUmVsYXRpb24uc2NhbGE=) | | | | | [...he/hudi/hadoop/SafeParquetRecordReaderWrapper.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1oYWRvb3AtbXIvc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvaGFkb29wL1NhZmVQYXJxdWV0UmVjb3JkUmVhZGVyV3JhcHBlci5qYXZh) | | | | | [...pache/hudi/common/fs/SizeAwareDataInputStream.java](https://codecov.io/gh/apache/hudi/pull/2930/diff?src=pr=tree_medium=referral_source=github_content=comment_campaign=pr+comments_term=The+Apache+Software+Foundation#diff-aHVkaS1jb21tb24vc3JjL21haW4vamF2YS9vcmcvYXBhY2hlL2h1ZGkvY29tbW9uL2ZzL1NpemVBd2FyZURhdGFJbnB1dFN0cmVhbS5qYXZh) | | | | |
[jira] [Created] (HUDI-1890) FlinkCreateHandle and FlinkAppendHandle canWrite should always return true
Danny Chen created HUDI-1890: Summary: FlinkCreateHandle and FlinkAppendHandle canWrite should always return true Key: HUDI-1890 URL: https://issues.apache.org/jira/browse/HUDI-1890 Project: Apache Hudi Issue Type: Bug Components: Flink Integration Reporter: Danny Chen Assignee: Danny Chen Fix For: 0.9.0 The method {{canWrite}} should always return true because they can already write based on file size, e.g. the {{BucketAssigner}}. -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [hudi] MyLanPangzi commented on a change in pull request #2931: [MINOR] optimize FilePathUtils
MyLanPangzi commented on a change in pull request #2931: URL: https://github.com/apache/hudi/pull/2931#discussion_r629240721 ## File path: hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java ## @@ -260,12 +245,12 @@ private static void listStatusRecursively( int level, int expectLevel, List results) throws IOException { -if (expectLevel == level && !isHiddenFile(fileStatus)) { +if (expectLevel == level && isHiddenFile(fileStatus)) { Review comment: oh, sorry i forgot revert this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] MyLanPangzi commented on a change in pull request #2930: [HUDI-1818] Validate and check required option for HoodieTable (Azure…
MyLanPangzi commented on a change in pull request #2930: URL: https://github.com/apache/hudi/pull/2930#discussion_r629237005 ## File path: hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java ## @@ -98,6 +102,33 @@ public String factoryIdentifier() { // Utilities // - + /** Validate required options. e.g record key and pre combine key. + * + * @param conf The table options + * @param schema The table schema + */ + private void validateRequiredOptions(Configuration conf, TableSchema schema) { Review comment: good, rename the method. partitioned table needn't validate.it has default partiton value.we just verify hoodie required fields. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] n3nash opened a new pull request #2932: [WIP] Fixing schema evolution in sparksql_writer and deltastreamer
n3nash opened a new pull request #2932: URL: https://github.com/apache/hudi/pull/2932 ## *Tips* - *Thank you very much for contributing to Apache Hudi.* - *Please review https://hudi.apache.org/contributing.html before opening a pull request.* ## What is the purpose of the pull request *(For example: This pull request adds quick-start document.)* ## Brief change log *(for example:)* - *Modify AnnotationLocation checkstyle rule in checkstyle.xml* ## Verify this pull request *(Please pick either of the following options)* This pull request is a trivial rework / code cleanup without any test coverage. *(or)* This pull request is already covered by existing tests, such as *(please describe tests)*. (or) This change added tests and can be verified as follows: *(example:)* - *Added integration tests for end-to-end.* - *Added HoodieClientWriteTest to verify the change.* - *Manually verified the change by running a job locally.* ## Committer checklist - [ ] Has a corresponding JIRA in PR title & commit - [ ] Commit message is descriptive of the change - [ ] CI is green - [ ] Necessary doc changes done or have another open PR - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] lukemarles commented on pull request #2893: [HUDI-1371] Support metadata based listing for Spark DataSource and Spark SQL
lukemarles commented on pull request #2893: URL: https://github.com/apache/hudi/pull/2893#issuecomment-836301088 stop sending me shit Sent from my iPhone > On 10 May 2021, at 12:47 pm, pengzhiwei ***@***.***> wrote: > > > @pengzhiwei2018 commented on this pull request. > > In hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java: > > > @@ -141,6 +143,31 @@ protected BaseTableMetadata(HoodieEngineContext engineContext, HoodieMetadataCon > .getAllFilesInPartition(partitionPath); >} > > + @Override > + public Map getAllFilesInPartitions(List partitionPaths) > + throws IOException { > +if (enabled) { > + Map partitionsFilesMap = new HashMap<>(); > + > + try { > +for (String partitionPath : partitionPaths) { > + partitionsFilesMap.put(partitionPath, fetchAllFilesInPartition(new Path(partitionPath))); > +} > + } catch (Exception e) { > +if (metadataConfig.enableFallback()) { > + LOG.error("Failed to retrieve files in partitions from metadata", e); > If enable the fallback here, an empty partitionsFilesMap will return if there is an Exception happened, is it right? > > In hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java: > > > @@ -105,6 +106,20 @@ public FileSystemBackedTableMetadata(HoodieEngineContext engineContext, Serializ > return partitionPaths; >} > > + @Override > + public Map getAllFilesInPartitions(List partitionPaths) > + throws IOException { > +int parallelism = Math.min(DEFAULT_LISTING_PARALLELISM, partitionPaths.size()); > If the partitionPaths is empty, the parallelism will be 0, there may be an Exception ("Positive number of partitions required") throw out for the sparkContext.parallelize(seq, parallelism), > > In hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala: > > > val properties = new Properties() > +// To support metadata listing via Spark SQL we allow users to pass the config via Hadoop Conf. Spark SQL does not > Should we get these configurations from the spark.sessionState.conf for spark? > > — > You are receiving this because you are subscribed to this thread. > Reply to this email directly, view it on GitHub, or unsubscribe. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Assigned] (HUDI-1889) Support partition path in a nested field in HoodieFileIndex
[ https://issues.apache.org/jira/browse/HUDI-1889?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo reassigned HUDI-1889: --- Assignee: Ethan Guo > Support partition path in a nested field in HoodieFileIndex > --- > > Key: HUDI-1889 > URL: https://issues.apache.org/jira/browse/HUDI-1889 > Project: Apache Hudi > Issue Type: Bug >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > > Partition path in a nested field is not supported in HoodieFileIndex. When > using a nested field for the partition path, the following exception is > thrown: > {code:java} > java.lang.IllegalArgumentException: Cannot find column: 'fare.currency' in > the > schema[StructField(_row_key,StringType,true),StructField(timestamp,LongType,true),StructField(name,StringType,true),StructField(fare,StructType(StructField(value,LongType,true), > StructField(currency,StringType,true)),true)] > at > org.apache.hudi.HoodieFileIndex$$anonfun$4$$anonfun$apply$1.apply(HoodieFileIndex.scala:98) > at > org.apache.hudi.HoodieFileIndex$$anonfun$4$$anonfun$apply$1.apply(HoodieFileIndex.scala:98) > at scala.collection.MapLike$class.getOrElse(MapLike.scala:128) > at scala.collection.AbstractMap.getOrElse(Map.scala:59) > at > org.apache.hudi.HoodieFileIndex$$anonfun$4.apply(HoodieFileIndex.scala:98) > at > org.apache.hudi.HoodieFileIndex$$anonfun$4.apply(HoodieFileIndex.scala:97) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) > at > scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) > at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) > at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) > at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) > at > org.apache.hudi.HoodieFileIndex._partitionSchemaFromProperties$lzycompute(HoodieFileIndex.scala:97) > at > org.apache.hudi.HoodieFileIndex._partitionSchemaFromProperties(HoodieFileIndex.scala:91) > at > org.apache.hudi.HoodieFileIndex.loadPartitionPathFiles(HoodieFileIndex.scala:245) > at org.apache.hudi.HoodieFileIndex.refresh0(HoodieFileIndex.scala:147) > at org.apache.hudi.HoodieFileIndex.(HoodieFileIndex.scala:116) > at > org.apache.hudi.TestHoodieRowWriting.testRowWriting(TestHoodieRowWriting.scala:103) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.junit.platform.commons.util.ReflectionUtils.invokeMethod(ReflectionUtils.java:688) > at > org.junit.jupiter.engine.execution.MethodInvocation.proceed(MethodInvocation.java:60) > at > org.junit.jupiter.engine.execution.InvocationInterceptorChain$ValidatingInvocation.proceed(InvocationInterceptorChain.java:131) > at > org.junit.jupiter.engine.extension.TimeoutExtension.intercept(TimeoutExtension.java:149) > at > org.junit.jupiter.engine.extension.TimeoutExtension.interceptTestableMethod(TimeoutExtension.java:140) > at > org.junit.jupiter.engine.extension.TimeoutExtension.interceptTestTemplateMethod(TimeoutExtension.java:92) > at > org.junit.jupiter.engine.execution.ExecutableInvoker$ReflectiveInterceptorCall.lambda$ofVoidMethod$0(ExecutableInvoker.java:115) > at > org.junit.jupiter.engine.execution.ExecutableInvoker.lambda$invoke$0(ExecutableInvoker.java:105) > at > org.junit.jupiter.engine.execution.InvocationInterceptorChain$InterceptedInvocation.proceed(InvocationInterceptorChain.java:106) > at > org.junit.jupiter.engine.execution.InvocationInterceptorChain.proceed(InvocationInterceptorChain.java:64) > at > org.junit.jupiter.engine.execution.InvocationInterceptorChain.chainAndInvoke(InvocationInterceptorChain.java:45) > at > org.junit.jupiter.engine.execution.InvocationInterceptorChain.invoke(InvocationInterceptorChain.java:37) > at > org.junit.jupiter.engine.execution.ExecutableInvoker.invoke(ExecutableInvoker.java:104) > at > org.junit.jupiter.engine.execution.ExecutableInvoker.invoke(ExecutableInvoker.java:98) > at > org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.lambda$invokeTestMethod$6(TestMethodTestDescriptor.java:212) > at > org.junit.platform.engine.support.hierarchical.ThrowableCollector.execute(ThrowableCollector.java:73) > at >
[jira] [Updated] (HUDI-1889) Support partition path in a nested field in HoodieFileIndex
[ https://issues.apache.org/jira/browse/HUDI-1889?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-1889: Description: Partition path in a nested field is not supported in HoodieFileIndex. When using a nested field for the partition path, the following exception is thrown: {code:java} java.lang.IllegalArgumentException: Cannot find column: 'fare.currency' in the schema[StructField(_row_key,StringType,true),StructField(timestamp,LongType,true),StructField(name,StringType,true),StructField(fare,StructType(StructField(value,LongType,true), StructField(currency,StringType,true)),true)] at org.apache.hudi.HoodieFileIndex$$anonfun$4$$anonfun$apply$1.apply(HoodieFileIndex.scala:98) at org.apache.hudi.HoodieFileIndex$$anonfun$4$$anonfun$apply$1.apply(HoodieFileIndex.scala:98) at scala.collection.MapLike$class.getOrElse(MapLike.scala:128) at scala.collection.AbstractMap.getOrElse(Map.scala:59) at org.apache.hudi.HoodieFileIndex$$anonfun$4.apply(HoodieFileIndex.scala:98) at org.apache.hudi.HoodieFileIndex$$anonfun$4.apply(HoodieFileIndex.scala:97) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at org.apache.hudi.HoodieFileIndex._partitionSchemaFromProperties$lzycompute(HoodieFileIndex.scala:97) at org.apache.hudi.HoodieFileIndex._partitionSchemaFromProperties(HoodieFileIndex.scala:91) at org.apache.hudi.HoodieFileIndex.loadPartitionPathFiles(HoodieFileIndex.scala:245) at org.apache.hudi.HoodieFileIndex.refresh0(HoodieFileIndex.scala:147) at org.apache.hudi.HoodieFileIndex.(HoodieFileIndex.scala:116) at org.apache.hudi.TestHoodieRowWriting.testRowWriting(TestHoodieRowWriting.scala:103) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.junit.platform.commons.util.ReflectionUtils.invokeMethod(ReflectionUtils.java:688) at org.junit.jupiter.engine.execution.MethodInvocation.proceed(MethodInvocation.java:60) at org.junit.jupiter.engine.execution.InvocationInterceptorChain$ValidatingInvocation.proceed(InvocationInterceptorChain.java:131) at org.junit.jupiter.engine.extension.TimeoutExtension.intercept(TimeoutExtension.java:149) at org.junit.jupiter.engine.extension.TimeoutExtension.interceptTestableMethod(TimeoutExtension.java:140) at org.junit.jupiter.engine.extension.TimeoutExtension.interceptTestTemplateMethod(TimeoutExtension.java:92) at org.junit.jupiter.engine.execution.ExecutableInvoker$ReflectiveInterceptorCall.lambda$ofVoidMethod$0(ExecutableInvoker.java:115) at org.junit.jupiter.engine.execution.ExecutableInvoker.lambda$invoke$0(ExecutableInvoker.java:105) at org.junit.jupiter.engine.execution.InvocationInterceptorChain$InterceptedInvocation.proceed(InvocationInterceptorChain.java:106) at org.junit.jupiter.engine.execution.InvocationInterceptorChain.proceed(InvocationInterceptorChain.java:64) at org.junit.jupiter.engine.execution.InvocationInterceptorChain.chainAndInvoke(InvocationInterceptorChain.java:45) at org.junit.jupiter.engine.execution.InvocationInterceptorChain.invoke(InvocationInterceptorChain.java:37) at org.junit.jupiter.engine.execution.ExecutableInvoker.invoke(ExecutableInvoker.java:104) at org.junit.jupiter.engine.execution.ExecutableInvoker.invoke(ExecutableInvoker.java:98) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.lambda$invokeTestMethod$6(TestMethodTestDescriptor.java:212) at org.junit.platform.engine.support.hierarchical.ThrowableCollector.execute(ThrowableCollector.java:73) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.invokeTestMethod(TestMethodTestDescriptor.java:208) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.execute(TestMethodTestDescriptor.java:137) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.execute(TestMethodTestDescriptor.java:71) at org.junit.platform.engine.support.hierarchical.NodeTestTask.lambda$executeRecursively$5(NodeTestTask.java:139) at
[jira] [Updated] (HUDI-1889) Support partition path in a nested field in HoodieFileIndex
[ https://issues.apache.org/jira/browse/HUDI-1889?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-1889: Description: When using a nested field for the partition path, the following exception is thrown: {code:java} java.lang.IllegalArgumentException: Cannot find column: 'fare.currency' in the schema[StructField(_row_key,StringType,true),StructField(timestamp,LongType,true),StructField(name,StringType,true),StructField(fare,StructType(StructField(value,LongType,true), StructField(currency,StringType,true)),true)] at org.apache.hudi.HoodieFileIndex$$anonfun$4$$anonfun$apply$1.apply(HoodieFileIndex.scala:98) at org.apache.hudi.HoodieFileIndex$$anonfun$4$$anonfun$apply$1.apply(HoodieFileIndex.scala:98) at scala.collection.MapLike$class.getOrElse(MapLike.scala:128) at scala.collection.AbstractMap.getOrElse(Map.scala:59) at org.apache.hudi.HoodieFileIndex$$anonfun$4.apply(HoodieFileIndex.scala:98) at org.apache.hudi.HoodieFileIndex$$anonfun$4.apply(HoodieFileIndex.scala:97) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at org.apache.hudi.HoodieFileIndex._partitionSchemaFromProperties$lzycompute(HoodieFileIndex.scala:97) at org.apache.hudi.HoodieFileIndex._partitionSchemaFromProperties(HoodieFileIndex.scala:91) at org.apache.hudi.HoodieFileIndex.loadPartitionPathFiles(HoodieFileIndex.scala:245) at org.apache.hudi.HoodieFileIndex.refresh0(HoodieFileIndex.scala:147) at org.apache.hudi.HoodieFileIndex.(HoodieFileIndex.scala:116) at org.apache.hudi.TestHoodieRowWriting.testRowWriting(TestHoodieRowWriting.scala:103) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.junit.platform.commons.util.ReflectionUtils.invokeMethod(ReflectionUtils.java:688) at org.junit.jupiter.engine.execution.MethodInvocation.proceed(MethodInvocation.java:60) at org.junit.jupiter.engine.execution.InvocationInterceptorChain$ValidatingInvocation.proceed(InvocationInterceptorChain.java:131) at org.junit.jupiter.engine.extension.TimeoutExtension.intercept(TimeoutExtension.java:149) at org.junit.jupiter.engine.extension.TimeoutExtension.interceptTestableMethod(TimeoutExtension.java:140) at org.junit.jupiter.engine.extension.TimeoutExtension.interceptTestTemplateMethod(TimeoutExtension.java:92) at org.junit.jupiter.engine.execution.ExecutableInvoker$ReflectiveInterceptorCall.lambda$ofVoidMethod$0(ExecutableInvoker.java:115) at org.junit.jupiter.engine.execution.ExecutableInvoker.lambda$invoke$0(ExecutableInvoker.java:105) at org.junit.jupiter.engine.execution.InvocationInterceptorChain$InterceptedInvocation.proceed(InvocationInterceptorChain.java:106) at org.junit.jupiter.engine.execution.InvocationInterceptorChain.proceed(InvocationInterceptorChain.java:64) at org.junit.jupiter.engine.execution.InvocationInterceptorChain.chainAndInvoke(InvocationInterceptorChain.java:45) at org.junit.jupiter.engine.execution.InvocationInterceptorChain.invoke(InvocationInterceptorChain.java:37) at org.junit.jupiter.engine.execution.ExecutableInvoker.invoke(ExecutableInvoker.java:104) at org.junit.jupiter.engine.execution.ExecutableInvoker.invoke(ExecutableInvoker.java:98) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.lambda$invokeTestMethod$6(TestMethodTestDescriptor.java:212) at org.junit.platform.engine.support.hierarchical.ThrowableCollector.execute(ThrowableCollector.java:73) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.invokeTestMethod(TestMethodTestDescriptor.java:208) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.execute(TestMethodTestDescriptor.java:137) at org.junit.jupiter.engine.descriptor.TestMethodTestDescriptor.execute(TestMethodTestDescriptor.java:71) at org.junit.platform.engine.support.hierarchical.NodeTestTask.lambda$executeRecursively$5(NodeTestTask.java:139) at org.junit.platform.engine.support.hierarchical.ThrowableCollector.execute(ThrowableCollector.java:73) at