This is an automated email from the ASF dual-hosted git repository. bhavanisudha pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push: new 89b33da0f8 [DOCS] Fix Reliable ingestion from AWS S3 blog for configs (#5986) 89b33da0f8 is described below commit 89b33da0f8362efedb71765db93b0dac79383036 Author: Bhavani Sudha Saktheeswaran <2179254+bhasu...@users.noreply.github.com> AuthorDate: Tue Jun 28 10:48:09 2022 -0700 [DOCS] Fix Reliable ingestion from AWS S3 blog for configs (#5986) --- website/blog/2021-08-23-s3-events-source.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/website/blog/2021-08-23-s3-events-source.md b/website/blog/2021-08-23-s3-events-source.md index e48b89cfd1..541a9c30f0 100644 --- a/website/blog/2021-08-23-s3-events-source.md +++ b/website/blog/2021-08-23-s3-events-source.md @@ -79,20 +79,21 @@ spark-submit \ --hoodie-conf hoodie.datasource.hive_sync.table=s3_meta_table \ --hoodie-conf hoodie.datasource.hive_sync.partition_fields=bucket \ --source-class org.apache.hudi.utilities.sources.S3EventsSource \ ---hoodie-conf hoodie.deltastreamer.source.queue.url=https://sqs.us-west-2.amazonaws.com/queue/url +--hoodie-conf hoodie.deltastreamer.s3.source.queue.url=https://sqs.us-west-2.amazonaws.com/queue/url --hoodie-conf hoodie.deltastreamer.s3.source.queue.region=us-west-2 -# To start S3EventsHoodieIncrSource +# To start S3EventsHoodieIncrSource use following command along with ordering field, record key(s) and +# partition field(s) from the source s3 data. spark-submit \ --jars "/home/hadoop/hudi-utilities-bundle_2.11-0.9.0.jar,/usr/lib/spark/external/lib/spark-avro.jar,/home/hadoop/aws-java-sdk-sqs-1.12.22.jar" \ --master yarn --deploy-mode client \ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer /home/hadoop/hudi-packages/hudi-utilities-bundle_2.11-0.9.0-SNAPSHOT.jar \ --table-type COPY_ON_WRITE \ ---source-ordering-field eventTime --target-base-path s3://bucket_name/path/for/s3_hudi_table \ +--source-ordering-field <ordering key from source data> --target-base-path s3://bucket_name/path/for/s3_hudi_table \ --target-table s3_hudi_table --continuous --min-sync-interval-seconds 10 \ ---hoodie-conf hoodie.datasource.write.recordkey.field="pull_request_id" \ +--hoodie-conf hoodie.datasource.write.recordkey.field="<record key from source data>" \ --hoodie-conf hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator \ ---hoodie-conf hoodie.datasource.write.partitionpath.field=s3.bucket.name --enable-hive-sync \ +--hoodie-conf hoodie.datasource.write.partitionpath.field=<partition key from source data> --enable-hive-sync \ --hoodie-conf hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.MultiPartKeysValueExtractor \ --hoodie-conf hoodie.datasource.write.hive_style_partitioning=true \ --hoodie-conf hoodie.datasource.hive_sync.database=default \