pramodbiligiri commented on code in PR #6665: URL: https://github.com/apache/hudi/pull/6665#discussion_r979612265
########## hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/gcs/GcsIngestionConfig.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.helpers.gcs; + +/** + * Config keys and defaults for GCS Ingestion + */ +public class GcsIngestionConfig { + + /** + * The GCP Project Id where the Pubsub Subscription to ingest from resides. Needed to connect + * to the Pubsub subscription + */ + public static final String GOOGLE_PROJECT_ID = "hoodie.deltastreamer.source.gcs.project.id"; + + /** + * The GCP Pubsub subscription id for the GCS Notifications. Needed to connect to the Pubsub + * subscription. + */ + public static final String PUBSUB_SUBSCRIPTION_ID = "hoodie.deltastreamer.source.gcs.subscription.id"; + + /** + * How many messages to pull from Cloud Pubsub at a time. Also see {@link DEFAULT_BATCH_SIZE}. + */ + public static final String BATCH_SIZE_CONF = "hoodie.deltastreamer.source.gcs.batch.size"; + + /** + * Provide a reasonable setting for default batch size. + * If batch size is too big, two possible issues can happen: + * i) Acknowledgement takes too long (given that Hudi needs to commit first). That means Pubsub + * will keep delivering the same message since it wasn't acked in time. + * ii) The size of the request that acks outstanding messages may exceed the limit, + * which is 512KB as per Google's docs. See: https://cloud.google.com/pubsub/quotas#resource_limits + */ + public static final int DEFAULT_BATCH_SIZE = 10; + + // Size of inbound messages when pulling data, in bytes + public static final int DEFAULT_MAX_INBOUND_MESSAGE_SIZE = 20 * 1024 * 1024; // bytes + + /** + * Whether to acknowledge messages or not. Not acknowledging means Pubsub will keep redelivering the + * same messages. In Prod this should always be true. So this is mainly useful during dev and testing. + */ + public static final String ACK_MESSAGES = "hoodie.deltastreamer.source.gcs.ack"; + + /** + * Default value for {@link ACK_MESSAGES} + */ + public static final boolean ACK_MESSAGES_DEFAULT_VALUE = true; + + /** + * Check whether file exists before attempting to pull it + */ + public static final String ENABLE_EXISTS_CHECK = "hoodie.deltastreamer.source.gcsincr.check.file.exists"; Review Comment: Have made a note. This will require changing documentation also. May get back to this later. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
