BjornPrime commented on code in PR #25965:
URL: https://github.com/apache/beam/pull/25965#discussion_r1224720792
##########
sdks/python/apache_beam/io/gcp/gcsio.py:
##########
@@ -296,160 +234,87 @@ def delete_batch(self, paths):
"""
if not paths:
return []
-
- paths = iter(paths)
+ if len(paths) > MAX_BATCH_OPERATION_SIZE:
+ raise TooManyRequests("Batch larger than %s", MAX_BATCH_OPERATION_SIZE)
result_statuses = []
- while True:
- paths_chunk = list(islice(paths, MAX_BATCH_OPERATION_SIZE))
- if not paths_chunk:
- return result_statuses
- batch_request = BatchApiRequest(
- batch_url=GCS_BATCH_ENDPOINT,
- retryable_codes=retry.SERVER_ERROR_OR_TIMEOUT_CODES,
- response_encoding='utf-8')
- for path in paths_chunk:
- bucket, object_path = parse_gcs_path(path)
- request = storage.StorageObjectsDeleteRequest(
- bucket=bucket, object=object_path)
- batch_request.Add(self.client.objects, 'Delete', request)
- api_calls = batch_request.Execute(self.client._http) # pylint:
disable=protected-access
- for i, api_call in enumerate(api_calls):
- path = paths_chunk[i]
+ with self.client.batch():
+ for path in paths:
+ bucket_name, blob_path = parse_gcs_path(path)
+ bucket = self.client.get_bucket(bucket_name)
+ blob = storage.Blob(blob_path, bucket)
exception = None
- if api_call.is_error:
- exception = api_call.exception
- # Return success when the file doesn't exist anymore for idempotency.
- if isinstance(exception, HttpError) and exception.status_code == 404:
+ try:
+ blob.delete()
Review Comment:
I considered that but all the fields on the client library Batch object are
marked as private, so I wasn't sure we could rely on them in the future.
Are the methods in GcsIO being accessed directly by anyone or are they only
accessed through GCSFileSystem? If it's the former, I agree that maintaining
their return values is important, but if it's the latter I don't feel like
that's as high of a priority, especially if we can only do that by using
internal variables that aren't guaranteed to remain backwards compatible.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]