Abacn commented on code in PR #24317:
URL: https://github.com/apache/beam/pull/24317#discussion_r1029897186
##########
sdks/python/apache_beam/io/gcp/gcsio.py:
##########
@@ -582,32 +585,58 @@ def list_prefix(self, path, with_metadata=False):
If ``with_metadata`` is False: dict of file name -> size; if
``with_metadata`` is True: dict of file name -> tuple(size, timestamp).
"""
+ file_info = {}
+ for file_metadata in self.list_files(path, with_metadata):
+ file_info[file_metadata[0]] = file_metadata[1]
+
+ return file_info
+
+ def list_files(self, path, with_metadata=False):
+ """Lists files matching the prefix.
+
+ Args:
+ path: GCS file path pattern in the form gs://<bucket>/[name].
+ with_metadata: Experimental. Specify whether returns file metadata.
+
+ Returns:
+ If ``with_metadata`` is False: generator of tuple(file name, size); if
+ ``with_metadata`` is True: generator of
+ tuple(file name, tuple(size, timestamp)).
+ """
bucket, prefix = parse_gcs_path(path, object_optional=True)
request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix)
- file_info = {}
+ file_info = set()
Review Comment:
The dict in list_prefix changed to a set for deduplication use. So the
generated results are still deduplicated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]