Abacn commented on code in PR #17604:
URL: https://github.com/apache/beam/pull/17604#discussion_r881959947
##########
sdks/python/apache_beam/io/avroio.py:
##########
@@ -176,20 +181,70 @@ def __init__(
name and the value being the actual data. If False, it only returns
the data.
"""
- source_from_file = partial(
+ self._source_from_file = partial(
_create_avro_source, min_bundle_size=min_bundle_size)
- self._read_all_files = filebasedsource.ReadAllFiles(
+ self._desired_bundle_size = desired_bundle_size
+ self._min_bundle_size = min_bundle_size
+ self._with_filename = with_filename
+ self.label = label
+
+ def _set_read_all_files(self):
+ """Helper function to set _read_all_files PTransform in constructor."""
+ return filebasedsource.ReadAllFiles(
True,
CompressionTypes.AUTO,
- desired_bundle_size,
- min_bundle_size,
- source_from_file,
- with_filename)
-
- self.label = label
+ self._desired_bundle_size,
+ self._min_bundle_size,
+ self._source_from_file,
+ self._with_filename)
def expand(self, pvalue):
- return pvalue | self.label >> self._read_all_files
+ return pvalue | self.label >> self._set_read_all_files()
+
+
+class ReadAllFromAvroContinuously(ReadAllFromAvro):
+ """A ``PTransform`` for reading avro files in given file patterns.
+ This PTransform acts as a Source and produces continuously a ``PCollection``
+ of strings.
+
+ For more details, see ``ReadAllFromAvro`` for avro parsing settings;
+ see ``apache_beam.io.fileio.MatchContinuously`` for watching settings.
+
+ ReadAllFromAvroContinuously is experimental. No backwards-compatibility
+ guarantees. Due to the limitation on Reshuffle, current implementation does
+ not scale.
+ """
+ def __init__(self, file_pattern, label='ReadAllFilesContinuously', **kwargs):
+ """Initialize the ``ReadAllFromAvroContinuously`` transform.
+
+ Accepts args for constructor args of both ``ReadAllFromAvro`` and
+ ``apache_beam.io.fileio.MatchContinuously``.
+ """
+ kwargs_for_match = {
+ k: v
+ for (k, v) in kwargs.items()
+ if k in filebasedsource.ReadAllFilesContinuously.ARGS_FOR_MATCH
+ }
+ kwargs_for_read = {
+ k: v
+ for (k, v) in kwargs.items()
+ if k not in filebasedsource.ReadAllFilesContinuously.ARGS_FOR_MATCH
Review Comment:
Yeah I agree it sounds weird. The consideration was to avoid re-assign the
default variables of both ReadAllFromAvro and ReadAllFilesContinuously. Another
choice is to parse in a MatchContinuously instance as a parameter to
ReadAllFromAvroContinuously, but seems an anti-pattern of the python pipeline
syntaxes.
Since ReadAllFromAvroContinuously is merely a combination of
MatchContinuously, I think we can avoid creating this new PTransform in api at
all, and just add documentation for the use case that use these two transforms
to implement Read All From Avro Continuously functionality.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]