[
https://issues.apache.org/jira/browse/BEAM-6522?focusedWorklogId=237809&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-237809
]
ASF GitHub Bot logged work on BEAM-6522:
----------------------------------------
Author: ASF GitHub Bot
Created on: 06/May/19 14:32
Start Date: 06/May/19 14:32
Worklog Time Spent: 10m
Work Description: tvalentyn commented on pull request #8130: [BEAM-6522]
Use fastavro as default Avro implementation for Avro source/sink in Python3
URL: https://github.com/apache/beam/pull/8130#discussion_r281211197
##########
File path: sdks/python/apache_beam/io/avroio.py
##########
@@ -146,6 +153,10 @@ def __init__(self, file_pattern=None, min_bundle_size=0,
validate=True,
use_fastavro (bool); when set, use the `fastavro` library for IO, which
is significantly faster, and will likely become the default
"""
+ if sys.version_info[0] >= 3 and not use_fastavro:
+ warnings.warn("Due to a known issue in avro-python3 package, it is "
+ "recommended to use fastavro with Beam Avro IO on "
Review comment:
Looks like Avro release might happen soon [1]:
@fredo838, could you please check whether currenlty-disabled-on-python3 test
passes if we install:
https://dist.apache.org/repos/dist/dev/avro/avro-1.9.0-rc2/py3/avro-python3-1.9.0.tar.gz
You can pass that file as --extra_package to a Dataflow job.
Thank you.
[1]
https://lists.apache.org/thread.html/ff724cff8fed12ebc8316e0dc1e876f0ca8080ab68b6165b7f109665@%3Cdev.avro.apache.org%3E
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 237809)
Time Spent: 7h (was: 6h 50m)
> Avro RecordSchema class is not picklable
> ----------------------------------------
>
> Key: BEAM-6522
> URL: https://issues.apache.org/jira/browse/BEAM-6522
> Project: Beam
> Issue Type: Sub-task
> Components: sdk-py-core
> Reporter: Robbe
> Assignee: Frederik Bode
> Priority: Major
> Time Spent: 7h
> Remaining Estimate: 0h
>
> The avroio module still has 4 failing tests. This is actually 2 times the
> same 2 tests, both for Avro and Fastavro.
> *apache_beam.io.avroio_test.TestAvro.test_sink_transform*
> *apache_beam.io.avroio_test.TestFastAvro.test_sink_transform*
> fail with:
> {code:java}
> Traceback (most recent call last):
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/io/avroio_test.py",
> line 432, in test_sink_transform
> | avroio.WriteToAvro(path, self.SCHEMA, use_fastavro=self.use_fastavro)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/pvalue.py", line
> 112, in __or__
> return self.pipeline.apply(ptransform, self)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/pipeline.py", line
> 515, in apply
> pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/runners/runner.py",
> line 193, in apply
> return m(transform, input, options)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/runners/runner.py",
> line 199, in apply_PTransform
> return transform.expand(input)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/io/avroio.py", line
> 528, in expand
> return pcoll | beam.io.iobase.Write(self._sink)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/pvalue.py", line
> 112, in __or__
> return self.pipeline.apply(ptransform, self)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/pipeline.py", line
> 515, in apply
> pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/runners/runner.py",
> line 193, in apply
> return m(transform, input, options)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/runners/runner.py",
> line 199, in apply_PTransform
> return transform.expand(input)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/io/iobase.py", line
> 960, in expand
> return pcoll | WriteImpl(self.sink)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/pvalue.py", line
> 112, in __or__
> return self.pipeline.apply(ptransform, self)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/pipeline.py", line
> 515, in apply
> pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/runners/runner.py",
> line 193, in apply
> return m(transform, input, options)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/runners/runner.py",
> line 199, in apply_PTransform
> return transform.expand(input)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/io/iobase.py", line
> 979, in expand
> lambda _, sink: sink.initialize_write(), self.sink)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/transforms/core.py",
> line 1103, in Map
> pardo = FlatMap(wrapper, *args, **kwargs)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/transforms/core.py",
> line 1054, in FlatMap
> pardo = ParDo(CallableWrapperDoFn(fn), *args, **kwargs)
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/transforms/core.py",
> line 864, in __init__
> super(ParDo, self).__init__(fn, *args, **kwargs)
> File
> "/home/robbe/workspace/beam/sdks/python/apache_beam/transforms/ptransform.py",
> line 646, in __init__
> self.args = pickler.loads(pickler.dumps(self.args))
> File
> "/home/robbe/workspace/beam/sdks/python/apache_beam/internal/pickler.py",
> line 247, in loads
> return dill.loads(s)
> File
> "/home/robbe/workspace/beam/sdks/python/.eggs/dill-0.2.9-py3.5.egg/dill/_dill.py",
> line 317, in loads
> return load(file, ignore)
> File
> "/home/robbe/workspace/beam/sdks/python/.eggs/dill-0.2.9-py3.5.egg/dill/_dill.py",
> line 305, in load
> obj = pik.load()
> File
> "/home/robbe/workspace/beam/sdks/python/target/.tox/py3/lib/python3.5/site-packages/avro/schema.py",
> line 173, in __setitem__
> % (key, value, self))
> Exception: Attempting to map key 'favorite_color' to value <avro.schema.Field
> object at 0x7f8f72d0d0b8> in ImmutableDict {}
> {code}
>
> *apache_beam.io.avroio_test.TestAvro.test_split_points*
> *apache_beam.io.avroio_test.TestFastAvro.test_split_points*
> fail with:
>
> {code:java}
> Traceback (most recent call last):
> File "/home/robbe/workspace/beam/sdks/python/apache_beam/io/avroio_test.py",
> line 308, in test_split_points
> self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)
> AssertionError: Lists differ: [(10, 1), (10, 1), (10, 1), (10, 1), (10, 1[42
> chars], 1)] != [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2[32 chars], 1)]
> First differing element 0:
> (10, 1)
> (2, 1)
> + [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1),
> (2, 1)]
> - [(10, 1),
> - (10, 1),
> - (10, 1),
> - (10, 1),
> - (10, 1),
> - (10, 1),
> - (10, 1),
> - (10, 1),
> - (10, 1),
> - (10, 1)]
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)