Chris Rytting created BEAM-11098:
------------------------------------
Summary: Running Apache Beam to distribute the cleaning of a
dataset in Google Cloud Dataflow
Key: BEAM-11098
URL: https://issues.apache.org/jira/browse/BEAM-11098
Project: Beam
Issue Type: Bug
Components: sdk-py-core
Affects Versions: 2.24.0
Environment: Ubuntu 18.04
Python 3.6
Reporter: Chris Rytting
Trying to download C4 via [these
instructions]([https://github.com/google-research/text-to-text-transfer-transformer#c4)]
and 3 hours into my job I get this. Can't find any help on google for this
error.
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/dataflow_worker/batchworker.py",
line 649, in do_work
work_executor.execute()
File "/usr/local/lib/python3.6/site-packages/dataflow_worker/executor.py",
line 179, in execute
op.start()
File "dataflow_worker/shuffle_operations.py", line 63, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 64, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 79, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 80, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 84, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "apache_beam/runners/worker/operations.py", line 332, in
apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 195, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "dataflow_worker/shuffle_operations.py", line 261, in
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "dataflow_worker/shuffle_operations.py", line 268, in
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "apache_beam/runners/worker/operations.py", line 332, in
apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 195, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 670, in
apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 671, in
apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1215, in
apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1279, in
apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1213, in
apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 569, in
apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1371, in
apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 195, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 670, in
apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 671, in
apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1215, in
apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1294, in
apache_beam.runners.common.DoFnRunner._reraise_augmented
File "/usr/local/lib/python3.6/site-packages/future/utils/__init__.py", line
446, in raise_with_traceback
raise exc.with_traceback(traceback)
File "apache_beam/runners/common.py", line 1213, in
apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 570, in
apache_beam.runners.common.SimpleInvoker.invoke_process
File
"/mnt/pccfs/backed_up/crytting/persuasion/createc4/lib/python3.6/site-packages/apache_beam/transforms/core.py",
line 815, in <lambda>
self.process = lambda element: fn(element)
TypeError: clean_page() got an unexpected keyword argument 'badwords_regex'
[while running 'clean_pages']
--
This message was sent by Atlassian Jira
(v8.3.4#803005)