Chris Rytting created BEAM-11098:
------------------------------------

             Summary: Running Apache Beam to distribute the cleaning of a 
dataset in Google Cloud Dataflow
                 Key: BEAM-11098
                 URL: https://issues.apache.org/jira/browse/BEAM-11098
             Project: Beam
          Issue Type: Bug
          Components: sdk-py-core
    Affects Versions: 2.24.0
         Environment: Ubuntu 18.04
Python 3.6
            Reporter: Chris Rytting


Trying to download C4 via [these 
instructions]([https://github.com/google-research/text-to-text-transfer-transformer#c4)]
 and 3 hours into my job I get this. Can't find any help on google for this 
error.

 

Traceback (most recent call last):
 File "/usr/local/lib/python3.6/site-packages/dataflow_worker/batchworker.py", 
line 649, in do_work
 work_executor.execute()
 File "/usr/local/lib/python3.6/site-packages/dataflow_worker/executor.py", 
line 179, in execute
 op.start()
 File "dataflow_worker/shuffle_operations.py", line 63, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
 File "dataflow_worker/shuffle_operations.py", line 64, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
 File "dataflow_worker/shuffle_operations.py", line 79, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
 File "dataflow_worker/shuffle_operations.py", line 80, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
 File "dataflow_worker/shuffle_operations.py", line 84, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
 File "apache_beam/runners/worker/operations.py", line 332, in 
apache_beam.runners.worker.operations.Operation.output
 File "apache_beam/runners/worker/operations.py", line 195, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
 File "dataflow_worker/shuffle_operations.py", line 261, in 
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
 File "dataflow_worker/shuffle_operations.py", line 268, in 
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
 File "apache_beam/runners/worker/operations.py", line 332, in 
apache_beam.runners.worker.operations.Operation.output
 File "apache_beam/runners/worker/operations.py", line 195, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
 File "apache_beam/runners/worker/operations.py", line 670, in 
apache_beam.runners.worker.operations.DoOperation.process
 File "apache_beam/runners/worker/operations.py", line 671, in 
apache_beam.runners.worker.operations.DoOperation.process
 File "apache_beam/runners/common.py", line 1215, in 
apache_beam.runners.common.DoFnRunner.process
 File "apache_beam/runners/common.py", line 1279, in 
apache_beam.runners.common.DoFnRunner._reraise_augmented
 File "apache_beam/runners/common.py", line 1213, in 
apache_beam.runners.common.DoFnRunner.process
 File "apache_beam/runners/common.py", line 569, in 
apache_beam.runners.common.SimpleInvoker.invoke_process
 File "apache_beam/runners/common.py", line 1371, in 
apache_beam.runners.common._OutputProcessor.process_outputs
 File "apache_beam/runners/worker/operations.py", line 195, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
 File "apache_beam/runners/worker/operations.py", line 670, in 
apache_beam.runners.worker.operations.DoOperation.process
 File "apache_beam/runners/worker/operations.py", line 671, in 
apache_beam.runners.worker.operations.DoOperation.process
 File "apache_beam/runners/common.py", line 1215, in 
apache_beam.runners.common.DoFnRunner.process
 File "apache_beam/runners/common.py", line 1294, in 
apache_beam.runners.common.DoFnRunner._reraise_augmented
 File "/usr/local/lib/python3.6/site-packages/future/utils/__init__.py", line 
446, in raise_with_traceback
 raise exc.with_traceback(traceback)
 File "apache_beam/runners/common.py", line 1213, in 
apache_beam.runners.common.DoFnRunner.process
 File "apache_beam/runners/common.py", line 570, in 
apache_beam.runners.common.SimpleInvoker.invoke_process
 File 
"/mnt/pccfs/backed_up/crytting/persuasion/createc4/lib/python3.6/site-packages/apache_beam/transforms/core.py",
 line 815, in <lambda>
 self.process = lambda element: fn(element)
TypeError: clean_page() got an unexpected keyword argument 'badwords_regex' 
[while running 'clean_pages']



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to