[ https://issues.apache.org/jira/browse/BEAM-6179?focusedWorklogId=175562&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-175562 ]
ASF GitHub Bot logged work on BEAM-6179: ---------------------------------------- Author: ASF GitHub Bot Created on: 14/Dec/18 19:36 Start Date: 14/Dec/18 19:36 Worklog Time Spent: 10m Work Description: angoenka closed pull request #7280: [BEAM-6179] Fixing bundle estimation when all xs are same URL: https://github.com/apache/beam/pull/7280 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sdks/python/apache_beam/transforms/util.py b/sdks/python/apache_beam/transforms/util.py index 38839f57842d..046be0b417c2 100644 --- a/sdks/python/apache_beam/transforms/util.py +++ b/sdks/python/apache_beam/transforms/util.py @@ -281,6 +281,9 @@ def linear_regression_no_numpy(xs, ys): n = float(len(xs)) xbar = sum(xs) / n ybar = sum(ys) / n + if [xs[0]] * len(xs) == xs: + # Simply use the mean if all values in xs are same. + return 0, ybar/xbar b = (sum([(x - xbar) * (y - ybar) for x, y in zip(xs, ys)]) / sum([(x - xbar)**2 for x in xs])) a = ybar - b * xbar @@ -291,13 +294,16 @@ def linear_regression_numpy(xs, ys): # pylint: disable=wrong-import-order, wrong-import-position import numpy as np from numpy import sum + n = len(xs) + if [xs[0]] * n == xs: + # If all values of xs are same then fallback to linear_regression_no_numpy + return _BatchSizeEstimator.linear_regression_no_numpy(xs, ys) xs = np.asarray(xs, dtype=float) ys = np.asarray(ys, dtype=float) # First do a simple least squares fit for y = a + bx over all points. b, a = np.polyfit(xs, ys, 1) - n = len(xs) if n < 10: return a, b else: diff --git a/sdks/python/apache_beam/transforms/util_test.py b/sdks/python/apache_beam/transforms/util_test.py index e592f938e175..f0296c0a0f12 100644 --- a/sdks/python/apache_beam/transforms/util_test.py +++ b/sdks/python/apache_beam/transforms/util_test.py @@ -18,6 +18,7 @@ """Unit tests for the transform.util classes.""" from __future__ import absolute_import +from __future__ import division import logging import random @@ -160,6 +161,13 @@ def _run_regression_test(self, linear_regression_fn, test_outliers): self.assertAlmostEqual(a, 5, delta=0.01) self.assertAlmostEqual(b, 7, delta=0.01) + # Test repeated xs + xs = [1 + random.random()] * 100 + ys = [7 * x + 5 + 0.01 * random.random() for x in xs] + a, b = linear_regression_fn(xs, ys) + self.assertAlmostEqual(a, 0, delta=0.01) + self.assertAlmostEqual(b, sum(ys)/(len(ys) * xs[0]), delta=0.01) + if test_outliers: xs = [1 + random.random() for _ in range(100)] ys = [2*x + 1 for x in xs] ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 175562) Time Spent: 2h (was: 1h 50m) > Batch size estimation failing > ----------------------------- > > Key: BEAM-6179 > URL: https://issues.apache.org/jira/browse/BEAM-6179 > Project: Beam > Issue Type: Bug > Components: runner-flink, sdk-py-harness > Reporter: Ankur Goenka > Assignee: Ankur Goenka > Priority: Major > Time Spent: 2h > Remaining Estimate: 0h > > Batch size estimation is failing on flink when running 13MB input pipeline > with error > ValueError: On entry to DLASCL parameter number 4 had an illegal value > java.util.concurrent.ExecutionException: java.lang.RuntimeException: Error > received from SDK harness for instruction 48: Traceback (most recent call > last): > File > "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py", > line 135, in _execute > response = task() > File > "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py", > line 170, in <lambda> > self._execute(lambda: worker.do_instruction(work), work) > File > "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py", > line 221, in do_instruction > request.instruction_id) > File > "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py", > line 237, in process_bundle > bundle_processor.process_bundle(instruction_id) > File > "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/bundle_processor.py", > line 480, in process_bundle > ].process_encoded(data.data) > File > "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/bundle_processor.py", > line 125, in process_encoded > self.output(decoded_value) > File "apache_beam/runners/worker/operations.py", line 182, in > apache_beam.runners.worker.operations.Operation.output > def output(self, windowed_value, output_index=0): > File "apache_beam/runners/worker/operations.py", line 183, in > apache_beam.runners.worker.operations.Operation.output > cython.cast(Receiver, > self.receivers[output_index]).receive(windowed_value) > File "apache_beam/runners/worker/operations.py", line 89, in > apache_beam.runners.worker.operations.ConsumerSet.receive > cython.cast(Operation, consumer).process(windowed_value) > File "apache_beam/runners/worker/operations.py", line 497, in > apache_beam.runners.worker.operations.DoOperation.process > with self.scoped_process_state: > File "apache_beam/runners/worker/operations.py", line 498, in > apache_beam.runners.worker.operations.DoOperation.process > self.dofn_receiver.receive(o) > File "apache_beam/runners/common.py", line 680, in > apache_beam.runners.common.DoFnRunner.receive > self.process(windowed_value) > File "apache_beam/runners/common.py", line 686, in > apache_beam.runners.common.DoFnRunner.process > self._reraise_augmented(exn) > File "apache_beam/runners/common.py", line 709, in > apache_beam.runners.common.DoFnRunner._reraise_augmented > raise > File "apache_beam/runners/common.py", line 684, in > apache_beam.runners.common.DoFnRunner.process > self.do_fn_invoker.invoke_process(windowed_value) > File "apache_beam/runners/common.py", line 420, in > apache_beam.runners.common.SimpleInvoker.invoke_process > output_processor.process_outputs( > File "apache_beam/runners/common.py", line 794, in > apache_beam.runners.common._OutputProcessor.process_outputs > self.main_receivers.receive(windowed_value) > File "apache_beam/runners/worker/operations.py", line 89, in > apache_beam.runners.worker.operations.ConsumerSet.receive > cython.cast(Operation, consumer).process(windowed_value) > File "apache_beam/runners/worker/operations.py", line 497, in > apache_beam.runners.worker.operations.DoOperation.process > with self.scoped_process_state: > File "apache_beam/runners/worker/operations.py", line 498, in > apache_beam.runners.worker.operations.DoOperation.process > self.dofn_receiver.receive(o) > File "apache_beam/runners/common.py", line 680, in > apache_beam.runners.common.DoFnRunner.receive > self.process(windowed_value) > File "apache_beam/runners/common.py", line 686, in > apache_beam.runners.common.DoFnRunner.process > self._reraise_augmented(exn) > File "apache_beam/runners/common.py", line 709, in > apache_beam.runners.common.DoFnRunner._reraise_augmented > raise > File "apache_beam/runners/common.py", line 684, in > apache_beam.runners.common.DoFnRunner.process > self.do_fn_invoker.invoke_process(windowed_value) > File "apache_beam/runners/common.py", line 420, in > apache_beam.runners.common.SimpleInvoker.invoke_process > output_processor.process_outputs( > File "apache_beam/runners/common.py", line 794, in > apache_beam.runners.common._OutputProcessor.process_outputs > self.main_receivers.receive(windowed_value) > File "apache_beam/runners/worker/operations.py", line 89, in > apache_beam.runners.worker.operations.ConsumerSet.receive > cython.cast(Operation, consumer).process(windowed_value) > File "apache_beam/runners/worker/operations.py", line 497, in > apache_beam.runners.worker.operations.DoOperation.process > with self.scoped_process_state: > File "apache_beam/runners/worker/operations.py", line 498, in > apache_beam.runners.worker.operations.DoOperation.process > self.dofn_receiver.receive(o) > File "apache_beam/runners/common.py", line 680, in > apache_beam.runners.common.DoFnRunner.receive > self.process(windowed_value) > File "apache_beam/runners/common.py", line 686, in > apache_beam.runners.common.DoFnRunner.process > self._reraise_augmented(exn) > File "apache_beam/runners/common.py", line 709, in > apache_beam.runners.common.DoFnRunner._reraise_augmented > raise > File "apache_beam/runners/common.py", line 684, in > apache_beam.runners.common.DoFnRunner.process > self.do_fn_invoker.invoke_process(windowed_value) > File "apache_beam/runners/common.py", line 420, in > apache_beam.runners.common.SimpleInvoker.invoke_process > output_processor.process_outputs( > File "apache_beam/runners/common.py", line 794, in > apache_beam.runners.common._OutputProcessor.process_outputs > self.main_receivers.receive(windowed_value) > File "apache_beam/runners/worker/operations.py", line 89, in > apache_beam.runners.worker.operations.ConsumerSet.receive > cython.cast(Operation, consumer).process(windowed_value) > File "apache_beam/runners/worker/operations.py", line 497, in > apache_beam.runners.worker.operations.DoOperation.process > with self.scoped_process_state: > File "apache_beam/runners/worker/operations.py", line 498, in > apache_beam.runners.worker.operations.DoOperation.process > self.dofn_receiver.receive(o) > File "apache_beam/runners/common.py", line 680, in > apache_beam.runners.common.DoFnRunner.receive > self.process(windowed_value) > File "apache_beam/runners/common.py", line 686, in > apache_beam.runners.common.DoFnRunner.process > self._reraise_augmented(exn) > File "apache_beam/runners/common.py", line 709, in > apache_beam.runners.common.DoFnRunner._reraise_augmented > raise > File "apache_beam/runners/common.py", line 684, in > apache_beam.runners.common.DoFnRunner.process > self.do_fn_invoker.invoke_process(windowed_value) > File "apache_beam/runners/common.py", line 420, in > apache_beam.runners.common.SimpleInvoker.invoke_process > output_processor.process_outputs( > File "apache_beam/runners/common.py", line 794, in > apache_beam.runners.common._OutputProcessor.process_outputs > self.main_receivers.receive(windowed_value) > File "apache_beam/runners/worker/operations.py", line 89, in > apache_beam.runners.worker.operations.ConsumerSet.receive > cython.cast(Operation, consumer).process(windowed_value) > File "apache_beam/runners/worker/operations.py", line 497, in > apache_beam.runners.worker.operations.DoOperation.process > with self.scoped_process_state: > File "apache_beam/runners/worker/operations.py", line 498, in > apache_beam.runners.worker.operations.DoOperation.process > self.dofn_receiver.receive(o) > File "apache_beam/runners/common.py", line 680, in > apache_beam.runners.common.DoFnRunner.receive > self.process(windowed_value) > File "apache_beam/runners/common.py", line 686, in > apache_beam.runners.common.DoFnRunner.process > self._reraise_augmented(exn) > File "apache_beam/runners/common.py", line 724, in > apache_beam.runners.common.DoFnRunner._reraise_augmented > raise_with_traceback(new_exn) > File "apache_beam/runners/common.py", line 684, in > apache_beam.runners.common.DoFnRunner.process > self.do_fn_invoker.invoke_process(windowed_value) > File "apache_beam/runners/common.py", line 420, in > apache_beam.runners.common.SimpleInvoker.invoke_process > output_processor.process_outputs( > File "apache_beam/runners/common.py", line 770, in > apache_beam.runners.common._OutputProcessor.process_outputs > for result in results: > File > "/usr/local/lib/python2.7/site-packages/apache_beam/transforms/util.py", line > 398, in process > self._batch_size = self._batch_size_estimator.next_batch_size() > File > "/usr/local/lib/python2.7/site-packages/apache_beam/transforms/util.py", line > 351, in next_batch_size > a, b = self.linear_regression(xs, ys) > File > "/usr/local/lib/python2.7/site-packages/apache_beam/transforms/util.py", line > 321, in linear_regression_numpy > b, a = np.polyfit(xs, ys, 1, w=weight) > File "/usr/local/lib/python2.7/site-packages/numpy/lib/polynomial.py", line > 585, in polyfit > c, resids, rank, s = lstsq(lhs, rhs, rcond) > File "/usr/local/lib/python2.7/site-packages/numpy/linalg/linalg.py", line > 1957, in lstsq > 0, work, lwork, iwork, 0) > ValueError: On entry to DLASCL parameter number 4 had an illegal value [while > running > 'Analyze/RunPhase[0]/BatchAnalyzerInputs/BatchElements/ParDo(_GlobalWindowsBatchingDoFn)'] > at > java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) > at > java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895) > at org.apache.beam.sdk.util.MoreFutures.get(MoreFutures.java:57) > at > org.apache.beam.runners.fnexecution.control.SdkHarnessClient$ActiveBundle.close(SdkHarnessClient.java:263) > at > org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageFunction.$closeResource(FlinkExecutableStageFunction.java:188) > at > org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageFunction.mapPartition(FlinkExecutableStageFunction.java:188) > at > org.apache.flink.runtime.operators.MapPartitionDriver.run(MapPartitionDriver.java:103) > at org.apache.flink.runtime.operators.BatchTask.run(BatchTask.java:503) > at > org.apache.flink.runtime.operators.BatchTask.invoke(BatchTask.java:368) > at org.apache.flink.runtime.taskmanager.Task.run(Task.java:712) > at java.lang.Thread.run(Thread.java:748) > -- This message was sent by Atlassian JIRA (v7.6.3#76005)