[ 
https://issues.apache.org/jira/browse/BEAM-6179?focusedWorklogId=175562&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-175562
 ]

ASF GitHub Bot logged work on BEAM-6179:
----------------------------------------

                Author: ASF GitHub Bot
            Created on: 14/Dec/18 19:36
            Start Date: 14/Dec/18 19:36
    Worklog Time Spent: 10m 
      Work Description: angoenka closed pull request #7280: [BEAM-6179] Fixing 
bundle estimation when all xs are same
URL: https://github.com/apache/beam/pull/7280
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/sdks/python/apache_beam/transforms/util.py 
b/sdks/python/apache_beam/transforms/util.py
index 38839f57842d..046be0b417c2 100644
--- a/sdks/python/apache_beam/transforms/util.py
+++ b/sdks/python/apache_beam/transforms/util.py
@@ -281,6 +281,9 @@ def linear_regression_no_numpy(xs, ys):
     n = float(len(xs))
     xbar = sum(xs) / n
     ybar = sum(ys) / n
+    if [xs[0]] * len(xs) == xs:
+      # Simply use the mean if all values in xs are same.
+      return 0, ybar/xbar
     b = (sum([(x - xbar) * (y - ybar) for x, y in zip(xs, ys)])
          / sum([(x - xbar)**2 for x in xs]))
     a = ybar - b * xbar
@@ -291,13 +294,16 @@ def linear_regression_numpy(xs, ys):
     # pylint: disable=wrong-import-order, wrong-import-position
     import numpy as np
     from numpy import sum
+    n = len(xs)
+    if [xs[0]] * n == xs:
+      # If all values of xs are same then fallback to 
linear_regression_no_numpy
+      return _BatchSizeEstimator.linear_regression_no_numpy(xs, ys)
     xs = np.asarray(xs, dtype=float)
     ys = np.asarray(ys, dtype=float)
 
     # First do a simple least squares fit for y = a + bx over all points.
     b, a = np.polyfit(xs, ys, 1)
 
-    n = len(xs)
     if n < 10:
       return a, b
     else:
diff --git a/sdks/python/apache_beam/transforms/util_test.py 
b/sdks/python/apache_beam/transforms/util_test.py
index e592f938e175..f0296c0a0f12 100644
--- a/sdks/python/apache_beam/transforms/util_test.py
+++ b/sdks/python/apache_beam/transforms/util_test.py
@@ -18,6 +18,7 @@
 """Unit tests for the transform.util classes."""
 
 from __future__ import absolute_import
+from __future__ import division
 
 import logging
 import random
@@ -160,6 +161,13 @@ def _run_regression_test(self, linear_regression_fn, 
test_outliers):
     self.assertAlmostEqual(a, 5, delta=0.01)
     self.assertAlmostEqual(b, 7, delta=0.01)
 
+    # Test repeated xs
+    xs = [1 + random.random()] * 100
+    ys = [7 * x + 5 + 0.01 * random.random() for x in xs]
+    a, b = linear_regression_fn(xs, ys)
+    self.assertAlmostEqual(a, 0, delta=0.01)
+    self.assertAlmostEqual(b, sum(ys)/(len(ys) * xs[0]), delta=0.01)
+
     if test_outliers:
       xs = [1 + random.random() for _ in range(100)]
       ys = [2*x + 1 for x in xs]


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
-------------------

    Worklog Id:     (was: 175562)
    Time Spent: 2h  (was: 1h 50m)

> Batch size estimation failing
> -----------------------------
>
>                 Key: BEAM-6179
>                 URL: https://issues.apache.org/jira/browse/BEAM-6179
>             Project: Beam
>          Issue Type: Bug
>          Components: runner-flink, sdk-py-harness
>            Reporter: Ankur Goenka
>            Assignee: Ankur Goenka
>            Priority: Major
>          Time Spent: 2h
>  Remaining Estimate: 0h
>
> Batch size estimation is failing on flink when running 13MB input pipeline 
> with error
> ValueError: On entry to DLASCL parameter number 4 had an illegal value 
> java.util.concurrent.ExecutionException: java.lang.RuntimeException: Error 
> received from SDK harness for instruction 48: Traceback (most recent call 
> last):
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py",
>  line 135, in _execute
>     response = task()
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py",
>  line 170, in <lambda>
>     self._execute(lambda: worker.do_instruction(work), work)
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py",
>  line 221, in do_instruction
>     request.instruction_id)
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py",
>  line 237, in process_bundle
>     bundle_processor.process_bundle(instruction_id)
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/bundle_processor.py",
>  line 480, in process_bundle
>     ].process_encoded(data.data)
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/runners/worker/bundle_processor.py",
>  line 125, in process_encoded
>     self.output(decoded_value)
>   File "apache_beam/runners/worker/operations.py", line 182, in 
> apache_beam.runners.worker.operations.Operation.output
>     def output(self, windowed_value, output_index=0):
>   File "apache_beam/runners/worker/operations.py", line 183, in 
> apache_beam.runners.worker.operations.Operation.output
>     cython.cast(Receiver, 
> self.receivers[output_index]).receive(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 89, in 
> apache_beam.runners.worker.operations.ConsumerSet.receive
>     cython.cast(Operation, consumer).process(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 497, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     with self.scoped_process_state:
>   File "apache_beam/runners/worker/operations.py", line 498, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     self.dofn_receiver.receive(o)
>   File "apache_beam/runners/common.py", line 680, in 
> apache_beam.runners.common.DoFnRunner.receive
>     self.process(windowed_value)
>   File "apache_beam/runners/common.py", line 686, in 
> apache_beam.runners.common.DoFnRunner.process
>     self._reraise_augmented(exn)
>   File "apache_beam/runners/common.py", line 709, in 
> apache_beam.runners.common.DoFnRunner._reraise_augmented
>     raise
>   File "apache_beam/runners/common.py", line 684, in 
> apache_beam.runners.common.DoFnRunner.process
>     self.do_fn_invoker.invoke_process(windowed_value)
>   File "apache_beam/runners/common.py", line 420, in 
> apache_beam.runners.common.SimpleInvoker.invoke_process
>     output_processor.process_outputs(
>   File "apache_beam/runners/common.py", line 794, in 
> apache_beam.runners.common._OutputProcessor.process_outputs
>     self.main_receivers.receive(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 89, in 
> apache_beam.runners.worker.operations.ConsumerSet.receive
>     cython.cast(Operation, consumer).process(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 497, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     with self.scoped_process_state:
>   File "apache_beam/runners/worker/operations.py", line 498, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     self.dofn_receiver.receive(o)
>   File "apache_beam/runners/common.py", line 680, in 
> apache_beam.runners.common.DoFnRunner.receive
>     self.process(windowed_value)
>   File "apache_beam/runners/common.py", line 686, in 
> apache_beam.runners.common.DoFnRunner.process
>     self._reraise_augmented(exn)
>   File "apache_beam/runners/common.py", line 709, in 
> apache_beam.runners.common.DoFnRunner._reraise_augmented
>     raise
>   File "apache_beam/runners/common.py", line 684, in 
> apache_beam.runners.common.DoFnRunner.process
>     self.do_fn_invoker.invoke_process(windowed_value)
>   File "apache_beam/runners/common.py", line 420, in 
> apache_beam.runners.common.SimpleInvoker.invoke_process
>     output_processor.process_outputs(
>   File "apache_beam/runners/common.py", line 794, in 
> apache_beam.runners.common._OutputProcessor.process_outputs
>     self.main_receivers.receive(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 89, in 
> apache_beam.runners.worker.operations.ConsumerSet.receive
>     cython.cast(Operation, consumer).process(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 497, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     with self.scoped_process_state:
>   File "apache_beam/runners/worker/operations.py", line 498, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     self.dofn_receiver.receive(o)
>   File "apache_beam/runners/common.py", line 680, in 
> apache_beam.runners.common.DoFnRunner.receive
>     self.process(windowed_value)
>   File "apache_beam/runners/common.py", line 686, in 
> apache_beam.runners.common.DoFnRunner.process
>     self._reraise_augmented(exn)
>   File "apache_beam/runners/common.py", line 709, in 
> apache_beam.runners.common.DoFnRunner._reraise_augmented
>     raise
>   File "apache_beam/runners/common.py", line 684, in 
> apache_beam.runners.common.DoFnRunner.process
>     self.do_fn_invoker.invoke_process(windowed_value)
>   File "apache_beam/runners/common.py", line 420, in 
> apache_beam.runners.common.SimpleInvoker.invoke_process
>     output_processor.process_outputs(
>   File "apache_beam/runners/common.py", line 794, in 
> apache_beam.runners.common._OutputProcessor.process_outputs
>     self.main_receivers.receive(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 89, in 
> apache_beam.runners.worker.operations.ConsumerSet.receive
>     cython.cast(Operation, consumer).process(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 497, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     with self.scoped_process_state:
>   File "apache_beam/runners/worker/operations.py", line 498, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     self.dofn_receiver.receive(o)
>   File "apache_beam/runners/common.py", line 680, in 
> apache_beam.runners.common.DoFnRunner.receive
>     self.process(windowed_value)
>   File "apache_beam/runners/common.py", line 686, in 
> apache_beam.runners.common.DoFnRunner.process
>     self._reraise_augmented(exn)
>   File "apache_beam/runners/common.py", line 709, in 
> apache_beam.runners.common.DoFnRunner._reraise_augmented
>     raise
>   File "apache_beam/runners/common.py", line 684, in 
> apache_beam.runners.common.DoFnRunner.process
>     self.do_fn_invoker.invoke_process(windowed_value)
>   File "apache_beam/runners/common.py", line 420, in 
> apache_beam.runners.common.SimpleInvoker.invoke_process
>     output_processor.process_outputs(
>   File "apache_beam/runners/common.py", line 794, in 
> apache_beam.runners.common._OutputProcessor.process_outputs
>     self.main_receivers.receive(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 89, in 
> apache_beam.runners.worker.operations.ConsumerSet.receive
>     cython.cast(Operation, consumer).process(windowed_value)
>   File "apache_beam/runners/worker/operations.py", line 497, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     with self.scoped_process_state:
>   File "apache_beam/runners/worker/operations.py", line 498, in 
> apache_beam.runners.worker.operations.DoOperation.process
>     self.dofn_receiver.receive(o)
>   File "apache_beam/runners/common.py", line 680, in 
> apache_beam.runners.common.DoFnRunner.receive
>     self.process(windowed_value)
>   File "apache_beam/runners/common.py", line 686, in 
> apache_beam.runners.common.DoFnRunner.process
>     self._reraise_augmented(exn)
>   File "apache_beam/runners/common.py", line 724, in 
> apache_beam.runners.common.DoFnRunner._reraise_augmented
>     raise_with_traceback(new_exn)
>   File "apache_beam/runners/common.py", line 684, in 
> apache_beam.runners.common.DoFnRunner.process
>     self.do_fn_invoker.invoke_process(windowed_value)
>   File "apache_beam/runners/common.py", line 420, in 
> apache_beam.runners.common.SimpleInvoker.invoke_process
>     output_processor.process_outputs(
>   File "apache_beam/runners/common.py", line 770, in 
> apache_beam.runners.common._OutputProcessor.process_outputs
>     for result in results:
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/transforms/util.py", line 
> 398, in process
>     self._batch_size = self._batch_size_estimator.next_batch_size()
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/transforms/util.py", line 
> 351, in next_batch_size
>     a, b = self.linear_regression(xs, ys)
>   File 
> "/usr/local/lib/python2.7/site-packages/apache_beam/transforms/util.py", line 
> 321, in linear_regression_numpy
>     b, a = np.polyfit(xs, ys, 1, w=weight)
>   File "/usr/local/lib/python2.7/site-packages/numpy/lib/polynomial.py", line 
> 585, in polyfit
>     c, resids, rank, s = lstsq(lhs, rhs, rcond)
>   File "/usr/local/lib/python2.7/site-packages/numpy/linalg/linalg.py", line 
> 1957, in lstsq
>     0, work, lwork, iwork, 0)
> ValueError: On entry to DLASCL parameter number 4 had an illegal value [while 
> running 
> 'Analyze/RunPhase[0]/BatchAnalyzerInputs/BatchElements/ParDo(_GlobalWindowsBatchingDoFn)']
>       at 
> java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
>       at 
> java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895)
>       at org.apache.beam.sdk.util.MoreFutures.get(MoreFutures.java:57)
>       at 
> org.apache.beam.runners.fnexecution.control.SdkHarnessClient$ActiveBundle.close(SdkHarnessClient.java:263)
>       at 
> org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageFunction.$closeResource(FlinkExecutableStageFunction.java:188)
>       at 
> org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageFunction.mapPartition(FlinkExecutableStageFunction.java:188)
>       at 
> org.apache.flink.runtime.operators.MapPartitionDriver.run(MapPartitionDriver.java:103)
>       at org.apache.flink.runtime.operators.BatchTask.run(BatchTask.java:503)
>       at 
> org.apache.flink.runtime.operators.BatchTask.invoke(BatchTask.java:368)
>       at org.apache.flink.runtime.taskmanager.Task.run(Task.java:712)
>       at java.lang.Thread.run(Thread.java:748)
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to