Repository: beam
Updated Branches:
  refs/heads/master 650ee8ea6 -> 3961ce46c


Do not depend on message id in DataflowRunner

This field is deprecated and causing messages to be repeated. Hash
message to avoid printing duplicate messages.


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/bcde998f
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/bcde998f
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/bcde998f

Branch: refs/heads/master
Commit: bcde998f9fff72d59d74107db317de7b20a9f003
Parents: 650ee8e
Author: Ahmet Altay <[email protected]>
Authored: Tue Apr 25 18:42:03 2017 -0700
Committer: Ahmet Altay <[email protected]>
Committed: Wed Apr 26 18:06:03 2017 -0700

----------------------------------------------------------------------
 .../apache_beam/runners/dataflow/dataflow_runner.py    | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/beam/blob/bcde998f/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
----------------------------------------------------------------------
diff --git a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py 
b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
index 4534895..05f6833 100644
--- a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
+++ b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
@@ -76,7 +76,7 @@ class DataflowRunner(PipelineRunner):
   def poll_for_job_completion(runner, result):
     """Polls for the specified job to finish running (successfully or not)."""
     last_message_time = None
-    last_message_id = None
+    last_message_hash = None
 
     last_error_rank = float('-inf')
     last_error_msg = None
@@ -126,19 +126,20 @@ class DataflowRunner(PipelineRunner):
         messages, page_token = runner.dataflow_client.list_messages(
             job_id, page_token=page_token, start_time=last_message_time)
         for m in messages:
-          if last_message_id is not None and m.id == last_message_id:
+          message = '%s: %s: %s' % (m.time, m.messageImportance, m.messageText)
+          m_hash = hash(message)
+
+          if last_message_hash is not None and m_hash == last_message_hash:
             # Skip the first message if it is the last message we got in the
             # previous round. This can happen because we use the
             # last_message_time as a parameter of the query for new messages.
             continue
           last_message_time = m.time
-          last_message_id = m.id
+          last_message_hash = m_hash
           # Skip empty messages.
           if m.messageImportance is None:
             continue
-          logging.info(
-              '%s: %s: %s: %s', m.id, m.time, m.messageImportance,
-              m.messageText)
+          logging.info(message)
           if str(m.messageImportance) == 'JOB_MESSAGE_ERROR':
             if rank_error(m.messageText) >= last_error_rank:
               last_error_rank = rank_error(m.messageText)

Reply via email to