Report work item exceptions in the streaming worker

charlesccychen · silviulica · commit fff0718f2d29 · 2016-03-31T17:43:47.000-07:00
----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=118238822
diff --git a/google/cloud/dataflow/internal/windmill_pb2.py b/google/cloud/dataflow/internal/windmill_pb2.py
@@ -2262,6 +2262,7 @@
 _WATERMARKHOLD.fields_by_name['timestamps'].has_options = True
 _WATERMARKHOLD.fields_by_name['timestamps']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))
 import abc
+import six
 from grpc.beta import implementations as beta_implementations
 from grpc.framework.common import cardinality
 from grpc.framework.interfaces.face import utilities as face_utilities
diff --git a/google/cloud/dataflow/internal/windmill_service_pb2.py b/google/cloud/dataflow/internal/windmill_service_pb2.py
@@ -46,13 +46,13 @@
 DESCRIPTOR.has_options = True
 DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n5com.google.cloud.dataflow.sdk.runners.worker.windmill'))
 import abc
+import six
 from grpc.beta import implementations as beta_implementations
 from grpc.framework.common import cardinality
 from grpc.framework.interfaces.face import utilities as face_utilities
 
-class BetaCloudWindmillServiceV1Alpha1Servicer(object):
+class BetaCloudWindmillServiceV1Alpha1Servicer(six.with_metaclass(abc.ABCMeta, object)):
   """<fill me in later!>"""
-  __metaclass__ = abc.ABCMeta
   @abc.abstractmethod
   def GetWork(self, request, context):
     raise NotImplementedError()
@@ -69,9 +69,8 @@ def GetConfig(self, request, context):
   def ReportStats(self, request, context):
     raise NotImplementedError()
 
-class BetaCloudWindmillServiceV1Alpha1Stub(object):
+class BetaCloudWindmillServiceV1Alpha1Stub(six.with_metaclass(abc.ABCMeta, object)):
   """The interface to which stubs will conform."""
-  __metaclass__ = abc.ABCMeta
   @abc.abstractmethod
   def GetWork(self, request, timeout):
     raise NotImplementedError()
diff --git a/google/cloud/dataflow/worker/streamingworker.py b/google/cloud/dataflow/worker/streamingworker.py
@@ -18,6 +18,7 @@
 
 import logging
 import random
+import sys
 import time
 import traceback
 
@@ -76,9 +77,13 @@ class StreamingWorker(object):
 
   # Maximum size of the result of a GetWork request.
   MAX_GET_WORK_FETCH_BYTES = 64 << 20  # 64m
+
   # Maximum number of items to return in a GetWork request.
   MAX_GET_WORK_ITEMS = 100
 
+  # Delay to use before retrying work items locally, in seconds.
+  RETRY_LOCALLY_DELAY = 10.0
+
   def __init__(self, properties):
     self.project_id = properties['project_id']
     self.job_id = properties['job_id']
@@ -142,24 +147,62 @@ def dispatch_loop(self):
         backoff_seconds = min(1.0, backoff_seconds * 2)
 
       for computation_work in work_response.work:
-        computation_id = computation_work.computation_id
-        input_data_watermark = windmillio.windmill_to_harness_timestamp(
-            computation_work.input_data_watermark)
-        if computation_id not in self.instruction_map:
-          self.get_config(computation_id)
-        map_task_proto = self.instruction_map[computation_id]
-        for work_item in computation_work.work:
-          try:
-            self.process(computation_id, map_task_proto, input_data_watermark,
-                         work_item)
-          except:
-            logging.error(
-                'Exception while processing work item for computation %r: '
-                '%s, %s', computation_id, work_item, traceback.format_exc())
-            raise
-
-  def process(self, computation_id, map_task_proto, input_data_watermark,
-              work_item):
+        self.process_computation(computation_work)
+
+  def process_computation(self, computation_work):
+    computation_id = computation_work.computation_id
+    input_data_watermark = windmillio.windmill_to_harness_timestamp(
+        computation_work.input_data_watermark)
+    if computation_id not in self.instruction_map:
+      self.get_config(computation_id)
+    map_task_proto = self.instruction_map[computation_id]
+    for work_item in computation_work.work:
+      retry_locally = True
+      while retry_locally:
+        try:
+          self.process_work_item(computation_id, map_task_proto,
+                                 input_data_watermark, work_item)
+          break
+        except:  # pylint: disable=bare-except
+          logging.error(
+              'Exception while processing work item for computation %r: '
+              '%s, %s', computation_id, work_item, traceback.format_exc())
+
+          # Send exception details to Windmill, retry locally if possible.
+          retry_locally = self.report_failure(computation_id, work_item,
+                                              sys.exc_info())
+
+          # TODO(ccy): handle token expiration in retry logic.
+          # TODO(ccy): handle out-of-memory error in retry logic.
+          if retry_locally:
+            logging.error('Execution of work in computation %s for key %r '
+                          'failed; will retry locally.', computation_id,
+                          work_item.key)
+            time.sleep(StreamingWorker.RETRY_LOCALLY_DELAY)
+          else:
+            logging.error('Execution of work in computation %s for key %r '
+                          'failed; Windmill indicated to not retry '
+                          'locally.', computation_id, work_item.key)
+
+  def report_failure(self, computation_id, work_item, exc_info):
+    """Send exception details to Windmill; returns whether to retry locally."""
+    exc_type, exc_value, exc_traceback = exc_info
+    messages = list(line.strip() for line in
+                    (traceback.format_exception_only(exc_type,
+                                                     exc_value) +
+                     traceback.format_tb(exc_traceback)))
+    wm_exception = windmill_pb2.Exception(stack_frames=messages)
+    report_stats_request = windmill_pb2.ReportStatsRequest(
+        computation_id=computation_id,
+        key=work_item.key,
+        sharding_key=work_item.sharding_key,
+        work_token=work_item.work_token,
+        exceptions=[wm_exception])
+    response = self.windmill.ReportStats(report_stats_request)
+    return not response.failed
+
+  def process_work_item(self, computation_id, map_task_proto,
+                        input_data_watermark, work_item):
     """Process a work item."""
     workitem_commit_request = windmill_pb2.WorkItemCommitRequest(
         key=work_item.key,
diff --git a/google/cloud/dataflow/worker/streamingworker_test.py b/google/cloud/dataflow/worker/streamingworker_test.py
@@ -0,0 +1,105 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the streaming worker.
+
+These tests check that the streaming worker harness runs properly, with mocked
+interactions with Windmill.
+"""
+
+import logging
+import unittest
+
+
+import mock
+
+from google.cloud.dataflow.internal import windmill_pb2
+from google.cloud.dataflow.worker.streamingworker import StreamingWorker
+
+
+class StreamingWorkerTest(unittest.TestCase):
+
+  @mock.patch(
+      'google.cloud.dataflow.worker.streamingworker.WindmillClient')
+  def _get_worker(self, *unused_mocks):
+    fake_properties = {
+        'project_id': 'fake_project',
+        'job_id': 'fake_job',
+        'worker_id': 'fake_worker',
+        'windmill.host': 'fake_host',
+        'windmill.grpc_port': '12345',
+    }
+    return StreamingWorker(fake_properties)
+
+  def _get_worker_and_single_computation(self):
+    worker = self._get_worker()
+    computation_work = windmill_pb2.ComputationWorkItems(
+        computation_id='A1',
+        work=[windmill_pb2.WorkItem(
+            key='k',
+            work_token=12345)])
+    worker.instruction_map['A1'] = mock.Mock()
+    return worker, computation_work
+
+  @mock.patch('google.cloud.dataflow.worker.streamingworker.StreamingWorker.'
+              'process_work_item')
+  def test_successful_work_item(self, *unused_mocks):
+    worker, computation_work = self._get_worker_and_single_computation()
+    worker.process_computation(computation_work)
+    self.assertEqual(0, len(worker.windmill.ReportStats.call_args_list))
+    self.assertEqual(1, len(worker.process_work_item.call_args_list))
+
+  @mock.patch('google.cloud.dataflow.worker.streamingworker.StreamingWorker.'
+              'process_work_item')
+  @mock.patch('logging.error')
+  def test_failed_work_item(self, *unused_mocks):
+    worker, computation_work = self._get_worker_and_single_computation()
+    worker.windmill.ReportStats.return_value = (
+        windmill_pb2.ReportStatsResponse(failed=True))
+    worker.process_work_item.side_effect = Exception
+
+    worker.process_computation(computation_work)
+
+    # Verify number of attempts and that failed work was reported.
+    self.assertEqual(1, len(worker.windmill.ReportStats.call_args_list))
+    self.assertEqual(1, len(worker.process_work_item.call_args_list))
+    logging.error.assert_called_with(
+        'Execution of work in computation %s for key %r failed; Windmill '
+        'indicated to not retry locally.', u'A1', 'k')
+
+  @mock.patch('google.cloud.dataflow.worker.streamingworker.StreamingWorker.'
+              'process_work_item')
+  @mock.patch('logging.error')
+  @mock.patch('time.sleep')
+  def test_retrying_failed_work_item(self, *unused_mocks):
+    worker, computation_work = self._get_worker_and_single_computation()
+    retries = 5
+    worker.windmill.ReportStats.side_effect = (
+        [windmill_pb2.ReportStatsResponse(failed=False)] * retries)
+    worker.process_work_item.side_effect = (
+        [Exception] * retries + [None])
+
+    worker.process_computation(computation_work)
+
+    # Verify number of attempts and that failed work was reported the correct
+    # number of times.
+    self.assertEqual(retries, len(worker.windmill.ReportStats.call_args_list))
+    self.assertEqual(retries + 1, len(worker.process_work_item.call_args_list))
+    logging.error.assert_called_with(
+        'Execution of work in computation %s for key %r failed; will retry '
+        'locally.', u'A1', 'k')
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  unittest.main()
diff --git a/setup.cfg b/setup.cfg
@@ -5,9 +5,14 @@ verbosity=2
 # Exclude some unit tests because they define command line options.  Nose runs
 # tests by loading the corresponding modules in the same process and the
 # side-effect of defining command line options makes other tests fail.
+#
 # TODO(silviuc): Find a way to run the remaining tests excluded here.
+#
 # The following tests are excluded because they try to load the Cython-based
 # fast_coders module which is not available when running unit tests:
 # fast_coders_test, typecoders_test, workitem_test, and executor_test.
-exclude=examples|bigquery_test|ptransform_test|fast_coders_test|typecoders_test|workitem_test|executor_test
+#
+# The streamingworker_test test is excluded because it depends on protobuf and
+# gRPC.
+exclude=examples|bigquery_test|ptransform_test|fast_coders_test|typecoders_test|workitem_test|executor_test|streamingworker_test