|
18 | 18 |
|
19 | 19 | import logging |
20 | 20 | import random |
| 21 | +import sys |
21 | 22 | import time |
22 | 23 | import traceback |
23 | 24 |
|
@@ -76,9 +77,13 @@ class StreamingWorker(object): |
76 | 77 |
|
77 | 78 | # Maximum size of the result of a GetWork request. |
78 | 79 | MAX_GET_WORK_FETCH_BYTES = 64 << 20 # 64m |
| 80 | + |
79 | 81 | # Maximum number of items to return in a GetWork request. |
80 | 82 | MAX_GET_WORK_ITEMS = 100 |
81 | 83 |
|
| 84 | + # Delay to use before retrying work items locally, in seconds. |
| 85 | + RETRY_LOCALLY_DELAY = 10.0 |
| 86 | + |
82 | 87 | def __init__(self, properties): |
83 | 88 | self.project_id = properties['project_id'] |
84 | 89 | self.job_id = properties['job_id'] |
@@ -142,24 +147,62 @@ def dispatch_loop(self): |
142 | 147 | backoff_seconds = min(1.0, backoff_seconds * 2) |
143 | 148 |
|
144 | 149 | for computation_work in work_response.work: |
145 | | - computation_id = computation_work.computation_id |
146 | | - input_data_watermark = windmillio.windmill_to_harness_timestamp( |
147 | | - computation_work.input_data_watermark) |
148 | | - if computation_id not in self.instruction_map: |
149 | | - self.get_config(computation_id) |
150 | | - map_task_proto = self.instruction_map[computation_id] |
151 | | - for work_item in computation_work.work: |
152 | | - try: |
153 | | - self.process(computation_id, map_task_proto, input_data_watermark, |
154 | | - work_item) |
155 | | - except: |
156 | | - logging.error( |
157 | | - 'Exception while processing work item for computation %r: ' |
158 | | - '%s, %s', computation_id, work_item, traceback.format_exc()) |
159 | | - raise |
160 | | - |
161 | | - def process(self, computation_id, map_task_proto, input_data_watermark, |
162 | | - work_item): |
| 150 | + self.process_computation(computation_work) |
| 151 | + |
| 152 | + def process_computation(self, computation_work): |
| 153 | + computation_id = computation_work.computation_id |
| 154 | + input_data_watermark = windmillio.windmill_to_harness_timestamp( |
| 155 | + computation_work.input_data_watermark) |
| 156 | + if computation_id not in self.instruction_map: |
| 157 | + self.get_config(computation_id) |
| 158 | + map_task_proto = self.instruction_map[computation_id] |
| 159 | + for work_item in computation_work.work: |
| 160 | + retry_locally = True |
| 161 | + while retry_locally: |
| 162 | + try: |
| 163 | + self.process_work_item(computation_id, map_task_proto, |
| 164 | + input_data_watermark, work_item) |
| 165 | + break |
| 166 | + except: # pylint: disable=bare-except |
| 167 | + logging.error( |
| 168 | + 'Exception while processing work item for computation %r: ' |
| 169 | + '%s, %s', computation_id, work_item, traceback.format_exc()) |
| 170 | + |
| 171 | + # Send exception details to Windmill, retry locally if possible. |
| 172 | + retry_locally = self.report_failure(computation_id, work_item, |
| 173 | + sys.exc_info()) |
| 174 | + |
| 175 | + # TODO(ccy): handle token expiration in retry logic. |
| 176 | + # TODO(ccy): handle out-of-memory error in retry logic. |
| 177 | + if retry_locally: |
| 178 | + logging.error('Execution of work in computation %s for key %r ' |
| 179 | + 'failed; will retry locally.', computation_id, |
| 180 | + work_item.key) |
| 181 | + time.sleep(StreamingWorker.RETRY_LOCALLY_DELAY) |
| 182 | + else: |
| 183 | + logging.error('Execution of work in computation %s for key %r ' |
| 184 | + 'failed; Windmill indicated to not retry ' |
| 185 | + 'locally.', computation_id, work_item.key) |
| 186 | + |
| 187 | + def report_failure(self, computation_id, work_item, exc_info): |
| 188 | + """Send exception details to Windmill; returns whether to retry locally.""" |
| 189 | + exc_type, exc_value, exc_traceback = exc_info |
| 190 | + messages = list(line.strip() for line in |
| 191 | + (traceback.format_exception_only(exc_type, |
| 192 | + exc_value) + |
| 193 | + traceback.format_tb(exc_traceback))) |
| 194 | + wm_exception = windmill_pb2.Exception(stack_frames=messages) |
| 195 | + report_stats_request = windmill_pb2.ReportStatsRequest( |
| 196 | + computation_id=computation_id, |
| 197 | + key=work_item.key, |
| 198 | + sharding_key=work_item.sharding_key, |
| 199 | + work_token=work_item.work_token, |
| 200 | + exceptions=[wm_exception]) |
| 201 | + response = self.windmill.ReportStats(report_stats_request) |
| 202 | + return not response.failed |
| 203 | + |
| 204 | + def process_work_item(self, computation_id, map_task_proto, |
| 205 | + input_data_watermark, work_item): |
163 | 206 | """Process a work item.""" |
164 | 207 | workitem_commit_request = windmill_pb2.WorkItemCommitRequest( |
165 | 208 | key=work_item.key, |
|
0 commit comments