|
43 | 43 | from google.cloud.dataflow.worker import maptask |
44 | 44 | from google.cloud.dataflow.worker import opcounters |
45 | 45 | from google.cloud.dataflow.worker import shuffle |
| 46 | +from google.cloud.dataflow.worker import sideinputs |
46 | 47 |
|
47 | 48 |
|
48 | 49 | class ReceiverSet(object): |
@@ -201,14 +202,6 @@ def start(self): |
201 | 202 | windowed_value = GlobalWindows.WindowedValue(value) |
202 | 203 | self.output(windowed_value) |
203 | 204 |
|
204 | | - def side_read_all(self, singleton=False): |
205 | | - # TODO(mairbek): Should we return WindowedValue here? |
206 | | - with self.spec.source.reader() as reader: |
207 | | - for value in reader: |
208 | | - yield value |
209 | | - if singleton: |
210 | | - return |
211 | | - |
212 | 205 | def request_dynamic_split(self, dynamic_split_request): |
213 | 206 | if self._reader is not None: |
214 | 207 | return self._reader.request_dynamic_split(dynamic_split_request) |
@@ -423,36 +416,46 @@ def _read_side_inputs(self, tags_and_types): |
423 | 416 | # specification. This can happen for instance if the source has been |
424 | 417 | # sharded into several files. |
425 | 418 | for side_tag, view_class, view_options in tags_and_types: |
426 | | - # Note that currently, the implementation of Iterable and List views |
427 | | - # are identical. This may change in the future once we allow very large |
428 | | - # side input collections. |
429 | | - is_singleton = view_class == pvalue.SingletonPCollectionView |
| 419 | + sources = [] |
430 | 420 | # Using the side_tag in the lambda below will trigger a pylint warning. |
431 | 421 | # However in this case it is fine because the lambda is used right away |
432 | 422 | # while the variable has the value assigned by the current iteration of |
433 | 423 | # the for loop. |
434 | 424 | # pylint: disable=cell-var-from-loop |
435 | | - results = [] |
436 | 425 | for si in itertools.ifilter( |
437 | 426 | lambda o: o.tag == side_tag, self.spec.side_inputs): |
438 | | - if isinstance(si, maptask.WorkerSideInputSource): |
439 | | - op = ReadOperation(si, self.counter_factory) |
440 | | - else: |
| 427 | + if not isinstance(si, maptask.WorkerSideInputSource): |
441 | 428 | raise NotImplementedError('Unknown side input type: %r' % si) |
442 | | - for v in op.side_read_all(singleton=is_singleton): |
443 | | - results.append(v) |
444 | | - if is_singleton: |
445 | | - break |
446 | | - if is_singleton: |
| 429 | + sources.append(si.source) |
| 430 | + iterator_fn = sideinputs.get_iterator_fn_for_sources(sources) |
| 431 | + |
| 432 | + if view_class == pvalue.SingletonPCollectionView: |
447 | 433 | has_default, default = view_options |
448 | | - if results: |
449 | | - yield results[0] |
| 434 | + has_result = False |
| 435 | + result = None |
| 436 | + for v in iterator_fn(): |
| 437 | + has_result = True |
| 438 | + result = v |
| 439 | + break |
| 440 | + if has_result: |
| 441 | + yield result |
450 | 442 | elif has_default: |
451 | 443 | yield default |
452 | 444 | else: |
453 | 445 | yield EmptySideInput() |
| 446 | + elif view_class == pvalue.IterablePCollectionView: |
| 447 | + yield sideinputs.EmulatedIterable(iterator_fn) |
| 448 | + elif view_class == pvalue.ListPCollectionView: |
| 449 | + # TODO(ccy): this is not yet suitable for lists that do not fit in |
| 450 | + # memory on a single machine. |
| 451 | + yield list(iterator_fn()) |
| 452 | + elif view_class == pvalue.DictPCollectionView: |
| 453 | + # TODO(ccy): this is not yet suitable for dictionaries that do not fit |
| 454 | + # in memory on a single machine. |
| 455 | + yield dict(iterator_fn()) |
454 | 456 | else: |
455 | | - yield results |
| 457 | + raise NotImplementedError('Unknown PCollectionView type: %s' % |
| 458 | + view_class) |
456 | 459 |
|
457 | 460 | def start(self): |
458 | 461 | super(DoOperation, self).start() |
|
0 commit comments