Skip to content
This repository was archived by the owner on Jun 30, 2022. It is now read-only.

Commit 5cfe9a5

Browse files
gildeaaaltay
authored andcommitted
New class ReceiverSet in the worker
A ReceiverSet holds all the per-receiver information an Operation has. The motivation for introducing it was to have a place to hold the coder for an output (to be introduced later), and it is useful even before that addition as a way to combine receiving operations and the OperationCounter. ----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=117833057
1 parent c6ec8bb commit 5cfe9a5

4 files changed

Lines changed: 70 additions & 58 deletions

File tree

google/cloud/dataflow/runners/common.pxd

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,9 @@ cdef class DoFnRunner(object):
2020
cdef object window_fn
2121
cdef object context
2222
cdef object tagged_receivers
23-
cdef object tagged_counters
2423
cdef object logger
2524
cdef object step_name
2625

27-
cdef list main_receivers
28-
cdef object main_counters
26+
cdef object main_receivers
2927

3028
cpdef _process_outputs(self, element, results)

google/cloud/dataflow/runners/common.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ def __init__(self,
4848
windowing,
4949
context,
5050
tagged_receivers,
51-
tagged_counters,
5251
logger=None,
5352
step_name=None):
5453
if not args and not kwargs:
@@ -70,13 +69,11 @@ def finish_bundle(self, context):
7069
self.window_fn = windowing.windowfn
7170
self.context = context
7271
self.tagged_receivers = tagged_receivers
73-
self.tagged_counters = tagged_counters
7472
self.logger = logger or FakeLogger()
7573
self.step_name = step_name
7674

7775
# Optimize for the common case.
7876
self.main_receivers = tagged_receivers[None]
79-
self.main_counters = tagged_counters[None]
8077

8178
def start(self):
8279
self.context.set_element(None)
@@ -150,15 +147,10 @@ def _process_outputs(self, element, results):
150147
else:
151148
windowed_value = WindowedValue(
152149
result, element.timestamp, element.windows)
153-
# TODO(robertwb): Should the counters be on the context?
154150
if tag is None:
155-
self.main_counters.update(windowed_value)
156-
for receiver in self.main_receivers:
157-
receiver.process(windowed_value)
151+
self.main_receivers.output(windowed_value)
158152
else:
159-
self.tagged_counters[tag].update(windowed_value)
160-
for receiver in self.tagged_receivers[tag]:
161-
receiver.process(windowed_value)
153+
self.tagged_receivers[tag].output(windowed_value)
162154

163155
class NoContext(WindowFn.AssignContext):
164156
"""An uninspectable WindowFn.AssignContext."""

google/cloud/dataflow/runners/direct_runner.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,19 +114,15 @@ def get_side_input_value(si):
114114
transform.dofn = OutputCheckWrapperDoFn(
115115
transform.dofn, transform_node.full_label)
116116

117-
class NoOpCounters(object):
118-
def update(self, element):
119-
pass
120-
121-
class RecordingReciever(object):
117+
class RecordingReceiverSet(object):
122118
def __init__(self, tag):
123119
self.tag = tag
124-
def process(self, element):
120+
def output(self, element):
125121
results[self.tag].append(element)
126122

127-
class TaggedRecievers(dict):
123+
class TaggedReceivers(dict):
128124
def __missing__(self, key):
129-
return [RecordingReciever(key)]
125+
return RecordingReceiverSet(key)
130126

131127
results = collections.defaultdict(list)
132128
# Some tags may be empty.
@@ -135,8 +131,7 @@ def __missing__(self, key):
135131

136132
runner = DoFnRunner(transform.dofn, transform.args, transform.kwargs,
137133
side_inputs, transform_node.inputs[0].windowing,
138-
context, TaggedRecievers(),
139-
collections.defaultdict(NoOpCounters),
134+
context, TaggedReceivers(),
140135
step_name=transform_node.full_label)
141136
runner.start()
142137
for v in self._cache.get_pvalue(transform_node.inputs[0]):

google/cloud/dataflow/worker/executor.py

Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,45 @@
4242
from google.cloud.dataflow.worker import shuffle
4343

4444

45+
class ReceiverSet(object):
46+
"""A ReceiverSet represents a graph edge between two Operation nodes.
47+
48+
The ReceiverSet object collects information from the output of the
49+
Operation at one end of its edge and the input of the Operation at
50+
the other edge.
51+
ReceiverSets are attached to the outputting Operation.
52+
"""
53+
54+
def __init__(self, output_index=0):
55+
self.receivers = []
56+
self.opcounter = None
57+
self.output_index = output_index
58+
59+
def start(self, step_name):
60+
self.opcounter = opcounters.OperationCounters(step_name, self.output_index)
61+
62+
def add_receiver(self, receiving_operation):
63+
self.receivers.append(receiving_operation)
64+
65+
def output(self, windowed_value):
66+
self.update_counters(windowed_value)
67+
for receiver in self.receivers:
68+
receiver.process(windowed_value)
69+
70+
def update_counters(self, windowed_value):
71+
if self.opcounter:
72+
self.opcounter.update(windowed_value)
73+
74+
def itercounters(self):
75+
if self.opcounter:
76+
for counter in self.opcounter:
77+
yield counter
78+
79+
def __str__(self):
80+
return '[%s]' % ' '.join([r.str_internal(is_recursive=True)
81+
for r in self.receivers])
82+
83+
4584
class Operation(object):
4685
"""An operation representing the live version of a work item specification.
4786
@@ -58,21 +97,22 @@ def __init__(self, spec):
5897
spec: A maptask.Worker* instance.
5998
"""
6099
self.spec = spec
61-
self.receivers = [[]]
62-
# Initially we have no counters. Initializing this here makes it
63-
# safe to call itercounters() at any time, even if start() has
64-
# not been called yet.
65-
self.counters = []
100+
# Create the ReceiverSet for the default output.
101+
# We need this in several cases:
102+
# A. There may be no receiver explicitly created for an output:
103+
# 1. ParDo without anything following it, executed for side effect.
104+
# 2. Partition, which generates a default output that isn't used.
105+
# B. Write operations want opcounters, even though they have no outputs.
106+
self.receivers = [ReceiverSet()]
66107

67108
def start(self):
68109
"""Start operation."""
69-
# If the operation has receivers, create one counter set per receiver.
70-
self.counters = [opcounters.OperationCounters(self.step_name, output_index)
71-
for output_index in range(len(self.receivers))]
110+
for receiver in self.receivers:
111+
receiver.start(self.step_name)
72112

73113
def itercounters(self):
74-
for opcounter in self.counters:
75-
for counter in opcounter:
114+
for receiver in self.receivers:
115+
for counter in receiver.itercounters():
76116
yield counter
77117

78118
def finish(self):
@@ -84,15 +124,13 @@ def process(self, o):
84124
pass
85125

86126
def output(self, windowed_value, output_index=0):
87-
self.counters[output_index].update(windowed_value)
88-
for receiver in self.receivers[output_index]:
89-
receiver.process(windowed_value)
127+
self.receivers[output_index].output(windowed_value)
90128

91129
def add_receiver(self, operation, output_index=0):
92130
"""Adds a receiver operation for the specified output."""
93131
while len(self.receivers) <= output_index:
94-
self.receivers.append([])
95-
self.receivers[output_index].append(operation)
132+
self.receivers.append(ReceiverSet(len(self.receivers)))
133+
self.receivers[output_index].add_receiver(operation)
96134

97135
def __str__(self):
98136
"""Generates a useful string for this object.
@@ -127,9 +165,7 @@ def str_internal(self, is_recursive=False):
127165

128166
if not is_recursive and getattr(self, 'receivers', []):
129167
printable_fields.append('receivers=[%s]' % ', '.join([
130-
rop.str_internal(is_recursive=True)
131-
for oplist in self.receivers
132-
for rop in oplist]))
168+
str(receiver) for receiver in self.receivers]))
133169

134170
return '<%s %s>' % (printable_name, ', '.join(printable_fields))
135171

@@ -206,7 +242,7 @@ def finish(self):
206242
def process(self, o):
207243
logging.debug('Processing [%s] in %s', o, self)
208244
assert isinstance(o, WindowedValue)
209-
self.counters[0].update(o)
245+
self.receivers[0].update_counters(o)
210246
if self.use_windowed_value:
211247
self.writer.Write(o)
212248
else:
@@ -223,7 +259,7 @@ def __init__(self, spec):
223259
def process(self, o):
224260
logging.debug('Processing [%s] in %s', o, self)
225261
assert isinstance(o, WindowedValue)
226-
self.counters[0].update(o)
262+
self.receivers[0].update_counters(o)
227263
self.spec.output_buffer.append(o.value)
228264

229265

@@ -312,7 +348,7 @@ def finish(self):
312348
def process(self, o):
313349
logging.debug('Processing [%s] in %s', o, self)
314350
assert isinstance(o, WindowedValue)
315-
self.counters[0].update(o)
351+
self.receivers[0].update_counters(o)
316352
# We typically write into shuffle key/value pairs. This is the reason why
317353
# the else branch below expects the value attribute of the WindowedValue
318354
# argument to be a KV pair. However the service may write to shuffle in
@@ -409,7 +445,6 @@ def start(self):
409445
# by the DoFn function to the appropriate receivers. The main output is
410446
# tagged with None and is associated with its corresponding index.
411447
tagged_receivers = {}
412-
tagged_counters = {}
413448
output_tag_prefix = PropertyNames.OUT + '_'
414449
for index, tag in enumerate(self.spec.output_tags):
415450
if tag == PropertyNames.OUT:
@@ -418,19 +453,11 @@ def start(self):
418453
original_tag = tag[len(output_tag_prefix):]
419454
else:
420455
raise ValueError('Unexpected output name for operation: %s' % tag)
421-
# There may be no receiver for this output, in which case the
422-
# lookup will create one, and this value will be processed
423-
# for any side effect. This is desirable. There are two (known)
424-
# cases where there is no receiver for an output:
425-
# 1. ParDo without anything following it, executed for side effect.
426-
# 2. Partition (shows up here in the worker as Flatten), which
427-
# generates a default output that isn't used.
428456
tagged_receivers[original_tag] = self.receivers[index]
429-
tagged_counters[original_tag] = self.counters[index]
430457

431458
self.dofn_runner = common.DoFnRunner(
432459
fn, args, kwargs, self._read_side_inputs(tags_and_types),
433-
window_fn, self.context, tagged_receivers, tagged_counters,
460+
window_fn, self.context, tagged_receivers,
434461
logger, self.step_name)
435462

436463
self.dofn_runner.start()
@@ -773,12 +800,12 @@ def execute(self, map_task, test_shuffle_source=None, test_shuffle_sink=None):
773800

774801
# Add receiver operations to the appropriate producers.
775802
if hasattr(op.spec, 'input'):
776-
producer, index = op.spec.input
777-
self._ops[producer].add_receiver(op, index)
803+
producer, output_index = op.spec.input
804+
self._ops[producer].add_receiver(op, output_index)
778805
# Flatten has 'inputs', not 'input'
779806
if hasattr(op.spec, 'inputs'):
780-
for producer, index in op.spec.inputs:
781-
self._ops[producer].add_receiver(op, index)
807+
for producer, output_index in op.spec.inputs:
808+
self._ops[producer].add_receiver(op, output_index)
782809

783810
# Inject the step names into the operations.
784811
# This is used for logging and assigning names to counters.

0 commit comments

Comments
 (0)