GoogleCloudPlatform
diff --git a/‎google/cloud/dataflow/examples/wordcount.py‎
Lines changed: 3 additions & 2 deletions b/‎google/cloud/dataflow/examples/wordcount.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎google/cloud/dataflow/internal/apiclient.py‎
Lines changed: 45 additions & 19 deletions b/‎google/cloud/dataflow/internal/apiclient.py‎
Lines changed: 45 additions & 19 deletions
diff --git a/‎google/cloud/dataflow/transforms/aggregator.py‎
Lines changed: 18 additions & 40 deletions b/‎google/cloud/dataflow/transforms/aggregator.py‎
Lines changed: 18 additions & 40 deletions
diff --git a/‎google/cloud/dataflow/transforms/aggregator_test.py‎
Lines changed: 38 additions & 5 deletions b/‎google/cloud/dataflow/transforms/aggregator_test.py‎
Lines changed: 38 additions & 5 deletions
diff --git a/‎google/cloud/dataflow/transforms/combiners.py‎
Lines changed: 9 additions & 0 deletions b/‎google/cloud/dataflow/transforms/combiners.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎google/cloud/dataflow/transforms/core.py‎
Lines changed: 28 additions & 0 deletions b/‎google/cloud/dataflow/transforms/core.py‎
Lines changed: 28 additions & 0 deletions
@@ -25,7 +25,8 @@
 
 empty_line_aggregator = df.Aggregator('emptyLines')
 average_word_size_aggregator = df.Aggregator('averageWordLength',
-                                             df.combiners.Mean())
+                                             df.combiners.MeanCombineFn(),
+                                             float)
 
 
 class WordExtractingDoFn(df.DoFn):
@@ -47,7 +48,7 @@ def process(self, context):
       context.aggregate_to(empty_line_aggregator, 1)
     words = re.findall(r'[A-Za-z\']+', text_line)
     for w in words:
-      context.aggregate_to(average_word_size_aggregator, float(len(w)))
+      context.aggregate_to(average_word_size_aggregator, len(w))
     return words
 
 
 
@@ -26,6 +26,7 @@
 from google.cloud.dataflow.internal.auth import get_service_credentials
 from google.cloud.dataflow.internal.json_value import to_json_value
 from google.cloud.dataflow.io import iobase
+from google.cloud.dataflow.transforms import cy_combiners
 from google.cloud.dataflow.utils import dependency
 from google.cloud.dataflow.utils import retry
 from google.cloud.dataflow.utils.names import PropertyNames
@@ -44,7 +45,7 @@
 STORAGE_API_SERVICE = 'storage.googleapis.com'
 
 
-def append_counter(status_object, counter, tentative=False):
+def append_counter(status_object, counter, tentative):
   """Appends a counter to the status.
 
   Args:
@@ -55,22 +56,23 @@ def append_counter(status_object, counter, tentative=False):
   logging.debug('Appending counter%s %s',
                 ' (tentative)' if tentative else '',
                 counter)
+  kind, setter = metric_translations[counter.combine_fn.__class__]
   append_metric(
-      status_object, counter.name, counter.total,
-      counter.elements if counter.aggregation_kind == counter.MEAN else None,
-      tentative=tentative)
+      status_object, counter.name, kind, counter.accumulator,
+      setter, tentative=tentative)
 
 
-def append_metric(status_object, metric_name, value1, value2=None,
+def append_metric(status_object, metric_name, kind, value, setter=None,
                   step=None, output_user_name=None, tentative=False,
                   worker_id=None, cumulative=True):
   """Creates and adds a MetricUpdate field to the passed-in protobuf.
 
   Args:
     status_object: a work_item_status to which to add this metric
     metric_name: a string naming this metric
-    value1: scalar for a Sum or mean_sum for a Mean
-    value2: mean_count for a Mean aggregation (do not provide for a Sum).
+    kind: dataflow counter kind (e.g. 'sum')
+    value: accumulator value to encode
+    setter: if not None, a lambda to use to update metric_update with value
     step: the name of the associated step
     output_user_name: the user-visible name to use
     tentative: whether this should be labeled as a tentative metric
@@ -103,19 +105,13 @@ def append_to_context(key, value):
       append_to_context('workerId', worker_id)
   if cumulative and is_counter:
     metric_update.cumulative = cumulative
-  if value2 is None:
-    if is_counter:
-      # Counters are distinguished by having a kind; metrics do not.
-      metric_update.kind = 'Sum'
-    metric_update.scalar = to_json_value(value1, with_type=True)
-  elif value2 > 0:
-    metric_update.kind = 'Mean'
-    metric_update.meanSum = to_json_value(value1, with_type=True)
-    metric_update.meanCount = to_json_value(value2, with_type=True)
+  if is_counter:
+    # Counters are distinguished by having a kind; metrics do not.
+    metric_update.kind = kind
+  if setter:
+    setter(value, metric_update)
   else:
-    # A denominator of 0 will raise an error in the service.
-    # What it means is we have nothing to report yet, so don't.
-    pass
+    metric_update.scalar = to_json_value(value, with_type=True)
   logging.debug('Appending metric_update: %s', metric_update)
   status_object.metricUpdates.append(metric_update)
 
@@ -840,3 +836,33 @@ def cloud_position_to_reader_position(cloud_position):
 def approximate_progress_to_dynamic_split_request(approximate_progress):
   return iobase.DynamicSplitRequest(cloud_progress_to_reader_progress(
       approximate_progress))
+
+
+def set_scalar(accumulator, metric_update):
+  metric_update.scalar = to_json_value(accumulator.value, with_type=True)
+
+
+def set_mean(accumulator, metric_update):
+  if accumulator.count:
+    metric_update.meanSum = to_json_value(accumulator.sum, with_type=True)
+    metric_update.meanCount = to_json_value(accumulator.count, with_type=True)
+  else:
+    # A denominator of 0 will raise an error in the service.
+    # What it means is we have nothing to report yet, so don't.
+    metric_update.kind = None
+
+
+# To enable a counter on the service, add it to this dictionary.
+metric_translations = {
+    cy_combiners.CountCombineFn: ('sum', set_scalar),
+    cy_combiners.SumInt64Fn: ('sum', set_scalar),
+    cy_combiners.MinInt64Fn: ('min', set_scalar),
+    cy_combiners.MaxInt64Fn: ('max', set_scalar),
+    cy_combiners.MeanInt64Fn: ('mean', set_mean),
+    cy_combiners.SumFloatFn: ('sum', set_scalar),
+    cy_combiners.MinFloatFn: ('min', set_scalar),
+    cy_combiners.MaxFloatFn: ('max', set_scalar),
+    cy_combiners.MeanFloatFn: ('mean', set_mean),
+    cy_combiners.AllCombineFn: ('and', set_scalar),
+    cy_combiners.AnyCombineFn: ('or', set_scalar),
+}
@@ -38,8 +38,7 @@ def process(self, context):
 
 from __future__ import absolute_import
 
-from google.cloud.dataflow.transforms import combiners
-from google.cloud.dataflow.utils.counters import Counter
+from google.cloud.dataflow.transforms import core
 
 
 class Aggregator(object):
@@ -49,14 +48,12 @@ class Aggregator(object):
     combine_fn: how to combine values input to the aggregation.
       It must be one of these arithmetic functions:
 
-       - Python's built-in sum
-       - Python's built-in min
-       - Python's built-in max
-       - df.Mean()
+       - Python's built-in sum, min, max, any, and all.
+       - df.combiners.MeanCombineFn()
 
-      The default is sum.
+      The default is sum of 64-bit ints.
 
-    type: describes the numeric type that will be accepted as input
+    type: describes the type that will be accepted as input
       for aggregation; by default types appropriate to the combine_fn
       are accepted.
 
@@ -67,13 +64,16 @@ class Aggregator(object):
     complex_counter = df.Aggregator('other-counter', df.Mean(), float)
   """
 
-  def __init__(self,
-               name,
-               combine_fn=sum,
-               input_type=None):  # inferred from combine_fn
+  def __init__(self, name, combine_fn=sum, input_type=int):
+    combine_fn = core.CombineFn.maybe_from_callable(combine_fn).for_input_type(
+        input_type)
+    if not _is_supported_kind(combine_fn):
+      raise ValueError(
+          'combine_fn %r (class %r) '
+          'does not map to a supported aggregation kind'
+          % (combine_fn, combine_fn.__class__))
     self.name = name
     self.combine_fn = combine_fn
-    self.aggregation_kind = self._aggregator_counter_kind(combine_fn)
     self.input_type = input_type
 
   def __str__(self):
@@ -98,30 +98,8 @@ def get_name(thing):
       combine_call = ' %s%s' % (combine_fn_str, input_arg)
     return 'Aggregator %s%s' % (self.name, combine_call)
 
-  @staticmethod
-  def _aggregator_counter_kind(combine_fn):
-    """Returns the counter aggregation kind for the combine_fn passed in.
-
-    Args:
-      combine_fn: The combining function used in an Aggregator.
-
-    Returns:
-      The aggregation_kind (to use in a Counter) that matches combine_fn.
-
-    Raises:
-      ValueError if the combine_fn doesn't map to any supported
-      aggregation kind.
-    """
-    # We don't have combiner types that implement AND or OR.
-    combine_kind_map = {sum: Counter.SUM, max: Counter.MAX, min: Counter.MIN,
-                        combiners.Mean: Counter.MEAN}
-    try:
-      return combine_kind_map[combine_fn]
-    except KeyError:
-      try:
-        return combine_kind_map[combine_fn.__class__]
-      except KeyError:
-        raise ValueError(
-            'combine_fn %r (class %r) '
-            'does not map to a supported aggregation kind'
-            % (combine_fn, combine_fn.__class__))
+
+def _is_supported_kind(combine_fn):
+  # pylint: disable=g-import-not-at-top
+  from google.cloud.dataflow.internal.apiclient import metric_translations
+  return combine_fn.__class__ in metric_translations
@@ -16,6 +16,7 @@
 
 import unittest
 
+import google.cloud.dataflow as df
 from google.cloud.dataflow.transforms import combiners
 from google.cloud.dataflow.transforms.aggregator import Aggregator
 
@@ -24,16 +25,48 @@ class AggregatorTest(unittest.TestCase):
 
   def test_str(self):
     basic = Aggregator('a-name')
-    self.assertEqual('<Aggregator a-name>', str(basic))
+    self.assertEqual('<Aggregator a-name SumInt64Fn(int)>', str(basic))
 
     for_max = Aggregator('max-name', max)
-    self.assertEqual('<Aggregator max-name max>', str(for_max))
+    self.assertEqual('<Aggregator max-name MaxInt64Fn(int)>', str(for_max))
 
     for_float = Aggregator('f-name', sum, float)
-    self.assertEqual('<Aggregator f-name sum(float)>', str(for_float))
+    self.assertEqual('<Aggregator f-name SumFloatFn(float)>', str(for_float))
 
-    for_mean = Aggregator('m-name', combiners.Mean(), float)
-    self.assertEqual('<Aggregator m-name Mean(float)>', str(for_mean))
+    for_mean = Aggregator('m-name', combiners.MeanCombineFn(), float)
+    self.assertEqual('<Aggregator m-name MeanFloatFn(float)>', str(for_mean))
+
+  def test_aggregation(self):
+
+    mean = combiners.MeanCombineFn()
+    mean.__name__ = 'mean'
+    counter_types = [
+        (sum, int, 6),
+        (min, int, 0),
+        (max, int, 3),
+        (mean, int, 1),
+        (sum, float, 6.0),
+        (min, float, 0.0),
+        (max, float, 3.0),
+        (mean, float, 1.5),
+        (any, int, True),
+        (all, float, False),
+    ]
+    aggeregators = [Aggregator('%s_%s' % (f.__name__, t.__name__), f, t)
+                    for f, t, _ in counter_types]
+
+    class UpdateAggregators(df.DoFn):
+      def process(self, context):
+        for a in aggeregators:
+          context.aggregate_to(a, context.element)
+
+    p = df.Pipeline('DirectPipelineRunner')
+    p | df.Create([0, 1, 2, 3]) | df.ParDo(UpdateAggregators())
+    res = p.run()
+    for (_, _, expected), a in zip(counter_types, aggeregators):
+      actual = res.aggregated_values(a).values()[0]
+      self.assertEqual(expected, actual)
+      self.assertEqual(type(expected), type(actual))
 
 
 if __name__ == '__main__':
 
@@ -21,6 +21,7 @@
 import random
 
 from google.cloud.dataflow.transforms import core
+from google.cloud.dataflow.transforms import cy_combiners
 from google.cloud.dataflow.transforms import ptransform
 from google.cloud.dataflow.typehints import Any
 from google.cloud.dataflow.typehints import Dict
@@ -81,6 +82,14 @@ def extract_output(self, (sum_, count)):
       return float('NaN')
     return sum_ / float(count)
 
+  def for_input_type(self, input_type):
+    if input_type is int:
+      return cy_combiners.MeanInt64Fn()
+    elif input_type is float:
+      return cy_combiners.MeanFloatFn()
+    else:
+      return self
+
 
 class Count(object):
   """Combiners for counting elements."""
 
@@ -341,6 +341,16 @@ def apply(self, elements, *args, **kwargs):
             *args, **kwargs),
         *args, **kwargs)
 
+  def for_input_type(self, input_type):
+    """Returns a specialized implementation of self, if it exists.
+
+    Otherwise, returns self.
+
+    Args:
+      input_type: the type of input elements.
+    """
+    return self
+
   @staticmethod
   def from_callable(fn):
     return CallableWrapperCombineFn(fn)
@@ -431,6 +441,24 @@ def default_type_hints(self):
       hints.set_input_types(*input_args, **input_kwargs)
       return hints
 
+  def for_input_type(self, input_type):
+    # Avoid circular imports.
+    from google.cloud.dataflow.transforms import cy_combiners
+    if self._fn is any:
+      return cy_combiners.AnyCombineFn()
+    elif self._fn is all:
+      return cy_combiners.AllCombineFn()
+    else:
+      known_types = {
+          (sum, int): cy_combiners.SumInt64Fn(),
+          (min, int): cy_combiners.MinInt64Fn(),
+          (max, int): cy_combiners.MaxInt64Fn(),
+          (sum, float): cy_combiners.SumFloatFn(),
+          (min, float): cy_combiners.MinFloatFn(),
+          (max, float): cy_combiners.MaxFloatFn(),
+      }
+    return known_types.get((self._fn, input_type), self)
+
 
 class PartitionFn(WithTypeHints):
   """A function object used by a Partition transform.