Add reference counting for consumers of AppliedPTransform outputs

silviulica · silviulica · commit 30aac61854d6 · 2016-03-31T17:43:47.000-07:00
This is used by DirectPipelineRunner to delete cached values aggressively after all their respective consumers have used them. Without such a feature the runner can get into out of memory situations. ----Release Notes---- Improve memory footprint for DirrectPipelineRunner. [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=118410157
diff --git a/google/cloud/dataflow/pipeline.py b/google/cloud/dataflow/pipeline.py
@@ -39,6 +39,7 @@
 
 from __future__ import absolute_import
 
+import collections
 import logging
 import os
 import shutil
@@ -292,6 +293,7 @@ def apply(self, transform, pvalueish=None):
                            'output type-hint was found for the '
                            'PTransform %s' % ptransform_name)
 
+    child.update_input_refcounts()
     self.transforms_stack.pop()
     return pvalueish_result
 
@@ -357,6 +359,26 @@ def __init__(self, parent, transform, full_label, inputs):
     self.outputs = []
     self.parts = []
 
+    # Per tag refcount dictionary for PValues for which this node is a
+    # root producer.
+    self.refcounts = collections.defaultdict(int)
+
+  def update_input_refcounts(self):
+    """Increment refcounts for all transforms providing inputs."""
+
+    def real_producer(pv):
+      real = pv.producer
+      while real.parts:
+        real = real.parts[-1]
+      return real
+
+    if not self.is_composite():
+      for main_input in self.inputs:
+        if not isinstance(main_input, pvalue.PBegin):
+          real_producer(main_input).refcounts[main_input.tag] += 1
+      for side_input in self.side_inputs:
+        real_producer(side_input.pvalue).refcounts[side_input.pvalue.tag] += 1
+
   def add_output(self, output):
     assert (isinstance(output, pvalue.PValue) or
             isinstance(output, pvalue.DoOutputsTuple))
diff --git a/google/cloud/dataflow/pipeline_test.py b/google/cloud/dataflow/pipeline_test.py
@@ -14,15 +14,21 @@
 
 """Unit tests for the Pipeline class."""
 
+import gc
+import logging
 import unittest
 
 from google.cloud.dataflow.io.iobase import Source
 from google.cloud.dataflow.pipeline import Pipeline
 from google.cloud.dataflow.pipeline import PipelineOptions
 from google.cloud.dataflow.pipeline import PipelineVisitor
+from google.cloud.dataflow.pvalue import AsIter
+from google.cloud.dataflow.pvalue import SideOutputValue
 from google.cloud.dataflow.runners import DirectPipelineRunner
+from google.cloud.dataflow.transforms import CombinePerKey
 from google.cloud.dataflow.transforms import Create
 from google.cloud.dataflow.transforms import FlatMap
+from google.cloud.dataflow.transforms import Flatten
 from google.cloud.dataflow.transforms import Map
 from google.cloud.dataflow.transforms import PTransform
 from google.cloud.dataflow.transforms import Read
@@ -194,6 +200,47 @@ def apply(self, pcoll):
         ['a-x', 'b-x', 'c-x'],
         sorted(['a', 'b', 'c'] | AddSuffix('-x')))
 
+  def test_cached_pvalues_are_refcounted(self):
+    """Test that cached PValues are refcounted and deleted.
+
+    The intermediary PValues computed by the workflow below contain
+    one million elements so if the refcounting does not work the number of
+    objects tracked by the garbage collector will increase by a few millions
+    by the time we execute the final Map checking the objects tracked.
+    Anything that is much larger than what we started with will fail the test.
+    """
+    def check_memory(value, count_threshold):
+      gc.collect()
+      objects_count = len(gc.get_objects())
+      if objects_count > count_threshold:
+        raise RuntimeError(
+            'PValues are not refcounted: %s, %s' % (
+                objects_count, count_threshold))
+      return value
+
+    def create_dupes(o, _):
+      yield o
+      yield SideOutputValue('side', o)
+
+    pipeline = Pipeline('DirectPipelineRunner')
+
+    gc.collect()
+    count_threshold = len(gc.get_objects()) + 10000
+    biglist = pipeline | Create('oom:create', ['x'] * 1000000)
+    dupes = (
+        biglist
+        | Map('oom:addone', lambda x: (x, 1))
+        | FlatMap('oom:dupes', create_dupes,
+                  AsIter(biglist)).with_outputs('side', main='main'))
+    result = (
+        (dupes.side, dupes.main, dupes.side)
+        | Flatten('oom:flatten')
+        | CombinePerKey('oom:combine', sum)
+        | Map('oom:check', check_memory, count_threshold))
+
+    assert_that(result, equal_to([('x', 3000000)]))
+    pipeline.run()
+
 
 class Bacon(PipelineOptions):
 
@@ -264,4 +311,5 @@ def test_dir(self):
 
 
 if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
   unittest.main()
diff --git a/google/cloud/dataflow/runners/runner.py b/google/cloud/dataflow/runners/runner.py
@@ -184,35 +184,40 @@ def _ensure_pvalue_has_real_producer(self, pvalue):
     composite transform we need to find the output of its rightmost transform
     part.
     """
-    if not hasattr(pvalue, 'read_producer'):
+    if not hasattr(pvalue, 'real_producer'):
       real_producer = pvalue.producer
       while real_producer.parts:
         real_producer = real_producer.parts[-1]
       pvalue.real_producer = real_producer
 
   def is_cached(self, pobj):
-    # Import here to avoid circular dependencies.
     from google.cloud.dataflow.pipeline import AppliedPTransform
     if isinstance(pobj, AppliedPTransform):
       transform = pobj
+      tag = None
     else:
       self._ensure_pvalue_has_real_producer(pobj)
       transform = pobj.real_producer
-    return (id(transform), None) in self._cache
+      tag = pobj.tag
+    return (id(transform), tag) in self._cache
 
   def cache_output(self, transform, tag_or_value, value=None):
     if value is None:
       value = tag_or_value
       tag = None
     else:
       tag = tag_or_value
-    self._cache[id(transform), tag] = value
+    self._cache[id(transform), tag] = [value, transform.refcounts[tag]]
 
   def get_pvalue(self, pvalue):
     """Gets the value associated with a PValue from the cache."""
     self._ensure_pvalue_has_real_producer(pvalue)
     try:
-      return self._cache[self.key(pvalue)]
+      value_with_refcount = self._cache[self.key(pvalue)]
+      value_with_refcount[1] -= 1
+      if value_with_refcount[1] <= 0:
+        self.clear_pvalue(pvalue)
+      return value_with_refcount[0]
     except KeyError:
       if (pvalue.tag is not None
           and (id(pvalue.real_producer), None) in self._cache):