Use shelve as a disk backed dictionary optionally in PValueCache

aaltay · aaltay · commit c586bacaeaa8 · 2016-05-31T14:42:32.000-07:00
A new DirectRunner based DiskCachedPipelineRunner is introduced. Shelve will automatically spill dictionary entries into disk, reducing the memory requirement. For small pipelines the performance impact is minimal as it has an in memory cache for recent objects. For large pipelines that requires multiple pcollection object to be in the cache at the same time it prevents OOMs. However it will have a performance impact for large pipelines because of disk IO. Memory requirement of this new runner is capped by the single ptransform in the pipeline that consumes the largest total input (input + side inputs in aggregate). ----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=123441597
diff --git a/google/cloud/dataflow/pipeline_test.py b/google/cloud/dataflow/pipeline_test.py
@@ -24,7 +24,6 @@
 from google.cloud.dataflow.pipeline import PipelineVisitor
 from google.cloud.dataflow.pvalue import AsIter
 from google.cloud.dataflow.pvalue import SideOutputValue
-from google.cloud.dataflow.runners import DirectPipelineRunner
 from google.cloud.dataflow.transforms import CombinePerKey
 from google.cloud.dataflow.transforms import Create
 from google.cloud.dataflow.transforms import FlatMap
@@ -62,6 +61,9 @@ def reader(self):
 
 class PipelineTest(unittest.TestCase):
 
+  def setUp(self):
+    self.runner_name = 'DirectPipelineRunner'
+
   @staticmethod
   def custom_callable(pcoll):
     return pcoll | FlatMap('+1', lambda x: [x + 1])
@@ -92,7 +94,7 @@ def leave_composite_transform(self, transform_node):
       self.leave_composite.append(transform_node)
 
   def test_create(self):
-    pipeline = Pipeline('DirectPipelineRunner')
+    pipeline = Pipeline(self.runner_name)
     pcoll = pipeline | Create('label1', [1, 2, 3])
     assert_that(pcoll, equal_to([1, 2, 3]))
 
@@ -103,20 +105,19 @@ def test_create(self):
     pipeline.run()
 
   def test_create_singleton_pcollection(self):
-    pipeline = Pipeline(DirectPipelineRunner())
+    pipeline = Pipeline(self.runner_name)
     pcoll = pipeline | Create('label', [[1, 2, 3]])
     assert_that(pcoll, equal_to([[1, 2, 3]]))
     pipeline.run()
 
   def test_read(self):
-    pipeline = Pipeline('DirectPipelineRunner')
+    pipeline = Pipeline(self.runner_name)
     pcoll = pipeline | Read('read', FakeSource([1, 2, 3]))
     assert_that(pcoll, equal_to([1, 2, 3]))
     pipeline.run()
 
   def test_visit_entire_graph(self):
-
-    pipeline = Pipeline(DirectPipelineRunner())
+    pipeline = Pipeline(self.runner_name)
     pcoll1 = pipeline | Create('pcoll', [1, 2, 3])
     pcoll2 = pcoll1 | FlatMap('do1', lambda x: [x + 1])
     pcoll3 = pcoll2 | FlatMap('do2', lambda x: [x + 1])
@@ -135,14 +136,14 @@ def test_visit_entire_graph(self):
     self.assertEqual(visitor.leave_composite[0].transform, transform)
 
   def test_apply_custom_transform(self):
-    pipeline = Pipeline(DirectPipelineRunner())
+    pipeline = Pipeline(self.runner_name)
     pcoll = pipeline | Create('pcoll', [1, 2, 3])
     result = pcoll | PipelineTest.CustomTransform()
     assert_that(result, equal_to([2, 3, 4]))
     pipeline.run()
 
   def test_reuse_custom_transform_instance(self):
-    pipeline = Pipeline(DirectPipelineRunner())
+    pipeline = Pipeline(self.runner_name)
     pcoll1 = pipeline | Create('pcoll1', [1, 2, 3])
     pcoll2 = pipeline | Create('pcoll2', [4, 5, 6])
     transform = PipelineTest.CustomTransform()
@@ -157,7 +158,7 @@ def test_reuse_custom_transform_instance(self):
         'transform.clone("NEW LABEL").')
 
   def test_reuse_cloned_custom_transform_instance(self):
-    pipeline = Pipeline(DirectPipelineRunner())
+    pipeline = Pipeline(self.runner_name)
     pcoll1 = pipeline | Create('pcoll1', [1, 2, 3])
     pcoll2 = pipeline | Create('pcoll2', [4, 5, 6])
     transform = PipelineTest.CustomTransform()
@@ -168,7 +169,7 @@ def test_reuse_cloned_custom_transform_instance(self):
     pipeline.run()
 
   def test_apply_custom_callable(self):
-    pipeline = Pipeline('DirectPipelineRunner')
+    pipeline = Pipeline(self.runner_name)
     pcoll = pipeline | Create('pcoll', [1, 2, 3])
     result = pipeline.apply(PipelineTest.custom_callable, pcoll)
     assert_that(result, equal_to([2, 3, 4]))
@@ -249,6 +250,20 @@ def test_eager_pipeline(self):
     self.assertEqual([1, 4, 9], p | Create([1, 2, 3]) | Map(lambda x: x*x))
 
 
+class DiskCachedRunnerPipelineTest(PipelineTest):
+
+  def setUp(self):
+    self.runner_name = 'DiskCachedPipelineRunner'
+
+  def test_cached_pvalues_are_refcounted(self):
+    # Takes long with disk spilling.
+    pass
+
+  def test_eager_pipeline(self):
+    # Tests eager runner only
+    pass
+
+
 class Bacon(PipelineOptions):
 
   @classmethod
diff --git a/google/cloud/dataflow/runners/direct_runner.py b/google/cloud/dataflow/runners/direct_runner.py
@@ -70,6 +70,10 @@ def __init__(self, cache=None):
     self.debug_counters = {}
     self.debug_counters['element_counts'] = collections.Counter()
 
+  @property
+  def cache(self):
+    return self._cache
+
   def get_pvalue(self, pvalue):
     """Gets the PValue's computed value from the runner's cache."""
     try:
@@ -285,3 +289,38 @@ def run_transform(self, transform):
     if transform not in self._seen_transforms:
       self._seen_transforms.add(transform)
       super(EagerPipelineRunner, self).run_transform(transform)
+
+
+class DiskCachedPipelineRunner(DirectPipelineRunner):
+  """A DirectPipelineRunner that uses a disk backed cache.
+
+  DiskCachedPipelineRunner uses a temporary disk backed cache for running
+  pipelines. This allows for running pipelines that will require more memory
+  than it is available, however this comes with a performance cost due to disk
+  IO.
+
+  Memory requirement for DiskCachedPipelineRunner is approximately capped by the
+  single transform in the pipeline that consumes and outputs largest total
+  collection (i.e. inputs, side-inputs and outputs in aggregate). In the extreme
+  case a where a transform will use all previous intermediate values as input,
+  memory requirements for DiskCachedPipelineRunner will be the same as
+  DirectPipelineRunner.
+  """
+
+  def __init__(self):
+    self._null_cache = ()
+    super(DiskCachedPipelineRunner, self).__init__(self._null_cache)
+
+  def run(self, pipeline):
+    try:
+      self._cache = PValueCache(use_disk_backed_cache=True)
+      return super(DirectPipelineRunner, self).run(pipeline)
+    finally:
+      del self._cache
+      self._cache = self._null_cache
+
+  @property
+  def cache(self):
+    raise NotImplementedError(
+        'DiskCachedPipelineRunner does not keep cache outside the scope of its '
+        'run method.')
diff --git a/google/cloud/dataflow/runners/runner.py b/google/cloud/dataflow/runners/runner.py
@@ -17,6 +17,10 @@
 from __future__ import absolute_import
 
 import logging
+import os
+import shelve
+import shutil
+import tempfile
 
 
 def create_runner(runner_name):
@@ -37,6 +41,10 @@ def create_runner(runner_name):
   if runner_name == 'DirectPipelineRunner':
     import google.cloud.dataflow.runners.direct_runner
     return google.cloud.dataflow.runners.direct_runner.DirectPipelineRunner()
+  if runner_name == 'DiskCachedPipelineRunner':
+    import google.cloud.dataflow.runners.direct_runner
+    return google.cloud.dataflow.runners.direct_runner.DiskCachedPipelineRunner(
+    )
   if runner_name == 'EagerPipelineRunner':
     import google.cloud.dataflow.runners.direct_runner
     return google.cloud.dataflow.runners.direct_runner.EagerPipelineRunner()
@@ -164,17 +172,32 @@ def run_transform(self, transform_node):
 class PValueCache(object):
   """Local cache for arbitrary information computed for PValue objects."""
 
-  def __init__(self):
+  def __init__(self, use_disk_backed_cache=False):
     # Cache of values computed while a runner executes a pipeline. This is a
     # dictionary of PValues and their computed values. Note that in principle
     # the runner could contain PValues from several pipelines without clashes
     # since a PValue is associated with one and only one pipeline. The keys of
-    # the dictionary are PValue instance addresses obtained using id().
-    self._cache = {}
+    # the dictionary are tuple of PValue instance addresses obtained using id()
+    # and tag names converted to strings.
+
+    self._use_disk_backed_cache = use_disk_backed_cache
+    if use_disk_backed_cache:
+      self._tempdir = tempfile.mkdtemp()
+      self._cache = shelve.open(os.path.join(self._tempdir, 'shelve'))
+    else:
+      self._cache = {}
+
+  def __del__(self):
+    if self._use_disk_backed_cache:
+      self._cache.close()
+      shutil.rmtree(self._tempdir)
 
   def __len__(self):
     return len(self._cache)
 
+  def to_cache_key(self, transform, tag):
+    return str((id(transform), tag))
+
   def _ensure_pvalue_has_real_producer(self, pvalue):
     """Ensure the passed-in PValue has the real_producer attribute.
 
@@ -201,15 +224,16 @@ def is_cached(self, pobj):
       self._ensure_pvalue_has_real_producer(pobj)
       transform = pobj.real_producer
       tag = pobj.tag
-    return (id(transform), tag) in self._cache
+    return self.to_cache_key(transform, tag) in self._cache
 
   def cache_output(self, transform, tag_or_value, value=None):
     if value is None:
       value = tag_or_value
       tag = None
     else:
       tag = tag_or_value
-    self._cache[id(transform), tag] = [value, transform.refcounts[tag]]
+    self._cache[
+        self.to_cache_key(transform, tag)] = [value, transform.refcounts[tag]]
 
   def get_pvalue(self, pvalue):
     """Gets the value associated with a PValue from the cache."""
@@ -225,7 +249,7 @@ def get_pvalue(self, pvalue):
       return value_with_refcount[0]
     except KeyError:
       if (pvalue.tag is not None
-          and (id(pvalue.real_producer), None) in self._cache):
+          and self.to_cache_key(pvalue.real_producer, None) in self._cache):
         # This is an undeclared, empty side output of a DoFn executed
         # in the local runner before this side output referenced.
         return []
@@ -242,7 +266,7 @@ def clear_pvalue(self, pvalue):
 
   def key(self, pobj):
     self._ensure_pvalue_has_real_producer(pobj)
-    return id(pobj.real_producer), pobj.tag
+    return self.to_cache_key(pobj.real_producer, pobj.tag)
 
 
 class PipelineState(object):
diff --git a/google/cloud/dataflow/transforms/ptransform.py b/google/cloud/dataflow/transforms/ptransform.py
@@ -412,8 +412,11 @@ def __ror__(self, left):
     if deferred:
       return result
     else:
+      # Get a reference to the runners internal cache, otherwise runner may
+      # clean it after run.
+      cache = p.runner.cache
       p.run()
-      return _MaterializePValues(p.runner._cache).visit(result)
+      return _MaterializePValues(cache).visit(result)
 
   def _extract_input_pvalues(self, pvalueish):
     """Extract all the pvalues contained in the input pvalueish.