Updates bigquery source/sink to use executing project by default.

chamikaramj · gildea · commit 668b7bb2ae79 · 2016-03-16T15:02:57.000-07:00
This will be used if a project is not specified with the input/output table schema. Updates direct pipeline runner so that sinks have access to PipelineOptions object similar to sources. This fixes #1 ----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=116987206
diff --git a/README.md b/README.md
@@ -355,7 +355,7 @@ you can write to.
 import google.cloud.dataflow as df
 input_table = 'clouddataflow-readonly:samples.weather_stations'
 project = 'YOUR-PROJECT'
-output_table = '%s:DATASET.TABLENAME' % project
+output_table = 'DATASET.TABLENAME'
 p = df.Pipeline(argv=['--project', project])
 (p
  | df.Read('read', df.io.BigQuerySource(input_table))
@@ -379,7 +379,7 @@ of using the whole table.
 ```python
 import google.cloud.dataflow as df
 project = 'YOUR-PROJECT'
-output_table = '%s:DATASET.TABLENAME' % project
+output_table = 'DATASET.TABLENAME'
 input_query = 'SELECT month, COUNT(month) AS tornado_count ' \
         'FROM [clouddataflow-readonly:samples.weather_stations] ' \
         'WHERE tornado=true GROUP BY month'
diff --git a/google/cloud/dataflow/io/bigquery.py b/google/cloud/dataflow/io/bigquery.py
@@ -466,8 +466,13 @@ def __init__(self, source, test_bigquery_client=None):
     # getting additional details.
     self.schema = None
     if self.source.query is None:
+      # If table schema did not define a project we default to executing
+      # project.
+      project_id = self.source.table_reference.projectId
+      if not project_id:
+        project_id = self.executing_project
       self.query = 'SELECT * FROM [%s:%s.%s];' % (
-          self.source.table_reference.projectId,
+          project_id,
           self.source.table_reference.datasetId,
           self.source.table_reference.tableId)
     else:
@@ -505,6 +510,12 @@ def __init__(self, sink, test_bigquery_client=None, buffer_size=None):
     self.rows_buffer_flush_threshold = buffer_size or 1000
     # Figure out the project, dataset, and table used for the sink.
     self.project_id = self.sink.table_reference.projectId
+
+    # If table schema did not define a project we default to executing project.
+    if self.project_id is None and hasattr(sink, 'pipeline_options'):
+      self.project_id = (
+          sink.pipeline_options.view_as(GoogleCloudOptions).project)
+
     assert self.project_id is not None
 
     self.dataset_id = self.sink.table_reference.datasetId
diff --git a/google/cloud/dataflow/io/bigquery_test.py b/google/cloud/dataflow/io/bigquery_test.py
@@ -24,6 +24,7 @@
 from google.cloud.dataflow.internal.json_value import to_json_value
 from google.cloud.dataflow.io.bigquery import RowAsDictJsonCoder
 from google.cloud.dataflow.io.bigquery import TableRowJsonCoder
+from google.cloud.dataflow.utils.options import PipelineOptions
 
 from apitools.base.py.exceptions import HttpError
 from apitools.clients import bigquery
@@ -270,6 +271,15 @@ def test_read_from_table_and_multiple_pages(self):
     # adjust our expectation below accordingly.
     self.assertEqual(actual_rows, expected_rows * 2)
 
+  def test_table_schema_without_project(self):
+    # Reader should pick executing project by default.
+    source = df.io.BigQuerySource(table='mydataset.mytable')
+    options = PipelineOptions(flags=['--project', 'myproject'])
+    source.pipeline_options = options
+    reader = source.reader()
+    self.assertEquals('SELECT * FROM [myproject:mydataset.mytable];',
+                      reader.query)
+
 
 class TestBigQueryWriter(unittest.TestCase):
 
@@ -427,6 +437,13 @@ def test_rows_are_written(self):
             tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                 rows=expected_rows)))
 
+  def test_table_schema_without_project(self):
+    # Writer should pick executing project by default.
+    sink = df.io.BigQuerySink(table='mydataset.mytable')
+    options = PipelineOptions(flags=['--project', 'myproject'])
+    sink.pipeline_options = options
+    writer = sink.writer()
+    self.assertEquals('myproject', writer.project_id)
 
 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)
diff --git a/google/cloud/dataflow/runners/direct_runner.py b/google/cloud/dataflow/runners/direct_runner.py
@@ -189,8 +189,8 @@ def run_Flatten(self, transform_node):
 
   @skip_if_cached
   def run_Read(self, transform_node):
-    # TODO(chamikara) Implement a more generic way for passing PipelineOption
-    # to sources when using DirectRunner.
+    # TODO(chamikara) Implement a more generic way for passing PipelineOptions
+    # to sources and sinks when using DirectRunner.
     source = transform_node.transform.source
     source.pipeline_options = transform_node.inputs[0].pipeline.options
     with source.reader() as reader:
@@ -199,7 +199,8 @@ def run_Read(self, transform_node):
 
   @skip_if_cached
   def run__NativeWrite(self, transform_node):
-    transform = transform_node.transform
-    with transform.sink.writer() as writer:
+    sink = transform_node.transform.sink
+    sink.pipeline_options = transform_node.inputs[0].pipeline.options
+    with sink.writer() as writer:
       for v in self._cache.get_pvalue(transform_node.inputs[0]):
         writer.Write(v.value)