Several fixes related to schema specified when creating a BigQuery sink.

chamikaramj · aaltay · commit b2816c6cdedc · 2016-03-24T13:49:40.000-07:00
(1) Fixes a bug that prevented repeated fields working properly for DataflowRunner. (2) Updates documentation of 'schema' parameter. Removes text that says that schema can be specified as a dictionary since we do not actually support that. Clarifies the limitations of specifying specifying a schema as a string and recommends using 'bigquery.TableSchema' if the schema is complicated. (3) Adds a cookbook example that demonstrates how to build a 'bigquery.TableSchema' object with nested and repeated tables and how to write to a table created using that schema. ----Release Notes---- Fixes a bug that required type of record fields to be specified in all caps when using DataflowRunner. [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=118003947
diff --git a/google/cloud/dataflow/examples/cookbook/bigquery_schema.py b/google/cloud/dataflow/examples/cookbook/bigquery_schema.py
@@ -0,0 +1,127 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A workflow that writes to a BigQuery table with nested and repeated fields.
+
+Demonstrates how to build a bigquery.TableSchema object with nested and repeated
+fields. Also, shows how to generate data to be written to a BigQuery table with
+nested and repeated fields.
+"""
+
+from __future__ import absolute_import
+
+import argparse
+import logging
+
+import google.cloud.dataflow as df
+
+
+def run(argv=None):
+  """Run the workflow."""
+  parser = argparse.ArgumentParser()
+
+  parser.add_argument(
+      '--output',
+      required=True,
+      help=
+      ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
+       'or DATASET.TABLE.'))
+  known_args, pipeline_args = parser.parse_known_args(argv)
+
+  p = df.Pipeline(argv=pipeline_args)
+
+  from apitools.clients import bigquery  # pylint: disable=g-import-not-at-top
+
+  table_schema = bigquery.TableSchema()
+
+  # Fields that use standard types.
+  kind_schema = bigquery.TableFieldSchema()
+  kind_schema.name = 'kind'
+  kind_schema.type = 'string'
+  kind_schema.mode = 'nullable'
+  table_schema.fields.append(kind_schema)
+
+  full_name_schema = bigquery.TableFieldSchema()
+  full_name_schema.name = 'fullName'
+  full_name_schema.type = 'string'
+  full_name_schema.mode = 'required'
+  table_schema.fields.append(full_name_schema)
+
+  age_schema = bigquery.TableFieldSchema()
+  age_schema.name = 'age'
+  age_schema.type = 'integer'
+  age_schema.mode = 'nullable'
+  table_schema.fields.append(age_schema)
+
+  gender_schema = bigquery.TableFieldSchema()
+  gender_schema.name = 'gender'
+  gender_schema.type = 'string'
+  gender_schema.mode = 'nullable'
+  table_schema.fields.append(gender_schema)
+
+  # A nested field
+  phone_number_schema = bigquery.TableFieldSchema()
+  phone_number_schema.name = 'phoneNumber'
+  phone_number_schema.type = 'record'
+  phone_number_schema.mode = 'nullable'
+
+  area_code = bigquery.TableFieldSchema()
+  area_code.name = 'areaCode'
+  area_code.type = 'integer'
+  area_code.mode = 'nullable'
+  phone_number_schema.fields.append(area_code)
+
+  number = bigquery.TableFieldSchema()
+  number.name = 'number'
+  number.type = 'integer'
+  number.mode = 'nullable'
+  phone_number_schema.fields.append(number)
+  table_schema.fields.append(phone_number_schema)
+
+  # A repeated field.
+  children_schema = bigquery.TableFieldSchema()
+  children_schema.name = 'children'
+  children_schema.type = 'string'
+  children_schema.mode = 'repeated'
+  table_schema.fields.append(children_schema)
+
+  def create_random_record(record_id):
+    return {'kind': 'kind' + record_id, 'fullName': 'fullName'+record_id,
+            'age': int(record_id) * 10, 'gender': 'male',
+            'phoneNumber': {
+                'areaCode': int(record_id) * 100,
+                'number': int(record_id) * 100000},
+            'children': ['child' + record_id + '1',
+                         'child' + record_id + '2',
+                         'child' + record_id + '3']
+           }
+
+  # pylint: disable=expression-not-assigned
+  record_ids = p | df.Create('CreateIDs', ['1', '2', '3', '4', '5'])
+  records = record_ids | df.Map('CreateRecords', create_random_record)
+  records | df.io.Write(
+      'write',
+      df.io.BigQuerySink(
+          known_args.output,
+          schema=table_schema,
+          create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
+          write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))
+
+  # Run the pipeline (all operations are deferred until run() is called).
+  p.run()
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  run()
diff --git a/google/cloud/dataflow/io/bigquery.py b/google/cloud/dataflow/io/bigquery.py
@@ -347,9 +347,13 @@ def __init__(self, table, dataset=None, project=None, schema=None,
         reference is specified entirely by the table argument.
       project: The ID of the project containing this table or null if the table
         reference is specified entirely by the table argument.
-      schema: A bigquery.TableSchema instance or a dictionary associating field
-        names with types. Possible types are: STRING, INTEGER, FLOAT, BOOLEAN,
-        TIMESTAMP, RECORD (e.g., {'month': 'STRING', 'count': 'INTEGER'}).
+      schema: The schema to be used if the BigQuery table to write has to be
+        created. This can be either specified as a 'bigquery.TableSchema' object
+        or a single string  of the form 'field1:type1,field2:type2,field3:type3'
+        that defines a comma separated list of fields. Here 'type' should
+        specify the BigQuery type of the field. Single string based schemas do
+        not support nested fields, repeated fields, or specifying a BigQuery
+        mode for fields (mode will always be set to 'NULLABLE').
       create_disposition: A string describing what happens if the table does not
         exist. Possible values are:
         - BigQueryDisposition.CREATE_IF_NEEDED: create if does not exist.
@@ -417,7 +421,7 @@ def schema_list_as_object(schema_list):
           fs['description'] = f.description
         if f.mode is not None:
           fs['mode'] = f.mode
-        if f.type == 'RECORD':
+        if f.type.lower() == 'record':
           fs['fields'] = schema_list_as_object(f.fields)
         fields.append(fs)
       return fields