Explicitly set all required pipeline options in the minimal example

aaltay · aaltay · commit c6ec8bb5a06f · 2016-03-24T13:46:02.000-07:00
This is to simplify minimal wordcount example for first time users. ----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=117752280
diff --git a/google/cloud/dataflow/examples/wordcount_minimal.py b/google/cloud/dataflow/examples/wordcount_minimal.py
@@ -12,7 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""A minimalist word-counting workflow."""
+"""A minimalist word-counting workflow that counts words in Shakespeare.
+
+This is the first in a series of successively more detailed 'word count'
+examples.
+
+Next, see the wordcount pipeline, then the wordcount_debugging pipeline, for
+more detailed examples that introduce additional concepts.
+
+Concepts:
+
+1. Reading data from text files
+2. Specifying 'inline' transforms
+3. Counting a PCollection
+4. Writing data to Cloud Storage as text files
+
+To execute this pipeline locally, first edit the code to specify the output
+location. Output location could be a local file path or an output prefix
+on GCS. (Only update the output location marked with the first CHANGE comment.)
+
+To execute this pipeline remotely, first edit the code to set your project ID,
+runner type, the staging location, the temp location, and the output location.
+The specified GCS bucket(s) must already exist. (Update all the places marked
+with a CHANGE comment.)
+
+Then, run the pipeline as described in the README. It will be deployed and run
+using the Google Cloud Dataflow Service. No args are required to run the
+pipeline. You can see the results in your output bucket in the GCS browser.
+"""
 
 from __future__ import absolute_import
 
@@ -33,10 +60,28 @@ def run(argv=None):
                       help='Input file to process.')
   parser.add_argument('--output',
                       dest='output',
-                      required=True,
+                      # CHANGE 1/5: The Google Cloud Storage path is required
+                      # for outputting the results.
+                      default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                       help='Output file to write results to.')
   known_args, pipeline_args = parser.parse_known_args(argv)
 
+  pipeline_args.extend([
+      # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to
+      # run your pipeline on the Google Cloud Dataflow Service.
+      '--runner=DirectPipelineRunner',
+      # CHANGE 3/5: Your project ID is required in order to run your pipeline on
+      # the Google Cloud Dataflow Service.
+      '--project=SET_YOUR_PROJECT_ID_HERE',
+      # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
+      # files.
+      '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
+      # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
+      # files.
+      '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
+      '--job_name=your-wordcount-job',
+  ])
+
   p = df.Pipeline(argv=pipeline_args)
 
   # Read the text file[pattern] into a PCollection.