1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15- """A minimalist word-counting workflow."""
15+ """A minimalist word-counting workflow that counts words in Shakespeare.
16+
17+ This is the first in a series of successively more detailed 'word count'
18+ examples.
19+
20+ Next, see the wordcount pipeline, then the wordcount_debugging pipeline, for
21+ more detailed examples that introduce additional concepts.
22+
23+ Concepts:
24+
25+ 1. Reading data from text files
26+ 2. Specifying 'inline' transforms
27+ 3. Counting a PCollection
28+ 4. Writing data to Cloud Storage as text files
29+
30+ To execute this pipeline locally, first edit the code to specify the output
31+ location. Output location could be a local file path or an output prefix
32+ on GCS. (Only update the output location marked with the first CHANGE comment.)
33+
34+ To execute this pipeline remotely, first edit the code to set your project ID,
35+ runner type, the staging location, the temp location, and the output location.
36+ The specified GCS bucket(s) must already exist. (Update all the places marked
37+ with a CHANGE comment.)
38+
39+ Then, run the pipeline as described in the README. It will be deployed and run
40+ using the Google Cloud Dataflow Service. No args are required to run the
41+ pipeline. You can see the results in your output bucket in the GCS browser.
42+ """
1643
1744from __future__ import absolute_import
1845
@@ -33,10 +60,28 @@ def run(argv=None):
3360 help = 'Input file to process.' )
3461 parser .add_argument ('--output' ,
3562 dest = 'output' ,
36- required = True ,
63+ # CHANGE 1/5: The Google Cloud Storage path is required
64+ # for outputting the results.
65+ default = 'gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX' ,
3766 help = 'Output file to write results to.' )
3867 known_args , pipeline_args = parser .parse_known_args (argv )
3968
69+ pipeline_args .extend ([
70+ # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to
71+ # run your pipeline on the Google Cloud Dataflow Service.
72+ '--runner=DirectPipelineRunner' ,
73+ # CHANGE 3/5: Your project ID is required in order to run your pipeline on
74+ # the Google Cloud Dataflow Service.
75+ '--project=SET_YOUR_PROJECT_ID_HERE' ,
76+ # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
77+ # files.
78+ '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY' ,
79+ # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
80+ # files.
81+ '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY' ,
82+ '--job_name=your-wordcount-job' ,
83+ ])
84+
4085 p = df .Pipeline (argv = pipeline_args )
4186
4287 # Read the text file[pattern] into a PCollection.
0 commit comments