apache · aIbrahiim · Mar 12, 2026 · May 4, 2026 · May 6, 2026
diff --git a/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml b/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml
@@ -94,6 +94,7 @@ jobs:
             ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
             ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Table_Row_Inference_Batch.txt
             ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Table_Row_Inference_Stream.txt
+            ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt
       # The env variables are created and populated in the test-arguments-action as "<github.job>_test_arguments_<argument_file_paths_index>"
       - name: get current time
         run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV
@@ -214,3 +215,14 @@ jobs:
             -PpythonVersion=3.10 \
             -PloadTest.requirementsTxtFile=apache_beam/ml/inference/table_row_inference_requirements.txt \
             '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_10 }} --autoscaling_algorithm=THROUGHPUT_BASED --max_num_workers=20 --metrics_table=result_table_row_inference_stream --influx_measurement=result_table_row_inference_stream --mode=streaming --input_subscription=projects/apache-beam-testing/subscriptions/table_row_inference_benchmark --window_size_sec=60 --trigger_interval_sec=30 --timeout_ms=900000 --output_table=apache-beam-testing:beam_run_inference.result_table_row_inference_stream_outputs --job_name=benchmark-tests-table-row-inference-stream-${{env.NOW_UTC}}'
+      - name: run MLTransform Generate Vocab Batch
+        uses: ./.github/actions/gradle-command-self-hosted-action
+        timeout-minutes: 180
+        with:
+          gradle-command: :sdks:python:apache_beam:testing:load_tests:run
+          arguments: |
+            -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.mltransform_generate_vocab_benchmark \
+            -Prunner=DataflowRunner \
+            -PpythonVersion=3.10 \
+            -PloadTest.requirementsTxtFile=apache_beam/ml/transforms/mltransform_tests_requirements.txt \
+            '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_11 }} --job_name=benchmark-tests-mltransform-generate-vocab-batch-${{env.NOW_UTC}}'
diff --git a/...ne-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt b/...ne-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+--project=apache-beam-testing
+--region=us-central1
+--runner=DataflowRunner
+--temp_location=gs://temp-storage-for-perf-tests/loadtests
+--staging_location=gs://temp-storage-for-perf-tests/loadtests
+--machine_type=n1-standard-4
+--disk_size_gb=100
+--num_workers=8
+--max_num_workers=16
+--autoscaling_algorithm=THROUGHPUT_BASED
+--worker_zone=us-central1-b
+--sdk_location=container
+--requirements_file=apache_beam/ml/transforms/mltransform_tests_requirements.txt
+--input_options={}
+--publish_to_big_query=true
+--metrics_dataset=beam_run_inference
+--metrics_table=mltransform_generate_vocab_batch
+--influx_measurement=mltransform_generate_vocab_batch
+--input_file=gs://apache-beam-ml/testing/inputs/sentences_50k.txt
+--output_vocab=gs://temp-storage-for-perf-tests/mltransform/vocab_outputs/mltransform_generate_vocab_batch
+--columns=text
+--vocab_size=50000
+--min_frequency=1
+--lowercase=true
+--input_expand_factor=1
+
diff --git a/.test-infra/tools/refresh_looker_metrics.py b/.test-infra/tools/refresh_looker_metrics.py
@@ -44,7 +44,8 @@
     ("85", ["268", "269", "270", "271", "272"]),  # PyTorch Sentiment Batch DistilBERT base uncased
     ("86", ["284", "285", "286", "287", "288"]),  # VLLM Batch Gemma
     ("96", ["270", "304", "305", "353", "354"]),   # Table Row Inference Sklearn Batch
-    ("106", ["355", "356", "357", "358", "359"])   # Table Row Inference Sklearn Streaming
+    ("106", ["355", "356", "357", "358", "359"]),   # Table Row Inference Sklearn Streaming
+    ("107", ["360", "361", "362", "363", "364"]),  # MLTransform Generate Vocab Batch
 ]
 
 def get_look(id: str) -> models.Look:

diff --git a/sdks/python/apache_beam/examples/ml_transform/README.md b/sdks/python/apache_beam/examples/ml_transform/README.md
@@ -0,0 +1,116 @@
+<!--
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# MLTransform Examples
+
+This directory contains Apache Beam examples for MLTransform pipelines.
+
+## MLTransform - Generate Vocab (Batch only)
+
+`mltransform_generate_vocab.py` builds a vocabulary artifact from batch input
+rows using `MLTransform` + `ComputeAndApplyVocabulary`.
+
+### What it does
+
+1. Reads input rows from JSONL (`--input_file`) or BigQuery (`--input_table`).
+2. Extracts specified columns (`--columns`).
+3. Normalizes and combines text values (`trim`, optional lowercasing).
+4. Runs `ComputeAndApplyVocabulary` with top-k and min-frequency constraints
+   using space-delimited token splitting.
+5. Writes the vocabulary as one token per line.
+
+### Required arguments
+
+- `--output_vocab`
+- `--columns`
+- and one of:
+  - `--input_file`
+  - `--input_table`
+
+### Optional arguments
+
+- `--vocab_size` (default: `50000`)
+- `--min_frequency` (default: `1`)
+- `--lowercase` (default: `true`)
+- `--input_expand_factor` (default: `1`, useful for perf/load testing)
+
+### Local batch example
+
+```sh
+python -m apache_beam.examples.ml_transform.mltransform_generate_vocab \
+  --input_file=/tmp/input.jsonl \
+  --output_vocab=/tmp/vocab.txt \
+  --columns=text,category \
+  --vocab_size=5 \
+  --min_frequency=1 \
+  --lowercase=true \
+  --input_expand_factor=1 \
+  --runner=DirectRunner
+```
+
+### Input format
+
+JSONL input with object rows, for example:
+
+```json
+{"id":"1","text":"Beam beam ML pipeline"}
+{"id":"2","text":"Beam pipeline dataflow"}
+{"id":"3","text":"ML transform beam"}
+{"id":"4","text":"vocab vocab vocab test"}
+{"id":"5","text":"rare_token_once"}
+{"id":"6","text":""}
+{"id":"7","text":null}
+```
+
+The integration tests in `mltransform_generate_vocab_test.py` generate this
+sample data programmatically.
+
+### Output format
+
+One token per line:
+
+1. tokens follow the vocabulary order produced by `ComputeAndApplyVocabulary`.
+
+Example output:
+
+```txt
+beam
+ml
+```
+
+For this sample and config:
+
+```sh
+--columns=text --min_frequency=2 --vocab_size=3
+```
+
+the expected output is:
+
+```txt
+beam
+vocab
+ml
+```
+
+### Additional test datasets
+
+Test data for happy path and null/empty/missing columns is generated inline in
+`mltransform_generate_vocab_test.py`.
+
+### Performance testing pattern
+
+- Small local files: functional correctness and output-stability tests.
+- Large GCS files (or moderate file + `--input_expand_factor`): throughput/cost
+  benchmarking on Dataflow.
+