[yaml]: Phase2 High Usage Yaml examples (#35279)

derrickaw · web-flow · commit b97ee3110d0a · 2025-06-25T08:54:52.000-04:00
* add infrastructure to run Phase 2 test cases

* yapf

* add input data

* add new examples

* change dataflow link
diff --git a/sdks/python/apache_beam/yaml/examples/testing/examples_test.py b/sdks/python/apache_beam/yaml/examples/testing/examples_test.py
@@ -17,6 +17,7 @@
 #
 # pytype: skip-file
 import glob
+import json
 import logging
 import os
 import random
@@ -124,11 +125,35 @@ def test_kafka_read(
       | beam.Map(lambda element: beam.Row(payload=element.encode('utf-8'))))
 
 
+@beam.ptransform.ptransform_fn
+def test_pubsub_read(
+    pbegin,
+    topic: Optional[str] = None,
+    subscription: Optional[str] = None,
+    format: Optional[str] = None,
+    schema: Optional[Any] = None,
+    attributes: Optional[List[str]] = None,
+    attributes_map: Optional[str] = None,
+    id_attribute: Optional[str] = None,
+    timestamp_attribute: Optional[str] = None):
+
+  pubsub_messages = input_data.pubsub_messages_data()
+
+  return (
+      pbegin
+      | beam.Create([json.loads(msg.data) for msg in pubsub_messages])
+      | beam.Map(lambda element: beam.Row(**element)))
+
+
 TEST_PROVIDERS = {
-    'TestEnrichment': test_enrichment, 'TestReadFromKafka': test_kafka_read
+    'TestEnrichment': test_enrichment,
+    'TestReadFromKafka': test_kafka_read,
+    'TestReadFromPubSub': test_pubsub_read
 }
-
-INPUT_TRANSFORM_TEST_PROVIDERS = ['TestReadFromKafka']
+"""
+Transforms not requiring inputs.
+"""
+INPUT_TRANSFORM_TEST_PROVIDERS = ['TestReadFromKafka', 'TestReadFromPubSub']
 
 
 def check_output(expected: List[str]):
@@ -186,6 +211,7 @@ def test_yaml_example(self):
           f"Missing '# Expected:' tag in example file '{pipeline_spec_file}'")
     for i, line in enumerate(expected):
       expected[i] = line.replace('#  ', '').replace('\n', '')
+    expected = [line for line in expected if line]
     pipeline_spec = yaml.load(
         ''.join(lines), Loader=yaml_transform.SafeLineLoader)
 
@@ -418,7 +444,11 @@ def _kafka_test_preprocessor(
     'test_kafka_yaml',
     'test_spanner_read_yaml',
     'test_spanner_write_yaml',
-    'test_enrich_spanner_with_bigquery_yaml'
+    'test_enrich_spanner_with_bigquery_yaml',
+    'test_pubsub_topic_to_bigquery_yaml',
+    'test_pubsub_subscription_to_bigquery_yaml',
+    'test_jdbc_to_bigquery_yaml',
+    'test_spanner_to_avro_yaml'
 ])
 def _io_write_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
@@ -527,8 +557,11 @@ def _iceberg_io_read_test_preprocessor(
   return test_spec
 
 
-@YamlExamplesTestSuite.register_test_preprocessor(
-    ['test_spanner_read_yaml', 'test_enrich_spanner_with_bigquery_yaml'])
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_spanner_read_yaml',
+    'test_enrich_spanner_with_bigquery_yaml',
+    "test_spanner_to_avro_yaml"
+])
 def _spanner_io_read_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
   """
@@ -607,6 +640,71 @@ def _enrichment_test_preprocessor(
   return test_spec
 
 
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_pubsub_topic_to_bigquery_yaml',
+    'test_pubsub_subscription_to_bigquery_yaml'
+])
+def _pubsub_io_read_test_preprocessor(
+    test_spec: dict, expected: List[str], env: TestEnvironment):
+  """
+  Preprocessor for tests that involve reading from Pub/Sub.
+
+  This preprocessor replaces any ReadFromPubSub transform with a Create
+  transform that reads from a predefined in-memory list of messages.
+  This allows the test to verify the pipeline's correctness without relying
+  on an active Pub/Sub subscription or topic.
+  """
+  if pipeline := test_spec.get('pipeline', None):
+    for transform in pipeline.get('transforms', []):
+      if transform.get('type', '') == 'ReadFromPubSub':
+        transform['type'] = 'TestReadFromPubSub'
+
+  return test_spec
+
+
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_jdbc_to_bigquery_yaml',
+])
+def _jdbc_io_read_test_preprocessor(
+    test_spec: dict, expected: List[str], env: TestEnvironment):
+  """
+  Preprocessor for tests that involve reading from JDBC.
+
+  This preprocessor replaces any ReadFromJdbc transform with a Create
+  transform that reads from a predefined in-memory list of records.
+  This allows the test to verify the pipeline's correctness without
+  relying on an active JDBC connection.
+  """
+  if pipeline := test_spec.get('pipeline', None):
+    for transform in pipeline.get('transforms', []):
+      if transform.get('type', '').startswith('ReadFromJdbc'):
+        config = transform['config']
+        url = config['url']
+        database = url.split('/')[-1]
+        if (table := config.get('table', None)) is None:
+          table = config.get('query', '').split('FROM')[-1].strip()
+        transform['type'] = 'Create'
+        transform['config'] = {
+            k: v
+            for k, v in config.items() if k.startswith('__')
+        }
+        elements = INPUT_TABLES[("Jdbc", database, table)]
+        if config.get('query', None):
+          config['query'].replace('select ',
+                                  'SELECT ').replace(' from ', ' FROM ')
+          columns = set(
+              ''.join(config['query'].split('SELECT ')[1:]).split(
+                  ' FROM', maxsplit=1)[0].split(', '))
+          if columns != {'*'}:
+            elements = [{
+                column: element[column]
+                for column in element if column in columns
+            } for element in elements]
+        transform['config']['elements'] = elements
+
+  return test_spec
+
+
 INPUT_FILES = {'products.csv': input_data.products_csv()}
 INPUT_TABLES = {
     ('shipment-test', 'shipment', 'shipments'): input_data.
@@ -616,16 +714,17 @@ def _enrichment_test_preprocessor(
     ('db', 'users', 'NY'): input_data.iceberg_dynamic_destinations_users_data(),
     ('BigTable', 'beam-test', 'bigtable-enrichment-test'): input_data.
     bigtable_data(),
-    ('BigQuery', 'ALL_TEST', 'customers'): input_data.bigquery_data()
+    ('BigQuery', 'ALL_TEST', 'customers'): input_data.bigquery_data(),
+    ('Jdbc', 'shipment', 'shipments'): input_data.jdbc_shipments_data()
 }
 YAML_DOCS_DIR = os.path.join(os.path.dirname(__file__))
 
 AggregationTest = YamlExamplesTestSuite(
     'AggregationExamplesTest',
     os.path.join(YAML_DOCS_DIR, '../transforms/aggregation/*.yaml')).run()
-BlueprintsTest = YamlExamplesTestSuite(
-    'BlueprintsExamplesTest',
-    os.path.join(YAML_DOCS_DIR, '../transforms/blueprints/*.yaml')).run()
+BlueprintTest = YamlExamplesTestSuite(
+    'BlueprintExamplesTest',
+    os.path.join(YAML_DOCS_DIR, '../transforms/blueprint/*.yaml')).run()
 ElementWiseTest = YamlExamplesTestSuite(
     'ElementwiseExamplesTest',
     os.path.join(YAML_DOCS_DIR, '../transforms/elementwise/*.yaml')).run()
diff --git a/sdks/python/apache_beam/yaml/examples/testing/input_data.py b/sdks/python/apache_beam/yaml/examples/testing/input_data.py
@@ -16,10 +16,10 @@
 # limitations under the License.
 #
 
-"""
-This file contains the input data to be requested by the example tests, if
-needed.
-"""
+from apache_beam.io.gcp.pubsub import PubsubMessage
+
+# This file contains the input data to be requested by the example tests, if
+# needed.
 
 
 def text_data():
@@ -129,6 +129,57 @@ def spanner_shipments_data():
           }]
 
 
+def jdbc_shipments_data():
+  return [{
+      'shipment_id': 'S1',
+      'customer_id': 'C1',
+      'shipment_date': '2023-05-01',
+      'shipment_cost': 150.0,
+      'customer_name': 'Alice',
+      'customer_email': 'alice@example.com'
+  },
+          {
+              'shipment_id': 'S2',
+              'customer_id': 'C2',
+              'shipment_date': '2023-06-12',
+              'shipment_cost': 300.0,
+              'customer_name': 'Bob',
+              'customer_email': 'bob@example.com'
+          },
+          {
+              'shipment_id': 'S3',
+              'customer_id': 'C1',
+              'shipment_date': '2023-05-10',
+              'shipment_cost': 20.0,
+              'customer_name': 'Alice',
+              'customer_email': 'alice@example.com'
+          },
+          {
+              'shipment_id': 'S4',
+              'customer_id': 'C4',
+              'shipment_date': '2024-07-01',
+              'shipment_cost': 150.0,
+              'customer_name': 'Derek',
+              'customer_email': 'derek@example.com'
+          },
+          {
+              'shipment_id': 'S5',
+              'customer_id': 'C5',
+              'shipment_date': '2023-05-09',
+              'shipment_cost': 300.0,
+              'customer_name': 'Erin',
+              'customer_email': 'erin@example.com'
+          },
+          {
+              'shipment_id': 'S6',
+              'customer_id': 'C4',
+              'shipment_date': '2024-07-02',
+              'shipment_cost': 150.0,
+              'customer_name': 'Derek',
+              'customer_email': 'derek@example.com'
+          }]
+
+
 def bigtable_data():
   return [{
       'product_id': '1', 'product_name': 'pixel 5', 'product_stock': '2'
@@ -165,3 +216,15 @@ def bigquery_data():
               'customer_name': 'Claire',
               'customer_email': 'claire@gmail.com'
           }]
+
+
+def pubsub_messages_data():
+  """
+  Provides a list of PubsubMessage objects for testing.
+  """
+  return [
+      PubsubMessage(data=b"{\"label\": \"37a\", \"rank\": 1}", attributes={}),
+      PubsubMessage(data=b"{\"label\": \"37b\", \"rank\": 4}", attributes={}),
+      PubsubMessage(data=b"{\"label\": \"37c\", \"rank\": 3}", attributes={}),
+      PubsubMessage(data=b"{\"label\": \"37d\", \"rank\": 2}", attributes={}),
+  ]
diff --git a/sdks/python/apache_beam/yaml/examples/transforms/blueprint/jdbc_to_bigquery.yaml b/sdks/python/apache_beam/yaml/examples/transforms/blueprint/jdbc_to_bigquery.yaml
@@ -0,0 +1,57 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is an example of a Beam YAML pipeline that reads from jdbc database
+# and writes them to BigQuery.  This matches the Dataflow Template located
+# here - https://cloud.google.com/dataflow/docs/guides/templates/provided/jdbc-to-bigquery
+
+pipeline:
+  type: composite
+  transforms:
+    # Step 1: Reading shipment data from jdbc DB
+    - type: ReadFromJdbc
+      name: ReadShipments
+      config:
+        url: "jdbc:mysql://my-host:3306/shipment"
+        driver_class_name: "org.sqlite.JDBC"
+        query: "SELECT * FROM shipments"
+    # Step 2: Write successful records out to BigQuery
+    - type: WriteToBigQuery
+      name: WriteShipments
+      input: ReadShipments
+      config:
+        table: "apache-beam-testing.yaml_test.shipments"
+        create_disposition: "CREATE_NEVER"
+        write_disposition: "WRITE_APPEND"
+        error_handling:
+          output: "deadLetterQueue"
+        num_streams: 1
+    # Step 3: Write the failed messages to BQ to a dead letter queue JSON file
+    - type: WriteToJson
+      input: WriteShipments.deadLetterQueue
+      config:
+        path: "gs://my-bucket/yaml-123/writingToBigQueryErrors.json"
+
+
+
+# Expected:
+#  Row(shipment_id='S1', customer_id='C1', shipment_date='2023-05-01', shipment_cost=150.0, customer_name='Alice', customer_email='alice@example.com')
+#  Row(shipment_id='S2', customer_id='C2', shipment_date='2023-06-12', shipment_cost=300.0, customer_name='Bob', customer_email='bob@example.com')
+#  Row(shipment_id='S3', customer_id='C1', shipment_date='2023-05-10', shipment_cost=20.0, customer_name='Alice', customer_email='alice@example.com')
+#  Row(shipment_id='S4', customer_id='C4', shipment_date='2024-07-01', shipment_cost=150.0, customer_name='Derek', customer_email='derek@example.com')
+#  Row(shipment_id='S5', customer_id='C5', shipment_date='2023-05-09', shipment_cost=300.0, customer_name='Erin', customer_email='erin@example.com')
+#  Row(shipment_id='S6', customer_id='C4', shipment_date='2024-07-02', shipment_cost=150.0, customer_name='Derek', customer_email='derek@example.com')
diff --git a/sdks/python/apache_beam/yaml/examples/transforms/blueprint/pubsub_subscription_to_bigquery.yaml b/sdks/python/apache_beam/yaml/examples/transforms/blueprint/pubsub_subscription_to_bigquery.yaml
@@ -0,0 +1,61 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is an example of a Beam YAML pipeline that reads messages from Pub/Sub
+# and writes them to BigQuery.  This matches the Dataflow Template located
+# here - https://cloud.google.com/dataflow/docs/guides/templates/provided/pubsub-subscription-to-bigquery
+
+pipeline:
+  type: composite
+  transforms:
+    # Step 1: Reading messages from Pub/Sub subscription
+    - type: ReadFromPubSub
+      name: ReadMessages
+      config:
+        subscription: "projects/apache-beam-testing/subscription/my-subscription"
+        format: JSON
+        schema: 
+          type: object
+          properties:
+            data: {type: BYTES}
+            attributes: {type: object}
+    # Step 2: Write successful records out to BigQuery
+    - type: WriteToBigQuery
+      name: WriteMessages
+      input: ReadMessages
+      config:
+        table: "apache-beam-testing.yaml_test.order_data"
+        create_disposition: "CREATE_NEVER"
+        write_disposition: "WRITE_APPEND"
+        error_handling:
+          output: "deadLetterQueue"
+        num_streams: 1
+    # Step 3: Write the failed messages to BQ to a JSON file
+    - type: WriteToJson
+      input: WriteMessages.deadLetterQueue
+      config:
+        path: "gs://my-bucket/yaml-123/writingToBigQueryErrors.json"
+
+options:
+  streaming: true
+
+
+# Expected:
+#  Row(label='37a', rank=1)
+#  Row(label='37b', rank=4)
+#  Row(label='37c', rank=3)
+#  Row(label='37d', rank=2)
diff --git a/sdks/python/apache_beam/yaml/examples/transforms/blueprint/pubsub_topic_to_bigquery.yaml b/sdks/python/apache_beam/yaml/examples/transforms/blueprint/pubsub_topic_to_bigquery.yaml
diff --git a/sdks/python/apache_beam/yaml/examples/transforms/blueprint/spanner_to_avro.yaml b/sdks/python/apache_beam/yaml/examples/transforms/blueprint/spanner_to_avro.yaml