GoogleCloudPlatform
diff --git a/‎dataproc/snippets/README.md
+84 b/‎dataproc/snippets/README.md
+84
diff --git a/‎dataproc/snippets/create_cluster.py
+77 b/‎dataproc/snippets/create_cluster.py
+77
diff --git a/‎dataproc/snippets/create_cluster_test.py
+47 b/‎dataproc/snippets/create_cluster_test.py
+47
diff --git a/‎dataproc/snippets/dataproc_e2e_donttest.py
+32 b/‎dataproc/snippets/dataproc_e2e_donttest.py
+32
diff --git a/‎dataproc/snippets/instantiate_inline_workflow_template.py
+107 b/‎dataproc/snippets/instantiate_inline_workflow_template.py
+107
diff --git a/‎dataproc/snippets/instantiate_inline_workflow_template_test.py
+31 b/‎dataproc/snippets/instantiate_inline_workflow_template_test.py
+31
@@ -0,0 +1,84 @@
+# Cloud Dataproc API Examples
+
+[![Open in Cloud Shell][shell_img]][shell_link]
+
+[shell_img]: http://gstatic.com/cloudssh/images/open-btn.png
+[shell_link]: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataproc/README.md
+
+Sample command-line programs for interacting with the Cloud Dataproc API.
+
+See [the tutorial on the using the Dataproc API with the Python client
+library](https://cloud.google.com/dataproc/docs/tutorials/python-library-example)
+for information on a walkthrough you can run to try out the Cloud Dataproc API sample code.
+
+Note that while this sample demonstrates interacting with Dataproc via the API, the functionality demonstrated here could also be accomplished using the Cloud Console or the gcloud CLI.
+
+`list_clusters.py` is a simple command-line program to demonstrate connecting to the Cloud Dataproc API and listing the clusters in a region.
+
+`submit_job_to_cluster.py` demonstrates how to create a cluster, submit the
+`pyspark_sort.py` job, download the output from Google Cloud Storage, and output the result.
+
+`single_job_workflow.py` uses the Cloud Dataproc InstantiateInlineWorkflowTemplate API to create an ephemeral cluster, run a job, then delete the cluster with one API request.
+
+`pyspark_sort.py_gcs` is the same as `pyspark_sort.py` but demonstrates
+ reading from a GCS bucket.
+
+## Prerequisites to run locally:
+
+* [pip](https://pypi.python.org/pypi/pip)
+
+Go to the [Google Cloud Console](https://console.cloud.google.com).
+
+Under API Manager, search for the Google Cloud Dataproc API and enable it.
+
+## Set Up Your Local Dev Environment
+
+To install, run the following commands. If you want to use  [virtualenv](https://virtualenv.readthedocs.org/en/latest/)
+(recommended), run the commands within a virtualenv.
+
+    * pip install -r requirements.txt
+
+## Authentication
+
+Please see the [Google cloud authentication guide](https://cloud.google.com/docs/authentication/).
+The recommended approach to running these samples is a Service Account with a JSON key.
+
+## Environment Variables
+
+Set the following environment variables:
+
+    GOOGLE_CLOUD_PROJECT=your-project-id
+    REGION=us-central1 # or your region
+    CLUSTER_NAME=waprin-spark7
+    ZONE=us-central1-b
+
+## Running the samples
+
+To run list_clusters.py:
+
+    python list_clusters.py $GOOGLE_CLOUD_PROJECT --region=$REGION
+
+`submit_job_to_cluster.py` can create the Dataproc cluster or use an existing cluster. To create a cluster before running the code, you can use the [Cloud Console](console.cloud.google.com) or run:
+
+    gcloud dataproc clusters create your-cluster-name
+
+To run submit_job_to_cluster.py, first create a GCS bucket (used by Cloud Dataproc to stage files) from the Cloud Console or with gsutil:
+
+    gsutil mb gs://<your-staging-bucket-name>
+
+Next, set the following environment variables:
+
+    BUCKET=your-staging-bucket
+    CLUSTER=your-cluster-name
+
+Then, if you want to use an existing cluster, run:
+
+    python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET
+
+Alternatively, to create a new cluster, which will be deleted at the end of the job, run:
+
+    python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET --create_new_cluster
+
+The script will setup a cluster, upload the PySpark file, submit the job, print the result, then, if it created the cluster, delete the cluster.
+
+Optionally, you can add the `--pyspark_file` argument to change from the default `pyspark_sort.py` included in this script to a new script.
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This sample walks a user through creating a Cloud Dataproc cluster using
+# the Python client library.
+#
+# This script can be run on its own:
+#   python create_cluster.py ${PROJECT_ID} ${REGION} ${CLUSTER_NAME}
+
+
+import sys
+
+# [START dataproc_create_cluster]
+from google.cloud import dataproc_v1 as dataproc
+
+
+def create_cluster(project_id, region, cluster_name):
+    """This sample walks a user through creating a Cloud Dataproc cluster
+       using the Python client library.
+
+       Args:
+           project_id (string): Project to use for creating resources.
+           region (string): Region where the resources should live.
+           cluster_name (string): Name to use for creating a cluster.
+    """
+
+    # Create a client with the endpoint set to the desired cluster region.
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{region}-dataproc.googleapis.com:443',
+    })
+
+    # Create the cluster config.
+    cluster = {
+        'project_id': project_id,
+        'cluster_name': cluster_name,
+        'config': {
+            'master_config': {
+                'num_instances': 1,
+                'machine_type_uri': 'n1-standard-1'
+            },
+            'worker_config': {
+                'num_instances': 2,
+                'machine_type_uri': 'n1-standard-1'
+            }
+        }
+    }
+
+    # Create the cluster.
+    operation = cluster_client.create_cluster(project_id, region, cluster)
+    result = operation.result()
+
+    # Output a success message.
+    print(f'Cluster created successfully: {result.cluster_name}')
+    # [END dataproc_create_cluster]
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        sys.exit('python create_cluster.py project_id region cluster_name')
+
+    project_id = sys.argv[1]
+    region = sys.argv[2]
+    cluster_name = sys.argv[3]
+    create_cluster(project_id, region, cluster_name)
@@ -0,0 +1,47 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+from google.cloud import dataproc_v1 as dataproc
+import pytest
+
+import create_cluster
+
+
+PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT']
+REGION = 'us-central1'
+CLUSTER_NAME = 'py-cc-test-{}'.format(str(uuid.uuid4()))
+
+
+@pytest.fixture(autouse=True)
+def teardown():
+    yield
+
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
+    })
+    # Client library function
+    operation = cluster_client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME)
+    # Wait for cluster to delete
+    operation.result()
+
+
+def test_cluster_create(capsys):
+    # Wrapper function for client library function
+    create_cluster.create_cluster(PROJECT_ID, REGION, CLUSTER_NAME)
+
+    out, _ = capsys.readouterr()
+    assert CLUSTER_NAME in out
@@ -0,0 +1,32 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Integration tests for Dataproc samples.
+
+Creates a Dataproc cluster, uploads a pyspark file to Google Cloud Storage,
+submits a job to Dataproc that runs the pyspark file, then downloads
+the output logs from Cloud Storage and verifies the expected output."""
+
+import os
+
+import submit_job_to_cluster
+
+PROJECT = os.environ['GOOGLE_CLOUD_PROJECT']
+BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
+CLUSTER_NAME = 'testcluster3'
+ZONE = 'us-central1-b'
+
+
+def test_e2e():
+    output = submit_job_to_cluster.main(
+        PROJECT, ZONE, CLUSTER_NAME, BUCKET)
+    assert b"['Hello,', 'dog', 'elephant', 'panther', 'world!']" in output
@@ -0,0 +1,107 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This sample walks a user through instantiating an inline
+# workflow for Cloud Dataproc using the Python client library.
+#
+# This script can be run on its own:
+#   python instantiate_inline_workflow_template.py ${PROJECT_ID} ${REGION}
+
+
+import sys
+
+# [START dataproc_instantiate_inline_workflow_template]
+from google.cloud import dataproc_v1 as dataproc
+
+
+def instantiate_inline_workflow_template(project_id, region):
+    """This sample walks a user through submitting a workflow
+       for a Cloud Dataproc using the Python client library.
+
+       Args:
+           project_id (string): Project to use for running the workflow.
+           region (string): Region where the workflow resources should live.
+    """
+
+    # Create a client with the endpoint set to the desired region.
+    workflow_template_client = dataproc.WorkflowTemplateServiceClient(
+        client_options={
+            'api_endpoint': f'{region}-dataproc.googleapis.com:443'
+        }
+    )
+
+    parent = workflow_template_client.region_path(project_id, region)
+
+    template = {
+        'jobs': [
+            {
+                'hadoop_job': {
+                    'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/'
+                    'hadoop-mapreduce-examples.jar',
+                    'args': [
+                        'teragen',
+                        '1000',
+                        'hdfs:///gen/'
+                    ]
+                },
+                'step_id': 'teragen'
+            },
+            {
+                'hadoop_job': {
+                    'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/'
+                    'hadoop-mapreduce-examples.jar',
+                    'args': [
+                        'terasort',
+                        'hdfs:///gen/',
+                        'hdfs:///sort/'
+                    ]
+                },
+                'step_id': 'terasort',
+                'prerequisite_step_ids': [
+                    'teragen'
+                ]
+            }],
+        'placement': {
+            'managed_cluster': {
+                'cluster_name': 'my-managed-cluster',
+                'config': {
+                    'gce_cluster_config': {
+                        # Leave 'zone_uri' empty for 'Auto Zone Placement'
+                        # 'zone_uri': ''
+                        'zone_uri': 'us-central1-a'
+                    }
+                }
+            }
+        }
+    }
+
+    # Submit the request to instantiate the workflow from an inline template.
+    operation = workflow_template_client.instantiate_inline_workflow_template(
+        parent, template
+    )
+    operation.result()
+
+    # Output a success message.
+    print('Workflow ran successfully.')
+    # [END dataproc_instantiate_inline_workflow_template]
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        sys.exit('python instantiate_inline_workflow_template.py '
+                 + 'project_id region')
+
+    project_id = sys.argv[1]
+    region = sys.argv[2]
+    instantiate_inline_workflow_template(project_id, region)
@@ -0,0 +1,31 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import instantiate_inline_workflow_template
+
+
+PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT']
+REGION = 'us-central1'
+
+
+def test_workflows(capsys):
+    # Wrapper function for client library function
+    instantiate_inline_workflow_template.instantiate_inline_workflow_template(
+        PROJECT_ID, REGION
+    )
+
+    out, _ = capsys.readouterr()
+    assert "successfully" in out