Skip to content

Commit 51c045d

Browse files
feat!: add support for Virtual Dataproc cluster running on GKE cluster (#570)
- [ ] Regenerate this pull request now. Committer: @Padmaar PiperOrigin-RevId: 429111624 Source-Link: googleapis/googleapis@da999a2 Source-Link: googleapis/googleapis-gen@99c5b3e Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiOTljNWIzZTk4YmFhMWRlOTM3NzZhYTRiNWNkNGM3MzYxMzUzZTRmNiJ9
1 parent 4d125ff commit 51c045d

File tree

8 files changed

+3958
-516
lines changed

8 files changed

+3958
-516
lines changed

packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/batches.proto

+1-2
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,7 @@ message CreateBatchRequest {
9090
// Optional. The ID to use for the batch, which will become the final component of
9191
// the batch's resource name.
9292
//
93-
// This value must be 4-63 characters. Valid characters
94-
// are /[a-z][0-9]-/.
93+
// This value must be 4-63 characters. Valid characters are `/[a-z][0-9]-/`.
9594
string batch_id = 3 [(google.api.field_behavior) = OPTIONAL];
9695

9796
// Optional. A unique ID used to identify the request. If the service

packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/clusters.proto

+62-26
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021 Google LLC
1+
// Copyright 2022 Google LLC
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -167,6 +167,15 @@ message Cluster {
167167
// when clusters are updated.
168168
ClusterConfig config = 3 [(google.api.field_behavior) = OPTIONAL];
169169

170+
// Optional. The virtual cluster config, used when creating a Dataproc cluster that
171+
// does not directly control the underlying compute resources, for example,
172+
// when creating a [Dataproc-on-GKE
173+
// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
174+
// Note that Dataproc may set default values, and values may change when
175+
// clusters are updated. Exactly one of config or virtualClusterConfig must be
176+
// specified.
177+
VirtualClusterConfig virtual_cluster_config = 10 [(google.api.field_behavior) = OPTIONAL];
178+
170179
// Optional. The labels to associate with this cluster.
171180
// Label **keys** must contain 1 to 63 characters, and must conform to
172181
// [RFC 1035](https://www.ietf.org/rfc/rfc1035.txt).
@@ -275,33 +284,56 @@ message ClusterConfig {
275284

276285
// Optional. Metastore configuration.
277286
MetastoreConfig metastore_config = 20 [(google.api.field_behavior) = OPTIONAL];
278-
279-
// Optional. BETA. The Kubernetes Engine config for Dataproc clusters deployed to
280-
// Kubernetes. Setting this is considered mutually exclusive with Compute
281-
// Engine-based options such as `gce_cluster_config`, `master_config`,
282-
// `worker_config`, `secondary_worker_config`, and `autoscaling_config`.
283-
GkeClusterConfig gke_cluster_config = 21 [(google.api.field_behavior) = OPTIONAL];
284287
}
285288

286-
// The GKE config for this cluster.
287-
message GkeClusterConfig {
288-
// A full, namespace-isolated deployment target for an existing GKE cluster.
289-
message NamespacedGkeDeploymentTarget {
290-
// Optional. The target GKE cluster to deploy to.
291-
// Format: 'projects/{project}/locations/{location}/clusters/{cluster_id}'
292-
string target_gke_cluster = 1 [
293-
(google.api.field_behavior) = OPTIONAL,
294-
(google.api.resource_reference) = {
295-
type: "container.googleapis.com/Cluster"
296-
}
297-
];
298-
299-
// Optional. A namespace within the GKE cluster to deploy into.
300-
string cluster_namespace = 2 [(google.api.field_behavior) = OPTIONAL];
289+
// Dataproc cluster config for a cluster that does not directly control the
290+
// underlying compute resources, such as a [Dataproc-on-GKE
291+
// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
292+
message VirtualClusterConfig {
293+
// Optional. A Storage bucket used to stage job
294+
// dependencies, config files, and job driver console output.
295+
// If you do not specify a staging bucket, Cloud
296+
// Dataproc will determine a Cloud Storage location (US,
297+
// ASIA, or EU) for your cluster's staging bucket according to the
298+
// Compute Engine zone where your cluster is deployed, and then create
299+
// and manage this project-level, per-location bucket (see
300+
// [Dataproc staging and temp
301+
// buckets](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/staging-bucket)).
302+
// **This field requires a Cloud Storage bucket name, not a `gs://...` URI to
303+
// a Cloud Storage bucket.**
304+
string staging_bucket = 1 [(google.api.field_behavior) = OPTIONAL];
305+
306+
// Optional. A Cloud Storage bucket used to store ephemeral cluster and jobs data,
307+
// such as Spark and MapReduce history files.
308+
// If you do not specify a temp bucket,
309+
// Dataproc will determine a Cloud Storage location (US,
310+
// ASIA, or EU) for your cluster's temp bucket according to the
311+
// Compute Engine zone where your cluster is deployed, and then create
312+
// and manage this project-level, per-location bucket. The default bucket has
313+
// a TTL of 90 days, but you can use any TTL (or none) if you specify a
314+
// bucket (see
315+
// [Dataproc staging and temp
316+
// buckets](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/staging-bucket)).
317+
// **This field requires a Cloud Storage bucket name, not a `gs://...` URI to
318+
// a Cloud Storage bucket.**
319+
string temp_bucket = 2 [(google.api.field_behavior) = OPTIONAL];
320+
321+
oneof infrastructure_config {
322+
// Required. The configuration for running the Dataproc cluster on Kubernetes.
323+
KubernetesClusterConfig kubernetes_cluster_config = 6 [(google.api.field_behavior) = REQUIRED];
301324
}
302325

303-
// Optional. A target for the deployment.
304-
NamespacedGkeDeploymentTarget namespaced_gke_deployment_target = 1 [(google.api.field_behavior) = OPTIONAL];
326+
// Optional. Configuration of auxiliary services used by this cluster.
327+
AuxiliaryServicesConfig auxiliary_services_config = 7 [(google.api.field_behavior) = OPTIONAL];
328+
}
329+
330+
// Auxiliary services configuration for a Cluster.
331+
message AuxiliaryServicesConfig {
332+
// Optional. The Hive Metastore configuration for this workload.
333+
MetastoreConfig metastore_config = 1 [(google.api.field_behavior) = OPTIONAL];
334+
335+
// Optional. The Spark History Server configuration for the workload.
336+
SparkHistoryServerConfig spark_history_server_config = 2 [(google.api.field_behavior) = OPTIONAL];
305337
}
306338

307339
// Endpoint config for this cluster
@@ -660,8 +692,8 @@ message DiskConfig {
660692
// Optional. Interface type of local SSDs (default is "scsi").
661693
// Valid values: "scsi" (Small Computer System Interface),
662694
// "nvme" (Non-Volatile Memory Express).
663-
// See [SSD Interface
664-
// types](https://cloud.google.com/compute/docs/disks/local-ssd#performance).
695+
// See [local SSD
696+
// performance](https://cloud.google.com/compute/docs/disks/local-ssd#performance).
665697
string local_ssd_interface = 4 [(google.api.field_behavior) = OPTIONAL];
666698
}
667699

@@ -692,6 +724,10 @@ message ClusterStatus {
692724
CREATING = 1;
693725

694726
// The cluster is currently running and healthy. It is ready for use.
727+
//
728+
// **Note:** The cluster state changes from "creating" to "running" status
729+
// after the master node(s), first two primary worker nodes (and the last
730+
// primary worker node if primary workers > 2) are running.
695731
RUNNING = 2;
696732

697733
// The cluster encountered an error. It is not ready for use.

packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/shared.proto

+174-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021 Google LLC
1+
// Copyright 2022 Google LLC
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -108,6 +108,179 @@ message RuntimeInfo {
108108
string diagnostic_output_uri = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
109109
}
110110

111+
// The cluster's GKE config.
112+
message GkeClusterConfig {
113+
// Optional. A target GKE cluster to deploy to. It must be in the same project and
114+
// region as the Dataproc cluster (the GKE cluster can be zonal or regional).
115+
// Format: 'projects/{project}/locations/{location}/clusters/{cluster_id}'
116+
string gke_cluster_target = 2 [
117+
(google.api.field_behavior) = OPTIONAL
118+
];
119+
120+
// Optional. GKE NodePools where workloads will be scheduled. At least one node pool
121+
// must be assigned the 'default' role. Each role can be given to only a
122+
// single NodePoolTarget. All NodePools must have the same location settings.
123+
// If a nodePoolTarget is not specified, Dataproc constructs a default
124+
// nodePoolTarget.
125+
repeated GkeNodePoolTarget node_pool_target = 3 [(google.api.field_behavior) = OPTIONAL];
126+
}
127+
128+
// The configuration for running the Dataproc cluster on Kubernetes.
129+
message KubernetesClusterConfig {
130+
// Optional. A namespace within the Kubernetes cluster to deploy into. If this namespace
131+
// does not exist, it is created. If it exists, Dataproc
132+
// verifies that another Dataproc VirtualCluster is not installed
133+
// into it. If not specified, the name of the Dataproc Cluster is used.
134+
string kubernetes_namespace = 1 [(google.api.field_behavior) = OPTIONAL];
135+
136+
oneof config {
137+
// Required. The configuration for running the Dataproc cluster on GKE.
138+
GkeClusterConfig gke_cluster_config = 2 [(google.api.field_behavior) = REQUIRED];
139+
}
140+
141+
// Optional. The software configuration for this Dataproc cluster running on Kubernetes.
142+
KubernetesSoftwareConfig kubernetes_software_config = 3 [(google.api.field_behavior) = OPTIONAL];
143+
}
144+
145+
// The software configuration for this Dataproc cluster running on Kubernetes.
146+
message KubernetesSoftwareConfig {
147+
// The components that should be installed in this Dataproc cluster. The key
148+
// must be a string from the KubernetesComponent enumeration. The value is
149+
// the version of the software to be installed.
150+
// At least one entry must be specified.
151+
map<string, string> component_version = 1;
152+
153+
// The properties to set on daemon config files.
154+
//
155+
// Property keys are specified in `prefix:property` format, for example
156+
// `spark:spark.kubernetes.container.image`. The following are supported
157+
// prefixes and their mappings:
158+
//
159+
// * spark: `spark-defaults.conf`
160+
//
161+
// For more information, see [Cluster
162+
// properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties).
163+
map<string, string> properties = 2;
164+
}
165+
166+
// GKE NodePools that Dataproc workloads run on.
167+
message GkeNodePoolTarget {
168+
// `Role` specifies whose tasks will run on the NodePool. The roles can be
169+
// specific to workloads. Exactly one GkeNodePoolTarget within the
170+
// VirtualCluster must have 'default' role, which is used to run all workloads
171+
// that are not associated with a NodePool.
172+
enum Role {
173+
// Role is unspecified.
174+
ROLE_UNSPECIFIED = 0;
175+
176+
// Any roles that are not directly assigned to a NodePool run on the
177+
// `default` role's NodePool.
178+
DEFAULT = 1;
179+
180+
// Run controllers and webhooks.
181+
CONTROLLER = 2;
182+
183+
// Run spark driver.
184+
SPARK_DRIVER = 3;
185+
186+
// Run spark executors.
187+
SPARK_EXECUTOR = 4;
188+
}
189+
190+
// Required. The target GKE NodePool.
191+
// Format:
192+
// 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}'
193+
string node_pool = 1 [
194+
(google.api.field_behavior) = REQUIRED
195+
];
196+
197+
// Required. The types of role for a GKE NodePool
198+
repeated Role roles = 2 [(google.api.field_behavior) = REQUIRED];
199+
200+
// Optional. The configuration for the GKE NodePool.
201+
//
202+
// If specified, Dataproc attempts to create a NodePool with the
203+
// specified shape. If one with the same name already exists, it is
204+
// verified against all specified fields. If a field differs, the
205+
// virtual cluster creation will fail.
206+
//
207+
// If omitted, any NodePool with the specified name is used. If a
208+
// NodePool with the specified name does not exist, Dataproc create a NodePool
209+
// with default values.
210+
GkeNodePoolConfig node_pool_config = 3 [(google.api.field_behavior) = OPTIONAL];
211+
}
212+
213+
// The configuration of a GKE NodePool used by a [Dataproc-on-GKE
214+
// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
215+
message GkeNodePoolConfig {
216+
// Parameters that describe cluster nodes.
217+
message GkeNodeConfig {
218+
// Optional. The name of a Compute Engine [machine
219+
// type](https://cloud.google.com/compute/docs/machine-types).
220+
string machine_type = 1 [(google.api.field_behavior) = OPTIONAL];
221+
222+
// Optional. Whether the nodes are created as [preemptible VM
223+
// instances](https://cloud.google.com/compute/docs/instances/preemptible).
224+
bool preemptible = 10 [(google.api.field_behavior) = OPTIONAL];
225+
226+
// Optional. The number of local SSD disks to attach to the node, which is limited by
227+
// the maximum number of disks allowable per zone (see [Adding Local
228+
// SSDs](https://cloud.google.com/compute/docs/disks/local-ssd)).
229+
int32 local_ssd_count = 7 [(google.api.field_behavior) = OPTIONAL];
230+
231+
// Optional. A list of [hardware
232+
// accelerators](https://cloud.google.com/compute/docs/gpus) to attach to
233+
// each node.
234+
repeated GkeNodePoolAcceleratorConfig accelerators = 11 [(google.api.field_behavior) = OPTIONAL];
235+
236+
// Optional. [Minimum CPU
237+
// platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform)
238+
// to be used by this instance. The instance may be scheduled on the
239+
// specified or a newer CPU platform. Specify the friendly names of CPU
240+
// platforms, such as "Intel Haswell"` or Intel Sandy Bridge".
241+
string min_cpu_platform = 13 [(google.api.field_behavior) = OPTIONAL];
242+
}
243+
244+
// A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request
245+
// for a NodePool.
246+
message GkeNodePoolAcceleratorConfig {
247+
// The number of accelerator cards exposed to an instance.
248+
int64 accelerator_count = 1;
249+
250+
// The accelerator type resource namename (see GPUs on Compute Engine).
251+
string accelerator_type = 2;
252+
}
253+
254+
// GkeNodePoolAutoscaling contains information the cluster autoscaler needs to
255+
// adjust the size of the node pool to the current cluster usage.
256+
message GkeNodePoolAutoscalingConfig {
257+
// The minimum number of nodes in the NodePool. Must be >= 0 and <=
258+
// max_node_count.
259+
int32 min_node_count = 2;
260+
261+
// The maximum number of nodes in the NodePool. Must be >= min_node_count.
262+
// **Note:** Quota must be sufficient to scale up the cluster.
263+
int32 max_node_count = 3;
264+
}
265+
266+
// Optional. The node pool configuration.
267+
GkeNodeConfig config = 2 [(google.api.field_behavior) = OPTIONAL];
268+
269+
// Optional. The list of Compute Engine
270+
// [zones](https://cloud.google.com/compute/docs/zones#available) where
271+
// NodePool's nodes will be located.
272+
//
273+
// **Note:** Currently, only one zone may be specified.
274+
//
275+
// If a location is not specified during NodePool creation, Dataproc will
276+
// choose a location.
277+
repeated string locations = 13 [(google.api.field_behavior) = OPTIONAL];
278+
279+
// Optional. The autoscaler configuration for this NodePool. The autoscaler is enabled
280+
// only when a valid configuration is present.
281+
GkeNodePoolAutoscalingConfig autoscaling = 4 [(google.api.field_behavior) = OPTIONAL];
282+
}
283+
111284
// Cluster components that can be activated.
112285
enum Component {
113286
// Unspecified component. Specifying this will cause Cluster creation to fail.

0 commit comments

Comments
 (0)