GoogleCloudPlatform
diff --git a/‎infrastructure/outputs.tf
+11 b/‎infrastructure/outputs.tf
+11
diff --git a/‎modules/gke-autopilot-private-cluster/outputs.tf
+4 b/‎modules/gke-autopilot-private-cluster/outputs.tf
+4
diff --git a/‎modules/gke-autopilot-public-cluster/outputs.tf
+4 b/‎modules/gke-autopilot-public-cluster/outputs.tf
+4
diff --git a/‎modules/gke-standard-private-cluster/outputs.tf
+4 b/‎modules/gke-standard-private-cluster/outputs.tf
+4
diff --git a/‎modules/gke-standard-public-cluster/outputs.tf
+5 b/‎modules/gke-standard-public-cluster/outputs.tf
+5
diff --git a/‎tutorials-and-examples/skypilot/dws-and-kueue/README.md
+429 b/‎tutorials-and-examples/skypilot/dws-and-kueue/README.md
+429
diff --git a/‎tutorials-and-examples/skypilot/dws-and-kueue/backend.tf
+20 b/‎tutorials-and-examples/skypilot/dws-and-kueue/backend.tf
+20
diff --git a/‎tutorials-and-examples/skypilot/dws-and-kueue/code/finetune.py
+252 b/‎tutorials-and-examples/skypilot/dws-and-kueue/code/finetune.py
+252
diff --git a/‎tutorials-and-examples/skypilot/dws-and-kueue/example_environment.tfvars
+44 b/‎tutorials-and-examples/skypilot/dws-and-kueue/example_environment.tfvars
+44
@@ -45,6 +45,17 @@ output "ca_certificate" {
 
 }
 
+output "service_account" {
+  value = var.create_cluster && var.autopilot_cluster && var.private_cluster ? module.private-gke-autopilot-cluster[0].service_account : (
+    var.create_cluster && !var.autopilot_cluster && var.private_cluster ? module.private-gke-standard-cluster[0].service_account : (
+      var.create_cluster && var.autopilot_cluster && !var.private_cluster ? module.public-gke-autopilot-cluster[0].service_account : (
+        var.create_cluster && !var.autopilot_cluster && !var.private_cluster ? module.public-gke-standard-cluster[0].service_account :
+  "")))
+  sensitive  = true
+  depends_on = [module.private-gke-autopilot-cluster, module.private-gke-standard-cluster, module.public-gke-autopilot-cluster, module.public-gke-standard-cluster]
+
+}
+
 output "private_cluster" {
   value = var.private_cluster
 }
@@ -22,4 +22,8 @@ output "endpoint" {
 
 output "ca_certificate" {
   value = module.gke.ca_certificate
+}
+
+output "service_account" {
+  value = module.gke.service_account
 }
@@ -22,4 +22,8 @@ output "endpoint" {
 
 output "ca_certificate" {
   value = module.gke.ca_certificate
+}
+
+output "service_account" {
+  value = module.gke.service_account
 }
@@ -22,4 +22,8 @@ output "endpoint" {
 
 output "ca_certificate" {
   value = module.gke.ca_certificate
+}
+
+output "service_account" {
+  value = module.gke.service_account
 }
@@ -23,3 +23,8 @@ output "endpoint" {
 output "ca_certificate" {
   value = module.gke.ca_certificate
 }
+
+
+output "service_account" {
+  value = module.gke.service_account
+}
@@ -0,0 +1,20 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# terraform {
+#   backend "gcs" {
+#     bucket = "BUCKET_NAME"
+#     prefix = "terraform/state"
+#   }
+# }
@@ -0,0 +1,252 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from  datetime import datetime
+import time
+import os
+import torch
+from datasets import load_dataset, Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments,
+)
+from peft import LoraConfig, PeftModel, get_peft_model
+
+from trl import SFTTrainer
+
+
+
+# The model that you want to train from the Hugging Face hub
+model_name = os.getenv("BASE_MODEL_NAME", "google/gemma-2b")
+
+# The instruction dataset to use
+dataset_name = "b-mc2/sql-create-context"
+
+# Fine-tuned model name
+new_model = os.getenv("MODEL_NAME", "gemma-2b-sql")
+
+################################################################################
+# QLoRA parameters
+################################################################################
+
+# LoRA attention dimension
+lora_r = int(os.getenv("LORA_R", "4"))
+
+# Alpha parameter for LoRA scaling
+lora_alpha = int(os.getenv("LORA_ALPHA", "8"))
+
+# Dropout probability for LoRA layers
+lora_dropout = 0.1
+
+################################################################################
+# bitsandbytes parameters
+################################################################################
+
+# Activate 4-bit precision base model loading
+use_4bit = True
+
+# Compute dtype for 4-bit base models
+bnb_4bit_compute_dtype = "float16"
+
+# Quantization type (fp4 or nf4)
+bnb_4bit_quant_type = "nf4"
+
+# Activate nested quantization for 4-bit base models (double quantization)
+use_nested_quant = False
+
+################################################################################
+# TrainingArguments parameters
+################################################################################
+
+# Output directory where the model predictions and checkpoints will be stored
+output_dir  = "/data/models/" + new_model
+
+# Number of training epochs
+num_train_epochs = 1
+
+# Enable fp16/bf16 training (set bf16 to True with an A100)
+fp16 = True
+bf16 = False
+
+# Batch size per GPU for training
+per_device_train_batch_size = int(os.getenv("TRAIN_BATCH_SIZE", "1"))
+
+# Batch size per GPU for evaluation
+per_device_eval_batch_size = int(os.getenv("EVAL_BATCH_SIZE", "2"))
+
+# Number of update steps to accumulate the gradients for
+gradient_accumulation_steps = int(os.getenv("GRADIENT_ACCUMULATION_STEPS", "1"))
+
+# Enable gradient checkpointing
+gradient_checkpointing = True
+
+# Maximum gradient normal (gradient clipping)
+max_grad_norm = 0.3
+
+# Initial learning rate (AdamW optimizer)
+learning_rate = 2e-4
+
+# Weight decay to apply to all layers except bias/LayerNorm weights
+weight_decay = 0.001
+
+# Optimizer to use
+optim = "paged_adamw_32bit"
+
+# Learning rate schedule
+lr_scheduler_type = "cosine"
+
+# Number of training steps (overrides num_train_epochs)
+max_steps = -1
+
+# Ratio of steps for a linear warmup (from 0 to learning rate)
+warmup_ratio = 0.03
+
+# Group sequences into batches with same length
+# Saves memory and speeds up training considerably
+group_by_length = True
+
+# Save checkpoint every X updates steps
+save_steps = 0
+
+# Log every X updates steps
+logging_steps = int(os.getenv("LOGGING_STEPS", "50"))
+
+################################################################################
+# SFT parameters
+################################################################################
+
+# Maximum sequence length to use
+max_seq_length = int(os.getenv("MAX_SEQ_LENGTH", "512"))
+
+# Pack multiple short examples in the same input sequence to increase efficiency
+packing = False
+
+# Load the entire model on the GPU 0
+device_map = {'':torch.cuda.current_device()}
+
+# Set limit to a positive number
+limit = int(os.getenv("DATASET_LIMIT", "5000"))
+
+dataset = load_dataset(dataset_name, split="train")
+if limit != -1:
+    dataset = dataset.shuffle(seed=42).select(range(limit))
+
+
+def transform(data):
+    question = data['question']
+    context = data['context']
+    answer = data['answer']
+    template = "Question: {question}\nContext: {context}\nAnswer: {answer}"
+    return {'text': template.format(question=question, context=context, answer=answer)}
+
+
+transformed = dataset.map(transform)
+
+# Load tokenizer and model with QLoRA configuration
+compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=use_4bit,
+    bnb_4bit_quant_type=bnb_4bit_quant_type,
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=use_nested_quant,
+)
+
+# Check GPU compatibility with bfloat16
+if compute_dtype == torch.float16 and use_4bit:
+    major, _ = torch.cuda.get_device_capability()
+    if major >= 8:
+        print("=" * 80)
+        print("Your GPU supports bfloat16")
+        print("=" * 80)
+
+# Load base model
+# model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map=device_map,
+    torch_dtype=torch.float16,
+)
+model.config.use_cache = False
+model.config.pretraining_tp = 1
+
+# Load LLaMA tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
+
+# Load LoRA configuration
+peft_config = LoraConfig(
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    r=lora_r,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=["q_proj", "v_proj"]
+)
+
+# Set training parameters
+training_arguments = TrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=per_device_train_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    optim=optim,
+    save_steps=save_steps,
+    logging_steps=logging_steps,
+    learning_rate=learning_rate,
+    weight_decay=weight_decay,
+    fp16=fp16,
+    bf16=bf16,
+    max_grad_norm=max_grad_norm,
+    max_steps=max_steps,
+    warmup_ratio=warmup_ratio,
+    group_by_length=group_by_length,
+    lr_scheduler_type=lr_scheduler_type,
+)
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=transformed,
+    peft_config=peft_config,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    args=training_arguments,
+    packing=packing,
+)
+
+
+trainer.train()
+trainer.model.save_pretrained(new_model)
+
+
+# Reload model in FP16 and merge it with LoRA weights
+base_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    low_cpu_mem_usage=True,
+    return_dict=True,
+    torch_dtype=torch.float16,
+    device_map=device_map,
+)
+
+model = PeftModel.from_pretrained(base_model, new_model)
+model = model.merge_and_unload()
+
+# Save model to disk
+model.save_pretrained(output_dir)
+# Save the tokenizer to disk
+tokenizer.save_pretrained(output_dir)
@@ -0,0 +1,44 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+project_id             = "skypilot_project"
+create_cluster         = true
+cluster_name           = "skypilot-test"
+cluster_location       = "us-central1"
+enable_gpu             = true
+create_service_account = false
+create_brand           = false
+create_gcs_bucket      = true
+gcs_bucket             = "skypilot-model-bucket"
+
+# For Autopilot clusters
+autopilot_cluster = true
+
+#  For Standard clusters, configure GPU node pools:
+#autopilot_cluster = false
+
+#  If using Standard cluster please uncomment the
+#  following gpu_pools block to enable queued_provisioning
+#  on the node pool
+# gpu_pools = [{
+# name = "gpu-pool"
+# queued_provisioning = true
+# machine_type = "g2-standard-24"
+# disk_type = "pd-balanced"
+# autoscaling = true
+# min_count = 0
+# max_count = 3
+# initial_node_count = 0
+# }]
Original file line number	Diff line number	Diff line change
`@@ -22,4 +22,8 @@ output "endpoint" {`
`22`	`22`
`23`	`23`	`output "ca_certificate" {`
`24`	`24`	`value = module.gke.ca_certificate`
	`25`	`+}`
	`26`	`+`
	`27`	`+output "service_account" {`
	`28`	`+ value = module.gke.service_account`
`25`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -23,3 +23,8 @@ output "endpoint" {`
`23`	`23`	`output "ca_certificate" {`
`24`	`24`	`value = module.gke.ca_certificate`
`25`	`25`	`}`
	`26`	`+`
	`27`	`+`
	`28`	`+output "service_account" {`
	`29`	`+ value = module.gke.service_account`
	`30`	`+}`