Description
Terraform CLI and Terraform AWS Provider Version
Terraform v1.1.7
on linux_amd64
+ provider registry.terraform.io/hashicorp/aws v3.51.0
+ provider registry.terraform.io/infobloxopen/infoblox v1.1.1
Affected Resource(s)
- aws_instance
Terraform Configuration Files
// This Terraform plan demonstrates a bug where resource timeouts are ignored when
// AWS has "insufficient resource" errors, i.e. not enough hardware to fulfil
// the request. Not only will Terraform ignore the resource timeouts, but after an
// hour or so, when it finally does timeout with errors, it will not always include
// any instances it does create in the Terraform inventory. So running "terraform destroy"
// will leave you with very expensive GPU instances still running after destroy.
// This demo sets a creation timeout of 10 minutes for 10 instances.
variable "access_key" {}
variable "secret_key" {}
variable "region" {
default = "us-east-1"
}
variable "az" {
default = "us-east-1a"
}
// AWS does not have enough hardware in us-east-1a to deploy 10 "p4d.24xlarge" instances on-demand.
resource "aws_instance" "tf-bug-gpu" {
count = 10
ami = "ami-09e67e426f25ce0d7"
instance_type = "p4d.24xlarge"
key_name = aws_key_pair.tf-bug-key.id
vpc_security_group_ids = [aws_security_group.tf-bug-sg.id]
subnet_id = aws_subnet.tf-bug-subnet.id
monitoring = true
root_block_device {
volume_type = "gp2"
volume_size = "60"
delete_on_termination = true
}
connection {
user = "ubuntu"
agent = false
}
tags = {
Name = "Terminate this"
}
// Timeouts are ignored when AWS has "insufficient resource" errors.
timeouts {
create = "10m"
delete = "10m"
}
}
provider "aws" {
access_key = var.access_key
secret_key = var.secret_key
region = var.region
}
// keys -- use your own public key here if you want to be able to ssh into any deployed VMs.
resource "aws_key_pair" "tf-bug-key" {
key_name = "tf-bug-key"
public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDRy5qukBQUFiZaO9RvidBgHVRnRCAKiRAM9EtnM1eTXvkuvf98fw+2F2B+1C3/C7Cb09ZBqknCtFo3yS4V+BiHHn4zgz/zHY9gOEyxLaIEb5rvnkw64Qwo0bLeuc849zMDxHQiixMHi4GKYIOqI8FSvhRJUsdp9tbwOE58UV6AK8AHp5XbwGl5WgHUbfXzyHMs6stzI8s3tFHHuu6XQL7LNpHjMm7b43UnkoPTdfl7/j6VesUxL64JsEbi8jPxWZ/KBaKv3tGa/fKk5NOkdSaE7ODCUxWxPQfmBR1Vc0MOXs29ibBxfNJMCChY5JgJaoeCLF08+MIF3fsmHnNI2+t+0cI/cZhrde8+AhTTrunWJLBdAB/XFgGhSCSTUmMfa3KbpPgD06tcIjFM+hXND9pIRGEF9ZlUikEXIGe9PvjbHahNddU04rAlKoTY5oxLM7cdGFgraHwozzNmYsOCZXDoD6Ym9vjykERqq/fKfpt+70SNuiINh+JnS5tNKKSzNlsPkIYyylKeuSAsIgMvLDuUyifHp00VSIE4/ST0aP2P+vFiPIuYBNaq+emAdPvZixDBYrileRKvThe3zzKNxC8fniVFckw17B0JjWG8Q6tYMGQva15f4+US6VCXMBCgkDQO73LVWsaP23G4JHMqqvBk+GuHoaY9eScovBbqiFUO3w== tf-bug"
}
// Store Terraform state in S3
terraform {
backend "s3" {
bucket = "bf-tf-state"
key = "terraform/tf-bug"
region = "us-east-1"
}
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 3.0"
}
}
}
// networking
resource "aws_vpc" "tf-bug-vpc" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "tf-bug-vpc"
}
}
resource "aws_subnet" "tf-bug-subnet" {
cidr_block = cidrsubnet(aws_vpc.tf-bug-vpc.cidr_block, 8, 3)
vpc_id = aws_vpc.tf-bug-vpc.id
availability_zone = var.az
tags = {
Name = "tf-bug-subnet"
}
}
resource "aws_internet_gateway" "tf-bug-ipgw" {
vpc_id = aws_vpc.tf-bug-vpc.id
tags = {
Name = "tf-bug-ipgw"
}
}
resource "aws_route_table" "tf-bug-route-table" {
vpc_id = aws_vpc.tf-bug-vpc.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.tf-bug-ipgw.id
}
tags = {
Name = "tf-bug-route-table"
}
}
resource "aws_route_table_association" "tf-bug-subnet-association" {
subnet_id = aws_subnet.tf-bug-subnet.id
route_table_id = aws_route_table.tf-bug-route-table.id
}
// security
resource "aws_security_group" "tf-bug-sg" {
name = "tf-bug-sg"
vpc_id = aws_vpc.tf-bug-vpc.id
// ssh from anywhere
ingress {
cidr_blocks = ["0.0.0.0/0"]
from_port = 22
to_port = 22
protocol = "tcp"
}
// Allow all traffic across the internal subnet
ingress {
cidr_blocks = [cidrsubnet(aws_vpc.tf-bug-vpc.cidr_block, 8, 3)]
from_port = 0
to_port = 0
protocol = "-1"
}
// All all egress traffic (Terraform removes the default rule)
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
Debug Output
After timeout of 10m has passed, Terraform is still attempting to create the instances:
aws_instance.tf-bug-gpu[9]: Still creating... [22m10s elapsed]
aws_instance.tf-bug-gpu[5]: Still creating... [22m10s elapsed]
aws_instance.tf-bug-gpu[2]: Still creating... [22m10s elapsed]
aws_instance.tf-bug-gpu[2]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[4]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[6]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[7]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[1]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[3]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[0]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[5]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[8]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[9]: Still creating... [22m20s elapsed]
aws_instance.tf-bug-gpu[4]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[1]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[8]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[9]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[3]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[7]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[2]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[0]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[5]: Still creating... [22m30s elapsed]
aws_instance.tf-bug-gpu[6]: Still creating... [22m30s elapsed]
Panic Output
none
Expected Behavior
Based on the create = "10m"
timeout setting, after 10 minutes Terraform should have given up, stopped, and saved the state of any resources created.
Actual Behavior
Terraform keeps running for an hour before finally exiting with an error and (sometimes) losing track of VMs that were created.
Steps to Reproduce
terraform init
terraform apply
Important Factoids
"p4d.24xlarge" is a GPU instance type with 8x NVIDIA A100 GPUs. AWS doesn't have enough hardware to start up 10 of these in us-east-1a (one of the few AZs where these instance types exist). If Terraform fails to save these to inventory, and you run a terraform destroy
, afterwards they're still running and AWS is still billing you $32/hour for them.
References
Somewhat related to #1496. In that case, the issue is that the error message isn't displayed correctly. My issue is that the error isn't handled correctly at all.