Skip to content

Commit 277a1cf

Browse files
authored
feat: ICE AZs temporarily when subnets in that AZ run out of available ips (#8199)
1 parent cd4c060 commit 277a1cf

File tree

4 files changed

+58
-1
lines changed

4 files changed

+58
-1
lines changed

pkg/cache/unavailableofferings.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,15 @@ type UnavailableOfferings struct {
3434
// key: <capacityType>:<instanceType>:<zone>, value: struct{}{}
3535
offeringCache *cache.Cache
3636
capacityTypeCache *cache.Cache
37+
azCache *cache.Cache
3738
SeqNum uint64
3839
}
3940

4041
func NewUnavailableOfferings() *UnavailableOfferings {
4142
uo := &UnavailableOfferings{
4243
offeringCache: cache.New(UnavailableOfferingsTTL, UnavailableOfferingsCleanupInterval),
4344
capacityTypeCache: cache.New(UnavailableOfferingsTTL, UnavailableOfferingsCleanupInterval),
45+
azCache: cache.New(UnavailableOfferingsTTL, UnavailableOfferingsCleanupInterval),
4446
SeqNum: 0,
4547
}
4648
uo.offeringCache.OnEvicted(func(_ string, _ interface{}) {
@@ -56,7 +58,8 @@ func NewUnavailableOfferings() *UnavailableOfferings {
5658
func (u *UnavailableOfferings) IsUnavailable(instanceType ec2types.InstanceType, zone, capacityType string) bool {
5759
_, offeringFound := u.offeringCache.Get(u.key(instanceType, zone, capacityType))
5860
_, capacityTypeFound := u.capacityTypeCache.Get(capacityType)
59-
return offeringFound || capacityTypeFound
61+
_, azFound := u.azCache.Get(zone)
62+
return offeringFound || capacityTypeFound || azFound
6063
}
6164

6265
// MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings
@@ -83,13 +86,18 @@ func (u *UnavailableOfferings) MarkCapacityTypeUnavailable(capacityType string)
8386
atomic.AddUint64(&u.SeqNum, 1)
8487
}
8588

89+
func (u *UnavailableOfferings) MarkAZUnavailable(zone string) {
90+
u.azCache.SetDefault(zone, struct{}{})
91+
}
92+
8693
func (u *UnavailableOfferings) Delete(instanceType ec2types.InstanceType, zone string, capacityType string) {
8794
u.offeringCache.Delete(u.key(instanceType, zone, capacityType))
8895
}
8996

9097
func (u *UnavailableOfferings) Flush() {
9198
u.offeringCache.Flush()
9299
u.capacityTypeCache.Flush()
100+
u.azCache.Flush()
93101
}
94102

95103
// key returns the cache key for all offerings in the cache

pkg/errors/errors.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const (
3030
UnauthorizedOperationErrorCode = "UnauthorizedOperation"
3131
RateLimitingErrorCode = "RequestLimitExceeded"
3232
ServiceLinkedRoleCreationNotPermittedErrorCode = "AuthFailure.ServiceLinkedRoleCreationNotPermitted"
33+
InsufficientFreeAddressesInSubnetErrorCode = "InsufficientFreeAddressesInSubnet"
3334
)
3435

3536
var (
@@ -159,6 +160,10 @@ func IsServiceLinkedRoleCreationNotPermitted(err ec2types.CreateFleetError) bool
159160
return *err.ErrorCode == ServiceLinkedRoleCreationNotPermittedErrorCode
160161
}
161162

163+
func IsInsufficientFreeAddressesInSubnet(err ec2types.CreateFleetError) bool {
164+
return *err.ErrorCode == InsufficientFreeAddressesInSubnetErrorCode
165+
}
166+
162167
// IsReservationCapacityExceeded returns true if the fleet error means there is no remaining capacity for the provided
163168
// capacity reservation.
164169
func IsReservationCapacityExceeded(err ec2types.CreateFleetError) bool {

pkg/providers/instance/instance.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,13 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache(
445445
nodeClaim *karpv1.NodeClaim,
446446
instanceTypes []*cloudprovider.InstanceType,
447447
) {
448+
for _, err := range errs {
449+
zone := lo.FromPtr(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone)
450+
if awserrors.IsInsufficientFreeAddressesInSubnet(err) && zone != "" {
451+
p.unavailableOfferings.MarkAZUnavailable(zone)
452+
}
453+
}
454+
448455
if capacityType != karpv1.CapacityTypeReserved {
449456
for _, err := range errs {
450457
if awserrors.IsUnfulfillableCapacity(err) {

pkg/providers/instance/suite_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,4 +351,41 @@ var _ = Describe("InstanceProvider", func() {
351351
retrievedIDs := sets.New[string](lo.Map(instances, func(i *instance.Instance, _ int) string { return i.ID })...)
352352
Expect(ids.Equal(retrievedIDs)).To(BeTrue())
353353
})
354+
It("should mark subnets as unavailable when they run out of IPs", func() {
355+
ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass)
356+
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
357+
awsEnv.EC2API.CreateFleetBehavior.Output.Set(&ec2.CreateFleetOutput{
358+
Errors: []ec2types.CreateFleetError{
359+
{
360+
ErrorCode: lo.ToPtr("InsufficientFreeAddressesInSubnet"),
361+
ErrorMessage: lo.ToPtr("There are insufficient free addresses in that subnet to run instance"),
362+
LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{
363+
Overrides: &ec2types.FleetLaunchTemplateOverrides{
364+
InstanceType: "m5.xlarge",
365+
AvailabilityZone: lo.ToPtr("test-zone-1a"),
366+
},
367+
},
368+
},
369+
},
370+
})
371+
instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
372+
Expect(err).ToNot(HaveOccurred())
373+
374+
// We expect to treat that error as an ICE
375+
instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
376+
Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue())
377+
Expect(instance).To(BeNil())
378+
379+
// We should have set the zone used in the request as unavailable for all instance types
380+
for _, instance := range instanceTypes {
381+
Expect(awsEnv.UnavailableOfferingsCache.IsUnavailable(ec2types.InstanceType(instance.Name), "test-zone-1a", "on-demand")).To(BeTrue())
382+
}
383+
// But we should not have set the other zones as unavailable
384+
zones := []string{"test-zone-1b", "test-zone-1c"}
385+
for _, zone := range zones {
386+
for _, instance := range instanceTypes {
387+
Expect(awsEnv.UnavailableOfferingsCache.IsUnavailable(ec2types.InstanceType(instance.Name), zone, "on-demand")).To(BeFalse())
388+
}
389+
}
390+
})
354391
})

0 commit comments

Comments
 (0)