Skip to content

Commit dc91330

Browse files
authored
Merge pull request #7989 from loick111/feature/clusterapi-instances-status
ClusterAPI: Report machine phases to improve cluster-autoscaler decisions
2 parents c107f2b + 005a42b commit dc91330

File tree

2 files changed

+295
-2
lines changed

2 files changed

+295
-2
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

+75-2
Original file line numberDiff line numberDiff line change
@@ -253,9 +253,82 @@ func (ng *nodegroup) Nodes() ([]cloudprovider.Instance, error) {
253253
// must match the ID on the Node object itself.
254254
// https://github.com/kubernetes/autoscaler/blob/a973259f1852303ba38a3a61eeee8489cf4e1b13/cluster-autoscaler/clusterstate/clusterstate.go#L967-L985
255255
instances := make([]cloudprovider.Instance, len(providerIDs))
256-
for i := range providerIDs {
256+
for i, providerID := range providerIDs {
257+
providerIDNormalized := normalizedProviderID(providerID)
258+
259+
// Add instance Status to report instance state to cluster-autoscaler.
260+
// This helps cluster-autoscaler make better scaling decisions.
261+
//
262+
// Machine can be Failed for a variety of reasons, here we are looking for a specific errors:
263+
// - Failed to provision
264+
// - Failed to delete.
265+
// Other reasons for a machine to be in a Failed state are not forwarding Status to the autoscaler.
266+
var status *cloudprovider.InstanceStatus
267+
switch {
268+
case isFailedMachineProviderID(providerIDNormalized):
269+
klog.V(4).Infof("Machine failed in node group %s (%s)", ng.Id(), providerID)
270+
271+
machine, err := ng.machineController.findMachineByProviderID(providerIDNormalized)
272+
if err != nil {
273+
return nil, err
274+
}
275+
276+
if machine != nil {
277+
if !machine.GetDeletionTimestamp().IsZero() {
278+
klog.V(4).Infof("Machine failed in node group %s (%s) is being deleted", ng.Id(), providerID)
279+
status = &cloudprovider.InstanceStatus{
280+
State: cloudprovider.InstanceDeleting,
281+
ErrorInfo: &cloudprovider.InstanceErrorInfo{
282+
ErrorClass: cloudprovider.OtherErrorClass,
283+
ErrorCode: "DeletingFailed",
284+
ErrorMessage: "Machine deletion failed",
285+
},
286+
}
287+
} else {
288+
_, nodeFound, err := unstructured.NestedFieldCopy(machine.UnstructuredContent(), "status", "nodeRef")
289+
if err != nil {
290+
return nil, err
291+
}
292+
293+
// Machine failed without a nodeRef, this indicates that the machine failed to provision.
294+
// This is a special case where the machine is in a Failed state and the node is not created.
295+
// Machine controller will not reconcile this machine, so we need to report this to the autoscaler.
296+
if !nodeFound {
297+
klog.V(4).Infof("Machine failed in node group %s (%s) was being created", ng.Id(), providerID)
298+
status = &cloudprovider.InstanceStatus{
299+
State: cloudprovider.InstanceCreating,
300+
ErrorInfo: &cloudprovider.InstanceErrorInfo{
301+
ErrorClass: cloudprovider.OtherErrorClass,
302+
ErrorCode: "ProvisioningFailed",
303+
ErrorMessage: "Machine provisioning failed",
304+
},
305+
}
306+
}
307+
}
308+
}
309+
310+
case isPendingMachineProviderID(providerIDNormalized):
311+
klog.V(4).Infof("Machine pending in node group %s (%s)", ng.Id(), providerID)
312+
status = &cloudprovider.InstanceStatus{
313+
State: cloudprovider.InstanceCreating,
314+
}
315+
316+
case isDeletingMachineProviderID(providerIDNormalized):
317+
klog.V(4).Infof("Machine deleting in node group %s (%s)", ng.Id(), providerID)
318+
status = &cloudprovider.InstanceStatus{
319+
State: cloudprovider.InstanceDeleting,
320+
}
321+
322+
default:
323+
klog.V(4).Infof("Machine running in node group %s (%s)", ng.Id(), providerID)
324+
status = &cloudprovider.InstanceStatus{
325+
State: cloudprovider.InstanceRunning,
326+
}
327+
}
328+
257329
instances[i] = cloudprovider.Instance{
258-
Id: providerIDs[i],
330+
Id: providerID,
331+
Status: status,
259332
}
260333
}
261334

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go

+220
Original file line numberDiff line numberDiff line change
@@ -1768,3 +1768,223 @@ func TestNodeGroupGetOptions(t *testing.T) {
17681768
})
17691769
}
17701770
}
1771+
1772+
func TestNodeGroupNodesInstancesStatus(t *testing.T) {
1773+
type testCase struct {
1774+
description string
1775+
nodeCount int
1776+
includePendingMachine bool
1777+
includeDeletingMachine bool
1778+
includeFailedMachineWithNodeRef bool
1779+
includeFailedMachineWithoutNodeRef bool
1780+
includeFailedMachineDeleting bool
1781+
}
1782+
1783+
testCases := []testCase{
1784+
{
1785+
description: "standard number of nodes",
1786+
nodeCount: 5,
1787+
},
1788+
{
1789+
description: "includes a machine in pending state",
1790+
nodeCount: 5,
1791+
includePendingMachine: true,
1792+
},
1793+
{
1794+
description: "includes a machine in deleting state",
1795+
nodeCount: 5,
1796+
includeDeletingMachine: true,
1797+
},
1798+
{
1799+
description: "includes a machine in failed state with nodeRef",
1800+
nodeCount: 5,
1801+
includeFailedMachineWithNodeRef: true,
1802+
},
1803+
{
1804+
description: "includes a machine in failed state without nodeRef",
1805+
nodeCount: 5,
1806+
includeFailedMachineWithoutNodeRef: true,
1807+
},
1808+
}
1809+
1810+
test := func(t *testing.T, tc *testCase, testConfig *testConfig) {
1811+
controller, stop := mustCreateTestController(t, testConfig)
1812+
defer stop()
1813+
1814+
if tc.includePendingMachine {
1815+
if tc.nodeCount < 1 {
1816+
t.Fatal("test cannot pass, deleted machine requires at least 1 machine in machineset")
1817+
}
1818+
1819+
machine := testConfig.machines[0].DeepCopy()
1820+
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
1821+
unstructured.RemoveNestedField(machine.Object, "status", "nodeRef")
1822+
1823+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1824+
t.Fatalf("unexpected error updating machine, got %v", err)
1825+
}
1826+
}
1827+
1828+
if tc.includeDeletingMachine {
1829+
if tc.nodeCount < 2 {
1830+
t.Fatal("test cannot pass, deleted machine requires at least 2 machine in machineset")
1831+
}
1832+
1833+
machine := testConfig.machines[1].DeepCopy()
1834+
timestamp := metav1.Now()
1835+
machine.SetDeletionTimestamp(&timestamp)
1836+
1837+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1838+
t.Fatalf("unexpected error updating machine, got %v", err)
1839+
}
1840+
}
1841+
1842+
if tc.includeFailedMachineWithNodeRef {
1843+
if tc.nodeCount < 3 {
1844+
t.Fatal("test cannot pass, deleted machine requires at least 3 machine in machineset")
1845+
}
1846+
1847+
machine := testConfig.machines[2].DeepCopy()
1848+
unstructured.SetNestedField(machine.Object, "node-1", "status", "nodeRef", "name")
1849+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
1850+
1851+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1852+
t.Fatalf("unexpected error updating machine, got %v", err)
1853+
}
1854+
}
1855+
1856+
if tc.includeFailedMachineWithoutNodeRef {
1857+
if tc.nodeCount < 4 {
1858+
t.Fatal("test cannot pass, deleted machine requires at least 4 machine in machineset")
1859+
}
1860+
1861+
machine := testConfig.machines[3].DeepCopy()
1862+
unstructured.RemoveNestedField(machine.Object, "status", "nodeRef")
1863+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
1864+
1865+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1866+
t.Fatalf("unexpected error updating machine, got %v", err)
1867+
}
1868+
}
1869+
1870+
if tc.includeFailedMachineDeleting {
1871+
if tc.nodeCount < 5 {
1872+
t.Fatal("test cannot pass, deleted machine requires at least 5 machine in machineset")
1873+
}
1874+
1875+
machine := testConfig.machines[4].DeepCopy()
1876+
timestamp := metav1.Now()
1877+
machine.SetDeletionTimestamp(&timestamp)
1878+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
1879+
1880+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
1881+
t.Fatalf("unexpected error updating machine, got %v", err)
1882+
}
1883+
}
1884+
1885+
nodegroups, err := controller.nodeGroups()
1886+
if err != nil {
1887+
t.Fatalf("unexpected error: %v", err)
1888+
}
1889+
1890+
if l := len(nodegroups); l != 1 {
1891+
t.Fatalf("expected 1 nodegroup, got %d", l)
1892+
}
1893+
1894+
ng := nodegroups[0]
1895+
instances, err := ng.Nodes()
1896+
if err != nil {
1897+
t.Fatalf("unexpected error: %v", err)
1898+
}
1899+
1900+
expectedCount := tc.nodeCount
1901+
if len(instances) != expectedCount {
1902+
t.Errorf("expected %d nodes, got %d", expectedCount, len(instances))
1903+
}
1904+
1905+
// Sort instances by Id for stable comparison
1906+
sort.Slice(instances, func(i, j int) bool {
1907+
return instances[i].Id < instances[j].Id
1908+
})
1909+
1910+
for _, instance := range instances {
1911+
t.Logf("instance: %v", instance)
1912+
if tc.includePendingMachine && strings.HasPrefix(instance.Id, pendingMachinePrefix) {
1913+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceCreating {
1914+
t.Errorf("expected pending machine to have status %v, got %v", cloudprovider.InstanceCreating, instance.Status)
1915+
}
1916+
} else if tc.includeDeletingMachine && strings.HasPrefix(instance.Id, deletingMachinePrefix) {
1917+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceDeleting {
1918+
t.Errorf("expected deleting machine to have status %v, got %v", cloudprovider.InstanceDeleting, instance.Status)
1919+
}
1920+
} else if tc.includeFailedMachineWithNodeRef && strings.HasPrefix(instance.Id, failedMachinePrefix) {
1921+
if instance.Status != nil {
1922+
t.Errorf("expected failed machine with nodeRef to not have status, got %v", instance.Status)
1923+
}
1924+
} else if tc.includeFailedMachineWithoutNodeRef && strings.HasPrefix(instance.Id, failedMachinePrefix) {
1925+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceCreating {
1926+
t.Errorf("expected failed machine without nodeRef to have status %v, got %v", cloudprovider.InstanceCreating, instance.Status)
1927+
}
1928+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorClass != cloudprovider.OtherErrorClass {
1929+
t.Errorf("expected failed machine without nodeRef to have error class %v, got %v", cloudprovider.OtherErrorClass, instance.Status.ErrorInfo.ErrorClass)
1930+
}
1931+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorCode != "ProvisioningFailed" {
1932+
t.Errorf("expected failed machine without nodeRef to have error code %v, got %v", "ProvisioningFailed", instance.Status.ErrorInfo.ErrorCode)
1933+
}
1934+
} else if tc.includeFailedMachineDeleting && strings.HasPrefix(instance.Id, failedMachinePrefix) {
1935+
if instance.Status == nil || instance.Status.State != cloudprovider.InstanceDeleting {
1936+
t.Errorf("expected failed machine deleting to have status %v, got %v", cloudprovider.InstanceDeleting, instance.Status)
1937+
}
1938+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorClass != cloudprovider.OtherErrorClass {
1939+
t.Errorf("expected failed machine deleting to have error class %v, got %v", cloudprovider.OtherErrorClass, instance.Status.ErrorInfo.ErrorClass)
1940+
}
1941+
if instance.Status == nil || instance.Status.ErrorInfo.ErrorCode != "DeletingFailed" {
1942+
t.Errorf("expected failed machine deleting to have error code %v, got %v", "DeletingFailed", instance.Status.ErrorInfo.ErrorCode)
1943+
}
1944+
}
1945+
}
1946+
}
1947+
1948+
annotations := map[string]string{
1949+
nodeGroupMinSizeAnnotationKey: "1",
1950+
nodeGroupMaxSizeAnnotationKey: "10",
1951+
}
1952+
1953+
t.Run("MachineSet", func(t *testing.T) {
1954+
for _, tc := range testCases {
1955+
t.Run(tc.description, func(t *testing.T) {
1956+
test(
1957+
t,
1958+
&tc,
1959+
createMachineSetTestConfig(
1960+
RandomString(6),
1961+
RandomString(6),
1962+
RandomString(6),
1963+
tc.nodeCount,
1964+
annotations,
1965+
nil,
1966+
),
1967+
)
1968+
})
1969+
}
1970+
})
1971+
1972+
t.Run("MachineDeployment", func(t *testing.T) {
1973+
for _, tc := range testCases {
1974+
t.Run(tc.description, func(t *testing.T) {
1975+
test(
1976+
t,
1977+
&tc,
1978+
createMachineDeploymentTestConfig(
1979+
RandomString(6),
1980+
RandomString(6),
1981+
RandomString(6),
1982+
tc.nodeCount,
1983+
annotations,
1984+
nil,
1985+
),
1986+
)
1987+
})
1988+
}
1989+
})
1990+
}

0 commit comments

Comments
 (0)