Skip to content

Commit 0a55b13

Browse files
authored
Add resiliency mechanism to CPU and memory utilization checks (#6528)
* Add RetryingLinuxUtilizationParser * Switch approach to use retries in Provider class * Refactor and add test * Refactor
1 parent 78f99ee commit 0a55b13

File tree

2 files changed

+200
-7
lines changed

2 files changed

+200
-7
lines changed

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33

44
using System;
5+
using System.Collections.Generic;
56
using System.Diagnostics.Metrics;
7+
using System.Linq;
8+
using System.Threading;
69
using Microsoft.Extensions.Logging;
710
using Microsoft.Extensions.Logging.Abstractions;
811
using Microsoft.Extensions.Options;
@@ -33,6 +36,10 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
3336
private readonly double _scaleRelativeToCpuRequest;
3437
private readonly double _scaleRelativeToCpuRequestForTrackerApi;
3538

39+
private readonly TimeSpan _retryInterval = TimeSpan.FromMinutes(5);
40+
private DateTimeOffset _lastFailure = DateTimeOffset.MinValue;
41+
private int _measurementsUnavailable;
42+
3643
private DateTimeOffset _refreshAfterCpu;
3744
private DateTimeOffset _refreshAfterMemory;
3845

@@ -94,18 +101,44 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
94101
// Initialize the counters
95102
_cpuUtilizationLimit100PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_100_percent_exceeded");
96103
_cpuUtilizationLimit110PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_110_percent_exceeded");
97-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilizationLimit(cpuLimit), unit: "1");
98-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilizationWithoutHostDelta() / cpuRequest, unit: "1");
104+
105+
_ = meter.CreateObservableGauge(
106+
ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
107+
() => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
108+
"1");
109+
110+
_ = meter.CreateObservableGauge(
111+
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
112+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationWithoutHostDelta() / cpuRequest),
113+
unit: "1");
99114
}
100115
else
101116
{
102-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuLimit, unit: "1");
103-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
104-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
117+
_ = meter.CreateObservableGauge(
118+
name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
119+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuLimit),
120+
unit: "1");
121+
122+
_ = meter.CreateObservableGauge(
123+
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
124+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest),
125+
unit: "1");
126+
127+
_ = meter.CreateObservableGauge(
128+
name: ResourceUtilizationInstruments.ProcessCpuUtilization,
129+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest),
130+
unit: "1");
105131
}
106132

107-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: MemoryUtilization, unit: "1");
108-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessMemoryUtilization, observeValue: MemoryUtilization, unit: "1");
133+
_ = meter.CreateObservableGauge(
134+
name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization,
135+
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
136+
unit: "1");
137+
138+
_ = meter.CreateObservableGauge(
139+
name: ResourceUtilizationInstruments.ProcessMemoryUtilization,
140+
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
141+
unit: "1");
109142

110143
// cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
111144
// cpuLimit is a CPU limit (aka max CPU units available) for a pod or for a host.
@@ -288,4 +321,34 @@ public Snapshot GetSnapshot()
288321
userTimeSinceStart: TimeSpan.FromTicks((long)(cgroupTime / Hundred * _scaleRelativeToCpuRequestForTrackerApi)),
289322
memoryUsageInBytes: memoryUsed);
290323
}
324+
325+
private IEnumerable<Measurement<double>> GetMeasurementWithRetry(Func<double> func)
326+
{
327+
if (Volatile.Read(ref _measurementsUnavailable) == 1 &&
328+
_timeProvider.GetUtcNow() - _lastFailure < _retryInterval)
329+
{
330+
return Enumerable.Empty<Measurement<double>>();
331+
}
332+
333+
try
334+
{
335+
double result = func();
336+
if (Volatile.Read(ref _measurementsUnavailable) == 1)
337+
{
338+
_ = Interlocked.Exchange(ref _measurementsUnavailable, 0);
339+
}
340+
341+
return new[] { new Measurement<double>(result) };
342+
}
343+
catch (Exception ex) when (
344+
ex is System.IO.FileNotFoundException ||
345+
ex is System.IO.DirectoryNotFoundException ||
346+
ex is System.UnauthorizedAccessException)
347+
{
348+
_lastFailure = _timeProvider.GetUtcNow();
349+
_ = Interlocked.Exchange(ref _measurementsUnavailable, 1);
350+
351+
return Enumerable.Empty<Measurement<double>>();
352+
}
353+
}
291354
}

test/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring.Tests/Linux/LinuxUtilizationProviderTests.cs

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
using System.Threading.Tasks;
1010
using Microsoft.Extensions.Diagnostics.ResourceMonitoring.Test.Helpers;
1111
using Microsoft.Extensions.Logging.Testing;
12+
using Microsoft.Extensions.Time.Testing;
1213
using Microsoft.Shared.Instruments;
1314
using Microsoft.TestUtilities;
1415
using Moq;
@@ -272,4 +273,133 @@ public void Provider_Registers_Instruments_CgroupV2_WithoutHostCpu()
272273
Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
273274
Assert.Equal(1, samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization).value);
274275
}
276+
277+
[Fact]
278+
public void Provider_GetMeasurementWithRetry_HandlesExceptionAndRecovers()
279+
{
280+
var meterName = Guid.NewGuid().ToString();
281+
var logger = new FakeLogger<LinuxUtilizationProvider>();
282+
var options = Options.Options.Create(new ResourceMonitoringOptions());
283+
using var meter = new Meter(nameof(Provider_GetMeasurementWithRetry_HandlesExceptionAndRecovers));
284+
var meterFactoryMock = new Mock<IMeterFactory>();
285+
meterFactoryMock.Setup(x => x.Create(It.IsAny<MeterOptions>()))
286+
.Returns(meter);
287+
288+
var callCount = 0;
289+
var parserMock = new Mock<ILinuxUtilizationParser>();
290+
parserMock.Setup(p => p.GetMemoryUsageInBytes()).Returns(() =>
291+
{
292+
callCount++;
293+
if (callCount <= 1)
294+
{
295+
throw new FileNotFoundException("Simulated failure to read file");
296+
}
297+
298+
return 420UL;
299+
});
300+
parserMock.Setup(p => p.GetAvailableMemoryInBytes()).Returns(1000UL);
301+
parserMock.Setup(p => p.GetCgroupRequestCpu()).Returns(10f);
302+
parserMock.Setup(p => p.GetCgroupLimitedCpus()).Returns(12f);
303+
304+
var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow);
305+
var provider = new LinuxUtilizationProvider(options, parserMock.Object, meterFactoryMock.Object, logger, fakeTime);
306+
307+
using var listener = new MeterListener
308+
{
309+
InstrumentPublished = (instrument, listener) =>
310+
{
311+
if (ReferenceEquals(meter, instrument.Meter))
312+
{
313+
listener.EnableMeasurementEvents(instrument);
314+
}
315+
}
316+
};
317+
318+
var samples = new List<(Instrument instrument, double value)>();
319+
listener.SetMeasurementEventCallback<double>((instrument, value, _, _) =>
320+
{
321+
if (ReferenceEquals(meter, instrument.Meter))
322+
{
323+
samples.Add((instrument, value));
324+
}
325+
});
326+
327+
listener.Start();
328+
listener.RecordObservableInstruments();
329+
Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
330+
331+
fakeTime.Advance(TimeSpan.FromMinutes(1));
332+
listener.RecordObservableInstruments();
333+
Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
334+
335+
fakeTime.Advance(TimeSpan.FromMinutes(5));
336+
listener.RecordObservableInstruments();
337+
var metric = samples.SingleOrDefault(x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
338+
Assert.Equal(0.42, metric.value);
339+
340+
parserMock.Verify(p => p.GetMemoryUsageInBytes(), Times.Exactly(2));
341+
}
342+
343+
[Fact]
344+
public void Provider_GetMeasurementWithRetry_UnhandledException_DoesNotBlockFutureReads()
345+
{
346+
var meterName = Guid.NewGuid().ToString();
347+
var logger = new FakeLogger<LinuxUtilizationProvider>();
348+
var options = Options.Options.Create(new ResourceMonitoringOptions());
349+
using var meter = new Meter(nameof(Provider_GetMeasurementWithRetry_UnhandledException_DoesNotBlockFutureReads));
350+
var meterFactoryMock = new Mock<IMeterFactory>();
351+
meterFactoryMock.Setup(x => x.Create(It.IsAny<MeterOptions>()))
352+
.Returns(meter);
353+
354+
var callCount = 0;
355+
var parserMock = new Mock<ILinuxUtilizationParser>();
356+
parserMock.Setup(p => p.GetMemoryUsageInBytes()).Returns(() =>
357+
{
358+
callCount++;
359+
if (callCount <= 2)
360+
{
361+
throw new InvalidOperationException("Simulated unhandled exception");
362+
}
363+
364+
return 1234UL;
365+
});
366+
parserMock.Setup(p => p.GetAvailableMemoryInBytes()).Returns(2000UL);
367+
parserMock.Setup(p => p.GetCgroupRequestCpu()).Returns(10f);
368+
parserMock.Setup(p => p.GetCgroupLimitedCpus()).Returns(12f);
369+
370+
var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow);
371+
var provider = new LinuxUtilizationProvider(options, parserMock.Object, meterFactoryMock.Object, logger, fakeTime);
372+
373+
using var listener = new MeterListener
374+
{
375+
InstrumentPublished = (instrument, listener) =>
376+
{
377+
if (ReferenceEquals(meter, instrument.Meter))
378+
{
379+
listener.EnableMeasurementEvents(instrument);
380+
}
381+
}
382+
};
383+
384+
var samples = new List<(Instrument instrument, double value)>();
385+
listener.SetMeasurementEventCallback<double>((instrument, value, _, _) =>
386+
{
387+
if (ReferenceEquals(meter, instrument.Meter))
388+
{
389+
samples.Add((instrument, value));
390+
}
391+
});
392+
393+
listener.Start();
394+
395+
Assert.Throws<AggregateException>(() => listener.RecordObservableInstruments());
396+
Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
397+
398+
fakeTime.Advance(TimeSpan.FromMinutes(1));
399+
listener.RecordObservableInstruments();
400+
var metric = samples.SingleOrDefault(x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
401+
Assert.Equal(1234f / 2000f, metric.value, 0.01f);
402+
403+
parserMock.Verify(p => p.GetMemoryUsageInBytes(), Times.Exactly(3));
404+
}
275405
}

0 commit comments

Comments
 (0)