Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve CPU resource monitor for Linux systems #4888

Merged
merged 23 commits into from
Aug 14, 2024
Merged
Changes from 10 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
efcca1e
Replace free call on linux
vmapetr Jul 8, 2024
c66abe8
Merge branch 'master' into v-mpetrov/us-2190905-improve-resource-monitor
vmapetr Jul 10, 2024
1e59693
Minor fixes
vmapetr Jul 10, 2024
ea9dfae
Rework CPU info for linux
vmapetr Jul 10, 2024
cf1a22f
Fix typos
vmapetr Jul 10, 2024
ef9e60b
Fix issue with memory monitor
vmapetr Jul 11, 2024
f6ec823
Add comments
vmapetr Jul 11, 2024
e657974
Remove task run delegates
vmapetr Jul 12, 2024
5cda622
Fix for macos
vmapetr Jul 13, 2024
3e053ad
Merge branch 'master' into v-mpetrov/us-2190905-improve-resource-monitor
vmapetr Jul 15, 2024
ffb29f7
Expand comments for metrics methods
vmapetr Jul 15, 2024
ffb239b
Remove unnecessary cancellation exits
vmapetr Jul 15, 2024
621c110
Merge branch 'master' into v-mpetrov/us-2190905-improve-resource-monitor
vmapetr Jul 18, 2024
dbd552c
Add FF to disable resource monitor on debug runs
vmapetr Jul 18, 2024
f053b8d
Minor fix
vmapetr Jul 18, 2024
ca67e50
Fix warning
vmapetr Jul 18, 2024
dc2f5a2
Merge branch 'master' into v-mpetrov/us-2190905-improve-resource-monitor
vmapetr Jul 18, 2024
7a583be
Merge branch 'master' into v-mpetrov/us-2190905-improve-resource-monitor
vmapetr Jul 31, 2024
677ddba
Merge branch 'master' into v-mpetrov/us-2190905-improve-resource-monitor
vmapetr Aug 12, 2024
79022e0
Replace DisableResourceMonitorDebugOutput with EnableResourceMonitorD…
vmapetr Aug 12, 2024
f6e0041
Fix typo
vmapetr Aug 13, 2024
ea4e9b4
Update variable name
vmapetr Aug 13, 2024
9c0f83a
Fix typo
vmapetr Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 106 additions & 137 deletions src/Agent.Worker/ResourceMetricsManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ private void PublishTelemetry(string message, string taskId)
#endregion

#region MetricMethods
private async Task GetCpuInfoAsync()
private async Task GetCpuInfoAsync(CancellationToken cancellationToken)
{
if (_cpuInfo.Updated >= DateTime.Now - TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL))
{
Expand All @@ -120,13 +120,6 @@ private async Task GetCpuInfoAsync()

if (PlatformUtil.RunningOnWindows)
{
using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

await Task.Run(() =>
{
using var query = new ManagementObjectSearcher("SELECT PercentIdleTime FROM Win32_PerfFormattedData_PerfOS_Processor WHERE Name=\"_Total\"");
Expand All @@ -139,66 +132,55 @@ await Task.Run(() =>
_cpuInfo.Updated = DateTime.Now;
_cpuInfo.Usage = 100 - cpuInfoIdle;
}
}, linkedTokenSource.Token);
}, cancellationToken);
}

if (PlatformUtil.RunningOnLinux)
{
using var processInvoker = HostContext.CreateService<IProcessInvoker>();
List<float[]> samples = new();
int samplesCount = 10;

using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message)
// /proc/stat updates linearly in real time and shows CPU time counters during the whole system uptime
// so we need to collect multiple samples to calculate CPU usage in the moment
for (int i = 0; i < samplesCount + 1; i++)
{
var processInvokerOutput = message.Data;

var cpuInfoNice = int.Parse(processInvokerOutput.Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[2]);
var cpuInfoIdle = int.Parse(processInvokerOutput.Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[4]);
var cpuInfoIOWait = int.Parse(processInvokerOutput.Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[5]);

lock (_cpuInfoLock)
string[] strings = await File.ReadAllLinesAsync("/proc/stat", cancellationToken);
if (cancellationToken.IsCancellationRequested)
{
_cpuInfo.Updated = DateTime.Now;
_cpuInfo.Usage = (double)(cpuInfoNice + cpuInfoIdle) * 100 / (cpuInfoNice + cpuInfoIdle + cpuInfoIOWait);
return;
}
};

processInvoker.ErrorDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message)
samples.Add(strings[0]
.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Skip(1)
.Select(float.Parse)
.ToArray());

await Task.Delay(100, cancellationToken);
}

// We need to get deltas for idle and total CPU time counters using the gathered samples
// and calculate the average to provide the CPU utilization in the moment
double cpuUsage = 0.0;
for (int i = 1; i < samplesCount + 1; i++)
{
Trace.Error($"Error on receiving CPU info: {message.Data}");
};
double idle = samples[i][3] - samples[i - 1][3];
merlynomsft marked this conversation as resolved.
Show resolved Hide resolved
double total = samples[i].Sum() - samples[i - 1].Sum();

var filePath = "grep";
var arguments = "\"cpu \" /proc/stat";
await processInvoker.ExecuteAsync(
workingDirectory: string.Empty,
fileName: filePath,
arguments: arguments,
environment: null,
requireExitCodeZero: true,
outputEncoding: null,
killProcessOnCancel: true,
cancellationToken: linkedTokenSource.Token);
}
cpuUsage += 1.0 - (idle / total);
}

lock (_cpuInfoLock)
{
_cpuInfo.Updated = DateTime.Now;
_cpuInfo.Usage = (cpuUsage / samplesCount) * 100;
}
}
if (PlatformUtil.RunningOnMacOS)
{
List<string> outputs = new List<string>();

using var processInvoker = HostContext.CreateService<IProcessInvoker>();

using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

List<string> outputs = new List<string>();
processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message)
{
outputs.Add(message.Data);
Expand All @@ -219,7 +201,12 @@ await processInvoker.ExecuteAsync(
requireExitCodeZero: true,
outputEncoding: null,
killProcessOnCancel: true,
cancellationToken: linkedTokenSource.Token);
cancellationToken: cancellationToken);

if (cancellationToken.IsCancellationRequested)
{
return;
}

// Use second sample for more accurate calculation
var cpuInfoIdle = double.Parse(outputs[1].Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[6].Trim('%'));
Expand Down Expand Up @@ -251,7 +238,7 @@ private void GetDiskInfo()
}
}

private async Task GetMemoryInfoAsync()
private async Task GetMemoryInfoAsync(CancellationToken cancellationToken)
{
if (_memoryInfo.Updated >= DateTime.Now - TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL))
{
Expand All @@ -260,13 +247,6 @@ private async Task GetMemoryInfoAsync()

if (PlatformUtil.RunningOnWindows)
{
using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

await Task.Run(() =>
{
using var query = new ManagementObjectSearcher("SELECT FreePhysicalMemory, TotalVisibleMemorySize FROM CIM_OperatingSystem");
Expand All @@ -281,69 +261,27 @@ await Task.Run(() =>
_memoryInfo.TotalMemoryMB = totalMemory / 1024;
_memoryInfo.UsedMemoryMB = (totalMemory - freeMemory) / 1024;
}
}, linkedTokenSource.Token);
}, cancellationToken);
}

if (PlatformUtil.RunningOnLinux)
{
// Some compact Linux distributions like UBI may not have "free" utility installed, or it may have a custom output
// We don't want to break currently existing pipelines with ADO warnings
// so related errors thrown here will be sent to the trace or debug logs by caller methods

using var processInvoker = HostContext.CreateService<IProcessInvoker>();

using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message)
string[] memoryInfo = await File.ReadAllLinesAsync("/proc/meminfo", cancellationToken);
if (cancellationToken.IsCancellationRequested)
{
if (!message.Data.StartsWith("Mem:"))
{
return;
}

var processInvokerOutputString = message.Data;
var memoryInfoString = processInvokerOutputString.Split(" ", StringSplitOptions.RemoveEmptyEntries);

if (memoryInfoString.Length != 7)
{
throw new Exception("\"free\" utility has non-default output");
}

lock (_memoryInfoLock)
{
_memoryInfo.Updated = DateTime.Now;
_memoryInfo.TotalMemoryMB = long.Parse(memoryInfoString[1]);
_memoryInfo.UsedMemoryMB = long.Parse(memoryInfoString[2]);
}
};
return;
}

processInvoker.ErrorDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message)
{
Trace.Error($"Error on receiving memory info: {message.Data}");
};
// The available memory counter from /proc/meminfo contains the sum of free, cached, and buffer memory
// it shows more accurate information about the memory usage than the free memory counter
int totalMemory = int.Parse(memoryInfo[0].Split(" ", StringSplitOptions.RemoveEmptyEntries)[1]);
int availableMemory = int.Parse(memoryInfo[2].Split(" ", StringSplitOptions.RemoveEmptyEntries)[1]);
merlynomsft marked this conversation as resolved.
Show resolved Hide resolved

try
{
var filePath = "free";
var arguments = "-m";
await processInvoker.ExecuteAsync(
workingDirectory: string.Empty,
fileName: filePath,
arguments: arguments,
environment: null,
requireExitCodeZero: true,
outputEncoding: null,
killProcessOnCancel: true,
cancellationToken: linkedTokenSource.Token);
}
catch (Win32Exception ex)
lock (_memoryInfoLock)
{
throw new Exception($"\"free\" utility is unavailable. Exception: {ex.Message}");
_memoryInfo.Updated = DateTime.Now;
_memoryInfo.TotalMemoryMB = totalMemory / 1024;
_memoryInfo.UsedMemoryMB = (totalMemory - availableMemory) / 1024;
}
}

Expand All @@ -353,17 +291,9 @@ await processInvoker.ExecuteAsync(
// but unfortunately it returns values in pages and has no built-in arguments for custom output
// so we need to parse and cast the output manually

List<string> outputs = new List<string>();

using var processInvoker = HostContext.CreateService<IProcessInvoker>();

using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

List<string> outputs = new List<string>();
processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message)
{
outputs.Add(message.Data);
Expand All @@ -383,7 +313,12 @@ await processInvoker.ExecuteAsync(
requireExitCodeZero: true,
outputEncoding: null,
killProcessOnCancel: true,
cancellationToken: linkedTokenSource.Token);
cancellationToken: cancellationToken);

if (cancellationToken.IsCancellationRequested)
{
return;
}

var pageSize = int.Parse(outputs[0].Split(" ", StringSplitOptions.RemoveEmptyEntries)[7]);

Expand All @@ -408,11 +343,11 @@ await processInvoker.ExecuteAsync(
#endregion

#region StringMethods
private async Task<string> GetCpuInfoStringAsync()
private async Task<string> GetCpuInfoStringAsync(CancellationToken cancellationToken)
{
try
{
await GetCpuInfoAsync();
await GetCpuInfoAsync(cancellationToken);

return StringUtil.Loc("ResourceMonitorCPUInfo", $"{_cpuInfo.Usage:0.00}");
}
Expand All @@ -428,21 +363,26 @@ private string GetDiskInfoString()
{
GetDiskInfo();

return StringUtil.Loc("ResourceMonitorDiskInfo", _diskInfo.VolumeRoot, $"{_diskInfo.FreeDiskSpaceMB:0.00}", $"{_diskInfo.TotalDiskSpaceMB:0.00}");
return StringUtil.Loc("ResourceMonitorDiskInfo",
_diskInfo.VolumeRoot,
$"{_diskInfo.FreeDiskSpaceMB:0.00}",
$"{_diskInfo.TotalDiskSpaceMB:0.00}");
}
catch (Exception ex)
{
return StringUtil.Loc("ResourceMonitorDiskInfoError", ex.Message);
}
}

private async Task<string> GetMemoryInfoStringAsync()
private async Task<string> GetMemoryInfoStringAsync(CancellationToken cancellationToken)
{
try
{
await GetMemoryInfoAsync();
await GetMemoryInfoAsync(cancellationToken);

return StringUtil.Loc("ResourceMonitorMemoryInfo", $"{_memoryInfo.UsedMemoryMB:0.00}", $"{_memoryInfo.TotalMemoryMB:0.00}");
return StringUtil.Loc("ResourceMonitorMemoryInfo",
$"{_memoryInfo.UsedMemoryMB:0.00}",
$"{_memoryInfo.TotalMemoryMB:0.00}");
}
catch (Exception ex)
{
Expand All @@ -456,7 +396,17 @@ public async Task RunDebugResourceMonitorAsync()
{
while (!_context.CancellationToken.IsCancellationRequested)
{
_context.Debug(StringUtil.Loc("ResourceMonitorAgentEnvironmentResource", GetDiskInfoString(), await GetMemoryInfoStringAsync(), await GetCpuInfoStringAsync()));
using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

_context.Debug(StringUtil.Loc("ResourceMonitorAgentEnvironmentResource",
GetDiskInfoString(),
await GetMemoryInfoStringAsync(linkedTokenSource.Token),
await GetCpuInfoStringAsync(linkedTokenSource.Token)));

await Task.Delay(ACTIVE_MODE_INTERVAL, _context.CancellationToken);
}
Expand All @@ -475,7 +425,10 @@ public async Task RunDiskSpaceUtilizationMonitorAsync()

if (freeDiskSpacePercentage <= AVAILABLE_DISK_SPACE_PERCENTAGE_THRESHOLD)
{
_context.Warning(StringUtil.Loc("ResourceMonitorFreeDiskSpaceIsLowerThanThreshold", _diskInfo.VolumeRoot, AVAILABLE_DISK_SPACE_PERCENTAGE_THRESHOLD, $"{usedDiskSpacePercentage:0.00}"));
_context.Warning(StringUtil.Loc("ResourceMonitorFreeDiskSpaceIsLowerThanThreshold",
_diskInfo.VolumeRoot,
AVAILABLE_DISK_SPACE_PERCENTAGE_THRESHOLD,
$"{usedDiskSpacePercentage:0.00}"));

break;
}
Expand All @@ -495,15 +448,24 @@ public async Task RunMemoryUtilizationMonitorAsync()
{
while (!_context.CancellationToken.IsCancellationRequested)
{
using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

try
{
await GetMemoryInfoAsync();
await GetMemoryInfoAsync(linkedTokenSource.Token);

var usedMemoryPercentage = Math.Round(((_memoryInfo.UsedMemoryMB / (double)_memoryInfo.TotalMemoryMB) * 100.0), 2);

if (100.0 - usedMemoryPercentage <= AVAILABLE_MEMORY_PERCENTAGE_THRESHOLD)
{
_context.Warning(StringUtil.Loc("ResourceMonitorMemorySpaceIsLowerThanThreshold", AVAILABLE_MEMORY_PERCENTAGE_THRESHOLD, $"{usedMemoryPercentage:0.00}"));
_context.Warning(StringUtil.Loc("ResourceMonitorMemorySpaceIsLowerThanThreshold",
AVAILABLE_MEMORY_PERCENTAGE_THRESHOLD,
$"{usedMemoryPercentage:0.00}"));

break;
}
Expand All @@ -523,9 +485,16 @@ public async Task RunCpuUtilizationMonitorAsync(string taskId)
{
while (!_context.CancellationToken.IsCancellationRequested)
{
using var timeoutTokenSource = new CancellationTokenSource();
timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL));

using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(
_context.CancellationToken,
timeoutTokenSource.Token);

try
{
await GetCpuInfoAsync();
await GetCpuInfoAsync(linkedTokenSource.Token);

if (_cpuInfo.Usage >= CPU_UTILIZATION_PERCENTAGE_THRESHOLD)
{
Expand Down
Loading