diff --git a/.github/workflows/odd-memory-usage-test.yaml b/.github/workflows/odd-memory-usage-test.yaml new file mode 100644 index 00000000000..c8a3477889d --- /dev/null +++ b/.github/workflows/odd-memory-usage-test.yaml @@ -0,0 +1,43 @@ +name: 'ODD Memory Usage Test' + +on: + schedule: + - cron: '30 12 * * *' + workflow_dispatch: + +jobs: + analyze-memory: + name: 'ODD Memory Usage Test' + runs-on: 'ubuntu-latest' + timeout-minutes: 15 + + steps: + - name: Checkout repository + uses: 'actions/checkout@v4' + + - name: Setup Node.js + uses: 'actions/setup-node@v3' + with: + node-version: '18.19.0' + cache: 'yarn' + + - name: Install dependencies + run: | + yarn install + + - name: Run memory analysis + env: + MIXPANEL_INGEST_USER: ${{ secrets.MIXPANEL_INGEST_USER }} + MIXPANEL_INGEST_SECRET: ${{ secrets.MIXPANEL_INGEST_SECRET }} + MIXPANEL_PROJECT_ID: ${{ secrets.OT_APP_MIXPANEL_ID }} + run: | + OUTPUT=$(node ./scripts/resource-monitor/perform-memory-analysis "$MIXPANEL_INGEST_USER" "$MIXPANEL_INGEST_SECRET" "$MIXPANEL_PROJECT_ID") + + echo "::group::ODD Memory Usage Results" + echo "$OUTPUT" + echo "::endgroup::" + + echo "## ODD Memory Usage Results" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "$OUTPUT" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/scripts/resource-monitor/lib/analysis.js b/scripts/resource-monitor/lib/analysis.js new file mode 100644 index 00000000000..9aaefc46ff5 --- /dev/null +++ b/scripts/resource-monitor/lib/analysis.js @@ -0,0 +1,84 @@ +// Analysis is based on one-tailed, Pearson's correlation coefficient. + +const { MINIMUM_VALID_SAMPLE_SIZE } = require('./constants') + +/** + * + * @param x An array of a numbers. + * @param y An array of numbers. + * @return {number} The Pearson Correlation. + */ +function calculatePearsonCorrelation(x, y) { + const n = x.length + let sum_x = 0 + let sum_y = 0 + let sum_xy = 0 + let sum_x2 = 0 + let sum_y2 = 0 + + for (let i = 0; i < n; i++) { + sum_x += x[i] + sum_y += y[i] + sum_xy += x[i] * y[i] + sum_x2 += x[i] * x[i] + sum_y2 += y[i] * y[i] + } + + const numerator = n * sum_xy - sum_x * sum_y + const denominator = Math.sqrt( + (n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y) + ) + + return denominator === 0 ? 0 : numerator / denominator +} + +/** + * @description Calculate p-value using t-distribution approximation for a one-tailed test. + * If there are too few samples, assume no correlation. + * For positive correlations only. + * @param correlation The Pearson Correlation + * @param sampleSize The total number of samples. + * @return {number} The p-value. + */ +function calculatePValueOneTailed(correlation, sampleSize) { + if (sampleSize < MINIMUM_VALID_SAMPLE_SIZE) { + return 1 + } + + // The t-statistic + const t = + correlation * Math.sqrt((sampleSize - 2) / (1 - correlation * correlation)) + + // Approximate p-value using t-distribution (one-tailed test) + const degreesOfFreedom = sampleSize - 2 + return 1 - tDistributionCDF(t, degreesOfFreedom) +} + +// t-distribution CDF approximation +function tDistributionCDF(t, df) { + const x = df / (df + t * t) + return 1 - 0.5 * Math.pow(x, df / 2) +} + +function interpretResults(result) { + if (!result.isSignificant) { + return 'No significant correlation found' + } + + const strength = Math.abs(result.correlation) + const direction = result.correlation > 0 ? 'positive' : 'negative' + + if (strength > 0.7) { + return `Strong ${direction} correlation (>0.7)` + } else if (strength > 0.3) { + return `Moderate ${direction} correlation (>0.3 and <0.7)` + } else { + return `Weak ${direction} correlation (>=0.3)` + } +} + +module.exports = { + calculatePearsonCorrelation, + calculatePValueOneTailed, + interpretResults, +} diff --git a/scripts/resource-monitor/lib/constants.js b/scripts/resource-monitor/lib/constants.js new file mode 100644 index 00000000000..6a109caf56f --- /dev/null +++ b/scripts/resource-monitor/lib/constants.js @@ -0,0 +1,41 @@ +/** + * @description Several processes we care about execute with a lot of unique sub args determined at + * runtime. These processes are aggregated using a regex pattern. + */ +const AGGREGATED_PROCESSES = [ + { + pattern: /^\/opt\/opentrons-app\/opentrons --type=renderer/, + key: 'app-renderer-processes', + }, + { + pattern: /^\/opt\/opentrons-app\/opentrons --type=zygote/, + key: 'app-zygote-processes', + }, + { + pattern: /^python3 -m uvicorn/, + key: 'robot-server-uvicorn-processes', + }, + { + pattern: /^\/opt\/opentrons-app\/opentrons --type=utility/, + key: 'app-utility-processes', + }, +] + +/** + * @description Generally don't include any variation of external processes in analysis. + */ +const BLACKLISTED_PROCESSES = [/^nmcli/, /^\/usr\/bin\/python3/] + +/** + * @description For Pearson's, it's generally recommended to use a sample size of at least n=30. + */ +const MINIMUM_VALID_SAMPLE_SIZE = 30 + +const P_VALUE_SIGNIFICANCE_THRESHOLD = 0.05 + +module.exports = { + AGGREGATED_PROCESSES, + BLACKLISTED_PROCESSES, + MINIMUM_VALID_SAMPLE_SIZE, + P_VALUE_SIGNIFICANCE_THRESHOLD, +} diff --git a/scripts/resource-monitor/lib/helpers/date.js b/scripts/resource-monitor/lib/helpers/date.js new file mode 100644 index 00000000000..fa0bda62d51 --- /dev/null +++ b/scripts/resource-monitor/lib/helpers/date.js @@ -0,0 +1,22 @@ +/** + * @description Get ISO date strings for the past month from yesterday. + */ +function getISODatesForPastMonth() { + const now = new Date() + // Don't use today's data, because the Mixpanel API seemingly doesn't use UTC timestamps, and + // it's easy to fail a request depending on the time of day it's made. + const yesterday = new Date(now.setDate(now.getDate() - 1)) + const formatDate = date => date.toISOString().split('T')[0] + + const monthAgo = new Date(yesterday) + monthAgo.setMonth(yesterday.getMonth() - 1) + + return { + from: formatDate(monthAgo), + to: formatDate(yesterday), + } +} + +module.exports = { + getISODatesForPastMonth, +} diff --git a/scripts/resource-monitor/lib/helpers/index.js b/scripts/resource-monitor/lib/helpers/index.js new file mode 100644 index 00000000000..71991e2d09f --- /dev/null +++ b/scripts/resource-monitor/lib/helpers/index.js @@ -0,0 +1,5 @@ +module.exports = { + ...require('./date'), + ...require('./manifest'), + ...require('./mixpanel'), +} diff --git a/scripts/resource-monitor/lib/helpers/manifest.js b/scripts/resource-monitor/lib/helpers/manifest.js new file mode 100644 index 00000000000..3c91e9d7192 --- /dev/null +++ b/scripts/resource-monitor/lib/helpers/manifest.js @@ -0,0 +1,52 @@ +const fetch = require('node-fetch') + +const APP_MANIFEST = 'https://builds.opentrons.com/app/releases.json' + +async function downloadAppManifest() { + const response = await fetch(APP_MANIFEST) + return await response.json() +} + +/** + * @description Get the most recent app version that is not revoked. + * @param manifest The app manifest + */ +function latestValidVersionFromManifest(manifest) { + const versions = Object.keys(manifest.production) + const latestValidVersion = versions.findLast( + version => manifest.production[version].revoked === false + ) + + if (latestValidVersion != null) { + return latestValidVersion + } else { + throw new Error('No valid versions found') + } +} + +/** + * @description Get `count` latest, previous non revoked versions relative to the latest version. + * @param manifest The app manifest + * @param latestVersion The latest valid version + * @param count Number of previous versions to return + * @returns {string[]} Array of version strings, ordered from newest to oldest + */ +function getPrevValidVersions(manifest, latestVersion, count) { + const versions = Object.keys(manifest.production) + const latestIndex = versions.indexOf(latestVersion) + + if (latestIndex === -1) { + throw new Error('Latest version not found in manifest') + } + + return versions + .slice(0, latestIndex) + .filter(version => !manifest.production[version].revoked) + .slice(-count) + .reverse() +} +module.exports = { + downloadAppManifest, + latestValidVersionFromManifest, + getPrevValidVersions, +} diff --git a/scripts/resource-monitor/lib/helpers/mixpanel.js b/scripts/resource-monitor/lib/helpers/mixpanel.js new file mode 100644 index 00000000000..a518d1f4b40 --- /dev/null +++ b/scripts/resource-monitor/lib/helpers/mixpanel.js @@ -0,0 +1,65 @@ +const fetch = require('node-fetch') + +const MIXPANEL_URL = 'https://data.mixpanel.com/api/2.0/export' + +/** + * @description Base64 encode a username and password in + * @param uname Mixpanel service account username. + * @param pwd Mixpanel service account password. + * @return {string} + */ +function encodeCredentialsForMixpanel(uname, pwd) { + return Buffer.from(`${uname}:${pwd}`).toString('base64') +} + +/** + * @description Cleans up Mixpanel data for post-processing. + * @param data Mixpanel data + */ +function parseMixpanelData(data) { + const lines = data.split('\n').filter(line => line.trim()) + return lines.map(line => JSON.parse(line)) +} + +/** + * @description Make the network request to Mixpanel. + */ +async function getMixpanelResourceMonitorDataFor({ + uname, + pwd, + projectId, + fromDate, + toDate, + where, +}) { + const params = new URLSearchParams({ + project_id: projectId, + from_date: fromDate, + to_date: toDate, + event: '["resourceMonitorReport"]', + where, + }) + + const options = { + method: 'GET', + headers: { + 'Accept-Encoding': 'gzip', + accept: 'text/plain', + authorization: `Basic ${encodeCredentialsForMixpanel(uname, pwd)}`, + }, + } + + const response = await fetch(`${MIXPANEL_URL}?${params}`, options) + const text = await response.text() + if (!response.ok) { + throw new Error( + `Mixpanel request failed: ${response.status}, ${response.statusText}, ${text}` + ) + } + return text +} + +module.exports = { + getMixpanelResourceMonitorDataFor, + parseMixpanelData, +} diff --git a/scripts/resource-monitor/perform-memory-analysis/analyzeMemoryTrends.js b/scripts/resource-monitor/perform-memory-analysis/analyzeMemoryTrends.js new file mode 100644 index 00000000000..c70914e1e3e --- /dev/null +++ b/scripts/resource-monitor/perform-memory-analysis/analyzeMemoryTrends.js @@ -0,0 +1,296 @@ +const { + parseMixpanelData, + getISODatesForPastMonth, + getMixpanelResourceMonitorDataFor, + downloadAppManifest, + getPrevValidVersions, + latestValidVersionFromManifest, +} = require('../lib/helpers') +const { + calculatePearsonCorrelation, + calculatePValueOneTailed, + interpretResults, +} = require('../lib/analysis') +const { + AGGREGATED_PROCESSES, + BLACKLISTED_PROCESSES, + MINIMUM_VALID_SAMPLE_SIZE, + P_VALUE_SIGNIFICANCE_THRESHOLD, +} = require('../lib/constants') + +const UPTIME_BUCKETS = [ + { min: 0, max: 20, label: '0-20hrs' }, + { min: 20, max: 40, label: '20-40hrs' }, + { min: 40, max: 60, label: '40-60hrs' }, + { min: 60, max: 80, label: '60-80hrs' }, + { min: 80, max: 120, label: '80-120hrs' }, + { min: 120, max: 240, label: '120-240hrs' }, + { min: 240, max: Infinity, label: '240+hrs' }, +] + +/** + * @description Calculate average memory usage for measurements within a specific time range + * @param measurements Array of measurements with uptime and a memory metric + * @param minHours Minimum hours (inclusive) + * @param maxHours Maximum hours (exclusive) + * @param memoryMetric The field to average ('memRssMb' or 'systemAvailMemMb') + * @returns {number | null} Average memory usage or null if no measurements in range + */ +function calculateAverageMemoryForRange( + measurements, + minHours, + maxHours, + memoryMetric = 'memRssMb' +) { + const inRange = measurements.filter( + m => m.uptime >= minHours && m.uptime < maxHours + ) + + if (inRange.length === 0 || inRange.length < MINIMUM_VALID_SAMPLE_SIZE) { + return null + } + + const sum = inRange.reduce((acc, m) => acc + m[memoryMetric], 0) + return sum / inRange.length +} + +/** + * @description Calculate memory usage averages across all defined ranges + * @param measurements Array of measurements with uptime and the memory metric + * @param memoryMetric The field to average ('memRssMb' or 'systemAvailMemMb') + * @returns {Object} Contains averages for each range + */ +function calculateRangeAverages(measurements, memoryMetric = 'memRssMb') { + const averages = {} + UPTIME_BUCKETS.forEach(range => { + const avg = calculateAverageMemoryForRange( + measurements, + range.min, + range.max, + memoryMetric + ) + averages[range.label] = + avg !== null ? avg.toFixed(2) : 'N/A - Not enough data available.' + }) + return averages +} + +/** + * @description Filter the Mixpanel data for the data relevant for memory analysis, aggregating data for certain processes + * and ignoring data for blacklisted processes. + * @param data Mixpanel data. + * @return A tuple of memory data by process and general ODD system memory. + */ +function processMixpanelData(data) { + const processByName = new Map() + const systemMemory = [] + + data.forEach(entry => { + const { + systemUptimeHrs, + systemAvailMemMb, + processesDetails, + } = entry.properties + const uptime = parseFloat(systemUptimeHrs) + + // Validate uptime before adding any measurements + if (isNaN(uptime)) { + return + } + + // Ensure system mem is a valid number before adding it. + const availMemMb = parseFloat(systemAvailMemMb) + if (!isNaN(availMemMb)) { + systemMemory.push({ + uptime, + systemAvailMemMb: availMemMb, + }) + } + + processesDetails.forEach(process => { + const isBlacklisted = BLACKLISTED_PROCESSES.some(pattern => + pattern.test(process.name) + ) + + if (!isBlacklisted) { + let processKey = process.name + // Certain processes are aggregated. + for (const { pattern, key } of AGGREGATED_PROCESSES) { + if (pattern.test(process.name)) { + processKey = key + break + } + } + + const memRssMb = parseFloat(process.memRssMb) + if (!isNaN(memRssMb)) { + if (!processByName.has(processKey)) { + processByName.set(processKey, []) + } + processByName.get(processKey).push({ + memRssMb, + uptime, + }) + } + } + }) + }) + + return [processByName, systemMemory] +} + +/** + * @description Group data by process name and calculate correlation and range averages + * @param data See `analyzeMemoryTrends` + */ +function analyzeProcessMemoryTrends(data) { + // Create a map containing relevant data for each process + const [processByName, systemMemory] = processMixpanelData(data) + + // Filter out any process that has less than the minimum sample size + for (const [processName, measurements] of processByName.entries()) { + if (measurements.length < MINIMUM_VALID_SAMPLE_SIZE) { + processByName.delete(processName) + } + } + + // Calculate correlation coefficient and range averages for each process + const results = new Map() + processByName.forEach((measurements, processName) => { + const correlation = calculatePearsonCorrelation( + measurements.map(m => m.uptime), + measurements.map(m => m.memRssMb) + ) + + const pValue = calculatePValueOneTailed(correlation, measurements.length) + const rangeAverages = calculateRangeAverages(measurements, 'memRssMb') + + results.set(processName, { + correlation, + sampleSize: measurements.length, + isSignificant: pValue < P_VALUE_SIGNIFICANCE_THRESHOLD, + rangeAverages, + }) + }) + + // Calculate system memory metrics + const systemMemoryCorrelation = calculatePearsonCorrelation( + systemMemory.map(m => m.systemAvailMemMb), + systemMemory.map(m => m.uptime) + ) + const systemMemoryPValue = calculatePValueOneTailed( + systemMemoryCorrelation, + systemMemory.length + ) + results.set('odd-available-memory', { + correlation: systemMemoryCorrelation, + sampleSize: systemMemory.length, + isSignificant: systemMemoryPValue < P_VALUE_SIGNIFICANCE_THRESHOLD, + rangeAverages: calculateRangeAverages(systemMemory, 'systemAvailMemMb'), + }) + + // Filter out any process with a negative correlation + for (const [processName, memResults] of results.entries()) { + if (memResults.correlation < 0 && processName !== 'odd-available-memory') { + results.delete(processName) + } + } + + return results +} + +/** + * @description Post-process mixpanel data, returning statistical summaries per process + * @param mixpanelData Each entry is expected to contain a top-level 'properties' field with relevant subfields. + */ +function analyzeMemoryTrends(mixpanelData) { + const parsedData = parseMixpanelData(mixpanelData) + const results = analyzeProcessMemoryTrends(parsedData) + + const analysis = {} + results.forEach((result, processName) => { + analysis[processName] = { + correlation: result.correlation.toFixed(4), + sampleSize: result.sampleSize, + interpretation: interpretResults(result), + averageMemoryMbByUptime: result.rangeAverages, + } + }) + + return analysis +} + +/** + * @description The 'where' used as a segmentation expression for Mixpanel data filtering. + */ +function buildWhere(version) { + return `properties["appVersion"]=="${version}" and properties["appMode"]=="ODD"` +} + +/** + * @description Analyze memory trends across multiple versions + * @param {number} previousVersionCount Number of previous versions to analyze + * @param {string} uname Mixpanel service account username. + * @param {string} pwd Mixpanel service account password. + * @param {string} projectId Mixpanel project id. + */ +async function analyzeMemoryTrendsAcrossVersions({ + previousVersionCount, + uname, + pwd, + projectId, +}) { + const manifest = await downloadAppManifest() + const latestValidVersion = latestValidVersionFromManifest(manifest) + const prevValidVersions = getPrevValidVersions( + manifest, + latestValidVersion, + previousVersionCount + ) + const analysisPeriod = getISODatesForPastMonth() + + // Populate backup messaging if there's no data available for a specific version + const noDataAvailableStr = 'N/A - No data available' + const results = { + [latestValidVersion]: noDataAvailableStr, + } + prevValidVersions.forEach(version => { + results[version] = noDataAvailableStr + }) + + // Analyze latest version + const currentVersionData = await getMixpanelResourceMonitorDataFor({ + version: latestValidVersion, + uname, + pwd, + projectId, + fromDate: analysisPeriod.from, + toDate: analysisPeriod.to, + where: buildWhere(latestValidVersion), + }) + + if (currentVersionData) { + results[latestValidVersion] = analyzeMemoryTrends(currentVersionData) + } + + // Analyze previous versions + for (const version of prevValidVersions) { + const versionData = await getMixpanelResourceMonitorDataFor({ + version, + uname, + pwd, + projectId, + fromDate: analysisPeriod.from, + toDate: analysisPeriod.to, + where: buildWhere(version), + }) + + if (versionData) { + results[version] = analyzeMemoryTrends(versionData) + } + } + + return results +} + +module.exports = { analyzeMemoryTrendsAcrossVersions } diff --git a/scripts/resource-monitor/perform-memory-analysis/index.js b/scripts/resource-monitor/perform-memory-analysis/index.js new file mode 100644 index 00000000000..09578e5db5a --- /dev/null +++ b/scripts/resource-monitor/perform-memory-analysis/index.js @@ -0,0 +1,44 @@ +// A script to analyze memory usage data for the ODD and select ODD processes using Pearson's correlation coefficient. +// Processes with a negative correlation are filtered out. Select processes are aggregated and others are blacklisted +// from analysis. +// NOTE: While averages are reported in "buckets", Pearson's does not compare buckets when calculating a correlation. +'use strict' + +const assert = require('assert') + +const { analyzeMemoryTrendsAcrossVersions } = require('./analyzeMemoryTrends') + +const NUM_ANALYZED_PREV_VERSIONS = 2 +const USAGE = + '\nUsage:\n node ./scripts/resource-monitor/perform-memory-analysis ' + +async function main() { + try { + const [uname, pwd, projectId] = process.argv + .filter(a => !a.startsWith('-')) + .slice(2) + .map(s => s.trim()) + + assert(uname && pwd && projectId, USAGE) + + const memoryAnalysis = await analyzeMemoryTrendsAcrossVersions({ + previousVersionCount: NUM_ANALYZED_PREV_VERSIONS, + uname, + pwd, + projectId, + }) + + console.log( + '\nODD Available Memory and Processes with Increasing Memory Trend by Version (Rolling 1 Month Analysis Window):' + ) + Object.entries(memoryAnalysis).forEach(([version, analysis]) => { + console.log(`\n${version}:`, JSON.stringify(analysis, null, 2)) + }) + } catch (error) { + console.error('Error during analysis:', error) + } +} + +if (require.main === module) { + main() +}