Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide in the FWJR the CPU and wallclock time for CMSSW subprocesses #11665

Merged
merged 2 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/python/WMCore/FwkJobReport/Report.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,6 +1323,21 @@ def setAcquisitionProcessing(self, acquisitionEra, processingVer, processingStr=

return

def updateSubprocessInfo(self, sysTime, userTime, startTime, endTime):
"""
Add process information to WMCore FJR
vkuznet marked this conversation as resolved.
Show resolved Hide resolved
:param sysTime: systtem time reported by subprocess job
:param userTime: user time reported by subprocess job
:param startTime: start time of subprocess (in seconds)
:param endTime: endtime time of subprocess (in seconds)
"""
self.report.section_('WMCMSSWSubprocess')
self.report.WMCMSSWSubprocess.startTime = startTime
self.report.WMCMSSWSubprocess.endTime = endTime
self.report.WMCMSSWSubprocess.wallClockTime = endTime - startTime
self.report.WMCMSSWSubprocess.userTime = userTime
self.report.WMCMSSWSubprocess.sysTime = sysTime

def setValidStatus(self, validStatus):
"""
_setValidStatus_
Expand Down
31 changes: 24 additions & 7 deletions src/python/WMCore/WMSpec/Steps/Executors/CMSSW.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import socket
import subprocess
import sys
import time
from resource import getrusage, RUSAGE_SELF, RUSAGE_CHILDREN

from Utils.PythonVersion import PY3
from Utils.Utilities import encodeUnicodeToBytesConditional
Expand Down Expand Up @@ -243,9 +245,6 @@ def execute(self, emulator=None):
# spawn this new process
# the script looks for:
# <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG>
# open the output files
stdoutHandle = open(self.step.output.stdout, 'w')
stderrHandle = open(self.step.output.stderr, 'w')
args = ['/bin/bash',
configPath,
scramSetup,
Expand Down Expand Up @@ -273,7 +272,24 @@ def execute(self, emulator=None):

os.environ.update(envOverride)

returnCode = subprocess.call(args, stdout=stdoutHandle, stderr=stderrHandle)
# take snapshot of user and sys time before we call subprocess
startTime = time.time()
userTime0 = getrusage(RUSAGE_CHILDREN).ru_utime
sysTime0 = getrusage(RUSAGE_CHILDREN).ru_stime
returnCode = 1 # by default we assume process may fail
with open(self.step.output.stdout, 'w') as stdoutHandle, open(self.step.output.stderr, 'w') as stderrHandle:
proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
childPid, exitStatus, resource = os.wait4(proc.pid, os.P_PID)
stdout, stderr = proc.communicate()
stdoutHandle.write(str(stdout))
stderrHandle.write(str(stderr))
returnCode = proc.returncode

# calculate user and sys time of subprocess by substructing relevant
# parts take before we spawn the subprocess
userTime = getrusage(RUSAGE_CHILDREN).ru_utime - userTime0
sysTime = getrusage(RUSAGE_CHILDREN).ru_stime - sysTime0
endTime = time.time()
returnMessage = None

# Return PYTHONPATH to its original value, as this
Expand All @@ -300,10 +316,11 @@ def execute(self, emulator=None):
else:
self._setStatus(returnCode, returnMessage)

stdoutHandle.close()
stderrHandle.close()

try:
# update FJR report info with subprocess info
self.report.updateSubprocessInfo(sysTime, userTime, startTime, endTime)

# parse job report XML
self.report.parse(jobReportXML, stepName=self.stepName)
except Exception as ex:
msg = WM_JOB_ERROR_CODES[50115]
Expand Down
20 changes: 20 additions & 0 deletions test/python/WMCore_t/FwkJobReport_t/Report_t.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,26 @@ def testTaskJobID(self):

return

def test_SubprocessInfo(self):
"""
_SubprocessInfo_

Check CMSSW subprocess metrics
"""
report = Report("cmsRun1")
startTime = 0
endTime = 1
userTime = 1
sysTime = 1
report.updateSubprocessInfo(sysTime, userTime, startTime, endTime)
subinfo = report.retrieveStep("cmsRun1").WMCMSSWSubprocess
sdict = subinfo.dictionary_()
self.assertEqual(sdict['startTime'], startTime)
self.assertEqual(sdict['endTime'], endTime)
self.assertEqual(sdict['wallClockTime'], endTime-startTime)
self.assertEqual(sdict['userTime'], userTime)
self.assertEqual(sdict['sysTime'], sysTime)

def test_PerformanceReport(self):
"""
_PerformanceReport_
Expand Down
32 changes: 32 additions & 0 deletions test/python/WMCore_t/WMSpec_t/Steps_t/Executors_t/CMSSW_t.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
import inspect
import os
import shutil
import subprocess
import sys
import unittest
from resource import getrusage, RUSAGE_SELF, RUSAGE_CHILDREN

import WMCore_t.WMSpec_t.Steps_t as ModuleLocator

Expand Down Expand Up @@ -232,6 +234,36 @@ def testD_ExecuteNoOutput(self):
os.chdir(self.oldCwd)
return

def testSubprocessTime(self):
"""
Test logic of calculating subprocess user/sys times
"""
# take snapshot of user and sys time before we call subprocess
userTime0 = getrusage(RUSAGE_CHILDREN).ru_utime
sysTime0 = getrusage(RUSAGE_CHILDREN).ru_stime

# call subprocess command
args = ["sleep", "1"]
proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
childPid, exitStatus, resource = os.wait4(proc.pid, os.P_PID)
stdout, stderr = proc.communicate()
returnCode = proc.returncode
self.assertTrue(stdout == b'')
self.assertTrue(stderr == b'')
self.assertTrue(childPid >= 0)
self.assertTrue(exitStatus == 0)
self.assertTrue(returnCode == 0)
self.assertTrue(resource.ru_utime > 0)
self.assertTrue(resource.ru_stime > 0)

# calculate user and sys time of subprocess by substructing relevant
# parts take before we spawn the subprocess
userTime1 = getrusage(RUSAGE_CHILDREN).ru_utime
sysTime1 = getrusage(RUSAGE_CHILDREN).ru_stime

self.assertTrue(userTime1 > userTime0)
self.assertTrue(sysTime1 > sysTime0)


if __name__ == '__main__':
unittest.main()