Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "rda_python_miscs"
version = "2.0.0"
version = "2.0.1"
authors = [
{ name="Zaihua Ji", email="[email protected]" },
]
Expand Down
51 changes: 4 additions & 47 deletions src/rda_python_miscs/rdakill.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self):
'r': 0, # 1 - reserved for exclusive, working with -s PEND only
'u': None, # login user name
's': None, # batch status to kill
'q': None # batch partition/queue for SLURM/PBS, rda for default
'q': None # batch partition/queue for PBS, rda for default
}

# function to read parameters
Expand Down Expand Up @@ -71,15 +71,7 @@ def start_actions(self):
killloc = 1
if self.RDAKILL['h']:
self.local_host_action(self.RDAKILL['h'], "kill processes", self.PGLOG['HOSTNAME'], self.LGEREX)
if not self.pgcmp(self.RDAKILL['h'], self.PGLOG['SLMNAME'], 1):
if not (self.RDAKILL['p'] or self.RDAKILL['s']):
self.pglog("Provide Batch ID or Job Status to kill SLURM jobs", self.LGEREX)
if self.RDAKILL['p']:
self.dakill_slurm_batch(self.RDAKILL['p'])
else:
self.rdakill_slurm_status(self.RDAKILL['s'], self.RDAKILL['q'], self.RDAKILL['u'])
killloc = 0
elif not self.pgcmp(self.RDAKILL['h'], self.PGLOG['PBSNAME'], 1):
if not self.pgcmp(self.RDAKILL['h'], self.PGLOG['PBSNAME'], 1):
if not (self.RDAKILL['p'] or self.RDAKILL['s']):
self.pglog("Provide Batch ID or Job Status to kill PBS jobs", self.LGEREX)
if self.RDAKILL['p']:
Expand Down Expand Up @@ -127,7 +119,6 @@ def rdakill_processes(self, pid, ppid, aname = None, uname = None, level = 0):
buf += "on " + self.RDAKILL['h']
else:
buf += "locally"
if self.PGLOG['CURBID']: buf += "; add Option '-h SLURM' if SLURM batch ID provided"
self.pglog(buf, self.LOGWRN)

# a local child process
Expand All @@ -139,41 +130,7 @@ def kill_local_child(self, pid, uid, line):
elif self.check_process(pid):
return self.pglog("Error Kill: {}\n{}".format(line, self.PGLOG['SYSERR']), self.LOGWRN)
if not self.check_process(pid): self.pglog("Quit: " + line, self.LOGWRN)

# kill a slurm batch job
def rdakill_slurm_batch(self, bid):
ret = 0
stat = self.check_slurm_status(bid, self.LOGWRN)
if stat:
cmd = self.get_local_command("scancel {}".format(bid), stat['USER'])
ret = self.pgsystem(cmd, self.LOGWRN, 6)
if ret: self.record_dscheck_interrupt(bid, self.PGLOG['SLMNAME'])
else:
self.pglog("{}: cannot find SLURM batch ID".format(bid), self.LOGERR)
if not ret and self.PGLOG['SYSERR']: self.pglog(self.PGLOG['SYSERR'], self.LGEREX)
return ret

# kill SLURM batch jobs for given status
def rdakill_slurm_status(self, stat, part, uname):
if not part: part = 'rda'
bcmd = "sacct -o jobid,user,state -r {} -".format(part)
bcmd += ("u " + uname if uname else 'a')
lines = self.get_slurm_multiple(bcmd)
bcnt = len(lines['JOBID']) if lines else 0
pcnt = kcnt = 0
for i in range(bcnt):
if lines['STATE'][i] == stat:
pcnt += 1
kcnt += self.rdakill_slurm_batch(lines['JOBID'][i])
if pcnt > 0:
s = 's' if pcnt > 1 else ''
line = "{} of {} SLURM '{}' job{} Killed".format(kcnt, pcnt, stat, s)
else:
line = "No SLURM '{}' job found to kill".format(stat)
line += " in Partition '{}'".format(part)
if uname: line += " for " + uname
self.pglog(line, self.LOGWRN)


# kill a pbs batch job
def rdakill_pbs_batch(self, bid):
ret = 0
Expand All @@ -188,7 +145,7 @@ def rdakill_pbs_batch(self, bid):
self.pglog("{}: cannot find PBS batch ID".format(bid), self.LOGERR)
if not ret and self.PGLOG['SYSERR']: self.pglog(self.PGLOG['SYSERR'], self.LGEREX)
return ret

# kill PBS batch jobs for given status
def rdakill_pbs_status(self, stat, queue, uname):
if not queue: queue = 'rda'
Expand Down
8 changes: 4 additions & 4 deletions src/rda_python_miscs/rdakill.usg
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

Kill one of multiple processes and theirs children for given local process ID or
other process information; kill one or multiple SLURM/PBS batch jobs for give batch
Job ID or Status. For killing SLURM/PBS batch jobs, you must login to cheyenne/caser
other process information; kill one or multiple PBS batch jobs for give batch
Job ID or Status. For killing PBS batch jobs, you must login to casper
login nodes.

Usage: rdakill [-h HostName] [-p ProcessID] [-P ParentProcessID] \
Expand All @@ -10,13 +10,13 @@
- Option -a, application name of the process;

- Option -h, hostname the process is on. Omit it for local process,
but it is mandatory if the process id is a SLURM/PBS bactch id.
but it is mandatory if the process id is a PBS bactch id.

- Option -p, the process id or batch job id to be stopped.

- Option -P, the parent process id;

- Option -q, the SLURM Partition or PBS queue name. It defaults to 'rda';
- Option -q, the PBS queue name. It defaults to 'rda';

- Option -s, the Batch Job Status; this is mantatory if batch id is not provided;

Expand Down
42 changes: 3 additions & 39 deletions src/rda_python_miscs/rdaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,7 @@ def start_actions(self):
chkloc = 1
if self.RDAPS['h']:
self.local_host_action(self.RDAPS['h'], "check processes", self.PGLOG['HOSTNAME'], self.LGEREX)
if not self.pgcmp(self.RDAPS['h'], self.PGLOG['SLMNAME'], 1):
self.slurm_snapshot()
chkloc = 0
elif not self.pgcmp(self.RDAPS['h'], self.PGLOG['PBSNAME'], 1):
if not self.pgcmp(self.RDAPS['h'], self.PGLOG['PBSNAME'], 1):
self.pbs_snapshot()
chkloc = 0
if chkloc: self.process_snapshot()
Expand Down Expand Up @@ -96,40 +93,7 @@ def process_snapshot(self):
if self.RDAPS['P'] and self.RDAPS['P'] != ppid: continue
if self.RDAPS['a'] and aname.find(self.RDAPS['a']) < 0: continue
self.pglog(re.sub(r' +', ' ', line), self.LOGWRN)

# get a snapshot of a SLURM batch process status
def slurm_snapshot(self):
qopts = ''
if self.RDAPS['u']: qopts += " -u " + self.RDAPS['u']
if self.RDAPS['p']:
qopts += " -j {}".format(self.RDAPS['p'])
else:
qopts = " -p rda"
cmd = "squeue -l" + qopts
buf = self.pgsystem(cmd, self.LOGWRN, 272)
if not buf:
if self.PGLOG['SYSERR'] and self.PGLOG['SYSERR'].find('Invalid job id specified') < 0:
self.pglog(self.PGLOG['SYSERR'], self.LGEREX)
return
lines = re.split(r'\n', buf)
lcnt = len(lines)
if lcnt < 3: return
dochk = 1
for line in lines:
if not line: continue
if dochk:
if re.match(r'^\s*JOBID\s', line): dochk = 0
else:
vals = re.split(r'\s+', self.pgtrim(line))
if self.RDAPS['a'] and vals[2] and self.RDAPS['a'] != vals[2]: continue
# move user name to front
val = vals[3]
vals[3] = vals[2]
vals[2] = vals[1]
vals[1] = vals[0]
vals[0] = val
self.pglog(' '.join(vals), self.LOGWRN)


# get a snapshot of a PBS batch process status
def pbs_snapshot(self):
qopts = ''
Expand All @@ -138,7 +102,7 @@ def pbs_snapshot(self):
if self.RDAPS['p']:
if qopts: qopts += ' '
qopts += str(self.RDAPS['p'])
if not qopts: qopts = 'rda'
if not qopts: qopts = 'gdex'
stat = self.get_pbs_info(qopts, 1, self.LOGWRN)
if not stat:
if self.PGLOG['SYSERR']: self.pglog(self.PGLOG['SYSERR'], self.LGEREX)
Expand Down
2 changes: 1 addition & 1 deletion src/rda_python_miscs/rdaps.usg
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
- Option -a, application name of the process;

- Option -h, hostname the process is on; omit it for local process,
but it is mandatory if the process id is a SLURM/PBS bactch id.
but it is mandatory if the process id is a PBS bactch id.

- Option -p, the local process or batch job id to be checked;

Expand Down