-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsize_sort.py
executable file
·104 lines (79 loc) · 3.37 KB
/
size_sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
import pandas as pd
import sys
from os.path import isdir, dirname, abspath, join as pjoin
import os
from subprocess import Popen, PIPE
def usage():
print('''Organize directories according to last access date.
Usage:
# for all subdirectories in dir
./size_sort.py infoFile.csv outSummary.csv `ls -d /abs/path/of/parent/dir/*/`
# for all files in dir
./size_sort.py infoFile.csv outSummary.csv `ls -d /abs/path/of/parent/dir/*`''')
exit()
def main():
if len(sys.argv)==1 or sys.argv[1]=='-h' or sys.argv[1]=='--help':
usage()
infoFile= sys.argv[1]
outSummary = sys.argv[2]
dirs = sys.argv[3:]
# sanity check
if not (infoFile.endswith('.csv') and outSummary.endswith('.csv')):
print('infoFile and outSummary must be .csv files')
usage()
if len(dirs)<2:
print('provide a list of directories, not the parent directory only, may be you forgot an */ at the end?')
usage()
# read list of people ever been at PNL
dpeople= pd.read_csv(pjoin(dirname(abspath(__file__)), 'user_name.csv'))
# local folders
dpeople.set_index('uid', inplace=True)
remote=0
if '/data/' in dirs[1]:
# remote folders
dpeople.set_index('user', inplace=True)
remote=1
df= pd.read_csv(infoFile)
df_parent= pd.DataFrame(columns= ['Directory', 'SizeG', 'Owner', 'Last Modified'])
for j,dir in enumerate(dirs):
for i,name in enumerate(df[' Directory']):
if dir==name+'/':
# obtain its ownership info
if not remote:
stat= os.stat(dir)
else:
with Popen(f"ssh [email protected] \"ls -lad {dir}\"",
shell=True, stdout=PIPE) as p:
# b'drwxrws---. 72 ll598 BWH-PNL-G 12288 Apr 7 10:26 /data/pnl/home/\n'
stdout= p.communicate()[0]
uid= stdout.decode('utf-8').split()[2]
class stat:
st_uid= uid
try:
owner= dpeople.loc[stat.st_uid]
if pd.isna(owner.fname):
# if name does not exist, populate user ID
owner.fname=owner.user
except:
class owner:
fname=''
user=''
df_parent.loc[j]= [
dir,
round(df[' SizeG'][i],ndigits=2),
owner.fname,
df[' Last Modified'][i]
]
df_parent.sort_values(by=['SizeG'], ascending=False, inplace= True)
df_parent.set_index('Directory', inplace=True)
df_parent.to_csv(outSummary)
print(df_parent)
if __name__=='__main__':
main()
'''
./size_sort.py _data/logdirsizes/rfanfs_pnl-zorro-dirsizes-3-20190506.csv Collaborators.csv `ls -d /rfanfs/pnl-zorro/Collaborators/*/`
./size_sort.py _data/logdirsizes/rfanfs_pnl-zorro-dirsizes-3-20190506.csv projects.csv `ls -d /rfanfs/pnl-zorro/projects/*/`
./size_sort.py _data/logdirsizes/rfanfs_pnl-zorro-dirsizes-3-20190506.csv home.csv `ls -d /rfanfs/pnl-zorro/home/*/`
./size_sort.py _data/logdirsizes/data_pnl-dirsizes-3-20190615.csv data_pnl.csv `ssh eris1n2.research.partners.org "ls -d /data/pnl/*/"`
'''