-
Notifications
You must be signed in to change notification settings - Fork 31
/
archivefiles.py
executable file
·306 lines (228 loc) · 10.8 KB
/
archivefiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# archivefiles.py January 2018
#
# allows you to read files from the file service and save them to a directory on the file system.
# Optionally, the tool will also delete files from the file service in order to free up space.
# For example,
#
# ./archivefiles.py -n log -d 6 -p /job -fp /tmp
#
# Blog: https://blogs.sas.com/content/sgf/2019/04/04/where-are-my-viya-files/
#
# Change History
#
# 27JAN2019 Comments added
# 20SEP2019 Do not write out binary files
# 20SEP2019 Accept parent folder as a parameter
# 12FEB2020 Bug fix when not query is provided
# 20FEB2020 Fix for python 3 unicode is now str
#
#
# Copyright © 2018, SAS Institute Inc., Cary, NC, USA. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the License); you may not use this file except in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing permissions and limitations under the License.
#
import argparse , datetime, os, time, json, sys
from sharedfunctions import callrestapi,printresult,getfolderid,getidsanduris,createdatefilter
from datetime import datetime as dt, timedelta as td
# get python version
version=int(str(sys.version_info[0]))
# in python3 unicode is now string
if version >= 3: unicode = str
# setup command-line arguements. In this block which is common to all the tools you setup what parameters
# are passed to the tool
# the --output parameter is a common one which supports the styles of output json, simplejson, simple or csv
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='''DETAILS: Archive and optionally delete files stored in the Viya infrastructure data server.
NOTE: By default files in folders are not processed. You must use -pf to process files stored in folders.''',
epilog='''WARNING: if you use -xx your files cannot be recovered.
WARNING: Binary files are deleted and not archived if you choose to delete.''')
parser.add_argument("-n","--name", help="Name contains",default=None)
parser.add_argument("-c","--type", help="Content Type in.",default=None)
parser.add_argument("-p","--parent", help="ParentURI starts with.",default=None)
parser.add_argument("-pf","--parentfolder", help="Parent Folder Name.",default=None)
parser.add_argument("-d","--days", help="List files older than this number of days",default='-1')
parser.add_argument("-m","--modifiedby", help="Last modified id equals",default=None)
parser.add_argument("-fp","--path", help="Path of directory to store files",default='/tmp')
parser.add_argument("-x","--delete", help="Delete Files after archiving from Viya",action='store_true')
parser.add_argument("-xx","--deletenoarch", help="Delete Files without Archiving from Viya",action='store_true')
parser.add_argument("--debug", action='store_true', help="Debug")
args = parser.parse_args()
daysolder=args.days
modby=args.modifiedby
nameval=args.name
puri=args.parent
path=args.path
dodelete=args.delete
deletenoarch=args.deletenoarch
pfolder=args.parentfolder
debug=args.debug
binaryTypes=['application/octet-stream','audio','image','video','application/msword','application/gzip','application/java-archive','application/pdf','application/rtf','application/x-tar','application/vnd.ms-excel']
# you can subset by parenturi or parentfolder but not both
if puri !=None and pfolder !=None:
print("ERROR: cannot use both -p parent and -pf parentfolder at the same time.")
print("ERROR: Use -pf for folder parents and -p for service parents.")
sys.exit()
if deletenoarch and not dodelete:
print("ERROR: cannot choose -xx (--deletenorarch) without choosing -x (--delete).")
sys.exit()
# prompt if delete is requested
if dodelete:
if deletenoarch:
if version > 2:
areyousure=input("The files will be deleted. Do you want to continue? (Y)")
else:
areyousure=raw_input("The files will be deleted. Do you want to continue? (Y)")
if areyousure !='Y':
print("NOTE: you chose to not delete or archive.")
sys.exit()
else:
if version > 2:
areyousure=input("The files will be archived. Do you also want to delete the files? (Y)")
else:
areyousure=raw_input("The files will be archived. Do you also want to delete the files? (Y))")
if areyousure !='Y': dodelete=False
# calculate time period for files
datefilter=createdatefilter(olderoryounger='older',datevar='creationTimeStamp',days=daysolder)
# create a list for filter conditions
filtercond=[]
# there is always a number of days, the default is zero
filtercond.append(datefilter)
if nameval!=None: filtercond.append('contains($primary,name,"'+nameval+'")')
if modby!=None: filtercond.append("eq(modifiedBy,'"+modby+"')")
# set the request type
reqtype='get'
delimiter = ','
# process items not in folders
if puri!=None:
print("NOTE: processing files with parent uri contains: "+puri)
filtercond.append("contains(parentUri,'"+puri+"')")
completefilter = 'and('+delimiter.join(filtercond)+')'
reqval="/files/files?filter="+completefilter+"&limit=10000"
files_result_json=callrestapi(reqval,reqtype)
files = files_result_json['items']
# process items in folders
elif pfolder!=None:
print("NOTE: processing files with parentfolder equals: "+pfolder)
folderid=getfolderid(pfolder)[0]
# add the start and end and comma delimit the filter
reqval="/folders/folders/"+folderid+"/members"
files_in_folder=callrestapi(reqval,reqtype)
#now get the file objects using the ids returned
iddict=getidsanduris(files_in_folder)
# get the uris of the files
uris=iddict['uris']
#get id, need to do this because only the uri of the folder is returned
idlist=[]
for item in uris:
vallist=item.rsplit('/')
idlist.append(vallist[-1])
#inclause = ','.join(map(str, ids))
inclause=(', '.join("'" + item + "'" for item in idlist))
filtercond.append("in(id,"+inclause+")")
completefilter = 'and('+delimiter.join(filtercond)+')'
reqval="/files/files?filter="+completefilter+"&limit=10000"
files_result_json=callrestapi(reqval,reqtype)
files = files_result_json['items']
else:
print("NOTE: processing files that are not stored in folders.")
print("NOTE: files stored in folders are only processed with the -pf option.")
# no parent folder or URI provided
completefilter = 'and('+delimiter.join(filtercond)+')'
reqval="/files/files?filter="+completefilter+"&limit=10000"
files_result_json=callrestapi(reqval,reqtype)
files = files_result_json['items']
filesnotinfolders=[]
# post process so this list is only items that do not exist in folders
for file in files:
fileid=file['id']
contenttype=file['contentType']
filename=file['name']
# is this item in a folder
afolder=callrestapi("/folders/ancestors?childUri=/files/files/"+fileid,"get",stoponerror=0,noprint=1)
#if afolder == None: print("NOTE: NOT in a folder.")
#else:
if afolder == None: filesnotinfolders.append(file)
files=filesnotinfolders
#create a directory with a name of the timestamp only if running in execute mode
newdirname="D"+dt.today().strftime("%Y%m%dT%H%MS")
archivepath=os.path.join(path,newdirname )
if os.path.isdir(archivepath)==False: os.makedirs(archivepath)
if debug:
print(reqval)
#print(json.dumps(files,indent=2))
if len(files) and not deletenoarch :
if os.path.isdir(archivepath)==False: os.makedirs(archivepath)
# list that contains files that can be archived
passlist=[]
filesdeleted=0
# process each file
for file in files:
fileid=file['id']
contenttype=file['contentType']
filename=file['name']
if debug:
print("NOTE: processing file "+filename+" of contentype "+contenttype)
if deletenoarch:
reqtype='delete'
reqval="/files/files/"+fileid
callrestapi(reqval,reqtype)
filesdeleted=filesdeleted+1
else:
archivefile=os.path.join(archivepath,filename )
reqtype='get'
reqval="/files/files/"+fileid+"/content"
content=callrestapi(reqval,reqtype)
out_type='w'
# decide on write style w+b is binary w is text
# currently cannot process binary files
#if contenttype.startswith('application/v') or contenttype.startswith('image') or contenttype.startswith('video') or contenttype.startswith('audio') or contenttype.startswith('application/pdf'):
binaryTypes=['application/octet-stream','application/gzip','application/java-archive','application/pdf','application/rtf','application/x-tar','application/vnd.ms-excel']
binary=False
if contenttype.startswith('video') or contenttype.startswith('audio') or contenttype.startswith('image') or contenttype in binaryTypes: binary=True
# for typevar in binaryTypes:
# if contenttype.startswith(typevar):
# binary=True
# break
if binary:
out_type="wb"
print('NOTE: '+filename+' of content type ' +contenttype+' not supported for archive, but will be deleted if -x or -xx selected.')
else:
# if files is not binary write it to the archive
if type(content) is dict:
with open(archivefile, out_type) as fp:
json.dump(content,fp,indent=4)
fp.close()
passlist.append(filename)
elif type(content) is unicode or type(content) is str:
with open(archivefile, out_type) as fp:
if version < 3:
fp.write(content.encode('utf8'))
else: fp.write(content)
fp.close()
passlist.append(filename)
else: print('NOTE: '+filename+' content type not supported for archive.')
# delete requested
if dodelete:
reqtype='delete'
reqval="/files/files/"+fileid
callrestapi(reqval,reqtype)
filesdeleted=1
# print out final messages
total_archived=len(passlist)
if total_archived:
print('NOTE: '+str(total_archived) +' file(s) archived to the directory '+archivepath)
else:
if not deletenoarch: print('NOTE: No files that can be archived were found.')
if dodelete or deletenoarch:
if filesdeleted: print('NOTE: '+str(filesdeleted)+' file(s) matching criteria were deleted.')
else: print('NOTE: No files deleted.')