-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunEverything.py
executable file
·75 lines (53 loc) · 2.01 KB
/
runEverything.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/python
import os
import shlex
import threading
import subprocess
from crawler.settings import BASEDIR
CONVERT=os.path.join(BASEDIR, 'convert.py')
DATADIR=os.path.join(BASEDIR, 'resultado/')
formats = ['xml', 'json', 'jsonlines']
spiders = []
p=subprocess.Popen(['scrapy', 'list'], cwd=BASEDIR, stdout=subprocess.PIPE)
for line in p.stdout:
spiders.append(line.rstrip())
def popenAndCall(onExit, meta, popenArgs, workDir):
"""
Runs the given args in a subprocess.Popen, and then calls the function
onExit when the subprocess completes.
onExit is a callable object, and popenArgs is a list/tuple of args that
would give to subprocess.Popen.
"""
def runInThread(onExit, meta, popenArgs, workDir):
proc = subprocess.Popen(popenArgs, cwd=workDir)
proc.wait()
onExit(meta)
return
thread = threading.Thread(target=runInThread, args=(onExit, meta, popenArgs, workDir))
thread.start()
# returns immediately after the thread starts
return thread
def onExit(meta):
src = '%sdados_%s.jsonlines.incomplete' % (DATADIR, meta)
dst = '%sdados_%s.jsonlines' % (DATADIR, meta)
os.rename(src, dst)
for format in formats:
if format != 'jsonlines': #it's already in jsonlines
src = '%sdados_%s.%s.incomplete' % (DATADIR, meta, format)
dst = '%sdados_%s.%s' % (DATADIR, meta, format)
outfile = open(src, 'w')
p = subprocess.Popen(shlex.split('python2.7 %s -i %sdados_%s.jsonlines -t %s' % (CONVERT, DATADIR, meta, format)), stdout=outfile)
p.wait()
outfile.flush()
outfile.close()
os.rename(src, dst)
src = '%sdados_%s.%s.incomplete.gz' % (DATADIR, meta, format)
dst = '%sdados_%s.%s.gz' % (DATADIR, meta, format)
outfile = open(src, 'wb')
p = subprocess.Popen(shlex.split('gzip -c %sdados_%s.%s' % (DATADIR, meta, format)), stdout=outfile)
p.wait()
outfile.flush()
outfile.close()
os.rename(src, dst)
for dado in spiders:
popenAndCall(onExit, dado, shlex.split('scrapy crawl %s' % dado), BASEDIR)