Skip to content

Commit

Permalink
add an argument/helper script
Browse files Browse the repository at this point in the history
  • Loading branch information
dalgu90 committed Nov 18, 2015
1 parent 7b6b0c3 commit efadd63
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 6 deletions.
14 changes: 8 additions & 6 deletions crawler/gom.gomtv.com/gom/spiders/gom.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ class GomSpider(Spider):

#custom_settings = {'CONCURRENT_REQUESTS':5}

def __init__(self, page=None):
def __init__(self, page=None, down_dir=None):
if page is not None:
SUB_LIST_START_PAGE, SUB_LIST_END_PAGE = [int(temp) for temp in page.split('-')]
self.start_urls = [SUB_LIST_URL % (i+1) for i in range(SUB_LIST_START_PAGE, SUB_LIST_END_PAGE + 1)]
if not os.path.exists(DOWN_SUB_DIR):
os.mkdir(DOWN_SUB_DIR)
self.start_page, self.end_page = [int(temp) for temp in page.split('-')]
self.start_urls = [SUB_LIST_URL % (i+1) for i in range(self.start_page, self.end_page + 1)]
if down_dir is not None:
self.down_dir = down_dir
if not os.path.exists(self.down_dir):
os.mkdir(self.down_dir)

def parse(self, response):
# Select subtitle article page URL
Expand All @@ -61,7 +63,7 @@ def save_subtitle(self, response):
smi_filename = temp.path[temp.path.rfind('/')+1:]
if smi_filename.endswith('.smi'):
smi_filename = query_dict['intSeq'][0] + '_' + query_dict['capSeq'][0] + '_' + urllib.unquote(smi_filename).decode('utf-8')
smi_filepath = os.path.join(DOWN_SUB_DIR, smi_filename)
smi_filepath = os.path.join(self.down_dir, smi_filename)
if not os.path.exists(smi_filepath):
with open(smi_filepath, "wb") as f:
f.write(response.body)
38 changes: 38 additions & 0 deletions crawler/gom.gomtv.com/split_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
import shutil
import argparse

parser = argparse.ArgumentParser(description="Split files in a dir into multiple dirs")
parser.add_argument("--in_dir", help="Input dir")
parser.add_argument("--split", help="Split #")
parser.add_argument("--out_prefix", help="Output dir prefix")
args = parser.parse_args()

in_dir = "subtitle"
if args.in_dir is not None:
in_dir = args._in_dir
split = 4
if args.split is not None:
split = int(args.split)
out_prefix = "subtitle_"
if args.out_prefix is not None:
out_prefix = args.out_prefix

# Load all files
print 'Load all files'
file_list = os.walk(in_dir).next()[2]

# Generate out dirs
print 'Generate out dirs'
out_dir_list = []
for i in range(split):
dirname = out_prefix + str(i + 1)
out_dir_list.append(dirname)
os.mkdir(dirname)

# Split files into $(split) lists
print 'Split files into ' + str(split) + ' lists'
for i, filename in enumerate(file_list):
shutil.copy(os.path.join(in_dir, filename), os.path.join(out_dir_list[i % split], filename))

print 'done!'

0 comments on commit efadd63

Please sign in to comment.