diff --git a/crawler/gom.gomtv.com/gom/spiders/gom.py b/crawler/gom.gomtv.com/gom/spiders/gom.py index e630e1f..441813b 100644 --- a/crawler/gom.gomtv.com/gom/spiders/gom.py +++ b/crawler/gom.gomtv.com/gom/spiders/gom.py @@ -32,12 +32,14 @@ class GomSpider(Spider): #custom_settings = {'CONCURRENT_REQUESTS':5} - def __init__(self, page=None): + def __init__(self, page=None, down_dir=None): if page is not None: - SUB_LIST_START_PAGE, SUB_LIST_END_PAGE = [int(temp) for temp in page.split('-')] - self.start_urls = [SUB_LIST_URL % (i+1) for i in range(SUB_LIST_START_PAGE, SUB_LIST_END_PAGE + 1)] - if not os.path.exists(DOWN_SUB_DIR): - os.mkdir(DOWN_SUB_DIR) + self.start_page, self.end_page = [int(temp) for temp in page.split('-')] + self.start_urls = [SUB_LIST_URL % (i+1) for i in range(self.start_page, self.end_page + 1)] + if down_dir is not None: + self.down_dir = down_dir + if not os.path.exists(self.down_dir): + os.mkdir(self.down_dir) def parse(self, response): # Select subtitle article page URL @@ -61,7 +63,7 @@ def save_subtitle(self, response): smi_filename = temp.path[temp.path.rfind('/')+1:] if smi_filename.endswith('.smi'): smi_filename = query_dict['intSeq'][0] + '_' + query_dict['capSeq'][0] + '_' + urllib.unquote(smi_filename).decode('utf-8') - smi_filepath = os.path.join(DOWN_SUB_DIR, smi_filename) + smi_filepath = os.path.join(self.down_dir, smi_filename) if not os.path.exists(smi_filepath): with open(smi_filepath, "wb") as f: f.write(response.body) diff --git a/crawler/gom.gomtv.com/split_files.py b/crawler/gom.gomtv.com/split_files.py new file mode 100644 index 0000000..b83cc94 --- /dev/null +++ b/crawler/gom.gomtv.com/split_files.py @@ -0,0 +1,38 @@ +import os +import shutil +import argparse + +parser = argparse.ArgumentParser(description="Split files in a dir into multiple dirs") +parser.add_argument("--in_dir", help="Input dir") +parser.add_argument("--split", help="Split #") +parser.add_argument("--out_prefix", help="Output dir prefix") +args = parser.parse_args() + +in_dir = "subtitle" +if args.in_dir is not None: + in_dir = args._in_dir +split = 4 +if args.split is not None: + split = int(args.split) +out_prefix = "subtitle_" +if args.out_prefix is not None: + out_prefix = args.out_prefix + +# Load all files +print 'Load all files' +file_list = os.walk(in_dir).next()[2] + +# Generate out dirs +print 'Generate out dirs' +out_dir_list = [] +for i in range(split): + dirname = out_prefix + str(i + 1) + out_dir_list.append(dirname) + os.mkdir(dirname) + +# Split files into $(split) lists +print 'Split files into ' + str(split) + ' lists' +for i, filename in enumerate(file_list): + shutil.copy(os.path.join(in_dir, filename), os.path.join(out_dir_list[i % split], filename)) + +print 'done!'