Skip to content

Commit

Permalink
json language mark(ko->kr)
Browse files Browse the repository at this point in the history
  • Loading branch information
unknown committed Nov 15, 2015
1 parent e27e9fd commit 1682ac1
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
2 changes: 1 addition & 1 deletion crawler/www.ted.com/merge_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
for i in range(len(ens)):
en = ens[i]['en']
ko = kos[i]['ko']
parcors.append({'en':en, 'ko':ko})
parcors.append({'en':en, 'kr':ko})

print("{} script files({} lectures) merged".format(synced_list_count * 2, synced_list_count))
print("{} sentences in parcors".format(len(parcors)))
Expand Down
8 changes: 5 additions & 3 deletions crawler/www.ted.com/ted/spiders/ted.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
from urlparse import urlparse, parse_qs
import json
import time
import os

LEC_LIST_URL = "http://www.ted.com/talks?language=ko&page=%s"
LEC_LIST_MAX = 54
LEC_URL = "http://www.ted.com/talks/%s?language=ko"
SCRIPT_EN_URL = "http://www.ted.com/talks/%s/transcript?language=en"
SCRIPT_KO_URL = "http://www.ted.com/talks/%s/transcript?language=ko"
SCRIPT_EN_PATH = "scripts/%s-en.json"
SCRIPT_KO_PATH = "scripts/%s-ko.json"
SCRIPT_EN_PATH = os.path.join("scripts","%s-en.json")
SCRIPT_KO_PATH = os.path.join("scripts","%s-ko.json")

class TedSpider(Spider):
name = "ted"
Expand All @@ -26,7 +27,8 @@ class TedSpider(Spider):

def __init__(self):
self.start_urls = [LEC_LIST_URL % (i+1) for i in range(LEC_LIST_MAX)]
#self.start_urls = [DIC_URL % 100000]
if not os.path.exists("scripts"):
os.mkdir("scripts")

def parse(self, response):
# Select lecture names
Expand Down

0 comments on commit 1682ac1

Please sign in to comment.