From b60f3438247f65c06e5f60031bf9c4698d476971 Mon Sep 17 00:00:00 2001 From: Rob Kelly Date: Sun, 16 Aug 2015 08:17:04 +0400 Subject: [PATCH] Force bs4 to use lxml instead. --- scraping/new_mexico_tech_banweb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraping/new_mexico_tech_banweb.py b/scraping/new_mexico_tech_banweb.py index 0d650b9..3269a47 100755 --- a/scraping/new_mexico_tech_banweb.py +++ b/scraping/new_mexico_tech_banweb.py @@ -159,7 +159,7 @@ def getTerm(semester, subjects, parser): url = "http://banweb7.nmt.edu/pls/PROD/hwzkcrof.P_UncgSrchCrsOff?p_term="+t.getSemester()+"&p_subj="+subjectName.replace(" ", "%20") print url page = urllib2.urlopen(url) - soup = BeautifulSoup(page, "html.parser") + soup = BeautifulSoup(page, "lxml") trs = soup.findAll("tr") trs = trs[1:] #discard the retarded row that banweb is retarded about print_verbose("adding subject "+subjectName) @@ -179,7 +179,7 @@ def main(parser): page = urllib2.urlopen(url) - soup = BeautifulSoup(page, "html.parser") + soup = BeautifulSoup(page, "lxml") path = "" if (type(parser.path) == type("")):