-
Notifications
You must be signed in to change notification settings - Fork 0
/
indeed_first_run2.py
57 lines (50 loc) · 3.01 KB
/
indeed_first_run2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
#import necessary libraries
import re
import urllib2
import json
import os
#ask the user to set their initial search parameters
location_criteria = "London"
distance_criteria = "50"
salary_criteria = "£55k-£150k"
#set the Indeed publisher ID
publisher_id = "7317619313909945"
#setup regex to remove annoying bold tags from item descriptions
regex = re.compile(r'(?=(.))(?:<b>|</b>)', flags=re.IGNORECASE)
snippets = []
#make an API call for each line in job_list.txt.
#write jobs to job_results.txt, but ignore any job which has words or phrases from exclusion_list.txt in the job title.
with open('job_list.txt') as job_list_criteria, open('job_results.html', 'wb') as job_list_results, open('exclusion_list.txt', 'r') as exclusions:
job_phrases = job_list_criteria.read().splitlines()
exclusion_phrases = exclusions.read().splitlines()
job_list_results.write('<html>\n<head><link rel="stylesheet" type="text/css" href="mystyle.css"></head>\n<body>\n')
for job in job_phrases:
search_criteria1 = str(job)
search_criteria2 = search_criteria1.replace(' ', '+')
api_query = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_id + "&v=2&format=json&q=%22" + search_criteria2 + "%22&l=" + location_criteria + "&fromage=21&radius=" + distance_criteria + "&salary=" + salary_criteria + "&limit=500&sort=date&co=gb&ip=188.221.151.22&useragent=Mozilla/%2F4.0%28Firefox%29"
#fetch the response and assign json parsed data to a variable
json_data = json.loads(urllib2.urlopen(api_query).read())
#parse through json, strip out bold tags and then write relevant items from the job search to job_results.txt
for item in json_data["results"]:
if any(exclusion in item["jobtitle"] for exclusion in exclusion_phrases):
continue
else:
if item["snippet"] not in snippets:
snippets.append(item["snippet"])
job_list_results.write('<div id="job">\n')
company = '<h2>' + regex.sub(r"", item["company"]) + '</h2>' + "\n"
job_title = '<h1>' + regex.sub(r"", item["jobtitle"]) + '</h1>' + "\n"
job_snippet = '<p>' + regex.sub(r"", item["snippet"]) + '</h3>' + "\n" + "<br /><br />"
job_url = '<a href="' + regex.sub(r"", item["url"]) + '">Link to full description</a>'
job_list_results.write(job_title.encode('utf-8', 'replace'))
job_list_results.write(company.encode('utf-8', 'replace'))
job_list_results.write(job_snippet.encode('utf-8', 'replace'))
job_list_results.write(job_url.encode('utf-8', 'replace'))
job_list_results.write("\n")
job_list_results.write('</div>\n')
else:
continue
job_list_results.write("</body>\n</html>")
#open the list of jobs for review, so that job title keywords can be excluded
os.system('open job_results.html')