forked from Aishubalaji/Job-Search
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobscraper.py
106 lines (93 loc) · 3.12 KB
/
jobscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# imports
import requests
from bs4 import BeautifulSoup
from datetime import date
import pandas as pd
import boto3
from botocore.client import Config
import openpyxl
# amazon s3 configurations
ACCESS_KEY_ID = ''
ACCESS_SECRET_KEY = ''
BUCKET_NAME = 'jobs-csv-storage'
filename = 'job_list.xlsx'
# function to generate url
def generate_url(keyword,location):
url = 'https://in.indeed.com/jobs?q={}&l={}'.format(keyword,location)
return url
# function to generate html content (soup object)
def extract_data(url):
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36'}
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
# function to scrape data
def transform_data(soup, joblists):
divs = soup.find_all('div', class_="jobsearch-SerpJobCard")
for item in divs:
title = item.find('a').text.strip()
company = item.find('span', class_='company').text.strip()
location = item.find(class_='location').text.strip()
try:
salary = item.find('span', class_='salaryText').text.strip()
except AttributeError:
salary = ''
summary = item.find('div', class_='summary').text.strip().replace('\n','')
posted_date = item.find('span', class_='date').text
today_date = date.today().strftime("%d/%m/%Y")
base_url = "https://indeed.com"
link = item.find('a').attrs['href']
job = {
'Title': title,
'Company': company,
'Location': location,
# 'Rating': rating,
'Salary': salary,
'Summary': summary,
"Today's date": today_date,
'Posted': posted_date,
'Link': base_url + link
}
joblists.append(job)
return joblists
# function to get url of the next page
def get_next_page(soup):
try:
url = soup.find("a", {"aria-label": "Next"}).get("href")
return "https://in.indeed.com" + url
except AttributeError:
return None
# function to save dataframe to xlsx
def save_to_xlsx(df):
df.to_excel(filename, encoding='utf-8', columns=["Title","Company","Location","Salary","Today's date","Posted","Link"])
# function to upload xlsx to amazon s3 -> For heroku deployment
def to_aws():
data = open(filename, 'rb')
# connect to AWS
s3 = boto3.resource(
's3',
aws_access_key_id=ACCESS_KEY_ID,
aws_secret_access_key=ACCESS_SECRET_KEY,
config=Config(signature_version='s3v4')
)
# file upload
s3.Bucket(BUCKET_NAME).put_object(Key=filename, Body=data, ACL='public-read')
# main function
def extract_job(keyword=None,location=None):
joblists = []
if not location and not keyword:
return 0
url = generate_url(keyword,location)
for _ in range(4):
c = extract_data(url)
if not url:
break
joblists = transform_data(c,joblists)
url = get_next_page(c)
if not url:
break
df = pd.DataFrame(joblists)
if len(df)>0:
save_to_xlsx(df)
to_aws()
return len(df) # returns the number of jobs