-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
83 lines (66 loc) · 3.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests
from bs4 import BeautifulSoup
import json
import html
# Base URL for the find-jobs section
base_url = 'https://smart-recruitments.com/find-jobs-all/'
# Step 1: Load the main jobs page to find all job links
response = requests.get(base_url)
html_content = response.content
# Step 2: Parse the HTML with BeautifulSoup to find all job links
soup = BeautifulSoup(html_content, 'html.parser')
# Assuming job links are inside <a> tags with href pointing to the job details page
# Deduplicate links by converting to a set and back to a list
job_links = list(set([a['href'] for a in soup.find_all('a', href=True) if '/find-jobs-all/' in a['href']]))
# Prepare the base of the RSS feed
rss_feed = '''<?xml version="1.0" encoding="UTF-8"?>
<source>
<publisher>SmartRecruitments</publisher>
<publisherurl>https://www.smart-recruitments.com</publisherurl>'''
# Step 3: Iterate over each job link, fetch its content, and extract the JSON
for job_link in job_links:
# Full URL for each job link
job_url = base_url + job_link.split('/')[-1]
# Fetch the job details page
response = requests.get(job_url)
job_html_content = response.content
# Parse the HTML with BeautifulSoup
job_soup = BeautifulSoup(job_html_content, 'html.parser')
# Locate the <script> tag containing the JSON
script_tag = job_soup.find('script', type='application/ld+json')
if script_tag and script_tag.string:
json_content = script_tag.string
try:
# Unescape any HTML entities that may be in the JSON content
json_content_unescaped = html.unescape(json_content)
# Parse the JSON content
data = json.loads(json_content_unescaped)
# Extract fields and build the RSS feed item
rss_feed += f'''
<job>
<title><![CDATA[{data.get('title', 'undisclosed')}]]></title>
<date><![CDATA[{data.get('datePosted', 'undisclosed')}]]></date>
<referencenumber><![CDATA[{data.get('identifier', {}).get('value', 'undisclosed')}]]></referencenumber>
<url><![CDATA[{job_url}]]></url>
<company><![CDATA[{data.get('hiringOrganization', {}).get('name', 'undisclosed')}]]></company>
<location><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('addressLocality', 'undisclosed')}]]></location>
<city><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('addressLocality', 'undisclosed')}]]></city>
<state><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('addressRegion', 'undisclosed')}]]></state>
<country><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('addressCountry', 'undisclosed')}]]></country>
<remote><![CDATA[{data.get('jobLocationType', 'WFO')}]]></remote>
<postalcode><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('postalCode', 'undisclosed')}]]></postalcode>
<description><![CDATA[{data.get('description', 'undisclosed')}]]></description>
<jobtype><![CDATA[{data.get('employmentType', 'undisclosed')}]]></jobtype>
<category><![CDATA[{data.get('industry', {}).get('value', 'undisclosed')}]]></category>
<salary><![CDATA[{data.get('baseSalary', {}).get('value', {}).get('value', 'undisclosed')}]]></salary>
<email><![CDATA[[email protected]]]></email>
</job>'''
except json.JSONDecodeError:
print(f"Error decoding JSON from {job_url}")
# Close the RSS feed
rss_feed += '''
</source>'''
# Save the combined feed to a file with UTF-8 encoding
with open('feed.xml', 'w', encoding='utf-8') as f:
f.write(rss_feed)
print(rss_feed)