-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml from jobPosting Structured Data
72 lines (57 loc) · 2.94 KB
/
xml from jobPosting Structured Data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import requests
from bs4 import BeautifulSoup
import json
# Base URL for the find-jobs section
base_url = 'https://smart-recruitments.framer.website/find-jobs/'
# Step 1: Load the main jobs page to find all job links
response = requests.get(base_url)
html_content = response.content
# Step 2: Parse the HTML with BeautifulSoup to find all job links
soup = BeautifulSoup(html_content, 'html.parser')
# Assuming job links are inside <a> tags with href pointing to the job details page
job_links = [a['href'] for a in soup.find_all('a', href=True) if '/find-jobs/' in a['href']]
# Prepare the base of the RSS feed
rss_feed = '''<?xml version="1.0" encoding="UTF-8" ?>
<source>
<publisher>SmartRecruitments</publisher>
<publisherurl>https://www.smart-recruitments.com</publisherurl>'''
# Step 3: Iterate over each job link, fetch its content, and extract the JSON
for job_link in job_links:
# Full URL for each job link
job_url = base_url + job_link.split('/')[-1]
# Fetch the job details page
response = requests.get(job_url)
job_html_content = response.content
# Parse the HTML with BeautifulSoup
job_soup = BeautifulSoup(job_html_content, 'html.parser')
# Locate the <script> tag containing the JSON
script_tag = job_soup.find('script', type='application/ld+json')
if script_tag:
json_content = script_tag.string
# Parse the JSON content
data = json.loads(json_content)
# Extract fields
# hiring_organization =
# organization_name = hiring_organization.get('name', 'No Name Provided')
rss_feed += f'''
<job>
<title><![CDATA[{data.get('title', 'No Title')}]]></title>
<date><![CDATA[{data.get('datePosted', 'No Date')}]]></date>
<referencenumber><![CDATA[{data.get('identifier', {}).get('value', 'No Reference Number')}]]></referencenumber>
<url><![CDATA[{job_url}]]></url>
<company><![CDATA[{data.get('hiringOrganization', {}).get('name', 'No Name Provided')}]]></company>
<city><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('addressLocality', 'No City')}]]></city>
<state><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('addressRegion', 'No State')}]]></state>
<country><![CDATA[{data.get('jobLocation', {}).get('address', {}).get('addressCountry', 'No Country')}]]></country>
<remote><![CDATA[{data.get('jobLocationType', 'No Remote Info')}]]></remote>
<postalcode><![CDATA[{data.get('PostalAddress', 'No Postal Code')}]]></postalcode>
<description><![CDATA[{data.get('description', 'No Description')}]]></description>
<jobtype><![CDATA[{data.get('employmentType', 'No Job Type')}]]></jobtype>
<category><![CDATA[{data.get('category', 'No Category')}]]></category>
</job>'''
# Close the RSS feed
rss_feed += '''
</source>'''
# Save the combined feed to a file
with open('feed.xml', 'w') as f:
f.write(rss_feed)