Skip to content

Commit

Permalink
Merge pull request #234 from techeer-sv/BE/#206
Browse files Browse the repository at this point in the history
Be/#206 신입/인턴 채용공고 크롤링 기능 구현
  • Loading branch information
baekhangyeol authored Oct 2, 2023
2 parents 655f715 + f9d341e commit 8848732
Show file tree
Hide file tree
Showing 7 changed files with 307 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
public class Job {
@Id
@Column(name = "job_id")
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;

@Column(nullable = false)
Expand Down
13 changes: 13 additions & 0 deletions crawling_python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM python:3.11

RUN python -m venv /venv
ENV PATH="/venv/bin:$PATH"

COPY . /app/
WORKDIR /app

RUN apt-get update && apt-get install -y chromium chromium-driver

RUN pip install --no-cache-dir -r requirements.txt

CMD ["python", "scheduler.py"]
131 changes: 131 additions & 0 deletions crawling_python/crawling_jobkorea.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import re
import time
from datetime import datetime, timedelta

from global_utils import *
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from dotenv import load_dotenv
from selenium.webdriver.support.wait import WebDriverWait

load_dotenv()

def crawling_job_data(driver, page_number, existing_contents):
url = f'https://www.jobkorea.co.kr/Recruit/Joblist?menucode=local&localorder=1#anchorGICnt_{page_number}'
driver.get(url)
wait = WebDriverWait(driver, 10)

if page_number == 1:
duty_btn = driver.find_element(By.CSS_SELECTOR, 'p.btn_tit')
duty_btn.click()

dev_data_label = driver.find_element(By.CSS_SELECTOR, 'label[for="duty_step1_10031"]')
dev_data_label.click()

backend_dev = driver.find_element(By.XPATH, '//span[contains(text(), "백엔드개발자")]')
backend_dev.click()

frontend_dev = driver.find_element(By.XPATH, '//span[contains(text(), "프론트엔드개발자")]')
frontend_dev.click()

web_dev = driver.find_element(By.XPATH, '//span[contains(text(), "웹개발자")]')
web_dev.click()

app_dev = driver.find_element(By.XPATH, '//span[contains(text(), "앱개발자")]')
app_dev.click()

career_btn = driver.find_element(By.XPATH, '//p[contains(text(), "경력")]')
career_btn.click()

newbie_label = driver.find_element(By.XPATH, '//label[contains(@for, "career1") and .//span[text()="신입"]]')
newbie_label.click()

search_button = driver.find_element(By.ID, 'dev-btn-search')
search_button.click()

time.sleep(4)

try:
companies = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td.tplCo')))
contents = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td.tplTit strong a.link')))
dates = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'span.date.dotum')))
urls = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td.tplTit strong a.link')))
except:
return None

data_list = []

for i in range(len(companies)):
company_name = companies[i].text.strip()
content = contents[i].get_attribute("title")

if not content:
content = contents[i].text.strip()

date_text = dates[i].text.strip()
date_match = re.search(r"~(\d{2}/\d{2})\((\w+)\)", date_text)

if date_match:
month_day, day_of_week = date_match.groups()
current_year = datetime.now().year
date_text = f"{current_year}-{month_day}"
expiration_date = datetime.strptime(date_text, "%Y-%m/%d")
elif "오늘마감" in date_text:
expiration_date = datetime.now()
elif "내일마감" in date_text:
expiration_date = datetime.now() + timedelta(days=1)
elif "모레마감" in date_text:
expiration_date = datetime.now() + timedelta(days=2)
elif "상시채용" in date_text:
expiration_date = datetime.max
else:
expiration_date = None

if expiration_date:
expiration_date = expiration_date.strftime("%Y-%m-%d")
else:
expiration_date = ""

url = urls[i].get_attribute("href")

if content not in existing_contents:
data_list.append((company_name, content, expiration_date, url))

return data_list

def main():
driver = get_driver()
page_nuber = 1

db = get_database_connect()
cursor = db.cursor()
cursor.execute("SELECT title, expiration_date FROM job")
existing_contents = {row[0]: row[1] for row in cursor.fetchall()}
cursor.close()
db.close()

while True:
job_data = crawling_job_data(driver, page_nuber, existing_contents.keys())
if job_data is None:
break

db = get_database_connect()
cursor = db.cursor()
insert_query = "INSERT INTO job (company_name, title, expiration_date, url) VALUES (%s, %s, %s, %s)"
cursor.executemany(insert_query, job_data)
db.commit()

for title, expiration_date in existing_contents.items():
if expiration_date == "9999-12-31 00:00:00.000000" and title not in [content[1] for content in job_data]:
cursor.execute("DELETE FROM job WHERE title = %s", (title,))
db.commit()

cursor.close()
db.close()

page_nuber += 1

driver.quit()

if __name__ == "__main__":
main()
89 changes: 89 additions & 0 deletions crawling_python/crawling_saramin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from global_utils import *
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime, timedelta
from dotenv import load_dotenv
import re

load_dotenv()


def crawling_job_data(driver, page_number, existing_contents):
url = f'https://www.saramin.co.kr/zf_user/jobs/public/list?exp_cd=1&company_cd=0%2C1%2C2%2C3%2C4%2C5%2C6%2C7%2C9%2C10&cat_kewd=84%2C86%2C87%2C92&panel_type=domestic&search_optional_item=y&search_done=y&panel_count=y&preview=y&page={page_number}&isAjaxRequest=y'
driver.get(url)

wait = WebDriverWait(driver, 10)

try:
companies = wait.until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, '.list_item .col.company_nm a.str_tit, .list_item .col.company_nm span.str_tit')))
contents = wait.until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, '.list_body .col.notification_info .job_tit .str_tit')))
urls = wait.until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, '.list_body .col.notification_info .job_tit a.str_tit')))
dates = wait.until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, '.list_body .col.support_info .support_detail .date')))
except:
return None

data_list = []

for i in range(len(companies)):
company_name = companies[i].text
content = contents[i].text
url = urls[i].get_attribute('href')
date_text = dates[i].text

match_d = re.search(r"D-(\d+)", date_text)

match_date = re.search(r"~(\d+\.\d+)\((\w+)\)", date_text)

if match_d:
days_to_add = int(match_d.group(1))
current_date = datetime.now()
calculated_date = current_date + timedelta(days=days_to_add)
expiration_date = calculated_date.strftime("%Y-%m-%d")
elif match_date:
month_day, day_of_week = match_date.groups()
current_year = datetime.now().year
date_text = f"{current_year}-{month_day}"
expiration_date = datetime.strptime(date_text, "%Y-%m.%d").strftime("%Y-%m-%d")

if content not in existing_contents:
data_list.append((company_name, content, url, expiration_date))

return data_list


def main():
driver = get_driver()
page_number = 1

db = get_database_connect()
cursor = db.cursor()
cursor.execute("SELECT DISTINCT title FROM job")
existing_contents = {row[0] for row in cursor.fetchall()}
cursor.close()
db.close()

while True:
job_data = crawling_job_data(driver, page_number, existing_contents)
if job_data is None:
break

db = get_database_connect()
cursor = db.cursor()
insert_query = "INSERT INTO job (company_name, title, url, expiration_date) VALUES (%s, %s, %s, %s)"
cursor.executemany(insert_query, job_data)
db.commit()
cursor.close()
db.close()

page_number += 1

driver.quit()


if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions crawling_python/global_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
from selenium import webdriver
from mysql.connector import connect


def get_database_connect():
return connect(
host="mysql",
port=3306,
user=os.getenv('DB_USERNAME'),
password=os.getenv('DB_USER_PASSWORD'),
database=os.getenv('DB_DATABASE')
)


def get_driver():

path = '/usr/bin/chromedriver'

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--log-level=DEBUG")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.92 Safari/537.36")

service = webdriver.ChromeService(executable_path=path)
driver = webdriver.Chrome(options=chrome_options, service=service)

return driver
18 changes: 18 additions & 0 deletions crawling_python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
attrs==23.1.0
certifi==2023.7.22
h11==0.14.0
idna==3.4
mysql-connector-python==8.1.0
outcome==1.2.0
protobuf==4.21.12
PySocks==1.7.1
python-dotenv==1.0.0
pytz==2023.3.post1
schedule==1.2.0
selenium==4.13.0
sniffio==1.3.0
sortedcontainers==2.4.0
trio==0.22.2
trio-websocket==0.11.1
urllib3==2.0.5
wsproto==1.2.0
21 changes: 21 additions & 0 deletions crawling_python/scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytz
import schedule
import subprocess

kst = pytz.timezone('Asia/Seoul')


def crawl_jobkorea():
subprocess.run(["python", "crawling_jobkorea.py"])


def crawl_saramin():
subprocess.run(["python", "crawling_saramin.py"])


schedule.every(3).days.at("06:00").do(crawl_saramin)
schedule.every(3).days.at("06:03").do(crawl_jobkorea)

if __name__ == "__main__":
while True:
schedule.run_pending()

0 comments on commit 8848732

Please sign in to comment.