-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Be/#206 신입/인턴 채용공고 크롤링 기능 구현
- Loading branch information
Showing
7 changed files
with
307 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
FROM python:3.11 | ||
|
||
RUN python -m venv /venv | ||
ENV PATH="/venv/bin:$PATH" | ||
|
||
COPY . /app/ | ||
WORKDIR /app | ||
|
||
RUN apt-get update && apt-get install -y chromium chromium-driver | ||
|
||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
CMD ["python", "scheduler.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import re | ||
import time | ||
from datetime import datetime, timedelta | ||
|
||
from global_utils import * | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from dotenv import load_dotenv | ||
from selenium.webdriver.support.wait import WebDriverWait | ||
|
||
load_dotenv() | ||
|
||
def crawling_job_data(driver, page_number, existing_contents): | ||
url = f'https://www.jobkorea.co.kr/Recruit/Joblist?menucode=local&localorder=1#anchorGICnt_{page_number}' | ||
driver.get(url) | ||
wait = WebDriverWait(driver, 10) | ||
|
||
if page_number == 1: | ||
duty_btn = driver.find_element(By.CSS_SELECTOR, 'p.btn_tit') | ||
duty_btn.click() | ||
|
||
dev_data_label = driver.find_element(By.CSS_SELECTOR, 'label[for="duty_step1_10031"]') | ||
dev_data_label.click() | ||
|
||
backend_dev = driver.find_element(By.XPATH, '//span[contains(text(), "백엔드개발자")]') | ||
backend_dev.click() | ||
|
||
frontend_dev = driver.find_element(By.XPATH, '//span[contains(text(), "프론트엔드개발자")]') | ||
frontend_dev.click() | ||
|
||
web_dev = driver.find_element(By.XPATH, '//span[contains(text(), "웹개발자")]') | ||
web_dev.click() | ||
|
||
app_dev = driver.find_element(By.XPATH, '//span[contains(text(), "앱개발자")]') | ||
app_dev.click() | ||
|
||
career_btn = driver.find_element(By.XPATH, '//p[contains(text(), "경력")]') | ||
career_btn.click() | ||
|
||
newbie_label = driver.find_element(By.XPATH, '//label[contains(@for, "career1") and .//span[text()="신입"]]') | ||
newbie_label.click() | ||
|
||
search_button = driver.find_element(By.ID, 'dev-btn-search') | ||
search_button.click() | ||
|
||
time.sleep(4) | ||
|
||
try: | ||
companies = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td.tplCo'))) | ||
contents = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td.tplTit strong a.link'))) | ||
dates = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'span.date.dotum'))) | ||
urls = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td.tplTit strong a.link'))) | ||
except: | ||
return None | ||
|
||
data_list = [] | ||
|
||
for i in range(len(companies)): | ||
company_name = companies[i].text.strip() | ||
content = contents[i].get_attribute("title") | ||
|
||
if not content: | ||
content = contents[i].text.strip() | ||
|
||
date_text = dates[i].text.strip() | ||
date_match = re.search(r"~(\d{2}/\d{2})\((\w+)\)", date_text) | ||
|
||
if date_match: | ||
month_day, day_of_week = date_match.groups() | ||
current_year = datetime.now().year | ||
date_text = f"{current_year}-{month_day}" | ||
expiration_date = datetime.strptime(date_text, "%Y-%m/%d") | ||
elif "오늘마감" in date_text: | ||
expiration_date = datetime.now() | ||
elif "내일마감" in date_text: | ||
expiration_date = datetime.now() + timedelta(days=1) | ||
elif "모레마감" in date_text: | ||
expiration_date = datetime.now() + timedelta(days=2) | ||
elif "상시채용" in date_text: | ||
expiration_date = datetime.max | ||
else: | ||
expiration_date = None | ||
|
||
if expiration_date: | ||
expiration_date = expiration_date.strftime("%Y-%m-%d") | ||
else: | ||
expiration_date = "" | ||
|
||
url = urls[i].get_attribute("href") | ||
|
||
if content not in existing_contents: | ||
data_list.append((company_name, content, expiration_date, url)) | ||
|
||
return data_list | ||
|
||
def main(): | ||
driver = get_driver() | ||
page_nuber = 1 | ||
|
||
db = get_database_connect() | ||
cursor = db.cursor() | ||
cursor.execute("SELECT title, expiration_date FROM job") | ||
existing_contents = {row[0]: row[1] for row in cursor.fetchall()} | ||
cursor.close() | ||
db.close() | ||
|
||
while True: | ||
job_data = crawling_job_data(driver, page_nuber, existing_contents.keys()) | ||
if job_data is None: | ||
break | ||
|
||
db = get_database_connect() | ||
cursor = db.cursor() | ||
insert_query = "INSERT INTO job (company_name, title, expiration_date, url) VALUES (%s, %s, %s, %s)" | ||
cursor.executemany(insert_query, job_data) | ||
db.commit() | ||
|
||
for title, expiration_date in existing_contents.items(): | ||
if expiration_date == "9999-12-31 00:00:00.000000" and title not in [content[1] for content in job_data]: | ||
cursor.execute("DELETE FROM job WHERE title = %s", (title,)) | ||
db.commit() | ||
|
||
cursor.close() | ||
db.close() | ||
|
||
page_nuber += 1 | ||
|
||
driver.quit() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from global_utils import * | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from datetime import datetime, timedelta | ||
from dotenv import load_dotenv | ||
import re | ||
|
||
load_dotenv() | ||
|
||
|
||
def crawling_job_data(driver, page_number, existing_contents): | ||
url = f'https://www.saramin.co.kr/zf_user/jobs/public/list?exp_cd=1&company_cd=0%2C1%2C2%2C3%2C4%2C5%2C6%2C7%2C9%2C10&cat_kewd=84%2C86%2C87%2C92&panel_type=domestic&search_optional_item=y&search_done=y&panel_count=y&preview=y&page={page_number}&isAjaxRequest=y' | ||
driver.get(url) | ||
|
||
wait = WebDriverWait(driver, 10) | ||
|
||
try: | ||
companies = wait.until(EC.presence_of_all_elements_located( | ||
(By.CSS_SELECTOR, '.list_item .col.company_nm a.str_tit, .list_item .col.company_nm span.str_tit'))) | ||
contents = wait.until(EC.presence_of_all_elements_located( | ||
(By.CSS_SELECTOR, '.list_body .col.notification_info .job_tit .str_tit'))) | ||
urls = wait.until(EC.presence_of_all_elements_located( | ||
(By.CSS_SELECTOR, '.list_body .col.notification_info .job_tit a.str_tit'))) | ||
dates = wait.until(EC.presence_of_all_elements_located( | ||
(By.CSS_SELECTOR, '.list_body .col.support_info .support_detail .date'))) | ||
except: | ||
return None | ||
|
||
data_list = [] | ||
|
||
for i in range(len(companies)): | ||
company_name = companies[i].text | ||
content = contents[i].text | ||
url = urls[i].get_attribute('href') | ||
date_text = dates[i].text | ||
|
||
match_d = re.search(r"D-(\d+)", date_text) | ||
|
||
match_date = re.search(r"~(\d+\.\d+)\((\w+)\)", date_text) | ||
|
||
if match_d: | ||
days_to_add = int(match_d.group(1)) | ||
current_date = datetime.now() | ||
calculated_date = current_date + timedelta(days=days_to_add) | ||
expiration_date = calculated_date.strftime("%Y-%m-%d") | ||
elif match_date: | ||
month_day, day_of_week = match_date.groups() | ||
current_year = datetime.now().year | ||
date_text = f"{current_year}-{month_day}" | ||
expiration_date = datetime.strptime(date_text, "%Y-%m.%d").strftime("%Y-%m-%d") | ||
|
||
if content not in existing_contents: | ||
data_list.append((company_name, content, url, expiration_date)) | ||
|
||
return data_list | ||
|
||
|
||
def main(): | ||
driver = get_driver() | ||
page_number = 1 | ||
|
||
db = get_database_connect() | ||
cursor = db.cursor() | ||
cursor.execute("SELECT DISTINCT title FROM job") | ||
existing_contents = {row[0] for row in cursor.fetchall()} | ||
cursor.close() | ||
db.close() | ||
|
||
while True: | ||
job_data = crawling_job_data(driver, page_number, existing_contents) | ||
if job_data is None: | ||
break | ||
|
||
db = get_database_connect() | ||
cursor = db.cursor() | ||
insert_query = "INSERT INTO job (company_name, title, url, expiration_date) VALUES (%s, %s, %s, %s)" | ||
cursor.executemany(insert_query, job_data) | ||
db.commit() | ||
cursor.close() | ||
db.close() | ||
|
||
page_number += 1 | ||
|
||
driver.quit() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import os | ||
from selenium import webdriver | ||
from mysql.connector import connect | ||
|
||
|
||
def get_database_connect(): | ||
return connect( | ||
host="mysql", | ||
port=3306, | ||
user=os.getenv('DB_USERNAME'), | ||
password=os.getenv('DB_USER_PASSWORD'), | ||
database=os.getenv('DB_DATABASE') | ||
) | ||
|
||
|
||
def get_driver(): | ||
|
||
path = '/usr/bin/chromedriver' | ||
|
||
chrome_options = webdriver.ChromeOptions() | ||
chrome_options.add_argument("--headless") | ||
chrome_options.add_argument("--no-sandbox") | ||
chrome_options.add_argument("--disable-dev-shm-usage") | ||
chrome_options.add_argument("--remote-debugging-port=9222") | ||
chrome_options.add_argument("--disable-gpu") | ||
chrome_options.add_argument("--disable-extensions") | ||
chrome_options.add_argument("--log-level=DEBUG") | ||
chrome_options.add_argument( | ||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.92 Safari/537.36") | ||
|
||
service = webdriver.ChromeService(executable_path=path) | ||
driver = webdriver.Chrome(options=chrome_options, service=service) | ||
|
||
return driver |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
attrs==23.1.0 | ||
certifi==2023.7.22 | ||
h11==0.14.0 | ||
idna==3.4 | ||
mysql-connector-python==8.1.0 | ||
outcome==1.2.0 | ||
protobuf==4.21.12 | ||
PySocks==1.7.1 | ||
python-dotenv==1.0.0 | ||
pytz==2023.3.post1 | ||
schedule==1.2.0 | ||
selenium==4.13.0 | ||
sniffio==1.3.0 | ||
sortedcontainers==2.4.0 | ||
trio==0.22.2 | ||
trio-websocket==0.11.1 | ||
urllib3==2.0.5 | ||
wsproto==1.2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import pytz | ||
import schedule | ||
import subprocess | ||
|
||
kst = pytz.timezone('Asia/Seoul') | ||
|
||
|
||
def crawl_jobkorea(): | ||
subprocess.run(["python", "crawling_jobkorea.py"]) | ||
|
||
|
||
def crawl_saramin(): | ||
subprocess.run(["python", "crawling_saramin.py"]) | ||
|
||
|
||
schedule.every(3).days.at("06:00").do(crawl_saramin) | ||
schedule.every(3).days.at("06:03").do(crawl_jobkorea) | ||
|
||
if __name__ == "__main__": | ||
while True: | ||
schedule.run_pending() |