-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNews_Crawler.py
78 lines (58 loc) · 2.33 KB
/
News_Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
## Yahoo Finance
options = Options()
options.add_argument('--headless')
options.add_argument('window-size = 800x600')
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options = options)
url = fr"https://finance.yahoo.com/quote/TSLA/news"
driver.get(url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
title_list = []
news_div = soup.find("div", class_ = "filtered-stories x-large svelte-7rcxn rulesBetween infiniteScroll").find_all("li", class_ = "stream-item svelte-7rcxn")
for i in range(len(news_div)):
## "Cf" 下面有兩種 Class_
try:
title = news_div[i].find("h3", class_ = "clamp svelte-13zydns").text
except:
continue
title = str(title)
title_list.append(title)
driver.quit()
# CNBC
options = Options()
options.add_argument('--headless')
options.add_argument('window-size = 800x600')
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options = options)
url = fr"https://www.cnbc.com/search/?query=tesla&qsearchterm=tesla"
driver.get(url)
wait = WebDriverWait(driver, 60)
search_results = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "SearchResults-searchResultsContainer")))
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
news_div = soup.find("div", class_="SearchResults-searchResultsContainer").find("div", id = "searchcontainer").find_all("div")
for i in range(len(news_div)):
try:
title = news_div[i].find("div", class_ = "SearchResult-searchResultTitle").find("a").find("span").text
title = str(title)
title_list.append(title)
except:
continue
driver.quit()
title_list = list(dict.fromkeys(title_list))
df = pd.DataFrame({
"Date": datetime.now().strftime('%Y-%m-%d'),
"News_Title": title_list
})
df.to_csv(f"News_History/News_Title_{datetime.now().strftime('%Y-%m-%d')}.csv")