-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
145 lines (123 loc) · 4.89 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from flask import request
# from selenium.webdriver.chrome.options import Options
from flask import render_template
import time
import uuid
from datetime import datetime
from db import client
db = client.flask_database
collection = db.collection
def scrape_twitter():
options = webdriver.FirefoxOptions()
# options.headless = True
options.add_argument("-headless")
# options.add_argument("--disable-blink-features")
# options.add_argument("--disable-blink-features=AutomationControlled") # Attempt to bypass bot detection
driver = webdriver.Firefox(options=options)
try:
driver.get("https://x.com/i/flow/login")
# Wait for username input to be visible and interactable
username_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, "text"))
)
username_input.click()
username_input.send_keys("KishoriCharles")
username_input.send_keys(Keys.RETURN)
print("Username entered")
try:
# it sometimes asks for email too so heres that
email_input = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.NAME, "text"))
)
email_input.send_keys("[email protected]")
email_input.send_keys(Keys.RETURN)
print("Email entered")
except:
pass
# password time
password_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, "password"))
)
#dont tweet anything weird pls
password_input.send_keys("Abhinav@2004")
password_input.send_keys(Keys.RETURN)
print("Password entered")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# extracting elements
try:
trending_elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.CSS_SELECTOR,
"div[aria-label='Timeline: Trending now'] span[dir='ltr'], div[aria-label='Timeline: Trending now'] div[dir='ltr']",
)
)
)
# filtering trending topics
trending_topics = []
for element in trending_elements:
try:
topic_name = element.text
if (
topic_name
and topic_name not in ["What’s happening", "Show more"]
and not any(char.isdigit() for char in topic_name)
):
if "Trending" not in topic_name and "·" not in topic_name:
if topic_name not in trending_topics:
trending_topics.append(topic_name)
except Exception as e:
print(f"Error extracting topic: {e}")
continue
print("Filtered Trending Topics:", trending_topics)
data = {
"trending_topics": trending_topics,
"timestamp": datetime.now(),
"unique_id": str(uuid.uuid4()),
}
print(data)
try:
db.collections.insert_one(data)
except Exception as e:
print(f"Error saving data to database: {e}")
return {
"status": "error",
"message": f"Error saving data to database: {str(e)}",
"timestamp": datetime.now(),
"ip": request.remote_addr,
"heres the data tho": trending_topics,
}
return {
"status": "success",
"message": "Login and trending topics fetched successfully and saved to database",
"timestamp": datetime.now(),
"ip": request.remote_addr,
"trending_topics": trending_topics,
}
except Exception as e:
print(f"Error fetching trending topics: {e}")
return {
"status": "error",
"timestamp": datetime.now(),
"ip": request.remote_addr,
"message": f"Error fetching trending topics: {str(e)}",
}
except Exception as e:
print(f"Error: {e}")
return {
"status": "error",
"timestamp": datetime.now(),
"ip": request.remote_addr,
"message": str(e),
}
finally:
time.sleep(3)
driver.quit()