Skip to content

Commit ab000eb

Browse files
authored
Merge pull request #172 from SurajSanap/main
[Error Solved] 'executable_path'
2 parents 570639b + c936616 commit ab000eb

File tree

1 file changed

+112
-86
lines changed

1 file changed

+112
-86
lines changed

Web_app/Scarper.py

Lines changed: 112 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,93 +1,119 @@
1-
from selenium.webdriver.common.by import By
2-
from selenium.webdriver.common.keys import Keys
3-
from selenium.webdriver.support.ui import WebDriverWait
4-
from selenium.webdriver.support import expected_conditions as EC
1+
import streamlit as st
2+
from selenium import webdriver
3+
from webdriver_manager.chrome import ChromeDriverManager
4+
from selenium.webdriver.chrome.service import Service
55
import time
66
import csv
77
import re
88
from bs4 import BeautifulSoup
9-
from selenium.webdriver.chrome.options import Options
10-
from selenium import webdriver
9+
import os
10+
from streamlit_lottie import st_lottie
11+
import json
12+
13+
with open('Movie_Animated.json', encoding='utf-8') as anim_source:
14+
animation_data = json.load(anim_source)
15+
st_lottie(animation_data, 1, True, True, "high", 150, -100)
16+
17+
# Function to scrape IMDb data
18+
def scrape_imdb_data():
19+
options = webdriver.ChromeOptions()
20+
options.add_argument('--no-sandbox')
21+
options.add_argument('--disable-dev-shm-usage')
22+
options.add_argument('--headless') # Run Chrome in headless mode
23+
24+
service = Service(ChromeDriverManager().install())
25+
driver = webdriver.Chrome(options=options, service=service)
1126

12-
DRIVER_PATH = 'E:/chromedriver-win64/chromedriver'
13-
# Initialize the Chrome driver
14-
15-
16-
options = webdriver.ChromeOptions()
17-
options.add_argument('--no-sandbox')
18-
options.add_argument('--disable-dev-shm-usage')
19-
driver = webdriver.Chrome(options=options,executable_path=DRIVER_PATH)
20-
21-
# Navigate to the URL
22-
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
23-
24-
driver.set_script_timeout(10000)
25-
def load_more_results():
26-
try:
27-
load_more_button = WebDriverWait(driver, 10).until(
28-
EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]'))
29-
)
30-
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
31-
driver.execute_script("arguments[0].click();", load_more_button)
32-
time.sleep(2)
33-
return True
34-
except Exception as e:
35-
print(f"Error: {e}")
36-
return False
37-
def save_to_csv(movies, filename='movies.csv'):
38-
keys = movies[0].keys()
39-
with open(filename, 'a', newline='', encoding='utf-8') as output_file:
40-
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
41-
dict_writer.writeheader()
42-
dict_writer.writerows(movies)
43-
44-
45-
all_movies=[]
46-
cnt=0
47-
while(cnt<300):
48-
cnt+=1
49-
print(cnt)
50-
if not load_more_results():
27+
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
28+
driver.set_script_timeout(10000)
29+
30+
def load_more_results():
31+
try:
32+
load_more_button = WebDriverWait(driver, 10).until(
33+
EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]'))
34+
)
35+
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
36+
driver.execute_script("arguments[0].click();", load_more_button)
37+
time.sleep(2)
38+
return True
39+
except Exception as e:
40+
print(f"Error: {e}")
41+
return False
42+
43+
def save_to_csv(movies, filename='movies.csv'):
44+
file_exists = os.path.isfile(filename)
45+
keys = movies[0].keys()
46+
with open(filename, 'a', newline='', encoding='utf-8') as output_file:
47+
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
48+
if not file_exists:
49+
dict_writer.writeheader()
50+
dict_writer.writerows(movies)
51+
52+
all_movies = []
53+
cnt = 0
54+
while cnt < 300:
55+
cnt += 1
56+
if not load_more_results():
5157
break
52-
53-
movie_elements = driver.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul")
54-
print("movie_list")
55-
56-
html_content = movie_elements.get_attribute('outerHTML')
57-
print("html movie_list")
58-
soup = BeautifulSoup(html_content, 'html.parser')
59-
60-
lst= soup.find_all("li", class_="ipc-metadata-list-summary-item")
61-
print("list")
62-
for i in lst:
63-
org_title= i.find("h3",class_="ipc-title__text").text
64-
try:
65-
title=re.sub(r'\d+\.\s*', '', org_title)
66-
except:
67-
title="NA"
68-
try:
69-
year = i.find("span", class_="sc-b189961a-8 kLaxqf dli-title-metadata-item").text
58+
59+
movie_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'lister-item mode-advanced')]")
7060

71-
except:
72-
year="NA"
73-
try:
74-
rating = i.find("span", class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text.split()[0]
75-
except:
76-
rating="NA"
77-
try:
78-
description = i.find("div", class_='ipc-html-content-inner-div').text
79-
except:
80-
description = "NA"
81-
all_movies.append({
82-
'title': title,
83-
'type':"Tv-Series",
84-
'year': year,
85-
'rating': rating,
86-
'description': description
87-
})
88-
89-
print("saving started")
90-
if all_movies:
91-
save_to_csv(all_movies)
92-
print("completed")
93-
driver.quit()
61+
for element in movie_elements:
62+
soup = BeautifulSoup(element.get_attribute('outerHTML'), 'html.parser')
63+
64+
try:
65+
org_title = soup.find("h3", class_="lister-item-header").find("a").text
66+
title = re.sub(r'\d+\.\s*', '', org_title)
67+
except:
68+
title = "NA"
69+
70+
try:
71+
year = soup.find("span", class_="lister-item-year").text
72+
except:
73+
year = "NA"
74+
75+
try:
76+
rating = soup.find("div", class_="ratings-bar").find("strong").text
77+
except:
78+
rating = "NA"
79+
80+
try:
81+
description = soup.find_all("p", class_="text-muted")[1].text.strip()
82+
except:
83+
description = "NA"
84+
85+
all_movies.append({
86+
'title': title,
87+
'type': "Tv-Series",
88+
'year': year,
89+
'rating': rating,
90+
'description': description
91+
})
92+
93+
if all_movies:
94+
save_to_csv(all_movies)
95+
all_movies = []
96+
97+
driver.quit()
98+
99+
# Streamlit App
100+
def main():
101+
st.title("IMDb Scraper")
102+
103+
if st.button("Scrape IMDb Data"):
104+
with st.spinner("Scraping IMDb data..."):
105+
scrape_imdb_data()
106+
st.success("Data scraped successfully!")
107+
108+
# Show the CSV file content
109+
st.subheader("Scraped IMDb Data:")
110+
filename = 'movies.csv'
111+
if os.path.exists(filename):
112+
with open(filename, 'r', encoding='utf-8') as file:
113+
csv_content = file.read()
114+
st.code(csv_content, language='csv')
115+
else:
116+
st.error("CSV file not found.")
117+
118+
if __name__ == "__main__":
119+
main()

0 commit comments

Comments
 (0)