1
- from selenium . webdriver . common . by import By
2
- from selenium . webdriver . common . keys import Keys
3
- from selenium . webdriver . support . ui import WebDriverWait
4
- from selenium .webdriver .support import expected_conditions as EC
1
+ import streamlit as st
2
+ from selenium import webdriver
3
+ from webdriver_manager . chrome import ChromeDriverManager
4
+ from selenium .webdriver .chrome . service import Service
5
5
import time
6
6
import csv
7
7
import re
8
8
from bs4 import BeautifulSoup
9
- from selenium .webdriver .chrome .options import Options
10
- from selenium import webdriver
9
+ import os
10
+ from streamlit_lottie import st_lottie
11
+ import json
12
+
13
+ with open ('Movie_Animated.json' , encoding = 'utf-8' ) as anim_source :
14
+ animation_data = json .load (anim_source )
15
+ st_lottie (animation_data , 1 , True , True , "high" , 150 , - 100 )
16
+
17
+ # Function to scrape IMDb data
18
+ def scrape_imdb_data ():
19
+ options = webdriver .ChromeOptions ()
20
+ options .add_argument ('--no-sandbox' )
21
+ options .add_argument ('--disable-dev-shm-usage' )
22
+ options .add_argument ('--headless' ) # Run Chrome in headless mode
23
+
24
+ service = Service (ChromeDriverManager ().install ())
25
+ driver = webdriver .Chrome (options = options , service = service )
11
26
12
- DRIVER_PATH = 'E:/chromedriver-win64/chromedriver'
13
- # Initialize the Chrome driver
14
-
15
-
16
- options = webdriver .ChromeOptions ()
17
- options .add_argument ('--no-sandbox' )
18
- options .add_argument ('--disable-dev-shm-usage' )
19
- driver = webdriver .Chrome (options = options ,executable_path = DRIVER_PATH )
20
-
21
- # Navigate to the URL
22
- driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
23
-
24
- driver .set_script_timeout (10000 )
25
- def load_more_results ():
26
- try :
27
- load_more_button = WebDriverWait (driver , 10 ).until (
28
- EC .element_to_be_clickable ((By .XPATH , '//button[contains(@class, "ipc-see-more__button")]' ))
29
- )
30
- driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
31
- driver .execute_script ("arguments[0].click();" , load_more_button )
32
- time .sleep (2 )
33
- return True
34
- except Exception as e :
35
- print (f"Error: { e } " )
36
- return False
37
- def save_to_csv (movies , filename = 'movies.csv' ):
38
- keys = movies [0 ].keys ()
39
- with open (filename , 'a' , newline = '' , encoding = 'utf-8' ) as output_file :
40
- dict_writer = csv .DictWriter (output_file , fieldnames = keys )
41
- dict_writer .writeheader ()
42
- dict_writer .writerows (movies )
43
-
44
-
45
- all_movies = []
46
- cnt = 0
47
- while (cnt < 300 ):
48
- cnt += 1
49
- print (cnt )
50
- if not load_more_results ():
27
+ driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
28
+ driver .set_script_timeout (10000 )
29
+
30
+ def load_more_results ():
31
+ try :
32
+ load_more_button = WebDriverWait (driver , 10 ).until (
33
+ EC .element_to_be_clickable ((By .XPATH , '//button[contains(@class, "ipc-see-more__button")]' ))
34
+ )
35
+ driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
36
+ driver .execute_script ("arguments[0].click();" , load_more_button )
37
+ time .sleep (2 )
38
+ return True
39
+ except Exception as e :
40
+ print (f"Error: { e } " )
41
+ return False
42
+
43
+ def save_to_csv (movies , filename = 'movies.csv' ):
44
+ file_exists = os .path .isfile (filename )
45
+ keys = movies [0 ].keys ()
46
+ with open (filename , 'a' , newline = '' , encoding = 'utf-8' ) as output_file :
47
+ dict_writer = csv .DictWriter (output_file , fieldnames = keys )
48
+ if not file_exists :
49
+ dict_writer .writeheader ()
50
+ dict_writer .writerows (movies )
51
+
52
+ all_movies = []
53
+ cnt = 0
54
+ while cnt < 300 :
55
+ cnt += 1
56
+ if not load_more_results ():
51
57
break
52
-
53
- movie_elements = driver .find_element (By .XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul" )
54
- print ("movie_list" )
55
-
56
- html_content = movie_elements .get_attribute ('outerHTML' )
57
- print ("html movie_list" )
58
- soup = BeautifulSoup (html_content , 'html.parser' )
59
-
60
- lst = soup .find_all ("li" , class_ = "ipc-metadata-list-summary-item" )
61
- print ("list" )
62
- for i in lst :
63
- org_title = i .find ("h3" ,class_ = "ipc-title__text" ).text
64
- try :
65
- title = re .sub (r'\d+\.\s*' , '' , org_title )
66
- except :
67
- title = "NA"
68
- try :
69
- year = i .find ("span" , class_ = "sc-b189961a-8 kLaxqf dli-title-metadata-item" ).text
58
+
59
+ movie_elements = driver .find_elements (By .XPATH , "//div[contains(@class, 'lister-item mode-advanced')]" )
70
60
71
- except :
72
- year = "NA"
73
- try :
74
- rating = i .find ("span" , class_ = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating' ).text .split ()[0 ]
75
- except :
76
- rating = "NA"
77
- try :
78
- description = i .find ("div" , class_ = 'ipc-html-content-inner-div' ).text
79
- except :
80
- description = "NA"
81
- all_movies .append ({
82
- 'title' : title ,
83
- 'type' :"Tv-Series" ,
84
- 'year' : year ,
85
- 'rating' : rating ,
86
- 'description' : description
87
- })
88
-
89
- print ("saving started" )
90
- if all_movies :
91
- save_to_csv (all_movies )
92
- print ("completed" )
93
- driver .quit ()
61
+ for element in movie_elements :
62
+ soup = BeautifulSoup (element .get_attribute ('outerHTML' ), 'html.parser' )
63
+
64
+ try :
65
+ org_title = soup .find ("h3" , class_ = "lister-item-header" ).find ("a" ).text
66
+ title = re .sub (r'\d+\.\s*' , '' , org_title )
67
+ except :
68
+ title = "NA"
69
+
70
+ try :
71
+ year = soup .find ("span" , class_ = "lister-item-year" ).text
72
+ except :
73
+ year = "NA"
74
+
75
+ try :
76
+ rating = soup .find ("div" , class_ = "ratings-bar" ).find ("strong" ).text
77
+ except :
78
+ rating = "NA"
79
+
80
+ try :
81
+ description = soup .find_all ("p" , class_ = "text-muted" )[1 ].text .strip ()
82
+ except :
83
+ description = "NA"
84
+
85
+ all_movies .append ({
86
+ 'title' : title ,
87
+ 'type' : "Tv-Series" ,
88
+ 'year' : year ,
89
+ 'rating' : rating ,
90
+ 'description' : description
91
+ })
92
+
93
+ if all_movies :
94
+ save_to_csv (all_movies )
95
+ all_movies = []
96
+
97
+ driver .quit ()
98
+
99
+ # Streamlit App
100
+ def main ():
101
+ st .title ("IMDb Scraper" )
102
+
103
+ if st .button ("Scrape IMDb Data" ):
104
+ with st .spinner ("Scraping IMDb data..." ):
105
+ scrape_imdb_data ()
106
+ st .success ("Data scraped successfully!" )
107
+
108
+ # Show the CSV file content
109
+ st .subheader ("Scraped IMDb Data:" )
110
+ filename = 'movies.csv'
111
+ if os .path .exists (filename ):
112
+ with open (filename , 'r' , encoding = 'utf-8' ) as file :
113
+ csv_content = file .read ()
114
+ st .code (csv_content , language = 'csv' )
115
+ else :
116
+ st .error ("CSV file not found." )
117
+
118
+ if __name__ == "__main__" :
119
+ main ()
0 commit comments