-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathimdbData.py
84 lines (73 loc) · 2.7 KB
/
imdbData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import lxml
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from requests import get
url1 = "https://www.imdb.com/search/title?count=100&title_type=feature,tv_series&ref_=nv_wl_img_2"
class IMDB(object):
"""docstring for IMDB"""
def __init__(self, url):
super(IMDB, self).__init__()
page = get(url)
self.soup = BeautifulSoup(page.content, 'lxml')
def articleTitle(self):
return self.soup.find("h1", class_="header").text.replace("\n","")
def bodyContent(self):
content = self.soup.find(id="main")
return content.find_all("div", class_="lister-item mode-advanced")
def movieData(self):
movieFrame = self.bodyContent()
movieTitle = []
movieDate = []
movieRunTime = []
movieGenre = []
movieRating = []
movieScore = []
movieDescription = []
movieDirector = []
movieStars = []
movieVotes = []
movieGross = []
for movie in movieFrame:
movieFirstLine = movie.find("h3", class_="lister-item-header")
movieTitle.append(movieFirstLine.find("a").text)
movieDate.append(re.sub(r"[()]","", movieFirstLine.find_all("span")[-1].text))
try:
movieRunTime.append(movie.find("span", class_="runtime").text[:-4])
except:
movieRunTime.append(np.nan)
movieGenre.append(movie.find("span", class_="genre").text.rstrip().replace("\n","").split(","))
try:
movieRating.append(movie.find("strong").text)
except:
movieRating.append(np.nan)
try:
movieScore.append(movie.find("span", class_="metascore unfavorable").text.rstrip())
except:
movieScore.append(np.nan)
movieDescription.append(movie.find_all("p", class_="text-muted")[-1].text.lstrip())
movieCast = movie.find("p", class_="")
try:
casts = movieCast.text.replace("\n","").split('|')
casts = [x.strip() for x in casts]
casts = [casts[i].replace(j, "") for i,j in enumerate(["Director:", "Stars:"])]
movieDirector.append(casts[0])
movieStars.append([x.strip() for x in casts[1].split(",")])
except:
casts = movieCast.text.replace("\n","").strip()
movieDirector.append(np.nan)
movieStars.append([x.strip() for x in casts.split(",")])
movieNumbers = movie.find_all("span", attrs={"name": "nv"})
if len(movieNumbers) == 2:
movieVotes.append(movieNumbers[0].text)
movieGross.append(movieNumbers[1].text)
elif len(movieNumbers) == 1:
movieVotes.append(movieNumbers[0].text)
movieGross.append(np.nan)
else:
movieVotes.append(np.nan)
movieGross.append(np.nan)
movieData = [movieTitle, movieDate, movieRunTime, movieGenre, movieRating, movieScore, movieDescription,
movieDirector, movieStars, movieVotes, movieGross]
return movieData