-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scrape.py
142 lines (115 loc) · 6.5 KB
/
Scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#Phil Tenteromano
#Web Scraping personal project
#12/30/2017
#Using Python 3.6
#Program fetches data from the HTML on the most recent IGN Game reviews page
#Relevant data includes [title, platform, score, scorePhrase, price, ReviewDate] //6 total
#Stores this data in a fresh database (:memory:)
#Also writes to a locally created csv file 'games.csv'
import requests #used to fetch url and HTTP request
from bs4 import BeautifulSoup #parsing module
import sqlite3 #used to create the database
from reviewClass import gameReview #self-made Class to store data into an object
import csv #used to write to csv file
#link to the website, use browser headers to connect
url = "http://www.ign.com/reviews/games" #target URL
headers = requests.utils.default_headers() #get default request headers, update accordingly
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
r = requests.get(url, headers=headers) #Request and assign proper header to the request
print("Status Code: ",r.status_code) #Make sure connection code is 200: good connection
if r.status_code != 200: #proper connection is critical to functioning program
exit(2)
#parse the website content
soup = BeautifulSoup(r.content, "lxml") #Using 'lxml' parser instead of the default 'html parser'
#styled = soup.prettify() #Can be used to print HTML in styled format
#connect to the sqlite database
dataB = sqlite3.connect(':memory:') #dynamically create a fresh database
c = dataB.cursor() #creates a position in the database
#primary database table for the gameReviews
c.execute("""CREATE TABLE gameReviews (
title text,
platform text,
score real,
price text,
review_date text
)""") #docstring allows for styled table creation
dataB.commit() #commit to database
#Database functions for insert, retrieve, and delete game reviews
def insert_game(game_obj):
float(game_obj.score) #convert the score into a float for easier retrieval
with dataB:
c.execute("INSERT INTO gameReviews VALUES \
(:title, :platform, :score, :price, :review_date)",
{'title':game_obj.title, 'platform':game_obj.platform,
'score':game_obj.score, 'price':game_obj.price,
'review_date':game_obj.revDate})
#returns the entire tuple set of game Reviews
def get_game_list():
c.execute("SELECT * FROM gameReviews")
return c.fetchall()
#returns the entire tuple when searching by title
def get_game_by_title(title):
c.execute("SELECT * FROM gameReviews WHERE title=:title", \
{'title': title})
return c.fetchall()
#returns the title and platform of the games with that score
def get_game_by_score(scoreSearch):
float(scoreSearch) #make sure the score is a float
c.execute("SELECT title, platform FROM gameReviews WHERE score=:score",
{'score':scoreSearch})
return c.fetchall()
#Delete a game by the title
def remove_game(title):
with dataB:
c.execute("DELETE FROM gameReviews WHERE title=:title", \
{'title': title})
#Start the program after status code 200
print("Let's see the reviews for the latest games from IGN.com:\n")
#find the primary div tags separating each game review
gameData = soup.find_all("div", {'clear itemList-item'})
currentGame = gameReview() #create gameReview object, used over iteration
filename = "games.csv" #also store the data on a file
f = open(filename, "w", newline='') #open file in write mode
#create top column names, use CSV module
headColumns = ['Title', 'Platform', 'Genre', 'Price','Score']
writer = csv.writer(f) #instantiate 'writer' object
writer.writerow(headColumns) #write the heading sections to the file
#primary iterating for loop, retrieves, prints, writes, and stores data
for game in gameData: #go through each game review, find data accordingly
#title, platform, and game are in their own tags, find and store
currentGame.g_title(game.h3.a.text.strip()) #strip() gets rid of leading/trailing whitespace
currentGame.g_platform(game.h3.span.text.strip())
currentGame.g_genre(game.find('span',{'class':"item-genre"}).text.strip())
#Not all games have a price, make sure sentinel value is absent
priceCheck = (game.find('span', {'class': 'details'}).text)
if "%displayPrice%" not in priceCheck: #presence of sentinel value, continue on
currentGame.g_price(priceCheck) #else, store price
# needed lambda to find correct HTML tag
currentGame.g_revDate(game.find(lambda tag: tag.name == 'div' #[1:-1] for date, HTML stored unused '\n'
and tag.get('class') == ['grid_3']).text[1:-1]) #on the lead and trail of the text
numScore = game.find('span',{'class':'scoreBox-score'}).text #store the score
phraseScore = game.find('span',{'class':'scoreBox-scorePhrase'}).text #score the scorePhrase directly below it
currentGame.g_score(numScore, phraseScore) #combine these values into the g_score attribute
#begin insertion into the database, with complete currentGame object
insert_game(currentGame) #insert every iteration into the database
print(currentGame) #print complete currentGame as they come, used __str__() in class file
#begin writing to the CSV file
writer.writerow([currentGame.title, currentGame.platform, \
currentGame.genre, currentGame.price, currentGame.revDate.replace(',', ' '), \
currentGame.score + ' ' + currentGame.scorePhrase])
#show use of database functions
print("\nFinding all instances of 'Rocket League'!\n")
print(get_game_by_title('Rocket League')) #get game by title
print("\nGetting game List!\n")
print(get_game_list()) #get entire list
print("\nFinding Games with score of '7'!\n")
print(get_game_by_score(9.5),'\n') #get games by score
remove_game('Rocket League') #delete all instances of 'Rocket League'
print("\nDeleting all instances of Rocket League. Searching again...'!\n")
print(get_game_by_title('Rocket League')) #shows up as empty list!
f.close() #close the csv file
dataB.close() #close the database
#done
print("\nEnd")