This repository has been archived by the owner on Dec 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
pokeScrap.0.2.py
232 lines (215 loc) · 8.07 KB
/
pokeScrap.0.2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 15 10:44:33 2021
@author: mat
"""
import requests, re, sys, getopt
import os.path
from bs4 import BeautifulSoup
from datetime import datetime
#-##############################-#
# ---------- ✖︎ TODO ✔︎ -----------#
# ✖︎ - Finish the PyDoc #
# ✖︎ - Make a GUI #
# ✖︎ - Manage Exceptions #
# ✖︎ - Do the Git Doc #
# ✖︎ - Add tools to track $ #
#-##############################-#
"""
PokeScraper is a scraping project with the objective to facilitate the use of CardMarket
for Pokemon when tracking prices of single cards.
Argument is either a link to a cardmarket page of a pokemon single card, or a file containing a bunch of https adresses.
Output currently is a cvs format in the terminal, but tends to be inside a file.
I will make it for a terminal use but will make a GUI for other users when I have the time.
Usage : python pokeScrap.O.1.py [link to cardmarket page of card] or [file containing links]
"""
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
"""
PokeScraper() is the main class
:param url: the url of the card to scrape
"""
def helpMe():
print("-- Pokemon CardMarket Scraper --")
print('usage: pokeScrap.0.2.py -i <input file or link> -o <outputfile> -s <statFile(optional)>')
print("Precisions about the results :")
print(" _____________________")
print("| minCondition |")
print("|_____________________|")
print("| None = Poor |")
print("| 6 = Played |")
print("| 5 = Light Played |")
print("| 4 = Good |")
print("| 3 = Excellent |")
print("| 2 = Near Mint |")
print("| 1 = Mint |")
print("|_____________________|")
print("| language |")
print("|_____________________|")
print("| None = None |")
print("| 1 = English |")
print("| 2 = French |")
print("| 3 = German |")
print("| 4 = Spanish |")
print("| 5 = Italian |")
print("| 6 = S-Chinese |")
print("| 7 = Japanese |")
print("| 8 = Portuguese |")
print("| 9 = Russian |")
print("| 10 = Korean |")
print("| 11 = T-Chinese |")
print("| 12 = Dutch |")
print("| 13 = Polish |")
print("| 14 = Czech |")
print("| 15 = Hungarian |")
print("|_____________________|")
class PokeScraper():
def __init__(self, url):
self.url = url
"""
scrape the parameters and output them in the cvs format
"""
def paramScrap(self):
"""
for each parameters, get only the value
:param parameter: the index of the parameter in the parameters list
:param paramString: a string containing the name of the parameter as used in cardmarket's url
"""
def singleParamScrap(parameter, paramString):
if parameter >= 0:
content = self.params[parameter]
splitted_content = content.split("=")
splitted_content = str(splitted_content[1]).replace(",",";")
self.paramliste.append(splitted_content)
else:
content = 'None'
self.paramliste.append(content)
# get the list of all the parameters in the url
self.params = self.params_ref.split("&")
self.paramliste = []
# these are the interesting parameters, that we will put in the cvs output, we want to get their values
language = self.index_containing_substring(self.params, "language")
sellerType = self.index_containing_substring(self.params, "sellerType")
minCondition = self.index_containing_substring(self.params, "minCondition")
isSigned = self.index_containing_substring(self.params, "isSigned")
isFirstEd = self.index_containing_substring(self.params, "isFirstEd")
isPlayset = self.index_containing_substring(self.params, "isPlayset")
isAltered = self.index_containing_substring(self.params, "isAltered")
# make use of singleParamScrap to update paramliste with only the values
singleParamScrap(language, "language")
singleParamScrap(sellerType, "sellerType")
singleParamScrap(minCondition, "minCondition")
singleParamScrap(isSigned, "isSigned")
singleParamScrap(isFirstEd, "isFirstEd")
singleParamScrap(isPlayset, "isPlayset")
singleParamScrap(isAltered, "isAltered")
# No need of a return, we can use self.paramliste
def index_containing_substring(self, the_list, substring):
for i, s in enumerate(the_list):
if substring in s:
return i
return -1
def Main(self):
splitted_URL = self.url.split("/")
langage = splitted_URL[3]
jeu = splitted_URL[4]
extension_ref = splitted_URL[7]
name_ref = splitted_URL[8]
name_ref_split = name_ref.split("?")
if len(name_ref_split) == 1 :
self.params_ref = ''
else:
self.params_ref = name_ref_split[1]
name_ref = name_ref_split[0]
page = requests.get(self.url)
soup = BeautifulSoup(page.content, "html.parser")
name_uncut = soup.find_all("div", class_="flex-grow-1")
name = re.search('><h1>(.*)<span', str(name_uncut))
name = name.group(1)
extension_uncut = soup.find_all("a", class_="mb-2")
extension = re.search('">(.*)</a',str(extension_uncut))
extension = extension.group(1)
number = soup.find_all("dd", class_="d-none d-md-block col-6 col-xl-7")
number = re.search('">(.*)<',str(number)).group(1)
Prices_uncut = soup.find_all("dd", class_="col-6 col-xl-7")
Prices_uncut = Prices_uncut[5:]
allPrices = []
for item in Prices_uncut:
allPrices.append(re.search('>(\d.*€)<', str(item)).group(1))
out = [extension, number, name, allPrices[0].replace(",",".").replace("€",""), allPrices[1].replace(",",".").replace("€",""), allPrices[2].replace(",",".").replace("€","")]
self.paramScrap()
self.paramliste.append(self.url)
return out+self.paramliste
def MultiPokeScrapURL(args):
fileIn = open(args[0], 'r')
fileOut = open(args[1], 'w')
fileStat = ''
if len(args)==3 :
fileStat = open(args[2], 'a')
print("extension,number,name,min_price,price_trend,mean30d_price,language,sellerType,minCondition,isSigned,isFirstEd,isPlayset,isAltered,url", file=fileOut)
Lines = fileIn.readlines()
nLines = len(Lines)
iterator = 1
minPrice = 0.0
trendPrice = 0.0
mean30Price = 0.0
for line in Lines:
print("[{}/{}] scraping links... ".format(iterator,nLines), end="\r", flush=True)
#print(f"{bcolors.OKBLUE}[{}/{}] scraping links... {bcolors.ENDC}".format(iterator,nLines), end="\r", flush=True)
currentline = str(line.strip())
pk = PokeScraper(currentline)
pkm = pk.Main()
print(', '.join(pkm), file=fileOut)
iterator+=1
minPrice+=float(pkm[3])
trendPrice+=float(pkm[4])
mean30Price+=float(pkm[5])
nLinesp1=nLines+1
print("Total Min Price = {}\nTotal Trend Price = {}\nTotal Mean Price = {}".format(minPrice, trendPrice, mean30Price))
print("Number of Cards:,{},Total Prices:,{},{},{},,,,,,,,".format(nLines,minPrice,trendPrice,mean30Price), file=fileOut)
if fileStat != '':
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
print("{}, {}, {}, {}".format(now, minPrice, trendPrice, mean30Price), file=fileStat)
def main(argv):
# credit : https://www.tutorialspoint.com/python/python_command_line_arguments.htm
inputfile = ''
outputfile = ''
statfile=''
try:
opts, args = getopt.getopt(argv,"hi:o:s:",["ifile=","ofile=","stats="])
except getopt.GetoptError:
print ('usage: pokeScrap.0.2.py -i <input file or link> -o <outputfile> -s <statFile(optional)>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
helpMe()
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
elif opt in ("-s", "--stats"):
statfile = arg
if outputfile == '':
outputfile = './pokeScraperOut.csv'
if inputfile == '':
print('An input is needed !')
print ('usage: pokeScrap.0.2.py -i <input file or link> -o <outputfile> -s <statFile(optional)>')
sys.exit(2)
print ('Input file is: ', inputfile)
print ('Output file is: ', outputfile)
args = [inputfile, outputfile]
if statfile != '':
args.append(statfile)
MultiPokeScrapURL(args)
if __name__ == "__main__":
main(sys.argv[1:])