-
Notifications
You must be signed in to change notification settings - Fork 5
/
get_products.py
171 lines (154 loc) · 5.1 KB
/
get_products.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from lxml import etree
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from pymongo import MongoClient
from datetime import datetime
import json
import os
from functions.getProxy import *
from functions.getUserAgent import *
#proxy = getProxy()
domain = 'shein.com' # For checking if the URL is from the same domain
debug = False # Set to True to limit to 1 page
db_mode = True # True = MongoDB, False = JSON
with open('shein_categories.txt', 'r') as file: # Read URLs from file
urls = file.readlines()
if db_mode:
mongo_host = os.environ.get('MONGO_HOST', 'localhost')
client = MongoClient(f'mongodb://{mongo_host}:27017/')
db = client['shein']
collection = db['product_urls']
# Setup Index
try:
collection.create_index('url', unique=True)
except Exception as e:
print('Index already exists')
blacklistedWords = [
'javascript:',
'mailto:',
'tel:',
'facebook.com',
'twitter.com',
'instagram.com',
'youtube.com',
'pinterest.com',
'tiktok.com',
'Copyright',
'copyright',
'Privacy',
'privacy',
'Terms',
'terms',
'Imprint',
'imprint',
'bonus',
'campaign',
'campaigns',
'sale',
'refund',
'track',
'How-to',
'how-to',
'shein.com/women',
'shein.com/other',
'shein.com/Return-Policy',
'shein.com/men',
'shein.com/plussize',
'shein.com/curve-plus-size',
'promotion',
'shein.com/home',
'shein.com/cart',
'contact',
'About',
'SUPPLY-CHAIN-TRANSPARENCY',
'prime',
'shein.com/kids',
'shein.com/beauty',
'shein.com/flashsale',
'Shipping-Info',
'coupon-a',
'/user/auth/login',
'daily-new',
'New-in-Trends',
'shein.com/style',
'New-in-Trends',
'shein.com/member-image-list',
]
# Function to check if any word from a list is included in a string
def included_in_string(string, word_list):
for word in word_list:
if word in string:
return True
return False
#prox_options = {
# 'proxy': {
# 'http': proxy
# }
#}
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--user-agent=' + GET_UA())
options.add_argument('--incognito')
options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
chrome_drvier_binary = '/opt/homebrew/bin/chromedriver'
driver = webdriver.Chrome(service=Service(chrome_drvier_binary), options=options)
for url in urls:
url = url.strip()
print('Processing ' + url)
driver.get(url)
try:
pagination_text = driver.find_element(By.CLASS_NAME,'sui-pagination__total').text
pagination_number = re.sub("\D", "", pagination_text)
max_pages = int(pagination_number)
if debug:
max_pages = min(1, max_pages) # Limit to 1 page in debug mode
print(f'Found {max_pages} pages')
except Exception as e:
print('Error getting pagination: ' + str(e))
max_pages = 1 # If no pagination, assume 1 page of product
pass
# Initialize array for product urls
product_urls = []
for i in range(1, max_pages + 1):
try:
print(f'Processing page {i} of {max_pages}') # Progress update
driver.get(url + '?page=' + str(i))
product_elements = driver.find_elements(By.CLASS_NAME, 'product-list__item')
for product in product_elements:
href = product.find_element(By.TAG_NAME, 'a').get_attribute('href')
if not href or included_in_string(href, blacklistedWords):
continue
parsed_url = urlparse(href)
cleaned_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
if domain in parsed_url.netloc and cleaned_url not in product_urls:
try:
if db_mode:
if collection.find_one({'url': cleaned_url}):
print('URL already exists in MongoDB')
continue
print('Adding ' + cleaned_url + ' to MongoDB')
collection.insert_one({'url': cleaned_url, 'status': 'pending', 'timestamp': datetime.now()})
else:
product_urls.append(cleaned_url)
except Exception as e:
print('Error adding URL to MongoDB: ' + str(e))
pass
except Exception as e:
print('Error processing page: ' + str(e))
continue
if not db_mode:
print('Writing to JSON file')
with open('product_urls.json', 'w') as outfile:
json.dump(product_urls, outfile)
driver.quit()
if db_mode:
client.close()