Skip to content

Commit

Permalink
Updated the second performance crawl list and fixed code bugs (issue #9)
Browse files Browse the repository at this point in the history
  • Loading branch information
atlasharry committed Oct 25, 2024
1 parent 94a8ce3 commit 26900e7
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ def select_sites(country, num_sites):
final_sites_df = pd.DataFrame(selected_sites).drop(columns=['cleaned_url'])

# Output the final list
output_path = 'random_50_sites_by_countries.csv'
output_path = 'random_50_sites_by_countries_crawl_2.csv'
final_sites_df[['Site URL']].to_csv(output_path, index=False)
print(f"Selected sites saved to {output_path}")
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd
import random
import re
import os
import pandas as pd
from tqdm import tqdm

def clean_url(url):
"""
Expand All @@ -19,28 +20,29 @@ def clean_url(url):
def select_sites(folder, num_sites_per_file):
selected_sites = []
files = [f for f in os.listdir(folder) if f.endswith('.csv')]
for file in files:
for file in tqdm(files, desc=folder):
df = pd.read_csv(os.path.join(folder, file))
df['cleaned_url'] = df['Site URL'].apply(clean_url)

while len(selected_sites) < num_sites_per_file * len(files):
selected_sites_per_file = []
while len(selected_sites_per_file) < num_sites_per_file:
potential_site = df.sample(n=1).iloc[0]
if potential_site['cleaned_url'] not in current_list['cleaned_url'].values and potential_site['cleaned_url'] not in [site['cleaned_url'] for site in selected_sites]:
selected_sites.append(potential_site)
if len([site for site in selected_sites if site['cleaned_url'] == potential_site['cleaned_url']]) > 1:
selected_sites.pop()

cleaned_url = potential_site['cleaned_url']
if (cleaned_url not in current_list['cleaned_url'].values and
cleaned_url not in [site['cleaned_url'] for site in selected_sites] and
cleaned_url not in [site['cleaned_url'] for site in selected_sites_per_file]):
selected_sites_per_file.append(potential_site)
selected_sites.extend(selected_sites_per_file)
return selected_sites

# Select sites from each folder
fingerprint_sites = select_sites(fingerprint_folder, 4)
location_sites = select_sites(location_folder, 6)
fingerprint_sites = select_sites(fingerprint_folder, 2)
location_sites = select_sites(location_folder, 8)

selected_sites = fingerprint_sites + location_sites
final_sites_df = pd.DataFrame(selected_sites).drop(columns=['cleaned_url'])

# Output the final list
output_path = 'random_50_sites_by_tracking_tech.csv'
output_path = 'random_50_sites_by_tracking_tech_crawl_2.csv'
final_sites_df[['Site URL']].to_csv(output_path, index=False)

print(f"Selected sites saved to {output_path}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
Site URL
https://ndis.gov.au
https://engineersaustralia.org.au
https://avivo.org.au
https://danmurphys.com.au
https://unisa.edu.au
https://marisa.com.br
https://zoom.com.br
https://jovemnerd.com.br
https://smiles.com.br
https://oab.org.br
https://primus.ca
https://ipolitics.ca
https://amazon.ca
https://yahoo.ca
https://jvns.ca
https://1und1.de
https://ozerov.de
https://ruhr24.de
https://deutsche-bank.de
https://united-domains.de
https://updeled.gov.in
https://guvi.in
https://delhi.gov.in
https://mahadiscom.in
https://shriramfinance.in
https://iamport.kr
https://icoop.or.kr
https://boardlife.co.kr
https://airbnb.co.kr
https://helpstart.co.kr
https://homage.sg
https://ofs.edu.sg
https://roots.gov.sg
https://fortytwo.sg
https://nyp.edu.sg
https://carmag.co.za
https://webtickets.co.za
https://okfurniture.co.za
https://isoft.co.za
https://www.budgetinsurance.co.za
https://zalando.es
https://rae.es
https://umh.es
https://ebay.es
https://castillalamancha.es
http://amplitude.com
http://tripadvisor.com
http://litnet.com
http://w3schools.com
http://latimes.com
https://nasm.org
https://beddinginn.com
https://knfilters.com
https://bobevans.com
https://officefurnitureitaly.com
https://borderlinx.com
https://curvage.org
https://truthfinder.com
https://221616.com
https://openet.com
https://opendns.com
https://nypl.org
https://cvent.com
https://betterment.com
https://canvaslms.com
https://billtrust.com
https://catholic.com
https://on24.com
https://theayurvedaexperience.com
https://trademarkia.com
https://cleverism.com
https://circleci.com
https://bestself.co
https://knapheide.com
https://eu.travelpro.com
https://buckmason.com
https://thediyoutlet.com
https://ninelineapparel.com
https://omnicell.com
https://trainz.com
https://newswire.com
https://tiffin.edu
https://cityu.edu
https://net10.com
https://cnet.com
https://thehill.com
https://media.chase.com
https://carfax.com
https://tasteofhome.com
https://britannica.com
https://nickiswift.com
https://nbcsports.com
https://bristol.ac.uk
https://bodymeasurements.org
https://badboymowers.com
https://caribjournal.com
https://transloc.com
https://sites.toro.com
https://bris.ac.uk
https://byucougars.com

10 changes: 9 additions & 1 deletion selenium-crawler/test-lists-preparation/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Crawler Performance Test List Preparation

We are testing our crawler using a list of 100 websites called `100_combined_test_list.csv`. The folder `test-lists-preparation` contains scripts and data we use to make this list. This README described how we build up this list and the information about this list.
We are testing our crawler using a list of 100 websites called `100_sites_test_list_crawl_1.csv`. The folder `test-lists-preparation` contains scripts and data we use to make this list. This README described how we build up this list and the information about this list.

Our 100-site crawler performance test list is consisted of 2 parts:

Expand All @@ -16,6 +16,14 @@ Our 100-site crawler performance test list is consisted of 2 parts:

From our experience the `Monetization` category and `Tracking Pixel` are the most common tracking technology used by websites. Therefore, we did not deliberately collect the websites that use those technologies. Instead, to make sure we have a good coverage of websites, our focuses are mainly on collecting websites that may track users' `location` (including IP Address, City, Fine Location, Corse Location, Region, and ZIP Code) and `Browser Fingerprinting`. In the end, the 50 websites by tracking technologies are consists of 30 websites that use `location tracking` technologies and 20 websites that use `browser fingerprinting` technologies. The script to build this second half of the list `random_50_sites_by_tracking_tech.csv` is `02_create_50_random_sites_by_technologies.py`.

### Updated on Sept 24, 2024:

We have performed a second performance crawl. In the second crawl, we have chosen 50 random websites that included 5 sites from each of the 10 countries' top 525 lists, and 50 random websites that had high probability of tracking technologies.

In the end, the 50 websites by tracking technologies are consists of 40 websites that use `location tracking` technologies and 10 websites that use `browser fingerprinting` technologies.

The final 100 sites list for the second crawl is saved in the `100_sites_test_list_crawl_2.csv`.

## Location

To collect sites that may track the users' location, we have randomly selected 5 location collection wedgets from this list:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,3 +323,103 @@ https://310nutrition.com/
https://www.izzyandliv.com/
https://www.sitnsleep.com/
https://www.jackrogersusa.com/
https://whitefoxboutique.com.au
https://canberra.edu.au
https://bodyandsoul.com.au
https://jcu.edu.au
https://4wdsupacentre.com.au
https://elo7.com.br
https://zattini.com.br
https://ufpr.br/
https://radios.com.br
https://mundi.com.br
https://indigo.ca
https://newspark.ca
https://sportsexperts.ca
https://ulaval.ca
https://nfb.ca
https://sma.de
https://nintendo.de
https://boell.de
https://bertelsmann-stiftung.de
https://reservix.de
https://esic.gov.in
https://logix.in
https://sony.co.in
https://trainman.in
https://airtelxstream.in
https://yonsei.ac.kr
https://cafe24.co.kr
https://etri.re.kr
https://tf.co.kr
https://pknu.ac.kr
https://sla.gov.sg
https://m1.com.sg
https://callhome.sg
https://justit.com.sg
https://8days.sg
https://cellc.co.za
https://skywire.co.za
https://pmg.org.za
https://webuycars.co.za
https://maroelamedia.co.za
https://unav.es
https://selwomarina.es
https://vermiip.es
https://www.repsol.es/particulares/
https://lowi.es
http://gofundme.com
http://checkpoint.com
http://dribbble.com
http://chartboost.com
http://n-able.com
https://keepersecurity.com
https://comptia.org
https://subscribe.nationalenquirer.com
https://cybrary.it
https://becker.com
https://tokopedia.com
https://sandbox.turnitin.com
https://olukai.com
https://kaptest.com
https://nautica.com
https://bamboohr.com
https://taosfootwear.com
https://com-date.org
https://checkbook.org
https://thecbdistillery.com
https://cupshe.com
https://californiapsychics.com
https://chubb.com
https://tomtop.com
https://bodybuilding.com
https://calendly.com
https://commercialtrucktrader.com
https://carbonite.com
https://newsobserver.com
https://clearcompany.com
https://tulsaworld.com
https://ontrac.com
https://ballotpedia.org
https://carsdirect.com
https://chegg.com
https://code.org
https://buffalonews.com
https://charitynavigator.org
https://bigthink.com
https://nasdaq.com
https://cloudinary.com
https://nielsen.com
https://tucson.com
https://techrepublic.com
https://newsweek.com
https://travelers.com
https://kiwireport.com
https://okta.com
https://social.cubcadet.com
https://cloudflare.com
https://nwitimes.com
https://touchnet.com
https://campuslabs.com
https://brightspace.com
https://teamusa.org
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Site URL
https://ndis.gov.au
https://engineersaustralia.org.au
https://avivo.org.au
https://danmurphys.com.au
https://unisa.edu.au
https://marisa.com.br
https://zoom.com.br
https://jovemnerd.com.br
https://smiles.com.br
https://oab.org.br
https://primus.ca
https://ipolitics.ca
https://amazon.ca
https://yahoo.ca
https://jvns.ca
https://1und1.de
https://ozerov.de
https://ruhr24.de
https://deutsche-bank.de
https://united-domains.de
https://updeled.gov.in
https://guvi.in
https://delhi.gov.in
https://mahadiscom.in
https://shriramfinance.in
https://iamport.kr
https://icoop.or.kr
https://boardlife.co.kr
https://airbnb.co.kr
https://helpstart.co.kr
https://homage.sg
https://ofs.edu.sg
https://roots.gov.sg
https://fortytwo.sg
https://nyp.edu.sg
https://carmag.co.za
https://webtickets.co.za
https://okfurniture.co.za
https://isoft.co.za
https://www.budgetinsurance.co.za
https://zalando.es
https://rae.es
https://umh.es
https://ebay.es
https://castillalamancha.es
http://amplitude.com
http://tripadvisor.com
http://litnet.com
http://w3schools.com
http://latimes.com

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Site URL
https://nasm.org
https://beddinginn.com
https://knfilters.com
https://bobevans.com
https://officefurnitureitaly.com
https://borderlinx.com
https://curvage.org
https://truthfinder.com
https://221616.com
https://openet.com
https://opendns.com
https://nypl.org
https://cvent.com
https://betterment.com
https://canvaslms.com
https://billtrust.com
https://catholic.com
https://on24.com
https://theayurvedaexperience.com
https://trademarkia.com
https://cleverism.com
https://circleci.com
https://bestself.co
https://knapheide.com
https://eu.travelpro.com
https://buckmason.com
https://thediyoutlet.com
https://ninelineapparel.com
https://omnicell.com
https://trainz.com
https://newswire.com
https://tiffin.edu
https://cityu.edu
https://net10.com
https://cnet.com
https://thehill.com
https://media.chase.com
https://carfax.com
https://tasteofhome.com
https://britannica.com
https://nickiswift.com
https://nbcsports.com
https://bristol.ac.uk
https://bodymeasurements.org
https://badboymowers.com
https://caribjournal.com
https://transloc.com
https://sites.toro.com
https://bris.ac.uk
https://byucougars.com

0 comments on commit 26900e7

Please sign in to comment.