From 9e0c240675a024a064072b855d36ef8774f96649 Mon Sep 17 00:00:00 2001 From: Ben Moon Date: Thu, 10 Aug 2023 15:05:23 +0100 Subject: [PATCH] Add support for producing data_by_site --- config.json | 1 + src/trackers/classes/crawl.js | 27 +++++++++++++++++++++++++++ test/fixtures/example.com.json | 3 +++ test/process-crawl.test.js | 28 ++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) diff --git a/config.json b/config.json index a6fe373..5c2fb55 100644 --- a/config.json +++ b/config.json @@ -17,6 +17,7 @@ "cname_ignore_subdomains": ["", "www"], "includePages": true, "pageMapLoc": "path to store page maps", + "analyseScripts": [], "topExampleSites": "JSON list of top sites to include as example sites", "includeExampleSites": "number of example sites to include for each resource", "overrideFingerprintWeights": true diff --git a/src/trackers/classes/crawl.js b/src/trackers/classes/crawl.js index a2b265c..978e2df 100644 --- a/src/trackers/classes/crawl.js +++ b/src/trackers/classes/crawl.js @@ -47,6 +47,9 @@ class Crawl { scripts: {tracking: 0, nontracking: 0}, apis: {} } + + // Per-site data + this.dataBySite = {} } exportEntities() { @@ -178,6 +181,25 @@ async function _processSite (crawl, site) { } } + // analyse per-site script fingerprinting + for (const [script, apis] of Object.entries(site.siteData.data.apis.callStats)) { + const scriptMatch = shared.config.analyseScripts.find(r => script.match(r)) + if (scriptMatch === undefined) { + continue + } + if (!crawl.dataBySite[site.domain]) { + crawl.dataBySite[site.domain] = {} + } + if (!crawl.dataBySite[site.domain][site.host]) { + crawl.dataBySite[site.domain][site.host] = {fingerprinting: {}} + } + const fp = crawl.dataBySite[site.domain][site.host].fingerprinting + if (!fp[scriptMatch]) { + fp[scriptMatch] = {apis: []} + } + fp[scriptMatch].apis = [...new Set([...fp[scriptMatch].apis, ...Object.keys(apis)])].sort() + } + for (const apis of Object.values(site.siteData.data.apis.callStats)) { const apisUsed = Object.keys(apis) @@ -263,6 +285,10 @@ function _getDomainSummaries (crawl) { return domainSummary } +function _getDataBySite (crawl) { + return crawl.dataBySite +} + function _getEntitySummaries (crawl) { delete crawl.entityPrevalence.undefined @@ -281,6 +307,7 @@ function _getEntitySummaries (crawl) { function _writeSummaries (crawl) { fs.writeFileSync(`${shared.config.trackerDataLoc}/build-data/generated/domain_summary.json`, JSON.stringify(_getDomainSummaries(crawl), null, 4)) + fs.writeFileSync(`${shared.config.trackerDataLoc}/build-data/generated/data_by_site.json`, JSON.stringify(_getDataBySite(crawl), null, 4)) fs.writeFileSync(`${shared.config.trackerDataLoc}/crawlStats.json`, JSON.stringify(crawl.stats, null, 4)) diff --git a/test/fixtures/example.com.json b/test/fixtures/example.com.json index 7ebda20..5251a1d 100644 --- a/test/fixtures/example.com.json +++ b/test/fixtures/example.com.json @@ -18,6 +18,9 @@ "Date.prototype.getTime": 14, "Navigator.prototype.userAgent": 2, "Document.cookie setter": 1 + }, + "http://www.google-analytics.com/analytics.js": { + "Document.cookie getter": 1 } }, "savedCalls": [ diff --git a/test/process-crawl.test.js b/test/process-crawl.test.js index 9db188a..9e8b810 100644 --- a/test/process-crawl.test.js +++ b/test/process-crawl.test.js @@ -25,6 +25,9 @@ describe('Process Crawl', () => { before(async () => { // Mock owner of 3rd party domains sharedData.entityMap.set('google-analytics.com', 'Google LLC') + sharedData.analyseScripts = [ + 'google-analytics\\.com/analytics\\.js', + ] site = new Site(mockSiteData) for (const request of mockSiteData.data.requests) { await site.processRequest(request) @@ -141,6 +144,31 @@ describe('Process Crawl', () => { }) }) + it('extracts data by site', () => { + assertObjectPartial(crawl.dataBySite, { + "example.com": { + "test.example.com": { + "fingerprinting": { + "google-analytics\\.com/analytics\\.js": { + "apis": [ + "Date.prototype.getTime", + "Document.cookie getter", + "Document.cookie setter", + "Navigator.prototype.javaEnabled", + "Navigator.prototype.language", + "Navigator.prototype.plugins", + "Navigator.prototype.userAgent", + "Screen.prototype.colorDepth", + "Screen.prototype.height", + "Screen.prototype.width", + ], + }, + }, + }, + }, + }) + }) + it('extracts domain cloaks', () => { assert.deepStrictEqual(crawl.domainCloaks, {}) })