Skip to content

Commit

Permalink
Add support for producing data_by_site
Browse files Browse the repository at this point in the history
  • Loading branch information
GuiltyDolphin committed Aug 10, 2023
1 parent c700207 commit 9e0c240
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 0 deletions.
1 change: 1 addition & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"cname_ignore_subdomains": ["", "www"],
"includePages": true,
"pageMapLoc": "path to store page maps",
"analyseScripts": [],
"topExampleSites": "JSON list of top sites to include as example sites",
"includeExampleSites": "number of example sites to include for each resource",
"overrideFingerprintWeights": true
Expand Down
27 changes: 27 additions & 0 deletions src/trackers/classes/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class Crawl {
scripts: {tracking: 0, nontracking: 0},
apis: {}
}

// Per-site data
this.dataBySite = {}
}

exportEntities() {
Expand Down Expand Up @@ -178,6 +181,25 @@ async function _processSite (crawl, site) {
}
}

// analyse per-site script fingerprinting
for (const [script, apis] of Object.entries(site.siteData.data.apis.callStats)) {
const scriptMatch = shared.config.analyseScripts.find(r => script.match(r))
if (scriptMatch === undefined) {
continue
}
if (!crawl.dataBySite[site.domain]) {
crawl.dataBySite[site.domain] = {}
}
if (!crawl.dataBySite[site.domain][site.host]) {
crawl.dataBySite[site.domain][site.host] = {fingerprinting: {}}
}
const fp = crawl.dataBySite[site.domain][site.host].fingerprinting
if (!fp[scriptMatch]) {
fp[scriptMatch] = {apis: []}
}
fp[scriptMatch].apis = [...new Set([...fp[scriptMatch].apis, ...Object.keys(apis)])].sort()
}

for (const apis of Object.values(site.siteData.data.apis.callStats)) {
const apisUsed = Object.keys(apis)

Expand Down Expand Up @@ -263,6 +285,10 @@ function _getDomainSummaries (crawl) {
return domainSummary
}

function _getDataBySite (crawl) {
return crawl.dataBySite
}

function _getEntitySummaries (crawl) {
delete crawl.entityPrevalence.undefined

Expand All @@ -281,6 +307,7 @@ function _getEntitySummaries (crawl) {

function _writeSummaries (crawl) {
fs.writeFileSync(`${shared.config.trackerDataLoc}/build-data/generated/domain_summary.json`, JSON.stringify(_getDomainSummaries(crawl), null, 4))
fs.writeFileSync(`${shared.config.trackerDataLoc}/build-data/generated/data_by_site.json`, JSON.stringify(_getDataBySite(crawl), null, 4))

fs.writeFileSync(`${shared.config.trackerDataLoc}/crawlStats.json`, JSON.stringify(crawl.stats, null, 4))

Expand Down
3 changes: 3 additions & 0 deletions test/fixtures/example.com.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
"Date.prototype.getTime": 14,
"Navigator.prototype.userAgent": 2,
"Document.cookie setter": 1
},
"http://www.google-analytics.com/analytics.js": {
"Document.cookie getter": 1
}
},
"savedCalls": [
Expand Down
28 changes: 28 additions & 0 deletions test/process-crawl.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ describe('Process Crawl', () => {
before(async () => {
// Mock owner of 3rd party domains
sharedData.entityMap.set('google-analytics.com', 'Google LLC')
sharedData.analyseScripts = [
'google-analytics\\.com/analytics\\.js',
]
site = new Site(mockSiteData)
for (const request of mockSiteData.data.requests) {
await site.processRequest(request)
Expand Down Expand Up @@ -141,6 +144,31 @@ describe('Process Crawl', () => {
})
})

it('extracts data by site', () => {
assertObjectPartial(crawl.dataBySite, {
"example.com": {
"test.example.com": {
"fingerprinting": {
"google-analytics\\.com/analytics\\.js": {
"apis": [
"Date.prototype.getTime",
"Document.cookie getter",
"Document.cookie setter",
"Navigator.prototype.javaEnabled",
"Navigator.prototype.language",
"Navigator.prototype.plugins",
"Navigator.prototype.userAgent",
"Screen.prototype.colorDepth",
"Screen.prototype.height",
"Screen.prototype.width",
],
},
},
},
},
})
})

it('extracts domain cloaks', () => {
assert.deepStrictEqual(crawl.domainCloaks, {})
})
Expand Down

0 comments on commit 9e0c240

Please sign in to comment.