Skip to content

Commit

Permalink
Update entity data in-memory and write files only once (#44)
Browse files Browse the repository at this point in the history
* Update entity data in-memory and write files only once

* Fix test
  • Loading branch information
muodov authored Feb 2, 2023
1 parent 9008482 commit 9073db5
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 61 deletions.
73 changes: 67 additions & 6 deletions src/trackers/classes/crawl.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* eslint-disable max-depth */
const fs = require('fs')
const path = require('path')
const {median, std} = require('mathjs')
Expand Down Expand Up @@ -27,6 +28,9 @@ class Crawl {
// overall entity prevalence split into tracking and non-tracking
this.entityPrevalence = {}

// updated entity data that will be exported after the analysis
this.entityData = {}

// summary of 3p requests seen in the crawl
this.commonRequests = {}

Expand All @@ -45,12 +49,19 @@ class Crawl {
}
}

exportEntities() {
for (const [entityName, data] of Object.entries(this.entityData)) {
const entityFile = path.join(shared.config.trackerDataLoc, 'entities', `${entityName}.json`)
fs.writeFileSync(entityFile, JSON.stringify(data, null, 4))
}
}

writeSummaries () {
_writeSummaries(this)
}

processSite (site) {
_processSite(this, site)
async processSite (site) {
await _processSite(this, site)
}

finalizeRequests () {
Expand All @@ -67,7 +78,32 @@ class Crawl {
}
}

function _processSite (crawl, site) {
/**
* Add domain to entity property list when nameserver match is found.
* @param {Crawl} crawl - reference to the current crawl
* @param {string} entityName - entity file name to update
* @param {string} domain - domain name to add to the entity properties list
*/
function _updateEntityProperties (crawl, entityName, domain) {
if (!(entityName in crawl.entityData)) {
const entityFile = path.join(shared.config.trackerDataLoc, 'entities', `${entityName}.json`)

try {
const data = fs.readFileSync(entityFile, 'utf8')
const entityData = JSON.parse(data)
crawl.entityData[entityName] = entityData
} catch (e) {
console.error(`Could not update entity data: ${e} ${e.stack}`)
return
}
}
const entityData = crawl.entityData[entityName]
if (!entityData.properties.includes(domain)) {
entityData.properties.push(domain)
}
}

async function _processSite (crawl, site) {
// go through the uniqueDomains found on the site and update the crawl domain prevalence, fingerprinting, and cookies
Object.keys(site.uniqueDomains).forEach(domain => {
crawl.domainPrevalence[domain] ? crawl.domainPrevalence[domain] += 1 : crawl.domainPrevalence[domain] = 1
Expand Down Expand Up @@ -102,10 +138,35 @@ function _processSite (crawl, site) {
}

// add common requests entries for each of the requests found on the site
site.requests.forEach(request => {
for (const request of site.requests) {
if (!request.domain) {
crawl.stats.requestsSkipped++
return
continue
}

if (!site.isFirstParty(request.url)) {
const nameservers = await shared.nameservers.resolveNs(request.domain)

if (nameservers && nameservers.length) {
request.nameservers = nameservers

// The option to group by nameservers is set in the config
// All nameservers must match so we can do a quick check to see that the first nameserver exists in our data
if (shared.nameserverList && shared.nameserverToEntity[request.nameservers[0]]) {
for (const nsEntry of shared.nameserverList) {
const entityNS = new Set(nsEntry.nameservers)

// all nameservers in set must match
const nsDiff = request.nameservers.filter(x => !entityNS.has(x))

if (nsDiff && nsDiff.length === 0) {
_updateEntityProperties(crawl, nsEntry.name, request.domain)
request.owner = nsEntry.name
break
}
}
}
}
}

const key = _getCommonRequestKey(request)
Expand All @@ -115,7 +176,7 @@ function _processSite (crawl, site) {
} else {
crawl.commonRequests[key].update(request, site)
}
})
}

for (const apis of Object.values(site.siteData.data.apis.callStats)) {
const apisUsed = Object.keys(apis)
Expand Down
53 changes: 0 additions & 53 deletions src/trackers/classes/site.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
const tldts = require('tldts-experimental')
const fs = require('fs')
const path = require('path')
const Request = require('./request.js')
const shared = require('./../helpers/sharedData.js')
const URL = require('./../helpers/url.js')
Expand Down Expand Up @@ -141,31 +139,6 @@ function isRootSite(request, site) {
return isInitial || isFinal
}

/**
* Add domain to entity property list when nameserver match is found.
* @param {string} entityName - entity file name to update
* @param {string} domain - domain name to add to the entity properties list
*/
function _updateEntityProperties (entityName, domain) {
const entityFile = path.join(shared.config.trackerDataLoc, 'entities', `${entityName}.json`)

fs.readFile(entityFile, 'utf8', (readError, data) => {
if (readError) {
console.error(readError)
} else {
const entityData = JSON.parse(data)
if (!entityData.properties.includes(domain)) {
entityData.properties.push(domain)
fs.writeFile(entityFile, JSON.stringify(entityData, null, 4), writeError => {
if (writeError) {
console.error(writeError)
}
})
}
}
})
}

/**
* Process a single request, resolve CNAME's (if any)
* @param {Object} requestData - The raw request data
Expand All @@ -174,32 +147,6 @@ function _updateEntityProperties (entityName, domain) {
async function _processRequest (requestData, site) {
const request = new Request(requestData, site)

if (!site.isFirstParty(request.url)) {
const nameservers = await shared.nameservers.resolveNs(request.domain)

if (nameservers && nameservers.length) {
request.nameservers = nameservers

// The option to group by nameservers is set in the config
// All nameservers must match so we can do a quick check to see that the first nameserver exists in our data
if (shared.nameserverList && shared.nameserverToEntity[request.nameservers[0]]) {
for (const nsEntry of shared.nameserverList) {
const entityNS = new Set(nsEntry.nameservers)

// all nameservers in set must match
const nsDiff = request.nameservers.filter(x => !entityNS.has(x))

// eslint-disable-next-line max-depth
if (nsDiff && nsDiff.length === 0) {
_updateEntityProperties(nsEntry.name, request.domain)
request.owner = nsEntry.name
break
}
}
}
}
}

// If this request is a subdomain of the site, see if it is cnamed
if (site.isFirstParty(request.url) &&
!shared.config.treatCnameAsFirstParty &&
Expand Down
3 changes: 2 additions & 1 deletion src/trackers/process-crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ async function processSite(siteData) {
}

// update crawl level domain prevalence, entity prevalence, and fingerprinting
crawl.processSite(site)
await crawl.processSite(site)
crawl.stats.sites++
bar.tick()
}
Expand All @@ -55,6 +55,7 @@ async function processCrawl() {
sitesQueue.push(processSite(siteData))
}
await Promise.allSettled(sitesQueue)
crawl.exportEntities()
crawl.finalizeRequests()
crawl.writeSummaries()
console.log(`${chalk.blue(crawl.stats.sites)} sites processed\n${chalk.blue(crawl.stats.requests)} requests processed\n${chalk.blue(crawl.stats.requestsSkipped)} requests skipped`)
Expand Down
2 changes: 1 addition & 1 deletion test/process-crawl.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ describe('Process Crawl', () => {
await site.processRequest(request)
crawl.stats.requests++
}
crawl.processSite(site)
await crawl.processSite(site)
Object.values(crawl.commonRequests).forEach(req => req.finalize(2))
})

Expand Down

0 comments on commit 9073db5

Please sign in to comment.