Skip to content

Commit

Permalink
Merge pull request #3418 from owid/available-entities-accurate
Browse files Browse the repository at this point in the history
feat: index chart entities into `charts_x_entities` table
  • Loading branch information
marcelgerber authored Mar 28, 2024
2 parents 5f9cc6e + 75f90b7 commit e352e83
Show file tree
Hide file tree
Showing 6 changed files with 331 additions and 16 deletions.
37 changes: 21 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,27 @@ help:
@echo 'Available commands:'
@echo
@echo ' GRAPHER ONLY'
@echo ' make up start dev environment via docker-compose and tmux'
@echo ' make down stop any services still running'
@echo ' make refresh (while up) download a new grapher snapshot and update MySQL'
@echo ' make refresh.pageviews (while up) download and load pageviews from the private datasette instance'
@echo ' make migrate (while up) run any outstanding db migrations'
@echo ' make test run full suite (except db tests) of CI checks including unit tests'
@echo ' make dbtest run db test suite that needs a running mysql db'
@echo ' make svgtest compare current rendering against reference SVGs'
@echo ' make up start dev environment via docker-compose and tmux'
@echo ' make down stop any services still running'
@echo ' make refresh (while up) download a new grapher snapshot and update MySQL'
@echo ' make refresh.pageviews (while up) download and load pageviews from the private datasette instance'
@echo ' make migrate (while up) run any outstanding db migrations'
@echo ' make test run full suite (except db tests) of CI checks including unit tests'
@echo ' make dbtest run db test suite that needs a running mysql db'
@echo ' make svgtest compare current rendering against reference SVGs'
@echo
@echo ' GRAPHER + WORDPRESS (staff-only)'
@echo ' make up.full start dev environment via docker-compose and tmux'
@echo ' make down.full stop any services still running'
@echo ' make refresh.wp download a new wordpress snapshot and update MySQL'
@echo ' make refresh.full do a full MySQL update of both wordpress and grapher'
@echo ' make sync-images sync all images from the remote master'
@echo ' make reindex reindex (or initialise) search in Algolia'
@echo ' make up.full start dev environment via docker-compose and tmux'
@echo ' make down.full stop any services still running'
@echo ' make refresh.wp download a new wordpress snapshot and update MySQL'
@echo ' make refresh.full do a full MySQL update of both wordpress and grapher'
@echo ' make sync-images sync all images from the remote master'
@echo ' make update.chart-entities update the charts_x_entities join table'
@echo ' make reindex reindex (or initialise) search in Algolia'
@echo
@echo ' OPS (staff-only)'
@echo ' make deploy Deploy your local site to production'
@echo ' make stage Deploy your local site to staging'
@echo ' make deploy Deploy your local site to production'
@echo ' make stage Deploy your local site to staging'
@echo

up: export DEBUG = 'knex:query'
Expand Down Expand Up @@ -347,6 +348,10 @@ itsJustJavascript: node_modules
yarn run tsc -b
touch $@

update.chart-entities: itsJustJavascript
@echo '==> Updating chart entities table'
node --enable-source-maps itsJustJavascript/baker/updateChartEntities.js --all

reindex: itsJustJavascript
@echo '==> Reindexing search in Algolia'
node --enable-source-maps itsJustJavascript/baker/algolia/configureAlgolia.js
Expand Down
240 changes: 240 additions & 0 deletions baker/updateChartEntities.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
/**
* Updates the charts_x_entities table with the available entities for all published charts.
* This is useful in search, where we want to be able to filter charts by entities that can be selected.
* To do this, we need to instantiate a grapher, download its data, and then look at the available entities.
*/

import { Grapher } from "@ourworldindata/grapher"
import {
ChartsTableName,
ChartsXEntitiesTableName,
DbRawChart,
GrapherInterface,
GrapherTabOption,
MultipleOwidVariableDataDimensionsMap,
OwidVariableDataMetadataDimensions,
} from "@ourworldindata/types"
import * as db from "../db/db.js"
import pMap from "p-map"
import { mapEntityNamesToEntityIds } from "../db/model/Entity.js"
import { getVariableData } from "../db/model/Variable.js"
import { uniq } from "@ourworldindata/utils"
import yargs from "yargs"
import { hideBin } from "yargs/helpers"

const FETCH_CONCURRENCY = 10
const VARIABLES_TO_PREFETCH = 300

let _commonVariablesMap:
| Map<number, OwidVariableDataMetadataDimensions>
| undefined = undefined

const _fetchVariablesCounters = { cached: 0, fetched: 0 }

// This is a poor-man's cache for variable data.
// It is unrealistic to cache all variables in memory - at the time of writing, there are about 8000 distinct variables.
// Instead, we pre-fetch the most common variables and cache them in memory.
// These include very common variables: Continents, Population, GDP per capita, etc.
const preFetchCommonVariables = async (
trx: db.KnexReadonlyTransaction
): Promise<void> => {
const commonVariables = (await db.knexRaw(
trx,
`-- sql
SELECT variableId, COUNT(variableId) AS useCount
FROM chart_dimensions cd
JOIN charts c ON cd.chartId = c.id
WHERE config ->> "$.isPublished" = "true"
GROUP BY variableId
ORDER BY COUNT(variableId) DESC
LIMIT ??`,
[VARIABLES_TO_PREFETCH]
)) as { variableId: number; useCount: number }[]

_commonVariablesMap = new Map(
await pMap(
commonVariables,
async ({ variableId, useCount }) => {
const variableData = await getVariableData(variableId)
console.log(
`Pre-fetched variable ${variableId}: ${variableData.metadata.name} (${useCount} uses)`
)
return [variableId, variableData]
},
{ concurrency: FETCH_CONCURRENCY }
)
)
}

const getVariableDataUsingCache = async (
variableId: number
): Promise<OwidVariableDataMetadataDimensions> => {
if (_commonVariablesMap?.has(variableId)) {
_fetchVariablesCounters.cached++
return _commonVariablesMap.get(variableId)!
}

_fetchVariablesCounters.fetched++
return getVariableData(variableId)
}

const obtainAvailableEntitiesForGrapherConfig = async (
grapherConfig: GrapherInterface
) => {
const grapher = new Grapher({ ...grapherConfig, manuallyProvideData: true })

// Manually fetch data for grapher, so we can employ caching
const variableIds = uniq(grapher.dimensions.map((d) => d.variableId))
const variableData: MultipleOwidVariableDataDimensionsMap = new Map(
await pMap(variableIds, async (variableId) => [
variableId,
await getVariableDataUsingCache(variableId),
])
)
grapher.receiveOwidData(variableData)

// If the grapher has a chart tab, then the available entities there are the "most interesting" ones to us
if (grapher.hasChartTab) {
grapher.tab = GrapherTabOption.chart

// If the grapher allows for changing or multi-selecting entities, then let's index all entities the
// user can choose from. Otherwise, we'll just use the default-selected entities.
const canChangeEntities =
grapher.canChangeEntity || grapher.canSelectMultipleEntities

// In these chart types, an unselected entity is still shown
const chartTypeShowsUnselectedEntities =
grapher.isScatter || grapher.isSlopeChart || grapher.isMarimekko

if (canChangeEntities || chartTypeShowsUnselectedEntities)
return grapher.tableForSelection.availableEntityNames as string[]
else return grapher.selectedEntityNames
} else if (grapher.hasMapTab) {
grapher.tab = GrapherTabOption.map
// On a map tab, tableAfterAuthorTimelineAndActiveChartTransform contains all
// mappable entities for which data is available
return grapher.tableAfterAuthorTimelineAndActiveChartTransform
.availableEntityNames as string[]
} else return []
}

const obtainAvailableEntitiesForAllGraphers = async (
trx: db.KnexReadonlyTransaction
) => {
const entityNameToIdMap = await mapEntityNamesToEntityIds(trx)

const allPublishedGraphers = (await trx
.select("id", "config")
.from(ChartsTableName)
.whereRaw("config ->> '$.isPublished' = 'true'")) as Pick<
DbRawChart,
"id" | "config"
>[]

const availableEntitiesByChartId = new Map<number, number[]>()
await pMap(
allPublishedGraphers,
async (grapher) => {
const config = JSON.parse(grapher.config) as GrapherInterface
const availableEntities =
await obtainAvailableEntitiesForGrapherConfig(config)
const availableEntityIds = availableEntities.flatMap(
(entityName) => {
const entityId = entityNameToIdMap.get(entityName)
if (entityId === undefined) {
console.error(
`Entity not found for chart ${grapher.id}: "${entityName}"`
)
return []
}
return [entityId]
}
)
availableEntitiesByChartId.set(grapher.id, availableEntityIds)

console.log(
grapher.id,
config.slug,
`[${availableEntities.length} entities]`
)
},
{ concurrency: FETCH_CONCURRENCY }
)

return availableEntitiesByChartId
}

// Obtains available entities for ALL published graphers and updates the charts_x_entities table
// (by clearing it out and re-inserting all entries).
const updateAvailableEntitiesForAllGraphers = async (
trx: db.KnexReadWriteTransaction
) => {
console.log(
`--- Pre-fetching ${VARIABLES_TO_PREFETCH} most common variables ---`
)
await preFetchCommonVariables(trx)

console.log(
"--- Obtaining available entity ids for all published graphers ---"
)
const availableEntitiesByChartId =
await obtainAvailableEntitiesForAllGraphers(trx)

console.log("--- Fetch stats ---")
console.log(
`Fetched ${_fetchVariablesCounters.fetched} variables; cached ${_fetchVariablesCounters.cached} variable loads using ${VARIABLES_TO_PREFETCH} pre-fetched variables`
)

console.log("--- Updating charts_x_entities ---")

await trx.delete().from(ChartsXEntitiesTableName) // clears out the WHOLE table
for (const [chartId, availableEntityIds] of availableEntitiesByChartId) {
const rows = availableEntityIds.map((entityId) => ({
chartId,
entityId,
}))
if (rows.length) await trx(ChartsXEntitiesTableName).insert(rows)
}

console.log("--- ✅ Done ---")
}

process.on("unhandledRejection", (e) => {
console.error(e)
process.exit(1)
})

if (require.main === module) {
void yargs(hideBin(process.argv))
.command(
"$0",
"Update charts_x_entities table",
(yargs) => {
yargs
.option("all", {
boolean: true,
default: false,
description:
"Update available entities for all published charts",
})
.check(({ all }) => {
if (!all) {
console.error(
"Please use --all. Currently, no other mode is supported."
)
return false
}
return true
})
},
async ({ all }) => {
if (all)
await db.knexReadWriteTransaction(
updateAvailableEntitiesForAllGraphers,
db.TransactionCloseMode.Close
)
}
)
.help()
.strict().argv
}
27 changes: 27 additions & 0 deletions db/migration/1711549786507-CreateChartEntitiesTable.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import { MigrationInterface, QueryRunner } from "typeorm"

export class CreateChartEntitiesTable1711549786507
implements MigrationInterface
{
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.query(`-- sql
CREATE TABLE charts_x_entities (
chartId integer NOT NULL,
entityId integer NOT NULL,
FOREIGN KEY (chartId) REFERENCES charts (id) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (entityId) REFERENCES entities (id) ON DELETE RESTRICT ON UPDATE RESTRICT,
PRIMARY KEY (chartId, entityId),
-- we can use the primary key to look up by chartId, but might also want fast
-- lookups by entityId, so we add an index explicitly
INDEX (entityId)
)
`)
}

public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.query(`DROP TABLE charts_x_entities`)
}
}
30 changes: 30 additions & 0 deletions db/model/Entity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { DbPlainEntity, EntitiesTableName } from "@ourworldindata/types"
import * as db from "../db"

export async function mapEntityNamesToEntityIds(
knex: db.KnexReadonlyTransaction
): Promise<Map<string, number>> {
const entities = (await knex(EntitiesTableName).select(
"id",
"name"
)) as Pick<DbPlainEntity, "id" | "name">[]
const entityNameToIdMap = new Map<string, number>(
entities.map((entity) => [entity.name, entity.id])
)

return entityNameToIdMap
}

export async function mapEntityIdsToEntityNames(
knex: db.KnexReadonlyTransaction
): Promise<Map<number, string>> {
const entities = (await knex(EntitiesTableName).select(
"id",
"name"
)) as Pick<DbPlainEntity, "id" | "name">[]
const entityIdToNameMap = new Map<number, string>(
entities.map((entity) => [entity.id, entity.name])
)

return entityIdToNameMap
}
8 changes: 8 additions & 0 deletions packages/@ourworldindata/types/src/dbTypes/ChartsXEntities.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export const ChartsXEntitiesTableName = "charts_x_entities"

export interface DbInsertChartXEntity {
chartId: number
entityId: number
}

export type DbPlainChartXEntity = Required<DbInsertChartXEntity>
5 changes: 5 additions & 0 deletions packages/@ourworldindata/types/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,11 @@ export {
ChartTagsTableName,
type DbChartTagJoin,
} from "./dbTypes/ChartTags.js"
export {
ChartsXEntitiesTableName,
type DbInsertChartXEntity,
type DbPlainChartXEntity,
} from "./dbTypes/ChartsXEntities.js"
export {
type DbPlainCountryLatestData,
type DbInsertCountryLatestData,
Expand Down

0 comments on commit e352e83

Please sign in to comment.