diff --git a/Makefile b/Makefile index 36b0003383b..aa073fab758 100644 --- a/Makefile +++ b/Makefile @@ -24,26 +24,27 @@ help: @echo 'Available commands:' @echo @echo ' GRAPHER ONLY' - @echo ' make up start dev environment via docker-compose and tmux' - @echo ' make down stop any services still running' - @echo ' make refresh (while up) download a new grapher snapshot and update MySQL' - @echo ' make refresh.pageviews (while up) download and load pageviews from the private datasette instance' - @echo ' make migrate (while up) run any outstanding db migrations' - @echo ' make test run full suite (except db tests) of CI checks including unit tests' - @echo ' make dbtest run db test suite that needs a running mysql db' - @echo ' make svgtest compare current rendering against reference SVGs' + @echo ' make up start dev environment via docker-compose and tmux' + @echo ' make down stop any services still running' + @echo ' make refresh (while up) download a new grapher snapshot and update MySQL' + @echo ' make refresh.pageviews (while up) download and load pageviews from the private datasette instance' + @echo ' make migrate (while up) run any outstanding db migrations' + @echo ' make test run full suite (except db tests) of CI checks including unit tests' + @echo ' make dbtest run db test suite that needs a running mysql db' + @echo ' make svgtest compare current rendering against reference SVGs' @echo @echo ' GRAPHER + WORDPRESS (staff-only)' - @echo ' make up.full start dev environment via docker-compose and tmux' - @echo ' make down.full stop any services still running' - @echo ' make refresh.wp download a new wordpress snapshot and update MySQL' - @echo ' make refresh.full do a full MySQL update of both wordpress and grapher' - @echo ' make sync-images sync all images from the remote master' - @echo ' make reindex reindex (or initialise) search in Algolia' + @echo ' make up.full start dev environment via docker-compose and tmux' + @echo ' make down.full stop any services still running' + @echo ' make refresh.wp download a new wordpress snapshot and update MySQL' + @echo ' make refresh.full do a full MySQL update of both wordpress and grapher' + @echo ' make sync-images sync all images from the remote master' + @echo ' make update.chart-entities update the charts_x_entities join table' + @echo ' make reindex reindex (or initialise) search in Algolia' @echo @echo ' OPS (staff-only)' - @echo ' make deploy Deploy your local site to production' - @echo ' make stage Deploy your local site to staging' + @echo ' make deploy Deploy your local site to production' + @echo ' make stage Deploy your local site to staging' @echo up: export DEBUG = 'knex:query' @@ -347,6 +348,10 @@ itsJustJavascript: node_modules yarn run tsc -b touch $@ +update.chart-entities: itsJustJavascript + @echo '==> Updating chart entities table' + node --enable-source-maps itsJustJavascript/baker/updateChartEntities.js --all + reindex: itsJustJavascript @echo '==> Reindexing search in Algolia' node --enable-source-maps itsJustJavascript/baker/algolia/configureAlgolia.js diff --git a/baker/updateChartEntities.ts b/baker/updateChartEntities.ts new file mode 100644 index 00000000000..412b80ec361 --- /dev/null +++ b/baker/updateChartEntities.ts @@ -0,0 +1,240 @@ +/** + * Updates the charts_x_entities table with the available entities for all published charts. + * This is useful in search, where we want to be able to filter charts by entities that can be selected. + * To do this, we need to instantiate a grapher, download its data, and then look at the available entities. + */ + +import { Grapher } from "@ourworldindata/grapher" +import { + ChartsTableName, + ChartsXEntitiesTableName, + DbRawChart, + GrapherInterface, + GrapherTabOption, + MultipleOwidVariableDataDimensionsMap, + OwidVariableDataMetadataDimensions, +} from "@ourworldindata/types" +import * as db from "../db/db.js" +import pMap from "p-map" +import { mapEntityNamesToEntityIds } from "../db/model/Entity.js" +import { getVariableData } from "../db/model/Variable.js" +import { uniq } from "@ourworldindata/utils" +import yargs from "yargs" +import { hideBin } from "yargs/helpers" + +const FETCH_CONCURRENCY = 10 +const VARIABLES_TO_PREFETCH = 300 + +let _commonVariablesMap: + | Map + | undefined = undefined + +const _fetchVariablesCounters = { cached: 0, fetched: 0 } + +// This is a poor-man's cache for variable data. +// It is unrealistic to cache all variables in memory - at the time of writing, there are about 8000 distinct variables. +// Instead, we pre-fetch the most common variables and cache them in memory. +// These include very common variables: Continents, Population, GDP per capita, etc. +const preFetchCommonVariables = async ( + trx: db.KnexReadonlyTransaction +): Promise => { + const commonVariables = (await db.knexRaw( + trx, + `-- sql + SELECT variableId, COUNT(variableId) AS useCount + FROM chart_dimensions cd + JOIN charts c ON cd.chartId = c.id + WHERE config ->> "$.isPublished" = "true" + GROUP BY variableId + ORDER BY COUNT(variableId) DESC + LIMIT ??`, + [VARIABLES_TO_PREFETCH] + )) as { variableId: number; useCount: number }[] + + _commonVariablesMap = new Map( + await pMap( + commonVariables, + async ({ variableId, useCount }) => { + const variableData = await getVariableData(variableId) + console.log( + `Pre-fetched variable ${variableId}: ${variableData.metadata.name} (${useCount} uses)` + ) + return [variableId, variableData] + }, + { concurrency: FETCH_CONCURRENCY } + ) + ) +} + +const getVariableDataUsingCache = async ( + variableId: number +): Promise => { + if (_commonVariablesMap?.has(variableId)) { + _fetchVariablesCounters.cached++ + return _commonVariablesMap.get(variableId)! + } + + _fetchVariablesCounters.fetched++ + return getVariableData(variableId) +} + +const obtainAvailableEntitiesForGrapherConfig = async ( + grapherConfig: GrapherInterface +) => { + const grapher = new Grapher({ ...grapherConfig, manuallyProvideData: true }) + + // Manually fetch data for grapher, so we can employ caching + const variableIds = uniq(grapher.dimensions.map((d) => d.variableId)) + const variableData: MultipleOwidVariableDataDimensionsMap = new Map( + await pMap(variableIds, async (variableId) => [ + variableId, + await getVariableDataUsingCache(variableId), + ]) + ) + grapher.receiveOwidData(variableData) + + // If the grapher has a chart tab, then the available entities there are the "most interesting" ones to us + if (grapher.hasChartTab) { + grapher.tab = GrapherTabOption.chart + + // If the grapher allows for changing or multi-selecting entities, then let's index all entities the + // user can choose from. Otherwise, we'll just use the default-selected entities. + const canChangeEntities = + grapher.canChangeEntity || grapher.canSelectMultipleEntities + + // In these chart types, an unselected entity is still shown + const chartTypeShowsUnselectedEntities = + grapher.isScatter || grapher.isSlopeChart || grapher.isMarimekko + + if (canChangeEntities || chartTypeShowsUnselectedEntities) + return grapher.tableForSelection.availableEntityNames as string[] + else return grapher.selectedEntityNames + } else if (grapher.hasMapTab) { + grapher.tab = GrapherTabOption.map + // On a map tab, tableAfterAuthorTimelineAndActiveChartTransform contains all + // mappable entities for which data is available + return grapher.tableAfterAuthorTimelineAndActiveChartTransform + .availableEntityNames as string[] + } else return [] +} + +const obtainAvailableEntitiesForAllGraphers = async ( + trx: db.KnexReadonlyTransaction +) => { + const entityNameToIdMap = await mapEntityNamesToEntityIds(trx) + + const allPublishedGraphers = (await trx + .select("id", "config") + .from(ChartsTableName) + .whereRaw("config ->> '$.isPublished' = 'true'")) as Pick< + DbRawChart, + "id" | "config" + >[] + + const availableEntitiesByChartId = new Map() + await pMap( + allPublishedGraphers, + async (grapher) => { + const config = JSON.parse(grapher.config) as GrapherInterface + const availableEntities = + await obtainAvailableEntitiesForGrapherConfig(config) + const availableEntityIds = availableEntities.flatMap( + (entityName) => { + const entityId = entityNameToIdMap.get(entityName) + if (entityId === undefined) { + console.error( + `Entity not found for chart ${grapher.id}: "${entityName}"` + ) + return [] + } + return [entityId] + } + ) + availableEntitiesByChartId.set(grapher.id, availableEntityIds) + + console.log( + grapher.id, + config.slug, + `[${availableEntities.length} entities]` + ) + }, + { concurrency: FETCH_CONCURRENCY } + ) + + return availableEntitiesByChartId +} + +// Obtains available entities for ALL published graphers and updates the charts_x_entities table +// (by clearing it out and re-inserting all entries). +const updateAvailableEntitiesForAllGraphers = async ( + trx: db.KnexReadWriteTransaction +) => { + console.log( + `--- Pre-fetching ${VARIABLES_TO_PREFETCH} most common variables ---` + ) + await preFetchCommonVariables(trx) + + console.log( + "--- Obtaining available entity ids for all published graphers ---" + ) + const availableEntitiesByChartId = + await obtainAvailableEntitiesForAllGraphers(trx) + + console.log("--- Fetch stats ---") + console.log( + `Fetched ${_fetchVariablesCounters.fetched} variables; cached ${_fetchVariablesCounters.cached} variable loads using ${VARIABLES_TO_PREFETCH} pre-fetched variables` + ) + + console.log("--- Updating charts_x_entities ---") + + await trx.delete().from(ChartsXEntitiesTableName) // clears out the WHOLE table + for (const [chartId, availableEntityIds] of availableEntitiesByChartId) { + const rows = availableEntityIds.map((entityId) => ({ + chartId, + entityId, + })) + if (rows.length) await trx(ChartsXEntitiesTableName).insert(rows) + } + + console.log("--- ✅ Done ---") +} + +process.on("unhandledRejection", (e) => { + console.error(e) + process.exit(1) +}) + +if (require.main === module) { + void yargs(hideBin(process.argv)) + .command( + "$0", + "Update charts_x_entities table", + (yargs) => { + yargs + .option("all", { + boolean: true, + default: false, + description: + "Update available entities for all published charts", + }) + .check(({ all }) => { + if (!all) { + console.error( + "Please use --all. Currently, no other mode is supported." + ) + return false + } + return true + }) + }, + async ({ all }) => { + if (all) + await db.knexReadWriteTransaction( + updateAvailableEntitiesForAllGraphers, + db.TransactionCloseMode.Close + ) + } + ) + .help() + .strict().argv +} diff --git a/db/migration/1711549786507-CreateChartEntitiesTable.ts b/db/migration/1711549786507-CreateChartEntitiesTable.ts new file mode 100644 index 00000000000..c23aeb405cd --- /dev/null +++ b/db/migration/1711549786507-CreateChartEntitiesTable.ts @@ -0,0 +1,27 @@ +import { MigrationInterface, QueryRunner } from "typeorm" + +export class CreateChartEntitiesTable1711549786507 + implements MigrationInterface +{ + public async up(queryRunner: QueryRunner): Promise { + await queryRunner.query(`-- sql + CREATE TABLE charts_x_entities ( + chartId integer NOT NULL, + entityId integer NOT NULL, + + FOREIGN KEY (chartId) REFERENCES charts (id) ON DELETE CASCADE ON UPDATE CASCADE, + FOREIGN KEY (entityId) REFERENCES entities (id) ON DELETE RESTRICT ON UPDATE RESTRICT, + + PRIMARY KEY (chartId, entityId), + + -- we can use the primary key to look up by chartId, but might also want fast + -- lookups by entityId, so we add an index explicitly + INDEX (entityId) + ) + `) + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`DROP TABLE charts_x_entities`) + } +} diff --git a/db/model/Entity.ts b/db/model/Entity.ts new file mode 100644 index 00000000000..ba190d3f757 --- /dev/null +++ b/db/model/Entity.ts @@ -0,0 +1,30 @@ +import { DbPlainEntity, EntitiesTableName } from "@ourworldindata/types" +import * as db from "../db" + +export async function mapEntityNamesToEntityIds( + knex: db.KnexReadonlyTransaction +): Promise> { + const entities = (await knex(EntitiesTableName).select( + "id", + "name" + )) as Pick[] + const entityNameToIdMap = new Map( + entities.map((entity) => [entity.name, entity.id]) + ) + + return entityNameToIdMap +} + +export async function mapEntityIdsToEntityNames( + knex: db.KnexReadonlyTransaction +): Promise> { + const entities = (await knex(EntitiesTableName).select( + "id", + "name" + )) as Pick[] + const entityIdToNameMap = new Map( + entities.map((entity) => [entity.id, entity.name]) + ) + + return entityIdToNameMap +} diff --git a/packages/@ourworldindata/types/src/dbTypes/ChartsXEntities.ts b/packages/@ourworldindata/types/src/dbTypes/ChartsXEntities.ts new file mode 100644 index 00000000000..b5c31d664be --- /dev/null +++ b/packages/@ourworldindata/types/src/dbTypes/ChartsXEntities.ts @@ -0,0 +1,8 @@ +export const ChartsXEntitiesTableName = "charts_x_entities" + +export interface DbInsertChartXEntity { + chartId: number + entityId: number +} + +export type DbPlainChartXEntity = Required diff --git a/packages/@ourworldindata/types/src/index.ts b/packages/@ourworldindata/types/src/index.ts index 543dae7d134..679d3fb9dee 100644 --- a/packages/@ourworldindata/types/src/index.ts +++ b/packages/@ourworldindata/types/src/index.ts @@ -449,6 +449,11 @@ export { ChartTagsTableName, type DbChartTagJoin, } from "./dbTypes/ChartTags.js" +export { + ChartsXEntitiesTableName, + type DbInsertChartXEntity, + type DbPlainChartXEntity, +} from "./dbTypes/ChartsXEntities.js" export { type DbPlainCountryLatestData, type DbInsertCountryLatestData,