🔨 Refactor upserts to MySQL

owid · Sep 9, 2024 · 5a6ae23 · 5a6ae23
1 parent 48f2fa0
commit 5a6ae23
Show file tree

Hide file tree

Showing 9 changed files with 289 additions and 247 deletions.
diff --git a/apps/backport/datasync/data_metadata.py b/apps/backport/datasync/data_metadata.py
@@ -253,23 +253,17 @@ def _variable_metadata(
 
     schemaVersion = row.pop("schemaVersion")
     processingLevel = row.pop("processingLevel")
-    grapherConfigETLJson = row.pop("grapherConfigETL")
-    grapherConfigAdminJson = row.pop("grapherConfigAdmin")
     licenseJson = row.pop("license")
     descriptionKeyJson = row.pop("descriptionKey")
     sortJson = row.pop("sort")
 
     display = json.loads(displayJson)
-    grapherConfigETL = json.loads(grapherConfigETLJson) if grapherConfigETLJson else None
-    grapherConfigAdmin = json.loads(grapherConfigAdminJson) if grapherConfigAdminJson else None
     license = json.loads(licenseJson) if licenseJson else None
     descriptionKey = json.loads(descriptionKeyJson) if descriptionKeyJson else None
     sort = json.loads(sortJson) if sortJson else None
 
     # group fields from flat structure into presentation field
     presentation = dict(
-        grapherConfigETL=grapherConfigETL,
-        grapherConfigAdmin=grapherConfigAdmin,
         titlePublic=row.pop("titlePublic"),
         titleVariant=row.pop("titleVariant"),
         attributionShort=row.pop("attributionShort"),
@@ -398,37 +392,3 @@ def _convert_strings_to_numeric(lst: List[str]) -> List[Union[int, float, str]]:
 
 def _omit_nullable_values(d: dict) -> dict:
     return {k: v for k, v in d.items() if v is not None and (isinstance(v, list) and len(v) or not pd.isna(v))}
-
-
-def checksum_data_str(var_data_str: str) -> str:
-    return files.checksum_str(var_data_str)
-
-
-def checksum_metadata(meta: Dict[str, Any]) -> str:
-    """Calculate checksum for metadata. It modifies the metadata dict!"""
-    # Drop fields not needed for checksum computation
-    meta = filter_out_fields_in_metadata_for_checksum(meta)
-
-    return files.checksum_str(json.dumps(meta, default=str))
-
-
-def filter_out_fields_in_metadata_for_checksum(meta: Dict[str, Any]) -> Dict[str, Any]:
-    """Drop fields that are not needed to estimate the checksum."""
-    meta_ = deepcopy(meta)
-
-    # Drop checksums, they shouldn't be part of variable metadata, otherwise we get a
-    # feedback loop with changing checksums
-    meta_.pop("dataChecksum", None)
-    meta_.pop("metadataChecksum", None)
-
-    # Drop all IDs. If we create the same dataset on the staging server, it might have different
-    # IDs, but the metadata should be the same.
-    meta_.pop("id", None)
-    meta_.pop("datasetId", None)
-    for origin in meta_.get("origins", []):
-        origin.pop("id", None)
-
-    # Ignore updatedAt timestamps
-    meta_.pop("updatedAt", None)
-
-    return meta_
diff --git a/apps/chart_sync/admin_api.py b/apps/chart_sync/admin_api.py
@@ -81,6 +81,25 @@ def set_tags(self, chart_id: int, tags: List[Dict[str, Any]]) -> dict:
         assert js["success"]
         return js
 
+    def put_grapher_config(self, variable_id: int, grapher_config: Dict[str, Any]) -> dict:
+        resp = requests.put(
+            self.base_url + f"/admin/api/variables/{variable_id}/grapherConfigETL",
+            cookies={"sessionid": self.session_id},
+            json=grapher_config,
+        )
+        js = self._json_from_response(resp)
+        assert js["success"]
+        return js
+
+    def delete_grapher_config(self, variable_id: int) -> dict:
+        resp = requests.delete(
+            self.base_url + f"/admin/api/variables/{variable_id}/grapherConfigETL",
+            cookies={"sessionid": self.session_id},
+        )
+        js = self._json_from_response(resp)
+        assert js["success"]
+        return js
+
 
 def _generate_random_string(length=32) -> str:
     letters_and_digits = string.ascii_letters + string.digits

diff --git a/apps/wizard/app_pages/expert/prompts.py b/apps/wizard/app_pages/expert/prompts.py
@@ -607,7 +607,6 @@ def read_page_md(page_path: str) -> str:
       - name: display
       - name: columnOrder
       - name: originalMetadata
-      - name: grapherConfigAdmin
       - name: shortName
       - name: catalogPath
       - name: dimensions

diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py
@@ -18,6 +18,7 @@
 from sqlalchemy.engine import Engine
 from sqlalchemy.orm import Session
 
+from apps.backport.datasync import data_metadata as dm
 from etl.db import get_engine, read_sql
 from etl.files import checksum_str
 
@@ -92,8 +93,8 @@ def _yield_wide_table(
     # Validation
     if "year" not in table.primary_key:
         raise Exception("Table is missing `year` primary key")
-    if "entity_id" not in table.primary_key:
-        raise Exception("Table is missing `entity_id` primary key")
+    if "entityId" not in table.primary_key:
+        raise Exception("Table is missing `entityId` primary key")
     if na_action == "raise":
         for col in table.columns:
             if table[col].isna().any():
@@ -102,7 +103,7 @@ def _yield_wide_table(
     if cols_with_none_units:
         raise Exception("Columns with missing units: " + ", ".join(cols_with_none_units))
 
-    dim_names = [k for k in table.primary_key if k not in ("year", "entity_id")]
+    dim_names = [k for k in table.primary_key if k not in ("year", "entityId", "entityCode", "entityName")]
 
     # Keep only entity_id and year in index
     table = table.reset_index(level=dim_names)
@@ -188,7 +189,6 @@ def _yield_wide_table(
             # traverse metadata and expand Jinja
             tab[short_name].metadata = _expand_jinja(tab[short_name].metadata, dim_dict)
 
-            # Keep only entity_id and year in index
             yield tab
 
 
@@ -504,20 +504,14 @@ def _adapt_dataset_metadata_for_grapher(
     return metadata
 
 
-def _adapt_table_for_grapher(
-    table: catalog.Table, engine: Engine | None = None, country_col: str = "country", year_col: str = "year"
-) -> catalog.Table:
+def _adapt_table_for_grapher(table: catalog.Table, engine: Engine) -> catalog.Table:
     """Adapt table (from a garden dataset) to be used in a grapher step. This function
     is not meant to be run explicitly, but by default in the grapher step.
 
     Parameters
     ----------
     table : catalog.Table
         Table from garden dataset.
-    country_col : str
-        Name of country column in table.
-    year_col : str
-        Name of year column in table.
 
     Returns
     -------
@@ -534,7 +528,7 @@ def _adapt_table_for_grapher(
     ), f"Variable titles are not unique ({variable_titles_counts[variable_titles_counts > 1].index})."
 
     # Remember original dimensions
-    dim_names = [n for n in table.index.names if n and n not in ("year", "date", "entity_id", country_col)]
+    dim_names = [n for n in table.index.names if n and n not in ("year", "date", "entity_id", "country")]
 
     # Reset index unless we have default index
     if table.index.names != [None]:
@@ -546,14 +540,19 @@ def _adapt_table_for_grapher(
         assert "year" not in table.columns, "Table cannot have both `date` and `year` columns."
         table = adapt_table_with_dates_to_grapher(table)
 
-    assert {"year", country_col} <= set(table.columns), f"Table must have columns {country_col} and year."
+    assert {"year", "country"} <= set(table.columns), "Table must have columns country and year."
     assert "entity_id" not in table.columns, "Table must not have column entity_id."
 
     # Grapher needs a column entity id, that is constructed based on the unique entity names in the database.
-    table["entity_id"] = country_to_entity_id(table[country_col], create_entities=True, engine=engine)
-    table = table.drop(columns=[country_col]).rename(columns={year_col: "year"})
+    table["entityId"] = country_to_entity_id(table["country"], create_entities=True, engine=engine)
+    table = table.drop(columns=["country"])
+
+    # Add entity code and name
+    # less than 10ms per variable
+    with Session(engine) as session:
+        table = dm.add_entity_code_and_name(session, table).copy_metadata(table)
 
-    table = table.set_index(["entity_id", "year"] + dim_names)
+    table = table.set_index(["entityId", "entityCode", "entityName", "year"] + dim_names)
 
     # Ensure the default source of each column includes the description of the table (since that is the description that
     # will appear in grapher on the SOURCES tab).