✨ Use real users on staging servers (#3341)

* ✨ Use real users on staging servers
owid · Nov 1, 2024 · e49e43a · e49e43a
1 parent 22ff46a
commit e49e43a
Show file tree

Hide file tree

Showing 15 changed files with 90 additions and 69 deletions.
diff --git a/apps/chart_sync/admin_api.py b/apps/chart_sync/admin_api.py
@@ -25,7 +25,7 @@ def is_502_error(exception):
 
 
 class AdminAPI(object):
-    def __init__(self, owid_env: OWIDEnv, grapher_user_id: Optional[int] = None):
+    def __init__(self, owid_env: OWIDEnv, grapher_user_id: Optional[int] = GRAPHER_USER_ID):
         self.owid_env = owid_env
         self.session_id = create_session_id(owid_env, grapher_user_id)
 
@@ -39,6 +39,12 @@ def _json_from_response(self, resp: requests.Response) -> dict:
             raise AdminAPIError(resp.text) from e
         return js
 
+    def _get_session_id(self, user_id: Optional[int] = None) -> str:
+        """Return the session id for the given user, or the default session id."""
+        if user_id:
+            return create_session_id(self.owid_env, user_id)
+        return self.session_id
+
     def get_chart_config(self, chart_id: int) -> dict:
         resp = requests.get(
             f"{self.owid_env.admin_api}/charts/{chart_id}.config.json",
@@ -47,32 +53,32 @@ def get_chart_config(self, chart_id: int) -> dict:
         js = self._json_from_response(resp)
         return js
 
-    def create_chart(self, chart_config: dict) -> dict:
+    def create_chart(self, chart_config: dict, user_id: Optional[int] = None) -> dict:
         resp = requests.post(
             self.owid_env.admin_api + "/charts",
-            cookies={"sessionid": self.session_id},
+            cookies={"sessionid": self._get_session_id(user_id)},
             json=chart_config,
         )
         js = self._json_from_response(resp)
         if not js["success"]:
             raise AdminAPIError({"error": js["error"], "chart_config": chart_config})
         return js
 
-    def update_chart(self, chart_id: int, chart_config: dict) -> dict:
+    def update_chart(self, chart_id: int, chart_config: dict, user_id: Optional[int] = None) -> dict:
         resp = requests.put(
             f"{self.owid_env.admin_api}/charts/{chart_id}",
-            cookies={"sessionid": self.session_id},
+            cookies={"sessionid": self._get_session_id(user_id)},
             json=chart_config,
         )
         js = self._json_from_response(resp)
         if not js["success"]:
             raise AdminAPIError({"error": js["error"], "chart_config": chart_config})
         return js
 
-    def set_tags(self, chart_id: int, tags: List[Dict[str, Any]]) -> dict:
+    def set_tags(self, chart_id: int, tags: List[Dict[str, Any]], user_id: Optional[int] = None) -> dict:
         resp = requests.post(
             f"{self.owid_env.admin_api}/charts/{chart_id}/setTags",
-            cookies={"sessionid": self.session_id},
+            cookies={"sessionid": self._get_session_id(user_id)},
             json={"tags": tags},
         )
         js = self._json_from_response(resp)
@@ -107,13 +113,10 @@ def delete_grapher_config(self, variable_id: int) -> dict:
 
 
 @cache
-def create_session_id(owid_env: OWIDEnv, grapher_user_id: Optional[int] = None) -> str:
+def create_session_id(owid_env: OWIDEnv, grapher_user_id: int) -> str:
     engine = owid_env.get_engine()
     with Session(engine) as session:
-        if grapher_user_id:
-            user = session.get(gm.User, grapher_user_id)
-        else:
-            user = session.get(gm.User, GRAPHER_USER_ID)
+        user = session.get(gm.User, grapher_user_id)
         assert user
         session_id = _create_user_session(session, user.email)
         session.commit()

diff --git a/apps/chart_sync/cli.py b/apps/chart_sync/cli.py
@@ -157,6 +157,9 @@ def cli(
                 # Map variable IDs from source to target
                 migrated_config = diff.source_chart.migrate_config(source_session, target_session)
 
+                # Get user who edited the chart
+                user_id = diff.source_chart.lastEditedByUserId
+
                 # Chart in target exists, update it
                 if diff.target_chart:
                     # Configs are equal, no need to update
@@ -174,7 +177,7 @@ def cli(
                         log.info("chart_sync.chart_update", slug=chart_slug, chart_id=chart_id)
                         charts_synced += 1
                         if not dry_run:
-                            target_api.update_chart(chart_id, migrated_config)
+                            target_api.update_chart(chart_id, migrated_config, user_id=user_id)
 
                     # Rejected chart diff
                     elif diff.is_rejected:
@@ -202,8 +205,8 @@ def cli(
                     if diff.is_approved:
                         charts_synced += 1
                         if not dry_run:
-                            resp = target_api.create_chart(migrated_config)
-                            target_api.set_tags(resp["chartId"], chart_tags)
+                            resp = target_api.create_chart(migrated_config, user_id=user_id)
+                            target_api.set_tags(resp["chartId"], chart_tags, user_id=user_id)
                         else:
                             resp = {"chartId": None}
                         log.info(

diff --git a/apps/owidbot/chart_diff.py b/apps/owidbot/chart_diff.py
@@ -119,11 +119,9 @@ def format_chart_diff(df: pd.DataFrame) -> str:
     return f"""
 <ul>
     <li>{num_charts_reviewed}/{num_charts} reviewed charts</li>
-    <ul>
-        <li>Modified: {num_charts_modified_reviewed}/{num_charts_modified}</li>
-        <li>New: {num_charts_new_reviewed}/{num_charts_new}</li>
-        <li>Rejected: {num_charts_rejected}</li>
-        {errors}
-    </ul>
+    <li>Modified: {num_charts_modified_reviewed}/{num_charts_modified}</li>
+    <li>New: {num_charts_new_reviewed}/{num_charts_new}</li>
+    <li>Rejected: {num_charts_rejected}</li>
+    {errors}
 </ul>
     """.strip()
diff --git a/apps/wizard/app_pages/chart_diff/app.py b/apps/wizard/app_pages/chart_diff/app.py
@@ -13,7 +13,6 @@
 from apps.wizard.app_pages.chart_diff.utils import WARN_MSG, get_engines
 from apps.wizard.utils import set_states
 from apps.wizard.utils.components import Pagination
-from etl import config
 from etl.config import OWID_ENV
 
 log = get_logger()
@@ -50,11 +49,11 @@
 CHART_PER_PAGE = 10
 # WARN_MSG += ["This tool is being developed! Please report any issues you encounter in `#proj-new-data-workflow`"]
 
-if str(config.GRAPHER_USER_ID) != "1":
-    WARN_MSG.append(
-        "`GRAPHER_USER_ID` from your .env is not set to 1 (Admin user). Please modify your .env or use STAGING=1 flag to set it automatically. "
-        "All changes on staging servers must be done with Admin user."
-    )
+# if str(config.GRAPHER_USER_ID) != "1":
+#     WARN_MSG.append(
+#         "`GRAPHER_USER_ID` from your .env is not set to 1 (Admin user). Please modify your .env or use STAGING=1 flag to set it automatically. "
+#         "All changes on staging servers must be done with Admin user."
+#     )
 
 if WARN_MSG:
     st.warning("- " + "\n\n- ".join(WARN_MSG))

diff --git a/apps/wizard/app_pages/chart_diff/chart_diff.py b/apps/wizard/app_pages/chart_diff/chart_diff.py
@@ -477,7 +477,7 @@ def load_df(self, chart_ids: List[int] | None = None) -> pd.DataFrame:
         """Load changes in charts between environments from sessions."""
         with Session(self.source_engine) as source_session:
             with Session(self.target_engine) as target_session:
-                return modified_charts_by_admin(source_session, target_session, chart_ids=chart_ids)
+                return modified_charts_on_staging(source_session, target_session, chart_ids=chart_ids)
 
     @property
     def chart_ids_all(self):
@@ -540,7 +540,7 @@ def get_diffs_summary_df(self, cache: bool = False, **kwargs) -> pd.DataFrame:
         return pd.DataFrame(summary)
 
 
-def _modified_data_metadata_by_admin(
+def _modified_data_metadata_on_staging(
     source_session: Session, target_session: Session, chart_ids: Optional[List[int]] = None
 ) -> pd.DataFrame:
     """
@@ -570,23 +570,20 @@ def _modified_data_metadata_by_admin(
     join datasets as d on v.datasetId = d.id
     where v.dataChecksum is not null and v.metadataChecksum is not null and
     """
-    # NOTE: We assume that all changes on staging server are done by Admin user with ID = 1. This is
-    #   set automatically if you use STAGING env variable.
     where = """
-        -- include all charts from datasets that have been updated
-        (d.dataEditedByUserId = 1 or d.metadataEditedByUserId = 1)
+        -- only compare data/metadata that have been updated on staging server
+        (d.dataEditedAt >= %(timestamp_staging_creation)s or d.metadataEditedAt >= %(timestamp_staging_creation)s)
     """
     query_source = base_q + where
+    params = {"timestamp_staging_creation": get_staging_creation_time(source_session)}
     # Add filter for chart IDs
     if chart_ids is not None:
         where_charts = """
             -- filter and get only charts with given IDs
             cd.chartId in %(chart_ids)s
         """
         query_source += " and " + where_charts
-        params = {"chart_ids": tuple(chart_ids)}
-    else:
-        params = {}
+        params["chart_ids"] = tuple(chart_ids)
     source_df = read_sql(query_source, source_session, params=params)
 
     # no charts, return empty dataframe
@@ -634,7 +631,7 @@ def _modified_data_metadata_by_admin(
     return diff
 
 
-def _modified_chart_configs_by_admin(
+def _modified_chart_configs_on_staging(
     source_session: Session, target_session: Session, chart_ids: Optional[List[int]] = None
 ) -> pd.DataFrame:
     TIMESTAMP_STAGING_CREATION = get_staging_creation_time(source_session)
@@ -651,25 +648,22 @@ def _modified_chart_configs_by_admin(
     join chart_configs as cc on c.configId = cc.id
     where
     """
-    # NOTE: We assume that all changes on staging server are done by Admin user with ID = 1. This is
-    #   set automatically if you use STAGING env variable.
     where = """
-        -- only compare charts that have been updated on staging server by Admin user
+        -- only compare charts that have been updated on staging server
         (
-            c.lastEditedByUserId = 1 or c.publishedByUserId = 1
+            c.lastEditedAt >= %(timestamp_staging_creation)s
         )
     """
     query_source = base_q + where
+    params = {"timestamp_staging_creation": TIMESTAMP_STAGING_CREATION}
     # Add filter for chart IDs
     if chart_ids is not None:
         where_charts = """
             -- filter and get only charts with given IDs
             c.id in %(chart_ids)s
         """
         query_source += " and " + where_charts
-        params = {"chart_ids": tuple(chart_ids)}
-    else:
-        params = {}
+        params["chart_ids"] = tuple(chart_ids)
     source_df = read_sql(query_source, source_session, params=params)
 
     # no charts, return empty dataframe
@@ -694,23 +688,19 @@ def _modified_chart_configs_by_admin(
     diff["configEdited"] = source_df["chartChecksum"] != target_df["chartChecksum"]
 
     # Add flag 'edited in staging'
-    diff["chartEditedInStaging"] = source_df["chartLastEditedAt"] >= TIMESTAMP_STAGING_CREATION
-
-    assert (
-        diff["chartEditedInStaging"].notna().all()
-    ), "chartEditedInStaging has missing values! This might be due to `diff` and `eidted` dataframes not having the same number of rows."
+    diff["chartEditedInStaging"] = True
 
     # Remove charts with no changes
     return diff[["configEdited", "chartEditedInStaging"]]
 
 
-def modified_charts_by_admin(
+def modified_charts_on_staging(
     source_session: Session, target_session: Session, chart_ids: Optional[List[int]] = None
 ) -> pd.DataFrame:
     """Get charts that have been modified in staging.
 
     - It includes charts with different config, data or metadata checksums.
-    - It assumes that all changes on staging server are done by Admin user with ID = 1. That is, if there are changes by a different user in staging, they are not included.
+    - It detects changes by comparing updatedAt timestamps to staging creation time.
 
     Optionally, you can provide a list of chart IDs to filter the results.
 
@@ -723,8 +713,8 @@ def modified_charts_by_admin(
         TESTING:
         - chartEditedInStaging: True if the chart config has been edited in staging.
     """
-    df_config = _modified_chart_configs_by_admin(source_session, target_session, chart_ids=chart_ids)
-    df_data_metadata = _modified_data_metadata_by_admin(source_session, target_session, chart_ids=chart_ids)
+    df_config = _modified_chart_configs_on_staging(source_session, target_session, chart_ids=chart_ids)
+    df_data_metadata = _modified_data_metadata_on_staging(source_session, target_session, chart_ids=chart_ids)
 
     df = df_config.join(df_data_metadata, how="outer").fillna(False)
 

diff --git a/apps/wizard/app_pages/chart_diff/conflict_resolver.py b/apps/wizard/app_pages/chart_diff/conflict_resolver.py
@@ -155,12 +155,16 @@ def resolve_conflicts(self, rerun: bool = False):
             # Verify config
             config_new = validate_chart_config_and_set_defaults(config, schema=get_schema_from_url(config["$schema"]))
 
-            api = AdminAPI(SOURCE, grapher_user_id=1)
+            # User who last edited the chart
+            user_id = self.diff.source_chart.lastEditedByUserId
+
+            api = AdminAPI(SOURCE)
             try:
                 # Push new chart to staging
                 api.update_chart(
                     chart_id=self.diff.chart_id,
                     chart_config=config_new,
+                    user_id=user_id,
                 )
             except HTTPError as e:
                 log.error(e)

diff --git a/apps/wizard/app_pages/indicator_upgrade/charts_update.py b/apps/wizard/app_pages/indicator_upgrade/charts_update.py
@@ -93,8 +93,7 @@ def get_affected_charts_and_preview(indicator_mapping: Dict[int, int]) -> List[g
 def push_new_charts(charts: List[gm.Chart]) -> None:
     """Updating charts in the database."""
     # API to interact with the admin tool
-    # HACK: Forcing grapher user to be Admin so that it is detected by chart sync.
-    api = AdminAPI(OWID_ENV, grapher_user_id=1)
+    api = AdminAPI(OWID_ENV)
     # Update charts
     progress_text = "Updating charts..."
     bar = st.progress(0, progress_text)
@@ -114,10 +113,7 @@ def push_new_charts(charts: List[gm.Chart]) -> None:
                 chart_id = chart.config["id"]
             else:
                 raise ValueError(f"Chart {chart} does not have an ID in config.")
-            api.update_chart(
-                chart_id=chart_id,
-                chart_config=config_new,
-            )
+            api.update_chart(chart_id=chart_id, chart_config=config_new, user_id=chart.lastEditedByUserId)
             # Show progress bar
             percent_complete = int(100 * (i + 1) / len(charts))
             bar.progress(percent_complete, text=f"{progress_text} {percent_complete}%")

diff --git a/etl/helpers.py b/etl/helpers.py
@@ -3,6 +3,7 @@
 #  etl
 #
 
+import datetime as dt
 import re
 import sys
 import tempfile
@@ -1216,3 +1217,10 @@ def get_schema_from_url(schema_url: str) -> dict:
         Schema of a chart configuration.
     """
     return requests.get(schema_url, timeout=20, verify=TLS_VERIFY).json()
+
+
+def last_date_accessed(tb: Table) -> str:
+    """Get maximum date_accessed from all origins in the table and display it in a specific format. This
+    should be a replacement for {TODAY} in YAML templates."""
+    date_accessed = max([origin.date_accessed for col in tb.columns for origin in tb[col].m.origins])
+    return dt.datetime.strptime(date_accessed, "%Y-%m-%d").strftime("%d %B %Y")
diff --git a/etl/steps/data/garden/climate/latest/weekly_wildfires.meta.yml b/etl/steps/data/garden/climate/latest/weekly_wildfires.meta.yml
@@ -10,7 +10,7 @@ definitions:
   desc_wildfires: &desc_wildfires
     - Wildfires are detected through the use of satellite imagery obtained from MODIS (Moderate Resolution Imaging Spectroradiometer) and VIIRS (Visible Infrared Imaging Radiometer Suite). These satellite systems are capable of identifying thermal anomalies and alterations in landscape patterns, which are indicative of burning.
     - The data provider is presently engaged in a global accuracy assessment and acknowledged that they might be underestimating the genuine impact of wildfires, primarily due to constraints imposed by the spatial resolution of the sensors they employ.
-  desc_update: The 2024 data is incomplete and was last updated {TODAY}.
+  desc_update: The 2024 data is incomplete and was last updated {date_accessed}.
 # Learn more about the available fields:
 # http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/
 dataset:

diff --git a/etl/steps/data/garden/climate/latest/weekly_wildfires.py b/etl/steps/data/garden/climate/latest/weekly_wildfires.py
@@ -1,11 +1,12 @@
 """Load a meadow dataset and create a garden dataset."""
 
+
 import numpy as np
 import owid.catalog.processing as pr
 import pandas as pd
 
 from etl.data_helpers import geo
-from etl.helpers import PathFinder, create_dataset
+from etl.helpers import PathFinder, create_dataset, last_date_accessed
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
@@ -103,7 +104,11 @@ def run(dest_dir: str) -> None:
     #
     # Create a new garden dataset with the same metadata as the meadow dataset.
     ds_garden = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+        dest_dir,
+        tables=[tb],
+        check_variables_metadata=True,
+        default_metadata=ds_meadow.metadata,
+        yaml_params={"date_accessed": last_date_accessed(tb)},
     )
 
     # Save changes in the new garden dataset.

diff --git a/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_annual_average.meta.yml b/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_annual_average.meta.yml
@@ -28,7 +28,7 @@ definitions:
 
   temperature_anomaly_above: Calculated when the average surface temperature for a year is higher than the 1991-2020 average, this anomaly indicates a hotter-than-average temperature for that year.
   temperature_anomaly_below: Calculated when the average surface temperature for a year is lower than the 1991-2020 average, this anomaly indicates a colder-than-average temperature for that year.
-  desc_update: The 2024 data is incomplete and was last updated {TODAY}.
+  desc_update: The 2024 data is incomplete and was last updated {date_accessed}.
 
 dataset:
   title: Annual surface temperatures and anomalies by country

diff --git a/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_annual_average.py b/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_annual_average.py
@@ -3,7 +3,7 @@
 import numpy as np
 import owid.catalog.processing as pr
 
-from etl.helpers import PathFinder, create_dataset
+from etl.helpers import PathFinder, create_dataset, last_date_accessed
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
@@ -65,7 +65,11 @@ def run(dest_dir: str) -> None:
     #
     # Create a new grapher dataset with the same metadata as the garden dataset.
     ds_grapher = create_dataset(
-        dest_dir, tables=[combined], default_metadata=ds_garden.metadata, check_variables_metadata=True
+        dest_dir,
+        tables=[combined],
+        default_metadata=ds_garden.metadata,
+        check_variables_metadata=True,
+        yaml_params={"date_accessed": last_date_accessed(combined)},
     )
 
     ds_grapher.save()