diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py index 058598ee9f2..413ebfaa66f 100644 --- a/etl/grapher_helpers.py +++ b/etl/grapher_helpers.py @@ -36,7 +36,9 @@ ) # this might work too pd.api.types.is_integer_dtype(col) -INT_TYPES = tuple({f"{n}{b}" for n in ("int", "Int", "uint", "UInt") for b in ("8", "16", "32", "64")}) +INT_TYPES = tuple( + {f"{n}{b}{p}" for n in ("int", "Int", "uint", "UInt") for b in ("8", "16", "32", "64") for p in ("", "[pyarrow]")} +) def as_table(df: pd.DataFrame, table: catalog.Table) -> catalog.Table: diff --git a/lib/repack/owid/repack/__init__.py b/lib/repack/owid/repack/__init__.py index 1aa6461a306..92eaeba4a3d 100644 --- a/lib/repack/owid/repack/__init__.py +++ b/lib/repack/owid/repack/__init__.py @@ -157,18 +157,18 @@ def series_eq(lhs: pd.Series, rhs: pd.Series, cast: Any, rtol: float = 1e-5, ato def _safe_dtype(dtype: Any) -> str: """Determine the appropriate dtype string based on pandas dtype.""" if pd.api.types.is_integer_dtype(dtype): - return "Int64" + return "int64[pyarrow]" elif pd.api.types.is_float_dtype(dtype): - return "Float64" + return "float64[pyarrow]" elif isinstance(dtype, pd.CategoricalDtype): - return "string[python]" + return "string[pyarrow]" else: return dtype def to_safe_types(t: pd.DataFrame) -> pd.DataFrame: - """Convert numeric columns to Float64 and Int64 and categorical - columns to string[python]. This can significantly increase memory usage.""" + """Convert numeric columns to float64[pyarrow] and int64[pyarrow] and categorical + columns to string[pyarrow].""" t = t.astype({col: _safe_dtype(t[col].dtype) for col in t.columns}) if isinstance(t.index, pd.MultiIndex):