Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed Nov 5, 2024
1 parent 9ec8b6c commit b21d8fd
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
4 changes: 3 additions & 1 deletion etl/grapher_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
)

# this might work too pd.api.types.is_integer_dtype(col)
INT_TYPES = tuple({f"{n}{b}" for n in ("int", "Int", "uint", "UInt") for b in ("8", "16", "32", "64")})
INT_TYPES = tuple(
{f"{n}{b}{p}" for n in ("int", "Int", "uint", "UInt") for b in ("8", "16", "32", "64") for p in ("", "[pyarrow]")}
)


def as_table(df: pd.DataFrame, table: catalog.Table) -> catalog.Table:
Expand Down
10 changes: 5 additions & 5 deletions lib/repack/owid/repack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,18 +157,18 @@ def series_eq(lhs: pd.Series, rhs: pd.Series, cast: Any, rtol: float = 1e-5, ato
def _safe_dtype(dtype: Any) -> str:
"""Determine the appropriate dtype string based on pandas dtype."""
if pd.api.types.is_integer_dtype(dtype):
return "Int64"
return "int64[pyarrow]"
elif pd.api.types.is_float_dtype(dtype):
return "Float64"
return "float64[pyarrow]"
elif isinstance(dtype, pd.CategoricalDtype):
return "string[python]"
return "string[pyarrow]"
else:
return dtype


def to_safe_types(t: pd.DataFrame) -> pd.DataFrame:
"""Convert numeric columns to Float64 and Int64 and categorical
columns to string[python]. This can significantly increase memory usage."""
"""Convert numeric columns to float64[pyarrow] and int64[pyarrow] and categorical
columns to string[pyarrow]."""
t = t.astype({col: _safe_dtype(t[col].dtype) for col in t.columns})

if isinstance(t.index, pd.MultiIndex):
Expand Down

0 comments on commit b21d8fd

Please sign in to comment.