From a33a0aa161d14346c03616cfe72d98c3f6c05cb4 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:41:05 -0800 Subject: [PATCH 01/20] Fix bug in iter_timestamps + refactor to IndexTimeRange --- src/chronify/__init__.py | 6 +-- src/chronify/datetime_range_generator.py | 50 +++++++++++++------- src/chronify/time.py | 16 +++---- src/chronify/time_configs.py | 52 +++++---------------- src/chronify/time_series_checker.py | 1 - src/chronify/time_zone_converter.py | 12 +++-- tests/test_mapper_index_time_to_datetime.py | 30 +++++------- 7 files changed, 76 insertions(+), 91 deletions(-) diff --git a/src/chronify/__init__.py b/src/chronify/__init__.py index a61ca6c..4637d5d 100644 --- a/src/chronify/__init__.py +++ b/src/chronify/__init__.py @@ -25,8 +25,7 @@ AnnualTimeRange, DatetimeRange, DatetimeRangeWithTZColumn, - IndexTimeRangeNTZ, - IndexTimeRangeTZ, + IndexTimeRange, IndexTimeRangeWithTZColumn, RepresentativePeriodTimeNTZ, RepresentativePeriodTimeTZ, @@ -42,9 +41,8 @@ "CsvTableSchema", "DatetimeRange", "DatetimeRangeWithTZColumn", + "IndexTimeRange", "IndexTimeRangeWithTZColumn", - "IndexTimeRangeNTZ", - "IndexTimeRangeTZ", "InvalidOperation", "InvalidParameter", "InvalidTable", diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index b030e68..91dbd15 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -7,6 +7,7 @@ from chronify.time import ( LeapDayAdjustmentType, + TimeDataType, ) from chronify.time_configs import DatetimeRanges, DatetimeRange, DatetimeRangeWithTZColumn from chronify.time_utils import adjust_timestamp_by_dst_offset, get_tzname @@ -118,42 +119,59 @@ def __init__( raise InvalidValue(msg) def _list_timestamps(self, time_zone: Optional[tzinfo]) -> list[datetime]: - """always return tz-naive timestamps relative to input time_zone""" - if self._model.start_time_is_tz_naive(): - if time_zone: + """return timestamps for a given time_zone expected in the dataframe + returned timestamp dtype matches self._model.dtype, e.g., + if time_zone is None, return tz-naive timestamps else return tz-aware timestamps + """ + match (self._model.start_time_is_tz_naive(), self._model.dtype): + case (True, TimeDataType.TIMESTAMP_NTZ): + # align in local time of the time zone, all time zones have the same tz-naive timestamps + start = self._model.start + case (True, TimeDataType.TIMESTAMP_TZ): + # align in local time of the time zone, all time zones have different tz-aware timestamps that are aligned when adjusted by time zone start = self._model.start.replace(tzinfo=time_zone) - else: - start = None - else: - if time_zone: - start = self._model.start.astimezone(time_zone) - else: - start = self._model.start.replace(tzinfo=None) - timestamps = list(self._iter_timestamps(start=start)) - return [x.replace(tzinfo=None) for x in timestamps] + case (False, TimeDataType.TIMESTAMP_NTZ): + # align in absolute time, all time zones have different tz-naive timestamps that are aligned when localized to the time zone + if time_zone: + start = self._model.start.astimezone(time_zone).replace(tzinfo=None) + else: + start = self._model.start.replace(tzinfo=None) + case (False, TimeDataType.TIMESTAMP_TZ): + # align in absolute time, all time zones have the same tz-aware timestamps + start = self._model.start + case _: + msg = f"Unsupported combination of start_time_is_tz_naive and dtype: {self._model}" + raise InvalidValue(msg) + return list(self._iter_timestamps(start=start)) def list_timestamps(self) -> list[datetime]: - """return ordered timestamps across all time zones in the order of the time zones.""" + """return ordered tz-naive timestamps across all time zones in the order of the time zones.""" dct = self.list_timestamps_by_time_zone() return list(chain(*dct.values())) def list_timestamps_by_time_zone(self) -> dict[str, list[datetime]]: - """for each time zone, returns full timestamp iteration (duplicates allowed)""" + """for each time zone, returns full timestamp iteration + (duplicates allowed)""" dct = {} for tz in self._model.get_time_zones(): tz_name = get_tzname(tz) dct[tz_name] = self._list_timestamps(tz) - return dct def list_distinct_timestamps_by_time_zone_from_dataframe( self, df: pd.DataFrame ) -> dict[str, list[datetime]]: + """ + from the dataframe, for each time zone, returns distinct timestamps + """ tz_col = self._model.get_time_zone_column() t_col = self._model.time_column df[t_col] = pd.to_datetime(df[t_col]) df2 = df[[tz_col, t_col]].drop_duplicates() dct = {} for tz_name in sorted(df2[tz_col].unique()): - dct[tz_name] = sorted(df2.loc[df2[tz_col] == tz_name, t_col].tolist()) + timestamps = sorted(df2.loc[df2[tz_col] == tz_name, t_col].tolist()) + # if timestamps[0].tzinfo: + # timestamps = [x.astimezone(tz_name).replace(tzinfo=None) for x in timestamps] + dct[tz_name] = timestamps return dct diff --git a/src/chronify/time.py b/src/chronify/time.py index 08651ef..26d273f 100644 --- a/src/chronify/time.py +++ b/src/chronify/time.py @@ -8,13 +8,12 @@ class TimeType(StrEnum): - """Defines the supported time formats in the load data.""" + """Defines the formats of time config / representation.""" DATETIME = "datetime" DATETIME_TZ_COL = "datetime_tz_col" ANNUAL = "annual" - INDEX_NTZ = "index_ntz" - INDEX_TZ = "index_tz" + INDEX = "index" INDEX_TZ_COL = "index_tz_col" REPRESENTATIVE_PERIOD_NTZ = "representative_period_ntz" REPRESENTATIVE_PERIOD_TZ = "representative_period_tz" @@ -23,12 +22,13 @@ class TimeType(StrEnum): YEAR_MONTH_DAY_PERIOD_NTZ = "year_month_day_period" -class DatetimeFormat(StrEnum): - """Defines the time format of the datetime config model""" +class TimeDataType(StrEnum): + """Defines the data types of datetime columns in load_data.""" - ALIGNED = "aligned" - LOCAL = "local" - LOCAL_AS_STRINGS = "local_as_strings" + TIMESTAMP_TZ = "timestamp_tz" + TIMESTAMP_NTZ = "timestamp_ntz" + STRING = "string" + TIMESTAMP_IN_PARTS = "timestamp_in_parts" class RepresentativePeriodFormat(StrEnum): diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index e14709c..8e4a883 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -11,6 +11,7 @@ DaylightSavingAdjustmentType, LeapDayAdjustmentType, MeasurementType, + TimeDataType, TimeIntervalType, TimeType, RepresentativePeriodFormat, @@ -67,6 +68,9 @@ def get_time_zones(self) -> list[tzinfo | None]: class DatetimeRangeBase(TimeBaseModel): """Defines a time range base class that uses Python datetime instances.""" + dtype: Literal[ + TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ + ] = TimeDataType.TIMESTAMP_TZ time_column: str = Field(description="Column in the table that represents time.") length: int resolution: timedelta @@ -161,7 +165,7 @@ class IndexTimeRangeBase(TimeBaseModel): start: int = Field(description="starting index") length: int start_timestamp: datetime = Field( - description="The timestamp represented by the starting index." + description="The timestamp represented by the starting index. Can be tz-aware or tz-naive." ) resolution: timedelta = Field(description="The resolution of time represented by the indexes.") time_type: TimeType @@ -174,44 +178,12 @@ def list_time_columns(self) -> list[str]: return [self.time_column] -class IndexTimeRangeNTZ(IndexTimeRangeBase): - """Index time that represents tz-naive timestamps. - start_timestamp is tz-naive +class IndexTimeRange(IndexTimeRangeBase): + """Index time that represents timestamps. + start_timestamp can be tz-aware or tz-naive """ - time_type: Literal[TimeType.INDEX_NTZ] = TimeType.INDEX_NTZ - - @field_validator("start_timestamp") - @classmethod - def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: - if start_timestamp.tzinfo is not None: - msg = "start_timestamp must be tz-naive for IndexTimeRangeNTZ" - raise InvalidValue(msg) - return start_timestamp - - def get_time_zone_column(self) -> None: - return None - - def get_time_zones(self) -> list[tzinfo | None]: - return [] - - -class IndexTimeRangeTZ(IndexTimeRangeBase): - """Index time that represents tz-aware timestamps of a single time zone. - start_timestamp is tz-aware. - Used for dataset where the timeseries for all geographies start at the same - absolute time. - """ - - time_type: Literal[TimeType.INDEX_TZ] = TimeType.INDEX_TZ - - @field_validator("start_timestamp") - @classmethod - def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: - if start_timestamp.tzinfo is None: - msg = "start_timestamp must be tz-aware for IndexTimeRangeTZ" - raise InvalidValue(msg) - return start_timestamp + time_type: Literal[TimeType.INDEX] = TimeType.INDEX def get_time_zone_column(self) -> None: return None @@ -248,8 +220,7 @@ def get_time_zones(self) -> list[tzinfo | None]: IndexTimeRanges = Union[ - IndexTimeRangeNTZ, - IndexTimeRangeTZ, + IndexTimeRange, IndexTimeRangeWithTZColumn, ] @@ -431,8 +402,7 @@ def default_config(cls, length: int, year: int) -> "MonthDayHourTimeNTZ": AnnualTimeRange, DatetimeRange, DatetimeRangeWithTZColumn, - IndexTimeRangeNTZ, - IndexTimeRangeTZ, + IndexTimeRange, IndexTimeRangeWithTZColumn, RepresentativePeriodTimeNTZ, RepresentativePeriodTimeTZ, diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index 17c94c6..bb27f2f 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -86,7 +86,6 @@ def _check_expected_timestamps_with_external_time_zone(self) -> int: stmt = stmt.where(self._table.c[col].is_not(None)) df = read_database(stmt, self._conn, self._schema.time_config) actual_dct = self._time_generator.list_distinct_timestamps_by_time_zone_from_dataframe(df) - if sorted(expected_dct.keys()) != sorted(actual_dct.keys()): msg = ( "Time zone records do not match between expected and actual from table " diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 115c112..30f3baa 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -20,7 +20,7 @@ from chronify.time_series_mapper_base import apply_mapping from chronify.time_range_generator_factory import make_time_range_generator from chronify.sqlalchemy.functions import read_database -from chronify.time import TimeType +from chronify.time import TimeDataType, TimeType from chronify.time_utils import wrapped_time_timestamps, get_tzname @@ -171,7 +171,9 @@ def convert_time_zone( class TimeZoneConverter(TimeZoneConverterBase): - """Class for time zone conversion of time series data to a specified time zone.""" + """Class for time zone conversion of time series data to a specified time zone. + Output timestamp is tz-naive with time_zone recorded in a column. + """ def __init__( self, @@ -200,6 +202,7 @@ def generate_to_time_config(self) -> DatetimeRangeWithTZColumn: time_kwargs.items(), ) ) + time_kwargs["dtype"] = TimeDataType.TIMESTAMP_NTZ time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL time_kwargs["time_zone_column"] = "time_zone" time_kwargs["time_zones"] = [self._to_time_zone] @@ -274,7 +277,9 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: class TimeZoneConverterByColumn(TimeZoneConverterBase): - """Class for time zone conversion of time series data based on a time zone column.""" + """Class for time zone conversion of time series data based on a time zone column. + Output timestamp is tz-naive with time_zone recorded in a column. + """ def __init__( self, @@ -304,6 +309,7 @@ def generate_to_time_config(self) -> DatetimeRangeBase: time_kwargs.items(), ) ) + time_kwargs["dtype"] = TimeDataType.TIMESTAMP_NTZ time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL time_kwargs["time_zone_column"] = self.time_zone_column time_kwargs["time_zones"] = self._get_time_zones() diff --git a/tests/test_mapper_index_time_to_datetime.py b/tests/test_mapper_index_time_to_datetime.py index 580537e..df18a30 100644 --- a/tests/test_mapper_index_time_to_datetime.py +++ b/tests/test_mapper_index_time_to_datetime.py @@ -9,8 +9,7 @@ from chronify.time_series_mapper import map_time from chronify.time_configs import ( DatetimeRange, - IndexTimeRangeNTZ, - IndexTimeRangeTZ, + IndexTimeRange, IndexTimeRangeWithTZColumn, TimeBasedDataAdjustment, ) @@ -57,23 +56,18 @@ def data_for_simple_mapping( src_df = pd.DataFrame({"index_time": range(1, nts + 1), "value": range(1, nts + 1)}) if tz_naive: - time_config = IndexTimeRangeNTZ( - start=1, - length=nts, - start_timestamp=start_timestamp, - resolution=interval_resolution, - interval_type=TimeIntervalType.PERIOD_BEGINNING, - time_column="index_time", - ) + st_ts = start_timestamp else: - time_config = IndexTimeRangeTZ( - start=1, - length=nts, - start_timestamp=start_timestamp.tz_localize("US/Mountain"), - resolution=interval_resolution, - interval_type=TimeIntervalType.PERIOD_BEGINNING, - time_column="index_time", - ) + st_ts = start_timestamp.tz_localize("US/Mountain") + + time_config = IndexTimeRange( + start=1, + length=nts, + start_timestamp=st_ts, + resolution=interval_resolution, + interval_type=TimeIntervalType.PERIOD_BEGINNING, + time_column="index_time", + ) src_schema = TableSchema( name="input_data", time_config=time_config, From f74029f80330b7b0f84470192ae0510f1394971e Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Mon, 29 Dec 2025 12:31:02 -0800 Subject: [PATCH 02/20] Initial commit of localizer --- src/chronify/datetime_range_generator.py | 8 +- src/chronify/time_zone_converter.py | 56 ++- src/chronify/time_zone_localizer.py | 462 +++++++++++++++++++++++ 3 files changed, 512 insertions(+), 14 deletions(-) create mode 100644 src/chronify/time_zone_localizer.py diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 91dbd15..35585b9 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -125,19 +125,19 @@ def _list_timestamps(self, time_zone: Optional[tzinfo]) -> list[datetime]: """ match (self._model.start_time_is_tz_naive(), self._model.dtype): case (True, TimeDataType.TIMESTAMP_NTZ): - # align in local time of the time zone, all time zones have the same tz-naive timestamps + # aligned_in_local_time of the time zone, all time zones have the same tz-naive timestamps start = self._model.start case (True, TimeDataType.TIMESTAMP_TZ): - # align in local time of the time zone, all time zones have different tz-aware timestamps that are aligned when adjusted by time zone + # aligned_in_local_time of the time zone, all time zones have different tz-aware timestamps that are aligned when adjusted by time zone start = self._model.start.replace(tzinfo=time_zone) case (False, TimeDataType.TIMESTAMP_NTZ): - # align in absolute time, all time zones have different tz-naive timestamps that are aligned when localized to the time zone + # aligned_in_absolute_time, all time zones have different tz-naive timestamps that are aligned when localized to the time zone if time_zone: start = self._model.start.astimezone(time_zone).replace(tzinfo=None) else: start = self._model.start.replace(tzinfo=None) case (False, TimeDataType.TIMESTAMP_TZ): - # align in absolute time, all time zones have the same tz-aware timestamps + # aligned_in_absolute_time, all time zones have the same tz-aware timestamps start = self._model.start case _: msg = f"Unsupported combination of start_time_is_tz_naive and dtype: {self._model}" diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 30f3baa..44b14c9 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -144,14 +144,15 @@ def _check_from_schema(self, from_schema: TableSchema) -> None: msg = "" if not isinstance(from_schema.time_config, DatetimeRange): msg += "Source schema does not have DatetimeRange time config. " - if ( - isinstance(from_schema.time_config, DatetimeRange) - and from_schema.time_config.start_time_is_tz_naive() - ): + if from_schema.time_config.dtype != TimeDataType.TIMESTAMP_TZ: + msg += "Source schema time config dtype must be Timestamp_TZ. " + if from_schema.time_config.start_time_is_tz_naive(): msg += ( "Source schema start_time must be timezone-aware. " - "To convert from timezone-naive to timezone-aware, " - "use the TimeSeriesMapperDatetime.map_time() method instead. " + "This converter will convert time zones and return timestamps as tz-naive " + "along with time zone information in a column. " + "To localize timestamps from timezone-naive to timezone-aware, " + "use TimeZoneLocalizer() or TimeZoneLocalizerByColumn() instead. " ) if msg != "": raise InvalidParameter(msg) @@ -171,8 +172,16 @@ def convert_time_zone( class TimeZoneConverter(TimeZoneConverterBase): - """Class for time zone conversion of time series data to a specified time zone. - Output timestamp is tz-naive with time_zone recorded in a column. + """Class for time zone conversion of tz-aware, aligned_in_absolute_time + time series data to a specified time zone. + + Input data table must contain tz-aware timestamps. + Input time config must be of type DatetimeRange with Timestamp_TZ dtype and tz-aware start time. + Output data table will contain tz-naive timestamps with time zone recorded in a column + Output time config will be of type DatetimeRange with Timestamp_NTZ dtype and tz-naive start time. + + # TODO: support DatetimeRangeWithTZColumn as input time config + # TODO: support wrap_time_allowed option """ def __init__( @@ -277,8 +286,35 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: class TimeZoneConverterByColumn(TimeZoneConverterBase): - """Class for time zone conversion of time series data based on a time zone column. - Output timestamp is tz-naive with time_zone recorded in a column. + """Class for time zone conversion of tz-aware, aligned_in_absolute_time + time series data based on a time zone column. + + Input data table must contain tz-aware timestamps and a time zone column. + Input time config must be of type DatetimeRangeWithTZColumn or DatetimeRange with Timestamp_TZ dtype. + - If DatetimeRange is used, time_zone_column must be provided. + - If DatetimeRangeWithTZColumn is used, it is converted to DatetimeRange internally. + time_zone_column, if provided, is ignored and instead taken from the time_config. + Output data table will contain tz-naive timestamps and the original time zone column. + Output time config will be of type DatetimeRangeWithTZColumn with Timestamp_NTZ dtype (see scenarios). + + I/O Time config scenarios: + -------------------------------- + To convert tz-aware timestamps aligned_in_absolute_time to multiple time zones specified in a column: + - wrap_time_allowed = False + - Input time config: DatetimeRange with tz-aware start time, Timestamp_TZ dtype + - Output time config: DatetimeRangeWithTZColumn with tz-aware start time, Timestamp_NTZ dtype + + To convert tz-aware timestamps aligned_in_absolute_time to multiple time zones specified in a column + and aligned_in_local_time: + - wrap_time_allowed = True + - Input time config: DatetimeRange with tz-aware start time, Timestamp_TZ dtype + - Output time config: DatetimeRangeWithTZColumn with tz-naive start time, Timestamp_NTZ dtype + Note: converted time is wrapped within the local time range of the original timestamps. + -------------------------------- + + # TODO: support DatetimeRangeWithTZColumn as input time config + # TODO: add util func to reduce code duplication with TimeZoneLocalizerByColumn + # TODO: add util func to reduce DatetimeRangeWithTZColumn aligned_in_absolute_time to DatetimeRange """ def __init__( diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py new file mode 100644 index 0000000..d57213d --- /dev/null +++ b/src/chronify/time_zone_localizer.py @@ -0,0 +1,462 @@ +import abc +from zoneinfo import ZoneInfo +from datetime import tzinfo +from sqlalchemy import Engine, MetaData, Table, select +from typing import Optional +from pathlib import Path +import pandas as pd + +from chronify.models import TableSchema, MappingTableSchema +from chronify.time_configs import ( + DatetimeRangeBase, + DatetimeRange, + DatetimeRangeWithTZColumn, + TimeBasedDataAdjustment, +) +from chronify.datetime_range_generator import ( + DatetimeRangeGenerator, + DatetimeRangeGeneratorExternalTimeZone, +) +from chronify.exceptions import InvalidParameter, MissingValue +from chronify.time_series_mapper_base import apply_mapping +from chronify.time_range_generator_factory import make_time_range_generator +from chronify.sqlalchemy.functions import read_database +from chronify.time import TimeDataType, TimeType +from chronify.time_series_mapper import map_time + + +def localize_time_zone( + engine: Engine, + metadata: MetaData, + src_schema: TableSchema, + to_time_zone: tzinfo | None, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, +) -> TableSchema: + """Localize TIMESTAMP_NTZ time column in a table to a specified time zone. + Updates table to TIMESTAMP_TZ time column and returns a new time config. + + Parameters + ---------- + engine : sqlalchemy.Engine + SQLAlchemy engine. + metadata : sqlalchemy.MetaData + SQLAlchemy metadata. + src_schema : TableSchema + Defines the source table in the database. + to_time_zone : tzinfo or None + Time zone to convert to. If None, convert to tz-naive. + scratch_dir : pathlib.Path, optional + Directory to use for temporary writes. Defaults to the system's tmp filesystem. + output_file : pathlib.Path, optional + If set, write the mapped table to this Parquet file. + check_mapped_timestamps : bool, optional + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Returns + ------- + TableSchema + Schema of output table with converted timestamps. + """ + tzc = TimeZoneLocalizer(engine, metadata, src_schema, to_time_zone) + tzc.localize_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + return tzc._to_schema + + +def localize_time_zone_by_column( + engine: Engine, + metadata: MetaData, + src_schema: TableSchema, + time_zone_column: Optional[str] = None, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, +) -> TableSchema: + """Localize TIMESTAMP_NTZ time column in a table to multiple time zones specified by a column. + Updates table to TIMESTAMP_TZ time column and returns a new time config. + + Parameters + ---------- + engine : sqlalchemy.Engine + SQLAlchemy engine. + metadata : sqlalchemy.MetaData + sqlalchemy metadata + src_schema : TableSchema + Defines the source table in the database. + time_zone_column : Optional[str] + Column name in the source table that contains the time zone information. + - Required if src_schema.time_config is of type DatetimeRange. + - Ignored if src_schema.time_config is of type DatetimeRangeWithTZColumn. + scratch_dir : pathlib.Path, optional + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file : pathlib.Path, optional + If set, write the mapped table to this Parquet file. + check_mapped_timestamps : bool, optional + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Returns + ------- + dst_schema : TableSchema + schema of output table with converted timestamps + """ + if isinstance(src_schema.time_config, DatetimeRange) and time_zone_column is None: + msg = ( + "time_zone_column must be provided when localizing time zones " + "by column for DatetimeRange time config." + ) + raise MissingValue(msg) + + tzc = TimeZoneLocalizerByColumn( + engine, metadata, src_schema, time_zone_column=time_zone_column + ) + tzc.localize_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + return tzc._to_schema + + +class TimeZoneLocalizerBase(abc.ABC): + """Base class for time zone localization of time series data.""" + + def __init__( + self, + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + ): + self._engine = engine + self._metadata = metadata + self._from_schema = from_schema + + @abc.abstractmethod + @staticmethod + def _check_from_schema(from_schema: TableSchema) -> None: + """Check that from_schema is valid for time zone localization""" + + @abc.abstractmethod + def generate_to_schema(self) -> TableSchema: + """Generate to_schema based on from_schema""" + + @abc.abstractmethod + def localize_time_zone( + self, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> None: + """Localize tz-naive timestamps to the time zone of the from_schema""" + + +class TimeZoneLocalizer(TimeZoneLocalizerBase): + """Class for time zone localization of tz-naive time series data to a specified time zone. + + Input data table must contain tz-naive timestamps. + Input time config must be of type DatetimeRange with Timestamp_NTZ dtype and tz-naive start time. + Output data table will contain tz-aware timestamps. + Output time config will be of type DatetimeRange with Timestamp_TZ dtype and tz-aware start time. + """ + + def __init__( + self, + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + to_time_zone: tzinfo | None, + ): + self._check_from_schema(from_schema) + super().__init__(engine, metadata, from_schema) + self._to_time_zone = to_time_zone + self._to_schema = self.generate_to_schema() + + @staticmethod + def _check_from_schema(from_schema: TableSchema) -> None: + msg = "" + if not isinstance(from_schema.time_config, DatetimeRange): + msg += "Source schema does not have DatetimeRange time config. " + if isinstance(from_schema.time_config, DatetimeRangeWithTZColumn): + msg += ( + "Instead, time config is of type DatetimeRangeWithTZColumn, " + f"try using TimeZoneLocalizerByColumn(). {from_schema.time_config}" + ) + raise InvalidParameter(msg) + if from_schema.time_config.dtype != TimeDataType.TIMESTAMP_NTZ: + msg += "Source schema time config dtype must be TIMESTAMP_NTZ. " + if from_schema.time_config.start_time_is_tz_naive() is False: + msg += ( + "Source schema time config start time must be tz-naive." + "To convert between time zones for tz-aware timestamps, " + "try using TimeZoneConverter() " + ) + if msg != "": + msg += f"\n{from_schema.time_config}" + raise InvalidParameter(msg) + + def generate_to_time_config(self) -> DatetimeRange: + assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy + to_time_config: DatetimeRange = self._from_schema.time_config.model_copy( + update={ + "start": self._from_schema.time_config.start.replace(tzinfo=self._to_time_zone) + } + ) + + return to_time_config + + def generate_to_schema(self) -> TableSchema: + to_time_config = self.generate_to_time_config() + to_schema: TableSchema = self._from_schema.model_copy( + update={ + "name": f"{self._from_schema.name}_tz_converted", + "time_config": to_time_config, + } + ) + return to_schema + + def localize_time_zone( + self, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> None: + map_time( + engine=self._engine, + metadata=self._metadata, + from_schema=self._from_schema, + to_schema=self._to_schema, + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + +class TimeZoneLocalizerByColumn(TimeZoneLocalizerBase): + """Class for time zone localization of tz-naive time series data based on a time zone column. + + Input data table must contain tz-naive timestamps and a time zone column. + Input time config must be of type DatetimeRangeWithTZColumn or DatetimeRange with Timestamp_NTZ dtype. + - If DatetimeRangeWithTZColumn is used, time_zone_column, if provided, is ignored. + - If DatetimeRange is used, time_zone_column must be provided. It is then converted to + DatetimeRangeWithTZColumn internally. + Output data table will contain tz-aware timestamps and the original time zone column. + Output time config can be of type DatetimeRange or DatetimeRangeWithTZColumn with Timestamp_TZ dtype (see scenarios). + + I/O Time config scenarios: + -------------------------------- + To localize tz-naive timestamps aligned_in_local_time to multiple time zones specified in a column: + - Input time config: DatetimeRangeWithTZColumn with tz-naive start time, Timestamp_NTZ dtype + - Output time config: DatetimeRangeWithTZColumn with tz-naive start time, Timestamp_TZ dtype + + To localize tz-naive timestamps aligned_in_absolute_time to multiple time zones specified in a column: + - Input time config: DatetimeRangeWithTZColumn with tz-aware start time, Timestamp_NTZ dtype + - Output time config: DatetimeRange with tz-aware start time, Timestamp_TZ dtype + Note: output time config is reduced to DatetimeRange (from DatetimeRangeWithTZColumn) + since all timestamps are tz-aware and aligned in absolute time. + -------------------------------- + + # TODO: add tests + """ + + def __init__( + self, + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + time_zone_column: Optional[str] = None, + ): + self._check_from_schema(from_schema) + self._check_time_zone_column(from_schema, time_zone_column) + super().__init__(engine, metadata, from_schema) + if isinstance(self._from_schema.time_config, DatetimeRange): + self.time_zone_column = time_zone_column + self._convert_from_time_config_to_datetime_range_with_tz_column() + else: + self.time_zone_column = self._from_schema.time_config.time_zone_column + self._to_schema = self.generate_to_schema() + + @staticmethod + def _check_from_schema(from_schema: TableSchema) -> None: + msg = "" + if not isinstance(from_schema.time_config, (DatetimeRange, DatetimeRangeWithTZColumn)): + msg += ( + "Source schema must have DatetimeRange or DatetimeRangeWithTZColumn time config. " + ) + if from_schema.time_config.dtype != TimeDataType.TIMESTAMP_NTZ: + msg += "Source schema time config dtype must be TIMESTAMP_NTZ. " + if msg != "": + msg += f"\n{from_schema.time_config}" + raise InvalidParameter(msg) + + @staticmethod + def _check_time_zone_column(from_schema: TableSchema, time_zone_column: Optional[str]) -> None: + if ( + isinstance(from_schema.time_config, DatetimeRangeWithTZColumn) + and time_zone_column is None + ): + msg = f"Input {time_zone_column=} will be ignored. time_zone_column is already defined in the time_config." + raise Warning(msg) + + msg = "" + if isinstance(from_schema.time_config, DatetimeRange) and time_zone_column is None: + msg += "time_zone_column must be provided when source schema time config is of type DatetimeRange. " + if time_zone_column not in from_schema.time_array_id_columns: + msg = f"{time_zone_column=} must be in source schema time_array_id_columns." + if msg != "": + msg += f"\n{from_schema}" + raise InvalidParameter(msg) + + def _convert_from_time_config_to_datetime_range_with_tz_column(self) -> None: + """Convert DatetimeRange from_schema time config to DatetimeRangeWithTZColumn time config + for the rest of the workflow + """ + assert isinstance(self._from_schema.time_config, DatetimeRange) + time_kwargs = self._from_schema.time_config.model_dump() + time_kwargs = dict( + filter( + lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, + time_kwargs.items(), + ) + ) + time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL + time_kwargs["time_zone_column"] = self.time_zone_column + time_kwargs["time_zones"] = self._get_time_zones() + + self._from_schema.time_config = DatetimeRangeWithTZColumn(**time_kwargs) + + def generate_to_time_config(self) -> DatetimeRangeBase: + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy + match self._from_schema.time_config.start_time_is_tz_naive(): + case True: + # tz-naive start, aligned in local time of the time zones + to_time_config: DatetimeRangeWithTZColumn = ( + self._from_schema.time_config.model_copy( + update={ + "dtype": TimeDataType.TIMESTAMP_TZ, + } + ) + ) + return to_time_config + case False: + # tz-aware start, aligned in absolute time, convert to DatetimeRange config + time_kwargs = self._from_schema.time_config.model_dump() + time_kwargs = dict( + filter( + lambda k_v: k_v[0] in DatetimeRange.model_fields, + time_kwargs.items(), + ) + ) + time_kwargs["dtype"] = TimeDataType.TIMESTAMP_TZ + time_kwargs["time_type"] = TimeType.DATETIME + return DatetimeRange(**time_kwargs) + case _: + msg = ( + "Unable to determine if start time is tz-naive or tz-aware " + f"from time config: {self._from_schema.time_config}" + ) + raise InvalidParameter(msg) + + def generate_to_schema(self) -> TableSchema: + id_cols = self._from_schema.time_array_id_columns + if "time_zone" not in id_cols: + id_cols.append("time_zone") + to_schema: TableSchema = self._from_schema.model_copy( + update={ + "name": f"{self._from_schema.name}_tz_converted", + "time_config": self.generate_to_time_config(), + "time_array_id_columns": id_cols, + } + ) + return to_schema + + def localize_time_zone( + self, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> None: + df, mapping_schema = self._create_mapping() + + apply_mapping( + df, + mapping_schema, + self._from_schema, + self._to_schema, + self._engine, + self._metadata, + TimeBasedDataAdjustment(), + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + def _get_time_zones(self) -> list[tzinfo | None]: + with self._engine.connect() as conn: + table = Table(self._from_schema.name, self._metadata) + stmt = ( + select(table.c[self.time_zone_column]) + .distinct() + .where(table.c[self.time_zone_column].is_not(None)) + ) + time_zones = read_database(stmt, conn, self._from_schema.time_config)[ + self.time_zone_column + ].to_list() + + time_zones = [None if tz == "None" else ZoneInfo(tz) for tz in time_zones] + return time_zones + + def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: + """Create mapping dataframe for localizing tz-naive datetime to column time zones""" + # assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy + time_col = self._from_schema.time_config.time_column + from_time_col = "from_" + time_col + from_time_generator = make_time_range_generator(self._from_schema.time_config) + assert isinstance(from_time_generator, DatetimeRangeGeneratorExternalTimeZone) # mypy + from_time_data_dct = from_time_generator.list_timestamps_by_time_zone() + + to_time_generator = make_time_range_generator(self._to_schema.time_config) + match to_time_generator: + case DatetimeRangeGeneratorExternalTimeZone(): # mypy + to_time_data_dct = to_time_generator.list_timestamps_by_time_zone() + case DatetimeRangeGenerator(): # mypy + to_time_data = to_time_generator.list_timestamps() + to_time_data_dct = {tz_name: to_time_data for tz_name in from_time_data_dct.keys()} + case _: + msg = ( + "to_time_generator must be of type " + "DatetimeRangeGeneratorExternalTimeZone or DatetimeRangeGenerator. " + f"Got {type(to_time_generator)}" + ) + raise InvalidParameter(msg) + + from_tz_col = "from_" + self.time_zone_column + from_time_config = self._from_schema.time_config.model_copy( + update={"time_column": from_time_col} + ) + to_time_config = self._to_schema.time_config + + df_tz = [] + for tz_name, from_time_data in from_time_data_dct.items(): + df_tz.append( + pd.DataFrame( + { + from_time_col: from_time_data, + from_tz_col: tz_name, + time_col: to_time_data_dct[tz_name], + } + ) + ) + df = pd.concat(df_tz, ignore_index=True) + + mapping_schema = MappingTableSchema( + name="mapping_table_gtz_conversion", + time_configs=[from_time_config, to_time_config], + ) + return df, mapping_schema From 1ca60ed553525cec39bf701100ce62673891b504 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 8 Jan 2026 16:52:02 -0700 Subject: [PATCH 03/20] commit for now --- src/chronify/datetime_range_generator.py | 4 +- src/chronify/sqlalchemy/functions.py | 42 ++-- src/chronify/time_configs.py | 40 +++- src/chronify/time_series_checker.py | 5 +- src/chronify/time_zone_converter.py | 27 ++- src/chronify/time_zone_localizer.py | 49 +++-- tests/test_time_zone_converter.py | 22 +-- tests/test_time_zone_localizer.py | 236 +++++++++++++++++++++++ 8 files changed, 366 insertions(+), 59 deletions(-) create mode 100644 tests/test_time_zone_localizer.py diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 35585b9..b0a5651 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -120,8 +120,8 @@ def __init__( def _list_timestamps(self, time_zone: Optional[tzinfo]) -> list[datetime]: """return timestamps for a given time_zone expected in the dataframe - returned timestamp dtype matches self._model.dtype, e.g., - if time_zone is None, return tz-naive timestamps else return tz-aware timestamps + returned timestamp dtype matches that in the dataframe, i.e. self._model.dtype + (e.g., if time_zone is None, return tz-naive timestamps else return tz-aware timestamps) """ match (self._model.start_time_is_tz_naive(), self._model.dtype): case (True, TimeDataType.TIMESTAMP_NTZ): diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 6539ec8..3819c64 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -13,10 +13,11 @@ import pandas as pd from numpy.dtypes import DateTime64DType, ObjectDType from pandas import DatetimeTZDtype +from chronify.time import TimeDataType from sqlalchemy import Connection, Engine, Selectable, text from chronify.exceptions import InvalidOperation, InvalidParameter -from chronify.time_configs import DatetimeRangeBase, DatetimeRange, TimeBaseModel +from chronify.time_configs import DatetimeRangeBase, TimeBaseModel from chronify.utils.path_utils import check_overwrite, delete_if_exists, to_path # Copied from Pandas/Polars @@ -35,7 +36,7 @@ def read_database( df = conn.execute(query).cursor.fetch_df() # type: ignore case "sqlite": df = pd.read_sql(query, conn, params=params) - if isinstance(config, DatetimeRange): + if isinstance(config, DatetimeRangeBase): _convert_database_output_for_datetime(df, config) case "hive": df = _read_from_hive(query, conn, config, params) @@ -61,7 +62,7 @@ def write_database( """ match conn.engine.name: case "duckdb": - _write_to_duckdb(df, conn, table_name, if_table_exists) + _write_to_duckdb(df, conn, table_name, configs, if_table_exists) case "sqlite": _write_to_sqlite(df, conn, table_name, configs, if_table_exists) case "hive": @@ -81,9 +82,9 @@ def _check_one_config_per_datetime_column(configs: Sequence[TimeBaseModel]) -> N def _convert_database_input_for_datetime( - df: pd.DataFrame, config: DatetimeRange, copied: bool + df: pd.DataFrame, config: DatetimeRangeBase, copied: bool ) -> tuple[pd.DataFrame, bool]: - if config.start_time_is_tz_naive(): + if config.dtype == TimeDataType.TIMESTAMP_NTZ: return df, copied if copied: @@ -91,7 +92,6 @@ def _convert_database_input_for_datetime( else: df2 = df.copy() copied = True - if isinstance(df2[config.time_column].dtype, DatetimeTZDtype): df2[config.time_column] = df2[config.time_column].dt.tz_convert("UTC") else: @@ -100,9 +100,9 @@ def _convert_database_input_for_datetime( return df2, copied -def _convert_database_output_for_datetime(df: pd.DataFrame, config: DatetimeRange) -> None: +def _convert_database_output_for_datetime(df: pd.DataFrame, config: DatetimeRangeBase) -> None: if config.time_column in df.columns: - if not config.start_time_is_tz_naive(): + if config.dtype == TimeDataType.TIMESTAMP_TZ: if isinstance(df[config.time_column].dtype, ObjectDType): df[config.time_column] = pd.to_datetime(df[config.time_column], utc=True) else: @@ -116,21 +116,37 @@ def _write_to_duckdb( df: pd.DataFrame, conn: Connection, table_name: str, + configs: Sequence[TimeBaseModel], if_table_exists: DbWriteMode, ) -> None: assert conn._dbapi_connection is not None assert conn._dbapi_connection.driver_connection is not None + # time_cols = [] + # non_time_cols = df.columns.tolist() + # for config in configs: + # if isinstance(config, DatetimeRangeBase): + # time_col = config.time_column + # if config.dtype == TimeDataType.TIMESTAMP_TZ: + # time_cols.append(f"{time_col}::TIMESTAMPTZ AS {time_col}") + # else: + # time_cols.append(f"{time_col}::TIMESTAMP AS {time_col}") + # non_time_cols.remove(time_col) + + # col_stmt = ", ".join(time_cols + non_time_cols) + col_stmt = "*" + match if_table_exists: case "append": - query = f"INSERT INTO {table_name} SELECT * FROM df" + query = f"INSERT INTO {table_name} SELECT {col_stmt} FROM df" case "replace": conn._dbapi_connection.driver_connection.sql(f"DROP TABLE IF EXISTS {table_name}") - query = f"CREATE TABLE {table_name} AS SELECT * FROM df" + query = f"CREATE TABLE {table_name} AS SELECT {col_stmt} FROM df" case "fail": - query = f"CREATE TABLE {table_name} AS SELECT * FROM df" + query = f"CREATE TABLE {table_name} AS SELECT {col_stmt} FROM df" case _: msg = f"{if_table_exists=}" raise InvalidOperation(msg) + conn._dbapi_connection.driver_connection.sql(query) @@ -190,7 +206,7 @@ def _read_from_hive( ) -> pd.DataFrame: df = pd.read_sql_query(query, conn, params=params) if ( - isinstance(config, DatetimeRange) + isinstance(config, DatetimeRangeBase) and config.time_column in df.columns and not config.start_time_is_tz_naive() ): @@ -210,7 +226,7 @@ def _write_to_sqlite( _check_one_config_per_datetime_column(configs) copied = False for config in configs: - if isinstance(config, DatetimeRange): + if isinstance(config, DatetimeRangeBase): df, copied = _convert_database_input_for_datetime(df, config, copied) df.to_sql(table_name, conn, if_exists=if_table_exists, index=False) diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 8e4a883..2cb9f49 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -3,7 +3,7 @@ from zoneinfo import ZoneInfo from datetime import datetime, timedelta, tzinfo from typing import Union, Literal, Optional -from pydantic import Field, field_validator +from pydantic import Field, field_validator, ValidationInfo from typing_extensions import Annotated from chronify.base_models import ChronifyBaseModel @@ -68,9 +68,6 @@ def get_time_zones(self) -> list[tzinfo | None]: class DatetimeRangeBase(TimeBaseModel): """Defines a time range base class that uses Python datetime instances.""" - dtype: Literal[ - TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ - ] = TimeDataType.TIMESTAMP_TZ time_column: str = Field(description="Column in the table that represents time.") length: int resolution: timedelta @@ -92,6 +89,10 @@ class DatetimeRange(DatetimeRangeBase): description="Start time of the range. If it includes a time zone, the timestamps in " "the data must be time zone-aware." ) + dtype: Optional[Literal[TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ]] = Field( + description="Data type of the timestamps in the time column.", + default=None, + ) def get_time_zone_column(self) -> None: return None @@ -99,6 +100,34 @@ def get_time_zone_column(self) -> None: def get_time_zones(self) -> list[tzinfo | None]: return [] + @field_validator("dtype", mode="after") + @classmethod + def check_dtype_start_time_consistency( + cls, + dtype: Optional[Literal[TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ]], + info: ValidationInfo, + ) -> Literal[TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ]: + match (info.data["start"].tzinfo is None, dtype): + # assign default dtype if not provided + case (True, None): + dtype = TimeDataType.TIMESTAMP_NTZ + case (False, None): + dtype = TimeDataType.TIMESTAMP_TZ + # validate dype if provided + case (True, TimeDataType.TIMESTAMP_TZ): + msg = ( + "DatetimeRange with tz-naive start time must have dtype TIMESTAMP_NTZ: " + f"\n{info.data['start']=}, {dtype=}" + ) + raise InvalidValue(msg) + case (False, TimeDataType.TIMESTAMP_NTZ): + msg = ( + "DatetimeRange with tz-aware start time must have dtype TIMESTAMP_TZ: " + f"\n{info.data['start']=}, {dtype=}" + ) + raise InvalidValue(msg) + return dtype + class DatetimeRangeWithTZColumn(DatetimeRangeBase): """Defines a time range that uses an external time zone column to interpret timestamps.""" @@ -117,6 +146,9 @@ class DatetimeRangeWithTZColumn(DatetimeRangeBase): time_zones: list[tzinfo | ZoneInfo | None] = Field( description="Unique time zones from the table." ) + dtype: Literal[TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ] = Field( + description="Data type of the timestamps in the time column." + ) def get_time_zone_column(self) -> str: return self.time_zone_column diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index bb27f2f..afaef59 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -27,7 +27,10 @@ def check_timestamps( class TimeSeriesChecker: - """Performs checks on time series arrays in a table.""" + """Performs checks on time series arrays in a table. + Timestamps in the table will be checked against expected timestamps generated from the + TableSchema's time_config. TZ-awareness of the generated timestamps will match that of thetable. + """ def __init__( self, diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 44b14c9..18c8da9 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -143,12 +143,12 @@ def __init__( def _check_from_schema(self, from_schema: TableSchema) -> None: msg = "" if not isinstance(from_schema.time_config, DatetimeRange): - msg += "Source schema does not have DatetimeRange time config. " + msg += "Source schema must have DatetimeRange time config. " if from_schema.time_config.dtype != TimeDataType.TIMESTAMP_TZ: msg += "Source schema time config dtype must be Timestamp_TZ. " if from_schema.time_config.start_time_is_tz_naive(): msg += ( - "Source schema start_time must be timezone-aware. " + "Source schema time config start time must be timezone-aware. " "This converter will convert time zones and return timestamps as tz-naive " "along with time zone information in a column. " "To localize timestamps from timezone-naive to timezone-aware, " @@ -197,20 +197,19 @@ def __init__( def generate_to_time_config(self) -> DatetimeRangeWithTZColumn: assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy - to_time_config = self._from_schema.time_config.model_copy() - if self._to_time_zone: - to_time_config.start = to_time_config.start.astimezone(self._to_time_zone).replace( - tzinfo=None - ) - else: - to_time_config.start = to_time_config.start.replace(tzinfo=None) - time_kwargs = to_time_config.model_dump() + time_kwargs = self._from_schema.time_config.model_dump() time_kwargs = dict( filter( lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, time_kwargs.items(), ) ) + if self._to_time_zone: + time_kwargs["start"] = ( + time_kwargs["start"].astimezone(self._to_time_zone).replace(tzinfo=None) + ) + else: + time_kwargs["start"] = time_kwargs["start"].replace(tzinfo=None) time_kwargs["dtype"] = TimeDataType.TIMESTAMP_NTZ time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL time_kwargs["time_zone_column"] = "time_zone" @@ -335,16 +334,16 @@ def __init__( def generate_to_time_config(self) -> DatetimeRangeBase: assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy - to_time_config = self._from_schema.time_config.model_copy() - if self._wrap_time_allowed: - to_time_config.start = to_time_config.start.replace(tzinfo=None) - time_kwargs = to_time_config.model_dump() + time_kwargs = self._from_schema.time_config.model_dump() time_kwargs = dict( filter( lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, time_kwargs.items(), ) ) + if self._wrap_time_allowed: + time_kwargs["start"] = time_kwargs["start"].replace(tzinfo=None) + time_kwargs["dtype"] = TimeDataType.TIMESTAMP_NTZ time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL time_kwargs["time_zone_column"] = self.time_zone_column diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index d57213d..1d0da4f 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -5,6 +5,7 @@ from typing import Optional from pathlib import Path import pandas as pd +from pandas import DatetimeTZDtype from chronify.models import TableSchema, MappingTableSchema from chronify.time_configs import ( @@ -60,14 +61,14 @@ def localize_time_zone( TableSchema Schema of output table with converted timestamps. """ - tzc = TimeZoneLocalizer(engine, metadata, src_schema, to_time_zone) - tzc.localize_time_zone( + tzl = TimeZoneLocalizer(engine, metadata, src_schema, to_time_zone) + tzl.localize_time_zone( scratch_dir=scratch_dir, output_file=output_file, check_mapped_timestamps=check_mapped_timestamps, ) - return tzc._to_schema + return tzl._to_schema def localize_time_zone_by_column( @@ -114,15 +115,15 @@ def localize_time_zone_by_column( ) raise MissingValue(msg) - tzc = TimeZoneLocalizerByColumn( + tzl = TimeZoneLocalizerByColumn( engine, metadata, src_schema, time_zone_column=time_zone_column ) - tzc.localize_time_zone( + tzl.localize_time_zone( scratch_dir=scratch_dir, output_file=output_file, check_mapped_timestamps=check_mapped_timestamps, ) - return tzc._to_schema + return tzl._to_schema class TimeZoneLocalizerBase(abc.ABC): @@ -138,8 +139,8 @@ def __init__( self._metadata = metadata self._from_schema = from_schema - @abc.abstractmethod @staticmethod + @abc.abstractmethod def _check_from_schema(from_schema: TableSchema) -> None: """Check that from_schema is valid for time zone localization""" @@ -205,7 +206,10 @@ def generate_to_time_config(self) -> DatetimeRange: assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy to_time_config: DatetimeRange = self._from_schema.time_config.model_copy( update={ - "start": self._from_schema.time_config.start.replace(tzinfo=self._to_time_zone) + "dtype": TimeDataType.TIMESTAMP_TZ + if self._to_time_zone + else TimeDataType.TIMESTAMP_NTZ, + "start": self._from_schema.time_config.start.replace(tzinfo=self._to_time_zone), } ) @@ -307,8 +311,8 @@ def _check_time_zone_column(from_schema: TableSchema, time_zone_column: Optional msg = "" if isinstance(from_schema.time_config, DatetimeRange) and time_zone_column is None: msg += "time_zone_column must be provided when source schema time config is of type DatetimeRange. " - if time_zone_column not in from_schema.time_array_id_columns: - msg = f"{time_zone_column=} must be in source schema time_array_id_columns." + # if time_zone_column not in from_schema.time_array_id_columns: + # msg = f"{time_zone_column=} must be in source schema time_array_id_columns." if msg != "": msg += f"\n{from_schema}" raise InvalidParameter(msg) @@ -335,7 +339,7 @@ def generate_to_time_config(self) -> DatetimeRangeBase: assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy match self._from_schema.time_config.start_time_is_tz_naive(): case True: - # tz-naive start, aligned in local time of the time zones + # tz-naive start, aligned_in_local_time of the time zones to_time_config: DatetimeRangeWithTZColumn = ( self._from_schema.time_config.model_copy( update={ @@ -345,7 +349,7 @@ def generate_to_time_config(self) -> DatetimeRangeBase: ) return to_time_config case False: - # tz-aware start, aligned in absolute time, convert to DatetimeRange config + # tz-aware start, aligned_in_absolute_time, convert to DatetimeRange config time_kwargs = self._from_schema.time_config.model_dump() time_kwargs = dict( filter( @@ -409,12 +413,19 @@ def _get_time_zones(self) -> list[tzinfo | None]: self.time_zone_column ].to_list() + if "None" in time_zones and len(time_zones) > 1: + msg = ( + "Chronify does not support mix of None and time zones in time_zone_column." + "This is because databases do not support tz-aware and tz-naive timestamps " + f"in the same column: {time_zones}" + ) + raise InvalidParameter(msg) + time_zones = [None if tz == "None" else ZoneInfo(tz) for tz in time_zones] return time_zones def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: """Create mapping dataframe for localizing tz-naive datetime to column time zones""" - # assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy time_col = self._from_schema.time_config.time_column from_time_col = "from_" + time_col from_time_generator = make_time_range_generator(self._from_schema.time_config) @@ -443,17 +454,27 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: to_time_config = self._to_schema.time_config df_tz = [] + primary_tz = ZoneInfo(list(from_time_data_dct.keys())[0]) for tz_name, from_time_data in from_time_data_dct.items(): + # convert tz-aware timestamps to a single time zone for mapping + # this is because pandas coerces tz-aware timestamps with mixed time zones to object dtype otherwise + to_time_data = [ts.astimezone(primary_tz) for ts in to_time_data_dct[tz_name]] df_tz.append( pd.DataFrame( { from_time_col: from_time_data, from_tz_col: tz_name, - time_col: to_time_data_dct[tz_name], + time_col: to_time_data, } ) ) df = pd.concat(df_tz, ignore_index=True) + if not isinstance(df[time_col].dtype, DatetimeTZDtype): + msg = ( + "Mapped time column is expected to be of " + f"DatetimeTZDtype but got {df[time_col].dtype}" + ) + raise InvalidParameter(msg) mapping_schema = MappingTableSchema( name="mapping_table_gtz_conversion", diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index 3b0230a..aca8cbf 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -16,18 +16,19 @@ ) from chronify.time_configs import DatetimeRange from chronify.models import TableSchema -from chronify.time import TimeIntervalType +from chronify.time import TimeDataType, TimeIntervalType from chronify.datetime_range_generator import DatetimeRangeGenerator from chronify.exceptions import InvalidParameter -def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: # type: ignore - return pd.to_datetime(list(DatetimeRangeGenerator(time_config)._iter_timestamps())) - - def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: - df = pd.DataFrame({schema.time_config.time_column: generate_datetime_data(schema.time_config)}) - + df = pd.DataFrame( + { + schema.time_config.time_column: pd.to_datetime( + DatetimeRangeGenerator(schema.time_config).list_timestamps() + ) + } + ) for i, x in enumerate(schema.time_array_id_columns): df[x] = i df[schema.value_column] = np.random.rand(len(df)) @@ -70,6 +71,7 @@ def get_datetime_schema( schema = TableSchema( name=name, time_config=DatetimeRange( + dtype=TimeDataType.TIMESTAMP_TZ if tzinfo else TimeDataType.TIMESTAMP_NTZ, start=start, resolution=resolution, length=length, @@ -183,10 +185,8 @@ def run_conversion_with_error( def test_src_table_no_time_zone(iter_engines: Engine) -> None: from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") df = generate_datetime_dataframe(from_schema) - error = (InvalidParameter, "Source schema start_time must be timezone-aware") - run_conversion_with_error( - iter_engines, df, from_schema, False, error - ) # TODO, support tz-naive to tz-aware conversion + error = (InvalidParameter, "Source schema time config start time must be timezone-aware") + run_conversion_with_error(iter_engines, df, from_schema, False, error) @pytest.mark.parametrize( diff --git a/tests/test_time_zone_localizer.py b/tests/test_time_zone_localizer.py new file mode 100644 index 0000000..d33e086 --- /dev/null +++ b/tests/test_time_zone_localizer.py @@ -0,0 +1,236 @@ +from zoneinfo import ZoneInfo +from datetime import datetime, timedelta, tzinfo +import numpy as np +import pytest +from typing import Any + +import pandas as pd +from sqlalchemy import Engine, MetaData + +from chronify.sqlalchemy.functions import read_database, write_database +from chronify.time_zone_localizer import ( + TimeZoneLocalizer, + TimeZoneLocalizerByColumn, + localize_time_zone, + localize_time_zone_by_column, +) +from chronify.time_configs import DatetimeRange, DatetimeRangeWithTZColumn +from chronify.models import TableSchema +from chronify.time import TimeDataType, TimeIntervalType +from chronify.datetime_range_generator import ( + DatetimeRangeGenerator, + DatetimeRangeGeneratorExternalTimeZone, +) +from chronify.exceptions import InvalidParameter + + +def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: + df = pd.DataFrame( + { + schema.time_config.time_column: pd.to_datetime( + DatetimeRangeGenerator(schema.time_config).list_timestamps() + ) + } + ) + for i, x in enumerate(schema.time_array_id_columns): + df[x] = i + df[schema.value_column] = np.random.rand(len(df)) + return df + + +def generate_dataframe_with_tz_col(schema: TableSchema) -> pd.DataFrame: + time_col = schema.time_config.time_column + ts_dct = DatetimeRangeGeneratorExternalTimeZone( + schema.time_config + ).list_timestamps_by_time_zone() + dfo_list = [] + for i, (time_zone, data) in enumerate(ts_dct.items()): + dfo_list.append( + pd.DataFrame( + {x: i for x in schema.time_array_id_columns} + | { + time_col: pd.to_datetime(data).tz_localize(None), + "time_zone": time_zone, + schema.value_column: np.random.rand(len(data)), + } + ) + ) + dfo = pd.concat(dfo_list, ignore_index=True) + dfo = dfo.reset_index() + return dfo + + +def get_datetime_schema( + year: int, + tzinfo: tzinfo | None, + interval_type: TimeIntervalType, + name: str, + has_tz_col: bool = False, +) -> TableSchema: + start = datetime(year=year, month=1, day=1, tzinfo=tzinfo) + end = datetime(year=year, month=1, day=2, tzinfo=tzinfo) + resolution = timedelta(hours=1) + length = (end - start) / resolution + 1 + cols = ["id"] + # cols += ["time_zone"] if has_tz_col else [] + if has_tz_col: + time_config = DatetimeRangeWithTZColumn( + dtype=TimeDataType.TIMESTAMP_NTZ, + start=start, + resolution=resolution, + length=length, + interval_type=interval_type, + time_column="timestamp", + time_zone_column="time_zone", + time_zones=[ + ZoneInfo("US/Eastern"), + ZoneInfo("US/Central"), + ZoneInfo("US/Mountain"), + # None, + ], + ) + else: + time_config = DatetimeRange( + dtype=TimeDataType.TIMESTAMP_TZ if tzinfo else TimeDataType.TIMESTAMP_NTZ, + start=start, + resolution=resolution, + length=length, + interval_type=interval_type, + time_column="timestamp", + ) + schema = TableSchema( + name=name, + time_config=time_config, + time_array_id_columns=cols, + value_column="value", + ) + return schema + + +def ingest_data( + engine: Engine, + metadata: MetaData, + df: pd.DataFrame, + schema: TableSchema, +) -> None: + with engine.begin() as conn: + write_database(df, conn, schema.name, [schema.time_config], if_table_exists="replace") + metadata.reflect(engine, views=True) + + +def get_mapped_dataframe( + engine: Engine, + table_name: str, + time_config: DatetimeRange, +) -> pd.DataFrame: + with engine.connect() as conn: + query = f"select * from {table_name}" + queried = read_database(query, conn, time_config) + queried = queried.sort_values(by=["id", "timestamp"]).reset_index(drop=True) + return queried + + +def run_localization( + engine: Engine, + df: pd.DataFrame, + from_schema: TableSchema, + to_time_zone: tzinfo | None, +) -> None: + metadata = MetaData() + ingest_data(engine, metadata, df, from_schema) + to_schema = localize_time_zone( + engine, metadata, from_schema, to_time_zone, check_mapped_timestamps=True + ) + dfo = get_mapped_dataframe(engine, to_schema.name, to_schema.time_config) + assert df["value"].equals(dfo["value"]) + if to_time_zone is None: + expected = df["timestamp"] + else: + expected = df["timestamp"].dt.tz_localize(to_time_zone) + assert (dfo["timestamp"] == expected).prod() == 1 + + +def run_localization_to_column_time_zones( + engine: Engine, + df: pd.DataFrame, + from_schema: TableSchema, +) -> None: + metadata = MetaData() + ingest_data(engine, metadata, df, from_schema) + to_schema = localize_time_zone_by_column( + engine, + metadata, + from_schema, + "time_zone", + check_mapped_timestamps=True, + ) + dfo = get_mapped_dataframe(engine, to_schema.name, to_schema.time_config) + dfo = dfo[df.columns].sort_values(by="index").reset_index(drop=True) + dfo["timestamp"] = pd.to_datetime(dfo["timestamp"]) # needed for engine 2, not sure why + + assert df["value"].equals(dfo["value"]) + for i in range(len(dfo)): + tzn = dfo.loc[i, "time_zone"] + if tzn == "None": + ts = dfo.loc[i, "timestamp"].replace(tzinfo=None) + else: + ts = dfo.loc[i, "timestamp"].tz_convert(ZoneInfo(tzn)).replace(tzinfo=None) + assert df.loc[i, "timestamp"] == ts + + +def run_localization_with_error( + engine: Engine, + df: pd.DataFrame, + from_schema: TableSchema, + use_tz_col: bool, + error: tuple[Any, str], +) -> None: + metadata = MetaData() + ingest_data(engine, metadata, df, from_schema) + with pytest.raises(error[0], match=error[1]): + if use_tz_col: + tzl = TimeZoneLocalizerByColumn( + engine, + metadata, + from_schema, + "time_zone", + ) + tzl.localize_time_zone(check_mapped_timestamps=True) + else: + tzl2 = TimeZoneLocalizer(engine, metadata, from_schema, None) + tzl2.localize_time_zone(check_mapped_timestamps=True) + + +def test_src_table_not_tz_naive(iter_engines: Engine) -> None: + from_schema = get_datetime_schema( + 2018, ZoneInfo("US/Mountain"), TimeIntervalType.PERIOD_BEGINNING, "base_table" + ) + df = generate_datetime_dataframe(from_schema) + error = (InvalidParameter, "Source schema time config start time must be tz-naive.") + run_localization_with_error( + iter_engines, df, from_schema, False, error + ) # TODO, support tz-naive to tz-aware conversion + + +@pytest.mark.parametrize( + "to_time_zone", [None, ZoneInfo("US/Central"), ZoneInfo("America/Los_Angeles")] +) +def test_time_localization(iter_engines: Engine, to_time_zone: tzinfo | None) -> None: + from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") + df = generate_datetime_dataframe(from_schema) + run_localization(iter_engines, df, from_schema, to_time_zone) + + +@pytest.mark.parametrize("from_time_tz", [None, ZoneInfo("US/Mountain")]) +def test_time_localization_to_column_time_zones( + iter_engines: Engine, from_time_tz: tzinfo | None +) -> None: + from_schema = get_datetime_schema( + 2018, + from_time_tz, + TimeIntervalType.PERIOD_BEGINNING, + "base_table", + has_tz_col=True, + ) + df = generate_dataframe_with_tz_col(from_schema) + run_localization_to_column_time_zones(iter_engines, df, from_schema) From 9052d2c840feddd87ddb2fe67b13a8709aa9f70e Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 9 Jan 2026 14:55:18 -0700 Subject: [PATCH 04/20] fixed! --- docs/how_tos/map_time_config.md | 2 +- src/chronify/sqlalchemy/functions.py | 22 ++++--------------- src/chronify/store.py | 6 ++--- src/chronify/time_series_mapper_index_time.py | 8 ++++--- ...apper_column_representative_to_datetime.py | 2 +- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/docs/how_tos/map_time_config.md b/docs/how_tos/map_time_config.md index 1a3aa26..c45f823 100644 --- a/docs/how_tos/map_time_config.md +++ b/docs/how_tos/map_time_config.md @@ -35,7 +35,7 @@ dst_table_name = "ev_charging_datetime" hours_per_year = 12 * 7 * 24 num_time_arrays = 3 df = pd.DataFrame({ - "id": np.concat([np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)]), + "id": np.concatenate([np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)]), "month": np.tile(np.repeat(range(1, 13), 7 * 24), num_time_arrays), "day_of_week": np.tile(np.tile(np.repeat(range(7), 24), 12), num_time_arrays), "hour": np.tile(np.tile(range(24), 12 * 7), num_time_arrays), diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 3819c64..1e882f0 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -62,7 +62,7 @@ def write_database( """ match conn.engine.name: case "duckdb": - _write_to_duckdb(df, conn, table_name, configs, if_table_exists) + _write_to_duckdb(df, conn, table_name, if_table_exists) case "sqlite": _write_to_sqlite(df, conn, table_name, configs, if_table_exists) case "hive": @@ -116,33 +116,19 @@ def _write_to_duckdb( df: pd.DataFrame, conn: Connection, table_name: str, - configs: Sequence[TimeBaseModel], if_table_exists: DbWriteMode, ) -> None: assert conn._dbapi_connection is not None assert conn._dbapi_connection.driver_connection is not None - # time_cols = [] - # non_time_cols = df.columns.tolist() - # for config in configs: - # if isinstance(config, DatetimeRangeBase): - # time_col = config.time_column - # if config.dtype == TimeDataType.TIMESTAMP_TZ: - # time_cols.append(f"{time_col}::TIMESTAMPTZ AS {time_col}") - # else: - # time_cols.append(f"{time_col}::TIMESTAMP AS {time_col}") - # non_time_cols.remove(time_col) - - # col_stmt = ", ".join(time_cols + non_time_cols) - col_stmt = "*" match if_table_exists: case "append": - query = f"INSERT INTO {table_name} SELECT {col_stmt} FROM df" + query = f"INSERT INTO {table_name} SELECT * FROM df" case "replace": conn._dbapi_connection.driver_connection.sql(f"DROP TABLE IF EXISTS {table_name}") - query = f"CREATE TABLE {table_name} AS SELECT {col_stmt} FROM df" + query = f"CREATE TABLE {table_name} AS SELECT * FROM df" case "fail": - query = f"CREATE TABLE {table_name} AS SELECT {col_stmt} FROM df" + query = f"CREATE TABLE {table_name} AS SELECT * FROM df" case _: msg = f"{if_table_exists=}" raise InvalidOperation(msg) diff --git a/src/chronify/store.py b/src/chronify/store.py index dec92f6..f0d4259 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -892,7 +892,7 @@ def map_table_time_config( >>> num_time_arrays = 3 >>> df = pd.DataFrame( ... { - ... "id": np.concat( + ... "id": np.concatenate( ... [np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)] ... ), ... "month": np.tile(np.repeat(range(1, 13), 7 * 24), num_time_arrays), @@ -983,7 +983,7 @@ def convert_time_zone( >>> num_time_arrays = 1 >>> df = pd.DataFrame( ... { - ... "id": np.concat( + ... "id": np.concatenate( ... [np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)] ... ), ... "timestamp": np.tile( @@ -1072,7 +1072,7 @@ def convert_time_zone_by_column( >>> num_time_arrays = 3 >>> df = pd.DataFrame( ... { - ... "id": np.concat( + ... "id": np.concatenate( ... [np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)] ... ), ... "timestamp": np.tile( diff --git a/src/chronify/time_series_mapper_index_time.py b/src/chronify/time_series_mapper_index_time.py index 34c4094..4ef3925 100644 --- a/src/chronify/time_series_mapper_index_time.py +++ b/src/chronify/time_series_mapper_index_time.py @@ -19,7 +19,7 @@ ) from chronify.time_range_generator_factory import make_time_range_generator from chronify.time_series_mapper_datetime import MapperDatetimeToDatetime -from chronify.time import TimeType, DaylightSavingAdjustmentType, AggregationType +from chronify.time import TimeDataType, TimeType, DaylightSavingAdjustmentType, AggregationType from chronify.sqlalchemy.functions import read_database logger = logging.getLogger(__name__) @@ -35,6 +35,7 @@ def __init__( data_adjustment: Optional[TimeBasedDataAdjustment] = None, wrap_time_allowed: bool = False, ) -> None: + # TODO: refactor to use new time configs super().__init__( engine, metadata, from_schema, to_schema, data_adjustment, wrap_time_allowed ) @@ -181,7 +182,7 @@ def _create_local_time_config(self, time_zone: str) -> DatetimeRange: return time_config def _create_interm_map(self) -> tuple[pd.DataFrame, MappingTableSchema, TableSchema]: - """Create mapping dataframe for converting INDEX_TZ or INDEX_NTZ time to its represented datetime""" + """Create mapping dataframe for converting INDEX time to its represented datetime""" mapped_schema = self._create_intermediate_schema() assert isinstance(mapped_schema.time_config, DatetimeRange) mapped_time_col = mapped_schema.time_config.time_column @@ -195,7 +196,6 @@ def _create_interm_map(self) -> tuple[pd.DataFrame, MappingTableSchema, TableSch name="mapping_table", time_configs=[from_time_config, mapped_schema.time_config], ) - df = pd.DataFrame( { from_time_col: from_time_data, @@ -251,6 +251,7 @@ def _create_interm_map_with_time_zone( # Update mapped_schema mapped_schema.time_config.start = df[mapped_time_col].min() mapped_schema.time_config.length = df[mapped_time_col].nunique() + mapped_schema.time_config.dtype = TimeDataType.TIMESTAMP_TZ mapping_schema = MappingTableSchema( name="mapping_table_index_time", @@ -316,6 +317,7 @@ def _create_interm_map_with_time_zone_and_dst_adjustment( # Update mapped_schema mapped_schema.time_config.start = df[mapped_time_col].min() mapped_schema.time_config.length = df[mapped_time_col].nunique() + mapped_schema.time_config.dtype = TimeDataType.TIMESTAMP_TZ mapping_schema = MappingTableSchema( name="mapping_table_index_time_with_dst_adjustment", diff --git a/tests/test_mapper_column_representative_to_datetime.py b/tests/test_mapper_column_representative_to_datetime.py index 110c816..53a43db 100644 --- a/tests/test_mapper_column_representative_to_datetime.py +++ b/tests/test_mapper_column_representative_to_datetime.py @@ -169,7 +169,7 @@ def test_NYMDPV_mapper(time_series_NYMDPV, iter_store: Store): mapped_table = read_database( f"SELECT * FROM {to_schema.name}", conn, to_schema.time_config ).sort_values("timestamp") - values = np.concat( + values = np.concatenate( [ np.ones(5) * 100, np.ones(7) * 200, From 217a91fb28ac477231f822d1dc64290f82369129 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 9 Jan 2026 16:23:04 -0700 Subject: [PATCH 05/20] fix datetime_generator._iter_timestamps() --- src/chronify/datetime_range_generator.py | 95 ++++++++++--------- tests/test_mapper_datetime_to_datetime.py | 4 +- ..._mapper_representative_time_to_datetime.py | 4 +- 3 files changed, 52 insertions(+), 51 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index b0a5651..2c4bb60 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -1,7 +1,7 @@ from datetime import datetime, tzinfo from typing import Generator, Optional -from zoneinfo import ZoneInfo from itertools import chain +from calendar import isleap import pandas as pd @@ -10,7 +10,7 @@ TimeDataType, ) from chronify.time_configs import DatetimeRanges, DatetimeRange, DatetimeRangeWithTZColumn -from chronify.time_utils import adjust_timestamp_by_dst_offset, get_tzname +from chronify.time_utils import get_tzname from chronify.time_range_generator_base import TimeRangeGeneratorBase from chronify.exceptions import InvalidValue @@ -26,51 +26,52 @@ def __init__( self._model = model self._adjustment = leap_day_adjustment or LeapDayAdjustmentType.NONE - def _iter_timestamps( - self, start: Optional[datetime] = None - ) -> Generator[datetime, None, None]: - """ + def _list_timestamps(self, start: Optional[datetime] = None) -> list[datetime]: + """Return all timestamps as a list. if start is supplied, override self._model.start """ if start is None: start = self._model.start - tz = start.tzinfo - - for i in range(self._model.length): - if not tz: - cur = adjust_timestamp_by_dst_offset( - start + i * self._model.resolution, self._model.resolution - ) - else: - # always step in standard time - cur_utc = start.astimezone(ZoneInfo("UTC")) + i * self._model.resolution - cur = adjust_timestamp_by_dst_offset( - cur_utc.astimezone(tz), self._model.resolution - ) - - is_leap_year = ( - pd.Timestamp(f"{cur.year}-01-01") + pd.Timedelta(days=365) - ).year == cur.year - if not is_leap_year: - yield pd.Timestamp(cur) - continue - - month = cur.month - day = cur.day - if not ( - self._adjustment == LeapDayAdjustmentType.DROP_FEB29 and month == 2 and day == 29 - ): - if not ( - self._adjustment == LeapDayAdjustmentType.DROP_DEC31 - and month == 12 - and day == 31 - ): - if not ( - self._adjustment == LeapDayAdjustmentType.DROP_JAN1 - and month == 1 - and day == 1 - ): - yield pd.Timestamp(cur) + + timestamps = pd.date_range( + start=start, + periods=self._model.length, + freq=self._model.resolution, + ).tolist() + + match self._adjustment: + case LeapDayAdjustmentType.DROP_FEB29: + timestamps = [ + ts + for ts in timestamps + if not (isleap(ts.year) and ts.month == 2 and ts.day == 29) + ] + case LeapDayAdjustmentType.DROP_DEC31: + timestamps = [ + ts + for ts in timestamps + if not (isleap(ts.year) and ts.month == 12 and ts.day == 31) + ] + case LeapDayAdjustmentType.DROP_JAN1: + timestamps = [ + ts + for ts in timestamps + if not (isleap(ts.year) and ts.month == 1 and ts.day == 1) + ] + case _: + pass + + return timestamps # type: ignore + + def _iter_timestamps( + self, start: Optional[datetime] = None + ) -> Generator[datetime, None, None]: + """Generator from pd.date_range(). + Note: Established time library already handles historical changes in time zone conversion to UTC. + (e.g. Algeria (Africa/Algiers) changed from UTC+0 to UTC+1 on April 25, 1980) + """ + for ts in self._list_timestamps(start=start): + yield ts def list_time_columns(self) -> list[str]: return self._model.list_time_columns() @@ -94,7 +95,7 @@ def __init__( assert isinstance(self._model, DatetimeRange) def list_timestamps(self) -> list[datetime]: - return list(self._iter_timestamps()) + return self._list_timestamps() # list(self._iter_timestamps()) class DatetimeRangeGeneratorExternalTimeZone(DatetimeRangeGeneratorBase): @@ -118,7 +119,7 @@ def __init__( ) raise InvalidValue(msg) - def _list_timestamps(self, time_zone: Optional[tzinfo]) -> list[datetime]: + def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[datetime]: """return timestamps for a given time_zone expected in the dataframe returned timestamp dtype matches that in the dataframe, i.e. self._model.dtype (e.g., if time_zone is None, return tz-naive timestamps else return tz-aware timestamps) @@ -142,7 +143,7 @@ def _list_timestamps(self, time_zone: Optional[tzinfo]) -> list[datetime]: case _: msg = f"Unsupported combination of start_time_is_tz_naive and dtype: {self._model}" raise InvalidValue(msg) - return list(self._iter_timestamps(start=start)) + return self._list_timestamps(start=start) # ist(self._iter_timestamps(start=start)) def list_timestamps(self) -> list[datetime]: """return ordered tz-naive timestamps across all time zones in the order of the time zones.""" @@ -155,7 +156,7 @@ def list_timestamps_by_time_zone(self) -> dict[str, list[datetime]]: dct = {} for tz in self._model.get_time_zones(): tz_name = get_tzname(tz) - dct[tz_name] = self._list_timestamps(tz) + dct[tz_name] = self._list_timestamps_by_time_zone(time_zone=tz) return dct def list_distinct_timestamps_by_time_zone_from_dataframe( diff --git a/tests/test_mapper_datetime_to_datetime.py b/tests/test_mapper_datetime_to_datetime.py index 3d144e9..5d969c2 100644 --- a/tests/test_mapper_datetime_to_datetime.py +++ b/tests/test_mapper_datetime_to_datetime.py @@ -22,8 +22,8 @@ ) -def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: # type: ignore - return pd.to_datetime(list(DatetimeRangeGenerator(time_config)._iter_timestamps())) +def generate_datetime_data(time_config: DatetimeRange) -> pd.DatetimeIndex: # type: ignore + return pd.to_datetime(DatetimeRangeGenerator(time_config).list_timestamps()) def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: diff --git a/tests/test_mapper_representative_time_to_datetime.py b/tests/test_mapper_representative_time_to_datetime.py index 06ead60..2574a8a 100644 --- a/tests/test_mapper_representative_time_to_datetime.py +++ b/tests/test_mapper_representative_time_to_datetime.py @@ -14,8 +14,8 @@ from chronify.datetime_range_generator import DatetimeRangeGenerator -def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: - return pd.to_datetime(list(DatetimeRangeGenerator(time_config)._iter_timestamps())) +def generate_datetime_data(time_config: DatetimeRange) -> pd.DatetimeIndex: + return pd.to_datetime(DatetimeRangeGenerator(time_config).list_timestamps()) def get_datetime_schema(year: int, tzinfo: tzinfo | None) -> TableSchema: From 05c1c20277f34083c4ff79b3205795160f4cc4d5 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 13 Jan 2026 09:50:10 -0700 Subject: [PATCH 06/20] update pytest --- src/chronify/datetime_range_generator.py | 10 +++++++--- src/chronify/time_zone_converter.py | 2 +- src/chronify/time_zone_localizer.py | 2 +- tests/test_time_zone_localizer.py | 8 +++----- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 2c4bb60..7384386 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -126,13 +126,17 @@ def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[dat """ match (self._model.start_time_is_tz_naive(), self._model.dtype): case (True, TimeDataType.TIMESTAMP_NTZ): - # aligned_in_local_time of the time zone, all time zones have the same tz-naive timestamps + # aligned_in_local_standard_time of the time zone, + # all time zones must have the same tz-naive timestamps + # timestamps must represent local standard time zone, not local prevailing time zone with DST start = self._model.start case (True, TimeDataType.TIMESTAMP_TZ): - # aligned_in_local_time of the time zone, all time zones have different tz-aware timestamps that are aligned when adjusted by time zone + # aligned_in_local_standard_time of the time zone, + # all time zones have different tz-aware timestamps that are aligned when adjusted to local standard time zone start = self._model.start.replace(tzinfo=time_zone) case (False, TimeDataType.TIMESTAMP_NTZ): - # aligned_in_absolute_time, all time zones have different tz-naive timestamps that are aligned when localized to the time zone + # aligned_in_absolute_time, + # all time zones have different tz-naive timestamps that are aligned when localized to the time zone if time_zone: start = self._model.start.astimezone(time_zone).replace(tzinfo=None) else: diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 18c8da9..12c53e0 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -304,7 +304,7 @@ class TimeZoneConverterByColumn(TimeZoneConverterBase): - Output time config: DatetimeRangeWithTZColumn with tz-aware start time, Timestamp_NTZ dtype To convert tz-aware timestamps aligned_in_absolute_time to multiple time zones specified in a column - and aligned_in_local_time: + and aligned_in_local_standard_time: - wrap_time_allowed = True - Input time config: DatetimeRange with tz-aware start time, Timestamp_TZ dtype - Output time config: DatetimeRangeWithTZColumn with tz-naive start time, Timestamp_NTZ dtype diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index 1d0da4f..856151d 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -255,7 +255,7 @@ class TimeZoneLocalizerByColumn(TimeZoneLocalizerBase): I/O Time config scenarios: -------------------------------- - To localize tz-naive timestamps aligned_in_local_time to multiple time zones specified in a column: + To localize tz-naive timestamps aligned_in_local_standard_time to multiple time zones specified in a column: - Input time config: DatetimeRangeWithTZColumn with tz-naive start time, Timestamp_NTZ dtype - Output time config: DatetimeRangeWithTZColumn with tz-naive start time, Timestamp_TZ dtype diff --git a/tests/test_time_zone_localizer.py b/tests/test_time_zone_localizer.py index d33e086..079636d 100644 --- a/tests/test_time_zone_localizer.py +++ b/tests/test_time_zone_localizer.py @@ -14,7 +14,7 @@ localize_time_zone, localize_time_zone_by_column, ) -from chronify.time_configs import DatetimeRange, DatetimeRangeWithTZColumn +from chronify.time_configs import DatetimeRange, DatetimeRangeWithTZColumn, DatetimeRangeBase from chronify.models import TableSchema from chronify.time import TimeDataType, TimeIntervalType from chronify.datetime_range_generator import ( @@ -72,9 +72,8 @@ def get_datetime_schema( resolution = timedelta(hours=1) length = (end - start) / resolution + 1 cols = ["id"] - # cols += ["time_zone"] if has_tz_col else [] if has_tz_col: - time_config = DatetimeRangeWithTZColumn( + time_config: DatetimeRangeBase = DatetimeRangeWithTZColumn( dtype=TimeDataType.TIMESTAMP_NTZ, start=start, resolution=resolution, @@ -86,7 +85,6 @@ def get_datetime_schema( ZoneInfo("US/Eastern"), ZoneInfo("US/Central"), ZoneInfo("US/Mountain"), - # None, ], ) else: @@ -121,7 +119,7 @@ def ingest_data( def get_mapped_dataframe( engine: Engine, table_name: str, - time_config: DatetimeRange, + time_config: DatetimeRangeBase, ) -> pd.DataFrame: with engine.connect() as conn: query = f"select * from {table_name}" From f4608918a04169236bd49292e73de1fd31b1a94f Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:48:36 -0700 Subject: [PATCH 07/20] fix local time test --- src/chronify/time_zone_localizer.py | 32 +++++++++- tests/test_time_zone_localizer.py | 96 ++++++++++++++++++----------- 2 files changed, 90 insertions(+), 38 deletions(-) diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index 856151d..a50a349 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -24,6 +24,7 @@ from chronify.sqlalchemy.functions import read_database from chronify.time import TimeDataType, TimeType from chronify.time_series_mapper import map_time +from chronify.time_utils import get_standard_time_zone, is_standard_time_zone def localize_time_zone( @@ -204,12 +205,19 @@ def _check_from_schema(from_schema: TableSchema) -> None: def generate_to_time_config(self) -> DatetimeRange: assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy + # must localize to standard time zone of the to_time_zone before converting to to_time_zone + # because data must be in local standard time + tz_start = self._from_schema.time_config.start + if self._to_time_zone: + to_std_tz = get_standard_time_zone(self._to_time_zone) + tz_start = tz_start.replace(tzinfo=to_std_tz).astimezone(self._to_time_zone) + to_time_config: DatetimeRange = self._from_schema.time_config.model_copy( update={ "dtype": TimeDataType.TIMESTAMP_TZ if self._to_time_zone else TimeDataType.TIMESTAMP_NTZ, - "start": self._from_schema.time_config.start.replace(tzinfo=self._to_time_zone), + "start": tz_start, } ) @@ -284,6 +292,7 @@ def __init__( self._convert_from_time_config_to_datetime_range_with_tz_column() else: self.time_zone_column = self._from_schema.time_config.time_zone_column + self._check_standard_time_zones() self._to_schema = self.generate_to_schema() @staticmethod @@ -317,6 +326,25 @@ def _check_time_zone_column(from_schema: TableSchema, time_zone_column: Optional msg += f"\n{from_schema}" raise InvalidParameter(msg) + def _check_standard_time_zones(self) -> None: + """Check that all time zones in the time_zone_column are valid standard time zones.""" + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy + for tz in self._from_schema.time_config.time_zones: + if tz == "None": + msg = ( + "Chronify does not support None time zone in time_zone_column " + "when localizing time zones by column. " + ) + raise InvalidParameter(msg) + if not is_standard_time_zone(tz): + std_tz = get_standard_time_zone(tz) + msg = ( + f"Time zone {tz} in column {self.time_zone_column} is not a standard time zone. " + f"Please provide standard time zones (without DST) for localization. " + f"Standard time zone for {tz} is {std_tz}." + ) + raise InvalidParameter(msg) + def _convert_from_time_config_to_datetime_range_with_tz_column(self) -> None: """Convert DatetimeRange from_schema time config to DatetimeRangeWithTZColumn time config for the rest of the workflow @@ -421,7 +449,7 @@ def _get_time_zones(self) -> list[tzinfo | None]: ) raise InvalidParameter(msg) - time_zones = [None if tz == "None" else ZoneInfo(tz) for tz in time_zones] + time_zones = [ZoneInfo(tz) for tz in time_zones] return time_zones def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: diff --git a/tests/test_time_zone_localizer.py b/tests/test_time_zone_localizer.py index 079636d..1d7a30c 100644 --- a/tests/test_time_zone_localizer.py +++ b/tests/test_time_zone_localizer.py @@ -8,6 +8,7 @@ from sqlalchemy import Engine, MetaData from chronify.sqlalchemy.functions import read_database, write_database +from chronify.time_utils import get_standard_time_zone from chronify.time_zone_localizer import ( TimeZoneLocalizer, TimeZoneLocalizerByColumn, @@ -65,37 +66,58 @@ def get_datetime_schema( tzinfo: tzinfo | None, interval_type: TimeIntervalType, name: str, - has_tz_col: bool = False, ) -> TableSchema: - start = datetime(year=year, month=1, day=1, tzinfo=tzinfo) - end = datetime(year=year, month=1, day=2, tzinfo=tzinfo) + start = datetime(year=year, month=3, day=11, tzinfo=tzinfo) + end = datetime(year=year, month=3, day=12, tzinfo=tzinfo) resolution = timedelta(hours=1) length = (end - start) / resolution + 1 cols = ["id"] - if has_tz_col: - time_config: DatetimeRangeBase = DatetimeRangeWithTZColumn( - dtype=TimeDataType.TIMESTAMP_NTZ, - start=start, - resolution=resolution, - length=length, - interval_type=interval_type, - time_column="timestamp", - time_zone_column="time_zone", - time_zones=[ - ZoneInfo("US/Eastern"), - ZoneInfo("US/Central"), - ZoneInfo("US/Mountain"), - ], - ) - else: - time_config = DatetimeRange( - dtype=TimeDataType.TIMESTAMP_TZ if tzinfo else TimeDataType.TIMESTAMP_NTZ, - start=start, - resolution=resolution, - length=length, - interval_type=interval_type, - time_column="timestamp", - ) + time_config = DatetimeRange( + dtype=TimeDataType.TIMESTAMP_TZ if tzinfo else TimeDataType.TIMESTAMP_NTZ, + start=start, + resolution=resolution, + length=length, + interval_type=interval_type, + time_column="timestamp", + ) + schema = TableSchema( + name=name, + time_config=time_config, + time_array_id_columns=cols, + value_column="value", + ) + return schema + + +def get_datetime_with_tz_col_schema( + year: int, + tzinfo: tzinfo | None, + interval_type: TimeIntervalType, + name: str, + standard_tz: bool = False, +) -> TableSchema: + start = datetime(year=year, month=3, day=11, tzinfo=tzinfo) + end = datetime(year=year, month=3, day=12, tzinfo=tzinfo) + resolution = timedelta(hours=1) + length = (end - start) / resolution + 1 + cols = ["id"] + time_zones = [ + ZoneInfo("US/Eastern"), + ZoneInfo("US/Central"), + ZoneInfo("US/Mountain"), + ] + if standard_tz: + time_zones = [get_standard_time_zone(tz) for tz in time_zones] + time_config: DatetimeRangeBase = DatetimeRangeWithTZColumn( + dtype=TimeDataType.TIMESTAMP_NTZ, + start=start, + resolution=resolution, + length=length, + interval_type=interval_type, + time_column="timestamp", + time_zone_column="time_zone", + time_zones=time_zones, + ) schema = TableSchema( name=name, time_config=time_config, @@ -144,10 +166,13 @@ def run_localization( if to_time_zone is None: expected = df["timestamp"] else: - expected = df["timestamp"].dt.tz_localize(to_time_zone) + std_tz = get_standard_time_zone(to_time_zone) + expected = df["timestamp"].dt.tz_localize(std_tz).dt.tz_convert(to_time_zone) + assert (dfo["timestamp"] == expected).prod() == 1 +# TODO: add test for error cases def run_localization_to_column_time_zones( engine: Engine, df: pd.DataFrame, @@ -165,14 +190,15 @@ def run_localization_to_column_time_zones( dfo = get_mapped_dataframe(engine, to_schema.name, to_schema.time_config) dfo = dfo[df.columns].sort_values(by="index").reset_index(drop=True) dfo["timestamp"] = pd.to_datetime(dfo["timestamp"]) # needed for engine 2, not sure why - assert df["value"].equals(dfo["value"]) for i in range(len(dfo)): tzn = dfo.loc[i, "time_zone"] if tzn == "None": ts = dfo.loc[i, "timestamp"].replace(tzinfo=None) else: - ts = dfo.loc[i, "timestamp"].tz_convert(ZoneInfo(tzn)).replace(tzinfo=None) + # source data is in local standard time + ts = dfo.loc[i, "timestamp"].tz_convert(tzn).replace(tzinfo=None) + assert df.loc[i, "timestamp"] == ts @@ -210,25 +236,23 @@ def test_src_table_not_tz_naive(iter_engines: Engine) -> None: ) # TODO, support tz-naive to tz-aware conversion -@pytest.mark.parametrize( - "to_time_zone", [None, ZoneInfo("US/Central"), ZoneInfo("America/Los_Angeles")] -) +@pytest.mark.parametrize("to_time_zone", [None, ZoneInfo("US/Central"), ZoneInfo("EST")]) def test_time_localization(iter_engines: Engine, to_time_zone: tzinfo | None) -> None: from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") df = generate_datetime_dataframe(from_schema) run_localization(iter_engines, df, from_schema, to_time_zone) -@pytest.mark.parametrize("from_time_tz", [None, ZoneInfo("US/Mountain")]) +@pytest.mark.parametrize("from_time_tz", [None, ZoneInfo("US/Mountain"), ZoneInfo("MST")]) def test_time_localization_to_column_time_zones( iter_engines: Engine, from_time_tz: tzinfo | None ) -> None: - from_schema = get_datetime_schema( + from_schema = get_datetime_with_tz_col_schema( 2018, from_time_tz, TimeIntervalType.PERIOD_BEGINNING, "base_table", - has_tz_col=True, + standard_tz=True, ) df = generate_dataframe_with_tz_col(from_schema) run_localization_to_column_time_zones(iter_engines, df, from_schema) From 3fd76158188f98649131d88ed44158eaff05c9ae Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:13:14 -0700 Subject: [PATCH 08/20] update tests --- src/chronify/time_zone_converter.py | 1 + src/chronify/time_zone_localizer.py | 59 +++++----- tests/test_time_zone_localizer.py | 164 +++++++++++++++++++++++----- 3 files changed, 172 insertions(+), 52 deletions(-) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 12c53e0..a686e80 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -24,6 +24,7 @@ from chronify.time_utils import wrapped_time_timestamps, get_tzname +# TODO - retain original timestamp column def convert_time_zone( engine: Engine, metadata: MetaData, diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index a50a349..721eb56 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -36,7 +36,10 @@ def localize_time_zone( output_file: Optional[Path] = None, check_mapped_timestamps: bool = False, ) -> TableSchema: - """Localize TIMESTAMP_NTZ time column in a table to a specified time zone. + """Localize TIMESTAMP_NTZ time column in a table to a specified standard time zone. + Input data must be in a standard time zone (without DST) because it's ambiguous to localize + tz-naive timestamps with skips and duplicates to a prevailing time zone. + Updates table to TIMESTAMP_TZ time column and returns a new time config. Parameters @@ -48,7 +51,7 @@ def localize_time_zone( src_schema : TableSchema Defines the source table in the database. to_time_zone : tzinfo or None - Time zone to convert to. If None, convert to tz-naive. + Standard time zone to convert to. If None, convert to tz-naive. scratch_dir : pathlib.Path, optional Directory to use for temporary writes. Defaults to the system's tmp filesystem. output_file : pathlib.Path, optional @@ -164,6 +167,7 @@ class TimeZoneLocalizer(TimeZoneLocalizerBase): Input data table must contain tz-naive timestamps. Input time config must be of type DatetimeRange with Timestamp_NTZ dtype and tz-naive start time. + to_time_zone must be a standard time zone (without DST) or None. Output data table will contain tz-aware timestamps. Output time config will be of type DatetimeRange with Timestamp_TZ dtype and tz-aware start time. """ @@ -177,7 +181,7 @@ def __init__( ): self._check_from_schema(from_schema) super().__init__(engine, metadata, from_schema) - self._to_time_zone = to_time_zone + self._to_time_zone = self._check_standard_time_zone(to_time_zone) self._to_schema = self.generate_to_schema() @staticmethod @@ -203,21 +207,27 @@ def _check_from_schema(from_schema: TableSchema) -> None: msg += f"\n{from_schema.time_config}" raise InvalidParameter(msg) + @staticmethod + def _check_standard_time_zone(to_time_zone: tzinfo | None) -> tzinfo | None: + if to_time_zone is None: + return None + if not is_standard_time_zone(to_time_zone): + std_tz = get_standard_time_zone(to_time_zone) + msg = ( + "TimeZoneLocalizer only supports standard time zones (without DST). " + f"{to_time_zone=} is not a standard time zone. Try instead: {std_tz}" + ) + raise InvalidParameter(msg) + return to_time_zone + def generate_to_time_config(self) -> DatetimeRange: assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy - # must localize to standard time zone of the to_time_zone before converting to to_time_zone - # because data must be in local standard time - tz_start = self._from_schema.time_config.start - if self._to_time_zone: - to_std_tz = get_standard_time_zone(self._to_time_zone) - tz_start = tz_start.replace(tzinfo=to_std_tz).astimezone(self._to_time_zone) - to_time_config: DatetimeRange = self._from_schema.time_config.model_copy( update={ "dtype": TimeDataType.TIMESTAMP_TZ if self._to_time_zone else TimeDataType.TIMESTAMP_NTZ, - "start": tz_start, + "start": self._from_schema.time_config.start.replace(tzinfo=self._to_time_zone), } ) @@ -254,6 +264,7 @@ class TimeZoneLocalizerByColumn(TimeZoneLocalizerBase): """Class for time zone localization of tz-naive time series data based on a time zone column. Input data table must contain tz-naive timestamps and a time zone column. + Time zones in the time zone column must be standard time zones (without DST). Input time config must be of type DatetimeRangeWithTZColumn or DatetimeRange with Timestamp_NTZ dtype. - If DatetimeRangeWithTZColumn is used, time_zone_column, if provided, is ignored. - If DatetimeRange is used, time_zone_column must be provided. It is then converted to @@ -273,8 +284,6 @@ class TimeZoneLocalizerByColumn(TimeZoneLocalizerBase): Note: output time config is reduced to DatetimeRange (from DatetimeRangeWithTZColumn) since all timestamps are tz-aware and aligned in absolute time. -------------------------------- - - # TODO: add tests """ def __init__( @@ -312,7 +321,7 @@ def _check_from_schema(from_schema: TableSchema) -> None: def _check_time_zone_column(from_schema: TableSchema, time_zone_column: Optional[str]) -> None: if ( isinstance(from_schema.time_config, DatetimeRangeWithTZColumn) - and time_zone_column is None + and time_zone_column is not None ): msg = f"Input {time_zone_column=} will be ignored. time_zone_column is already defined in the time_config." raise Warning(msg) @@ -329,21 +338,21 @@ def _check_time_zone_column(from_schema: TableSchema, time_zone_column: Optional def _check_standard_time_zones(self) -> None: """Check that all time zones in the time_zone_column are valid standard time zones.""" assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy - for tz in self._from_schema.time_config.time_zones: + msg = "" + time_zones = self._from_schema.time_config.time_zones + for tz in time_zones: if tz == "None": - msg = ( - "Chronify does not support None time zone in time_zone_column " - "when localizing time zones by column. " - ) + msg += "\nChronify does not support None time zone in time_zone_column. " raise InvalidParameter(msg) if not is_standard_time_zone(tz): std_tz = get_standard_time_zone(tz) - msg = ( - f"Time zone {tz} in column {self.time_zone_column} is not a standard time zone. " - f"Please provide standard time zones (without DST) for localization. " - f"Standard time zone for {tz} is {std_tz}." - ) - raise InvalidParameter(msg) + msg = f"\n{tz} is not a standard time zone. Try instead: {std_tz}. " + if msg != "": + msg = ( + f"TimeZoneLocalizerByColumn only supports standard time zones (without DST). {time_zones}" + + msg + ) + raise InvalidParameter(msg) def _convert_from_time_config_to_datetime_range_with_tz_column(self) -> None: """Convert DatetimeRange from_schema time config to DatetimeRangeWithTZColumn time config diff --git a/tests/test_time_zone_localizer.py b/tests/test_time_zone_localizer.py index 1d7a30c..bc2968d 100644 --- a/tests/test_time_zone_localizer.py +++ b/tests/test_time_zone_localizer.py @@ -172,7 +172,6 @@ def run_localization( assert (dfo["timestamp"] == expected).prod() == 1 -# TODO: add test for error cases def run_localization_to_column_time_zones( engine: Engine, df: pd.DataFrame, @@ -184,7 +183,6 @@ def run_localization_to_column_time_zones( engine, metadata, from_schema, - "time_zone", check_mapped_timestamps=True, ) dfo = get_mapped_dataframe(engine, to_schema.name, to_schema.time_config) @@ -206,37 +204,37 @@ def run_localization_with_error( engine: Engine, df: pd.DataFrame, from_schema: TableSchema, - use_tz_col: bool, error: tuple[Any, str], ) -> None: metadata = MetaData() ingest_data(engine, metadata, df, from_schema) + with pytest.raises(error[0], match=error[1]): - if use_tz_col: - tzl = TimeZoneLocalizerByColumn( - engine, - metadata, - from_schema, - "time_zone", - ) - tzl.localize_time_zone(check_mapped_timestamps=True) - else: - tzl2 = TimeZoneLocalizer(engine, metadata, from_schema, None) - tzl2.localize_time_zone(check_mapped_timestamps=True) + TimeZoneLocalizer(engine, metadata, from_schema, None).localize_time_zone( + check_mapped_timestamps=True + ) -def test_src_table_not_tz_naive(iter_engines: Engine) -> None: - from_schema = get_datetime_schema( - 2018, ZoneInfo("US/Mountain"), TimeIntervalType.PERIOD_BEGINNING, "base_table" - ) - df = generate_datetime_dataframe(from_schema) - error = (InvalidParameter, "Source schema time config start time must be tz-naive.") - run_localization_with_error( - iter_engines, df, from_schema, False, error - ) # TODO, support tz-naive to tz-aware conversion +def run_localization_by_column_with_error( + engine: Engine, + df: pd.DataFrame, + from_schema: TableSchema, + error: tuple[Any, str], + time_zone_column: str | None = None, +) -> None: + metadata = MetaData() + ingest_data(engine, metadata, df, from_schema) + + with pytest.raises(error[0], match=error[1]): + TimeZoneLocalizerByColumn( + engine, + metadata, + from_schema, + time_zone_column=time_zone_column, + ).localize_time_zone(check_mapped_timestamps=True) -@pytest.mark.parametrize("to_time_zone", [None, ZoneInfo("US/Central"), ZoneInfo("EST")]) +@pytest.mark.parametrize("to_time_zone", [None, ZoneInfo("EST")]) def test_time_localization(iter_engines: Engine, to_time_zone: tzinfo | None) -> None: from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") df = generate_datetime_dataframe(from_schema) @@ -244,9 +242,7 @@ def test_time_localization(iter_engines: Engine, to_time_zone: tzinfo | None) -> @pytest.mark.parametrize("from_time_tz", [None, ZoneInfo("US/Mountain"), ZoneInfo("MST")]) -def test_time_localization_to_column_time_zones( - iter_engines: Engine, from_time_tz: tzinfo | None -) -> None: +def test_time_localization_by_column(iter_engines: Engine, from_time_tz: tzinfo | None) -> None: from_schema = get_datetime_with_tz_col_schema( 2018, from_time_tz, @@ -256,3 +252,117 @@ def test_time_localization_to_column_time_zones( ) df = generate_dataframe_with_tz_col(from_schema) run_localization_to_column_time_zones(iter_engines, df, from_schema) + + +# Error tests for TimeZoneLocalizer +def test_time_localizer_to_dst_time_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizer raises error when to_time_zone is a non standard time zone""" + from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") + df = generate_datetime_dataframe(from_schema) + to_time_zone = ZoneInfo("US/Mountain") # has DST + metadata = MetaData() + ingest_data(iter_engines, metadata, df, from_schema) + with pytest.raises( + InvalidParameter, match="TimeZoneLocalizer only supports standard time zones" + ): + localize_time_zone( + iter_engines, metadata, from_schema, to_time_zone, check_mapped_timestamps=True + ) + + +def test_time_localizer_with_tz_aware_config_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizer raises error when start time is tz-aware""" + from_schema = get_datetime_schema( + 2018, ZoneInfo("US/Mountain"), TimeIntervalType.PERIOD_BEGINNING, "base_table" + ) + df = generate_datetime_dataframe(from_schema) + error = (InvalidParameter, "Source schema time config start time must be tz-naive") + run_localization_with_error(iter_engines, df, from_schema, error) + + +def test_time_localizer_with_wrong_dtype_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizer raises error when dtype is not TIMESTAMP_NTZ""" + from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") + # Manually change dtype to TIMESTAMP_TZ to trigger error + from_schema.time_config = from_schema.time_config.model_copy( + update={"dtype": TimeDataType.TIMESTAMP_TZ} + ) + df = generate_datetime_dataframe(from_schema) + error = (InvalidParameter, "Source schema time config dtype must be TIMESTAMP_NTZ") + run_localization_with_error(iter_engines, df, from_schema, error) + + +def test_time_localizer_with_datetime_range_with_tz_col_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizer raises error when time config is DatetimeRangeWithTZColumn""" + from_schema = get_datetime_with_tz_col_schema( + 2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table", standard_tz=True + ) + df = generate_dataframe_with_tz_col(from_schema) + error = (InvalidParameter, "try using TimeZoneLocalizerByColumn") + run_localization_with_error(iter_engines, df, from_schema, error) + + +# Error tests for TimeZoneLocalizerByColumn +def test_time_localizer_by_column_to_dst_time_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizerByColumn raises error when to_time_zone is a non standard time zone""" + from_schema = get_datetime_with_tz_col_schema( + 2018, + None, + TimeIntervalType.PERIOD_BEGINNING, + "base_table", + standard_tz=False, + ) + df = generate_dataframe_with_tz_col(from_schema) + metadata = MetaData() + ingest_data(iter_engines, metadata, df, from_schema) + with pytest.raises( + InvalidParameter, match="TimeZoneLocalizerByColumn only supports standard time zones" + ): + localize_time_zone_by_column( + iter_engines, metadata, from_schema, check_mapped_timestamps=True + ) + + +def test_time_localizer_by_column_missing_tz_column_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizerByColumn raises error when time_zone_column is missing for DatetimeRange""" + from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") + df = generate_datetime_dataframe(from_schema) + error = (InvalidParameter, "time_zone_column must be provided") + run_localization_by_column_with_error(iter_engines, df, from_schema, error) + + +def test_time_localizer_by_column_wrong_dtype_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizerByColumn raises error when dtype is not TIMESTAMP_NTZ""" + from_schema = get_datetime_with_tz_col_schema( + 2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table", standard_tz=True + ) + # Change dtype to TIMESTAMP_TZ to trigger error + from_schema.time_config = from_schema.time_config.model_copy( + update={"dtype": TimeDataType.TIMESTAMP_TZ} + ) + df = generate_dataframe_with_tz_col(from_schema) + error = (InvalidParameter, "Source schema time config dtype must be TIMESTAMP_NTZ") + run_localization_by_column_with_error(iter_engines, df, from_schema, error) + + +def test_time_localizer_by_column_non_standard_tz_error(iter_engines: Engine) -> None: + """Test that TimeZoneLocalizerByColumn raises error when time zones are not standard""" + from_schema = get_datetime_with_tz_col_schema( + 2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table", standard_tz=False + ) + df = generate_dataframe_with_tz_col(from_schema) + error = (InvalidParameter, "is not a standard time zone") + run_localization_by_column_with_error(iter_engines, df, from_schema, error) + + +def test_localize_time_zone_by_column_missing_tz_column_error(iter_engines: Engine) -> None: + """Test that localize_time_zone_by_column raises error when time_zone_column is None for DatetimeRange""" + from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") + df = generate_datetime_dataframe(from_schema) + error = ( + Exception, + "time_zone_column must be provided when source schema time config is of type DatetimeRange", + ) + run_localization_by_column_with_error( + iter_engines, df, from_schema, error, time_zone_column=None + ) From 4f2a3f30cb5db253a370cfdcc2df3e5bcae4c5e9 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Wed, 14 Jan 2026 10:51:18 -0700 Subject: [PATCH 09/20] update store with localization funcs --- src/chronify/store.py | 174 ++++++++++++++++++++++++++++++++++++++++++ tests/test_store.py | 152 ++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) diff --git a/src/chronify/store.py b/src/chronify/store.py index f0d4259..a6156ff 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -51,6 +51,7 @@ from chronify.time_series_checker import check_timestamps from chronify.time_series_mapper import map_time from chronify.time_zone_converter import TimeZoneConverter, TimeZoneConverterByColumn +from chronify.time_zone_localizer import TimeZoneLocalizer, TimeZoneLocalizerByColumn from chronify.utils.path_utils import check_overwrite, to_path from chronify.utils.sqlalchemy_view import create_view @@ -1124,6 +1125,179 @@ def convert_time_zone_by_column( return dst_schema + def localize_time_zone( + self, + src_name: str, + time_zone: tzinfo | None, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> TableSchema: + """ + Localize the time zone of the existing table represented by src_name to a specified time zone + + Parameters + ---------- + src_name + Refers to the table name of the source data. + time_zone + Standard time zone to localize to. If None, keep as tz-naive. + scratch_dir + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file + If set, write the mapped table to this Parquet file. + check_mapped_timestamps + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Raises + ------ + TableAlreadyExists + Raised if the dst_schema name already exists. + + Examples + -------- + >>> store = Store() + >>> start = datetime(year=2018, month=1, day=1) # tz-naive + >>> freq = timedelta(hours=1) + >>> hours_per_year = 8760 + >>> num_time_arrays = 1 + >>> df = pd.DataFrame( + ... { + ... "id": np.concatenate( + ... [np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)] + ... ), + ... "timestamp": np.tile( + ... pd.date_range(start, periods=hours_per_year, freq="h"), num_time_arrays + ... ), + ... "value": np.random.random(hours_per_year * num_time_arrays), + ... } + ... ) + >>> schema = TableSchema( + ... name="some_data", + ... time_config=DatetimeRange( + ... time_column="timestamp", + ... start=start, + ... length=hours_per_year, + ... resolution=freq, + ... ), + ... time_array_id_columns=["id"], + ... value_column="value", + ... ) + >>> store.ingest_table(df, schema) + >>> to_time_zone = ZoneInfo("EST") + >>> dst_schema = store.localize_time_zone( + ... schema.name, to_time_zone, check_mapped_timestamps=True + ... ) + """ + + src_schema = self._schema_mgr.get_schema(src_name) + tzl = TimeZoneLocalizer(self._engine, self._metadata, src_schema, time_zone) + + dst_schema = tzl.generate_to_schema() + if self.has_table(dst_schema.name): + msg = dst_schema.name + raise TableAlreadyExists(msg) + + tzl.localize_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + with self._engine.begin() as conn: + self._schema_mgr.add_schema(conn, dst_schema) + + return dst_schema + + def localize_time_zone_by_column( + self, + src_name: str, + time_zone_column: str, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> TableSchema: + """ + Localize the time zone of the existing table represented by src_name to time zones defined by a column + + Parameters + ---------- + src_name + Refers to the table name of the source data. + time_zone_column + Name of the time zone column for localization. + scratch_dir + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file + If set, write the mapped table to this Parquet file. + check_mapped_timestamps + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Raises + ------ + TableAlreadyExists + Raised if the dst_schema name already exists. + + Examples + -------- + >>> store = Store() + >>> start = datetime(year=2018, month=1, day=1) # tz-naive + >>> freq = timedelta(hours=1) + >>> hours_per_year = 8760 + >>> num_time_arrays = 3 + >>> df = pd.DataFrame( + ... { + ... "id": np.concatenate( + ... [np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)] + ... ), + ... "timestamp": np.tile( + ... pd.date_range(start, periods=hours_per_year, freq="h"), num_time_arrays + ... ), + ... "time_zone": np.repeat(["EST", "CST", "MST"], hours_per_year), + ... "value": np.random.random(hours_per_year * num_time_arrays), + ... } + ... ) + >>> schema = TableSchema( + ... name="some_data", + ... time_config=DatetimeRange( + ... time_column="timestamp", + ... start=start, + ... length=hours_per_year, + ... resolution=freq, + ... ), + ... time_array_id_columns=["id"], + ... value_column="value", + ... ) + >>> store.ingest_table(df, schema) + >>> time_zone_column = "time_zone" + >>> dst_schema = store.localize_time_zone_by_column( + ... schema.name, + ... time_zone_column, + ... check_mapped_timestamps=True, + ... ) + """ + + src_schema = self._schema_mgr.get_schema(src_name) + tzl = TimeZoneLocalizerByColumn(self._engine, self._metadata, src_schema, time_zone_column) + + dst_schema = tzl.generate_to_schema() + if self.has_table(dst_schema.name): + msg = dst_schema.name + raise TableAlreadyExists(msg) + + tzl.localize_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + with self._engine.begin() as conn: + self._schema_mgr.add_schema(conn, dst_schema) + + return dst_schema + def read_query( self, name: str, diff --git a/tests/test_store.py b/tests/test_store.py index e9204a9..11a5ced 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -939,3 +939,155 @@ def test_convert_time_zone_by_column( for tz, expected in expected_dct.items(): actual = sorted(df2.loc[df2["time_zone"] == tz, "timestamp"]) check_timestamp_lists(actual, expected) + + +@pytest.mark.parametrize("to_time_zone", [ZoneInfo("UTC"), ZoneInfo("EST"), None]) +def test_localize_time_zone( + tmp_path, iter_stores_by_engine_no_data_ingestion: Store, to_time_zone: tzinfo | None +): + """Test time zone localization of tz-naive timestamps to a specified time zone.""" + store = iter_stores_by_engine_no_data_ingestion + time_array_len = 8784 + year = 2020 + + # Create tz-naive source time config + src_time_config = DatetimeRange( + start=datetime(year=year, month=1, day=1, hour=0, tzinfo=None), + resolution=timedelta(hours=1), + length=time_array_len, + interval_type=TimeIntervalType.PERIOD_BEGINNING, + time_column="timestamp", + ) + + src_csv_schema = CsvTableSchema( + time_config=src_time_config, + column_dtypes=[ + ColumnDType(name="timestamp", dtype=DateTime(timezone=False)), + ColumnDType(name="gen1", dtype=Double()), + ColumnDType(name="gen2", dtype=Double()), + ColumnDType(name="gen3", dtype=Double()), + ], + value_columns=["gen1", "gen2", "gen3"], + pivoted_dimension_name="generator", + time_array_id_columns=[], + ) + rel = read_csv(GENERATOR_TIME_SERIES_FILE, src_csv_schema) + rel2 = unpivot(rel, ("gen1", "gen2", "gen3"), "generator", "value") # noqa: F841 + + src_schema = TableSchema( + name="generators_pb", + time_config=src_time_config, + time_array_id_columns=["generator"], + value_column="value", + ) + if store.engine.name == "hive": + out_file = tmp_path / "data.parquet" + rel2.to_df().to_parquet(out_file) + store.create_view_from_parquet(out_file, src_schema) + else: + store.ingest_table(rel2, src_schema) + + if to_time_zone is None and store.engine.name != "sqlite": + output_file = tmp_path / "mapped_data" + else: + output_file = None + + dst_schema = store.localize_time_zone( + src_schema.name, + to_time_zone, + output_file=output_file, + check_mapped_timestamps=True, + ) + if output_file is None or store.engine.name == "sqlite": + df2 = store.read_table(dst_schema.name) + else: + df2 = pd.read_parquet(output_file) + df2["timestamp"] = pd.to_datetime(df2["timestamp"]) + assert len(df2) == time_array_len * 3 + actual = sorted(df2["timestamp"].unique()) + assert isinstance(dst_schema.time_config, DatetimeRange) + if to_time_zone: + # Should be localized to the target time zone + assert dst_schema.time_config.start_time_is_tz_naive() is False + else: + # Should remain tz-naive + assert dst_schema.time_config.start_time_is_tz_naive() is True + assert pd.Timestamp(actual[0]) == dst_schema.time_config.start + expected = make_time_range_generator(dst_schema.time_config).list_timestamps() + expected = sorted(set(expected)) + check_timestamp_lists(actual, expected) + + +def test_localize_time_zone_by_column(tmp_path, iter_stores_by_engine_no_data_ingestion: Store): + """Test time zone localization based on a time zone column.""" + store = iter_stores_by_engine_no_data_ingestion + time_array_len = 8784 + year = 2020 + + # Create tz-naive source time config + src_time_config = DatetimeRange( + start=datetime(year=year, month=1, day=1, hour=0, tzinfo=None), + resolution=timedelta(hours=1), + length=time_array_len, + interval_type=TimeIntervalType.PERIOD_BEGINNING, + time_column="timestamp", + ) + + src_csv_schema = CsvTableSchema( + time_config=src_time_config, + column_dtypes=[ + ColumnDType(name="timestamp", dtype=DateTime(timezone=False)), + ColumnDType(name="gen1", dtype=Double()), + ColumnDType(name="gen2", dtype=Double()), + ColumnDType(name="gen3", dtype=Double()), + ], + value_columns=["gen1", "gen2", "gen3"], + pivoted_dimension_name="generator", + time_array_id_columns=[], + ) + rel = read_csv(GENERATOR_TIME_SERIES_FILE, src_csv_schema) + rel2 = unpivot(rel, ("gen1", "gen2", "gen3"), "generator", "value") # noqa: F841 + # add time_zone column with standard time zones (not DST) + stmt = ", ".join(rel2.columns) + tz_col_stmt = "CASE WHEN generator='gen1' THEN 'Etc/GMT+5' WHEN generator='gen2' THEN 'Etc/GMT+6' ELSE 'Etc/GMT+7' END AS time_zone" + stmt += f", {tz_col_stmt}" + rel2 = rel2.project(stmt) + + src_schema = TableSchema( + name="generators_pb", + time_config=src_time_config, + time_array_id_columns=["generator", "time_zone"], + value_column="value", + ) + if store.engine.name == "hive": + out_file = tmp_path / "data.parquet" + rel2.to_df().to_parquet(out_file) + store.create_view_from_parquet(out_file, src_schema) + else: + store.ingest_table(rel2, src_schema) + + if store.engine.name != "sqlite": + output_file = tmp_path / "mapped_data" + else: + output_file = None + + dst_schema = store.localize_time_zone_by_column( + src_schema.name, + "time_zone", + output_file=output_file, + check_mapped_timestamps=True, + ) + if output_file is None or store.engine.name == "sqlite": + df2 = store.read_table(dst_schema.name) + else: + df2 = pd.read_parquet(output_file) + df2["timestamp"] = pd.to_datetime(df2["timestamp"]) + df_stats = df2.groupby(["time_zone"])["timestamp"].agg(["min", "max", "count"]) + assert set(df_stats["count"]) == {time_array_len} + assert isinstance(dst_schema.time_config, DatetimeRangeWithTZColumn) + # Verify that each time zone has tz-aware localized timestamps + assert df2["timestamp"].dt.tz is not None + expected_dct = make_time_range_generator(dst_schema.time_config).list_timestamps_by_time_zone() + for tz, expected in expected_dct.items(): + actual = sorted(df2.loc[df2["time_zone"] == tz, "timestamp"]) + check_timestamp_lists(actual, expected) From eec43dce12d1594354814d0aee2f086faaa72ccc Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 23 Jan 2026 09:39:44 -0700 Subject: [PATCH 10/20] add timedatatype to init --- src/chronify/__init__.py | 3 ++- src/chronify/store.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/chronify/__init__.py b/src/chronify/__init__.py index 4637d5d..8a0c1a9 100644 --- a/src/chronify/__init__.py +++ b/src/chronify/__init__.py @@ -20,7 +20,7 @@ TableSchema, ) from chronify.store import Store -from chronify.time import RepresentativePeriodFormat +from chronify.time import RepresentativePeriodFormat, TimeDataType from chronify.time_configs import ( AnnualTimeRange, DatetimeRange, @@ -57,6 +57,7 @@ "TableSchema", "TimeBaseModel", "TimeBasedDataAdjustment", + "TimeDataType", ) __version__ = metadata.metadata("chronify")["Version"] diff --git a/src/chronify/store.py b/src/chronify/store.py index a6156ff..3006654 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -1155,6 +1155,11 @@ def localize_time_zone( TableAlreadyExists Raised if the dst_schema name already exists. + Returns + ------- + TableSchema + The schema of the newly created table. + Examples -------- >>> store = Store() @@ -1240,6 +1245,11 @@ def localize_time_zone_by_column( TableAlreadyExists Raised if the dst_schema name already exists. + Returns + ------- + TableSchema + The schema of the newly created table. + Examples -------- >>> store = Store() From 829493a8f1e02aed422166bd83dcbd3bdb1caa1c Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:32:44 -0700 Subject: [PATCH 11/20] update --- src/chronify/datetime_range_generator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 7384386..5c036a6 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -78,7 +78,7 @@ def list_time_columns(self) -> list[str]: def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[datetime]: result = sorted(df[self._model.time_column].unique()) - if not isinstance(result[0], datetime): + if len(result) > 0 and not isinstance(result[0], datetime): result = [pd.Timestamp(x) for x in result] return result @@ -176,7 +176,5 @@ def list_distinct_timestamps_by_time_zone_from_dataframe( dct = {} for tz_name in sorted(df2[tz_col].unique()): timestamps = sorted(df2.loc[df2[tz_col] == tz_name, t_col].tolist()) - # if timestamps[0].tzinfo: - # timestamps = [x.astimezone(tz_name).replace(tzinfo=None) for x in timestamps] dct[tz_name] = timestamps return dct From 817d2393d483da4d401311817b207450513217f0 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Sat, 24 Jan 2026 01:50:59 -0700 Subject: [PATCH 12/20] minor update --- src/chronify/store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/chronify/store.py b/src/chronify/store.py index 3006654..407b7a6 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -1218,7 +1218,7 @@ def localize_time_zone( def localize_time_zone_by_column( self, src_name: str, - time_zone_column: str, + time_zone_column: Optional[str] = None, scratch_dir: Optional[Path] = None, output_file: Optional[Path] = None, check_mapped_timestamps: bool = False, @@ -1231,7 +1231,7 @@ def localize_time_zone_by_column( src_name Refers to the table name of the source data. time_zone_column - Name of the time zone column for localization. + Name of the time zone column for localization, default to None scratch_dir Directory to use for temporary writes. Default to the system's tmp filesystem. output_file From 7c957b5c3c320acc5e29bb7b81c50a85e053fe6a Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 27 Jan 2026 13:23:05 -0700 Subject: [PATCH 13/20] fix hive sql --- src/chronify/sqlalchemy/functions.py | 2 +- src/chronify/time_series_checker.py | 6 +++++- src/chronify/time_series_mapper_base.py | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 1e882f0..2ac301e 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -194,7 +194,7 @@ def _read_from_hive( if ( isinstance(config, DatetimeRangeBase) and config.time_column in df.columns - and not config.start_time_is_tz_naive() + and config.dtype == TimeDataType.TIMESTAMP_TZ ): # This is tied to the fact that we set the Spark session to UTC. # Otherwise, there is confusion with the computer's local time zone. diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index afaef59..53c36f1 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -78,7 +78,9 @@ def _check_expected_timestamps_datetime(self) -> int: return len(expected) def _check_expected_timestamps_with_external_time_zone(self) -> int: - """For tz-naive time with external time zone column""" + """For tz-naive or tz-aware time with external time zone column + tz-awareness is based on self._schema.time_config.dtype + """ assert isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone) # for mypy expected_dct = self._time_generator.list_timestamps_by_time_zone() time_columns = self._time_generator.list_time_columns() @@ -93,6 +95,7 @@ def _check_expected_timestamps_with_external_time_zone(self) -> int: msg = ( "Time zone records do not match between expected and actual from table " f"\nexpected: {sorted(expected_dct.keys())} vs. \nactual: {sorted(actual_dct.keys())}" + f"\ntime_config: {self._schema.time_config}" ) raise InvalidTable(msg) @@ -195,6 +198,7 @@ def _check_expected_timestamps_by_time_array(self, count: int) -> None: JOIN t2 ON {on_expr} """ + for result in self._conn.execute(text(query)).fetchall(): distinct_count_by_ta = result[0] count_by_ta = result[1] diff --git a/src/chronify/time_series_mapper_base.py b/src/chronify/time_series_mapper_base.py index 127d9db..cdd8cc0 100644 --- a/src/chronify/time_series_mapper_base.py +++ b/src/chronify/time_series_mapper_base.py @@ -146,7 +146,8 @@ def apply_mapping( to_schema.name, ) if output_file is None: - conn.execute(text(f"DROP TABLE {to_schema.name}")) + table_type = "VIEW" if engine.name == "hive" else "TABLE" + conn.execute(text(f"DROP {table_type} {to_schema.name}")) raise finally: with engine.begin() as conn: From 0a0c1398fd04600134bf799545b025ad20492b5c Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:04:45 -0700 Subject: [PATCH 14/20] fix mypy --- src/chronify/csv_time_series_parser.py | 2 +- src/chronify/sqlalchemy/functions.py | 20 +++++++++++++------ src/chronify/time_configs.py | 2 +- src/chronify/time_series_mapper_datetime.py | 7 ++----- src/chronify/time_zone_converter.py | 10 ++++++++-- src/chronify/time_zone_localizer.py | 22 +++++++++++++++++---- src/chronify/utils/sqlalchemy_table.py | 4 ++-- src/chronify/utils/sqlalchemy_view.py | 4 ++-- 8 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/chronify/csv_time_series_parser.py b/src/chronify/csv_time_series_parser.py index 1364b3d..0234461 100644 --- a/src/chronify/csv_time_series_parser.py +++ b/src/chronify/csv_time_series_parser.py @@ -117,7 +117,7 @@ def _check_input_format(data_file: Path) -> None: @staticmethod def _read_data_file(data_file: Path) -> pd.DataFrame: - return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) # type: ignore + return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) def _ingest_data(self, data: pd.DataFrame, table_name: str, year: int, length: int) -> None: csv_fmt = CsvTimeSeriesFormats.from_columns(data.columns) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 2ac301e..db42eca 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -17,11 +17,17 @@ from sqlalchemy import Connection, Engine, Selectable, text from chronify.exceptions import InvalidOperation, InvalidParameter -from chronify.time_configs import DatetimeRangeBase, TimeBaseModel +from chronify.time_configs import ( + DatetimeRangeBase, + TimeBaseModel, + DatetimeRange, + DatetimeRangeWithTZColumn, +) from chronify.utils.path_utils import check_overwrite, delete_if_exists, to_path # Copied from Pandas/Polars DbWriteMode: TypeAlias = Literal["replace", "append", "fail"] +DatetimeRangeWithDtype: TypeAlias = DatetimeRange | DatetimeRangeWithTZColumn def read_database( @@ -36,7 +42,7 @@ def read_database( df = conn.execute(query).cursor.fetch_df() # type: ignore case "sqlite": df = pd.read_sql(query, conn, params=params) - if isinstance(config, DatetimeRangeBase): + if isinstance(config, (DatetimeRange, DatetimeRangeWithTZColumn)): _convert_database_output_for_datetime(df, config) case "hive": df = _read_from_hive(query, conn, config, params) @@ -82,7 +88,7 @@ def _check_one_config_per_datetime_column(configs: Sequence[TimeBaseModel]) -> N def _convert_database_input_for_datetime( - df: pd.DataFrame, config: DatetimeRangeBase, copied: bool + df: pd.DataFrame, config: DatetimeRangeWithDtype, copied: bool ) -> tuple[pd.DataFrame, bool]: if config.dtype == TimeDataType.TIMESTAMP_NTZ: return df, copied @@ -100,7 +106,9 @@ def _convert_database_input_for_datetime( return df2, copied -def _convert_database_output_for_datetime(df: pd.DataFrame, config: DatetimeRangeBase) -> None: +def _convert_database_output_for_datetime( + df: pd.DataFrame, config: DatetimeRangeWithDtype +) -> None: if config.time_column in df.columns: if config.dtype == TimeDataType.TIMESTAMP_TZ: if isinstance(df[config.time_column].dtype, ObjectDType): @@ -192,7 +200,7 @@ def _read_from_hive( ) -> pd.DataFrame: df = pd.read_sql_query(query, conn, params=params) if ( - isinstance(config, DatetimeRangeBase) + isinstance(config, (DatetimeRange, DatetimeRangeWithTZColumn)) and config.time_column in df.columns and config.dtype == TimeDataType.TIMESTAMP_TZ ): @@ -212,7 +220,7 @@ def _write_to_sqlite( _check_one_config_per_datetime_column(configs) copied = False for config in configs: - if isinstance(config, DatetimeRangeBase): + if isinstance(config, (DatetimeRange, DatetimeRangeWithTZColumn)): df, copied = _convert_database_input_for_datetime(df, config, copied) df.to_sql(table_name, conn, if_exists=if_table_exists, index=False) diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 2cb9f49..7a5e8b8 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -106,7 +106,7 @@ def check_dtype_start_time_consistency( cls, dtype: Optional[Literal[TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ]], info: ValidationInfo, - ) -> Literal[TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ]: + ) -> Optional[Literal[TimeDataType.TIMESTAMP_TZ, TimeDataType.TIMESTAMP_NTZ]]: match (info.data["start"].tzinfo is None, dtype): # assign default dtype if not provided case (True, None): diff --git a/src/chronify/time_series_mapper_datetime.py b/src/chronify/time_series_mapper_datetime.py index 108aaff..8971d67 100644 --- a/src/chronify/time_series_mapper_datetime.py +++ b/src/chronify/time_series_mapper_datetime.py @@ -106,16 +106,13 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: # get standard time zone of to_tz to_tz_std = get_standard_time_zone(to_tz) # tz-naive time does not have skips/dups, so always localize in std tz first - # mypy complains with error: "Properties" has no attribute "tz_localize" - # but the type is pandas.core.indexes.accessors.DatetimeProperties and the - # attribute exists. - ser_from = ser_from.dt.tz_localize(to_tz_std).dt.tz_convert(to_tz) # type: ignore + ser_from = ser_from.dt.tz_localize(to_tz_std).dt.tz_convert(to_tz) pass case (False, True): # get standard time zone of fm_tz fm_tz_std = get_standard_time_zone(fm_tz) # convert to standard time zone of fm_tz before making it tz-naive - ser_from = ser_from.dt.tz_convert(fm_tz_std).dt.tz_localize(to_tz) # type: ignore + ser_from = ser_from.dt.tz_convert(fm_tz_std).dt.tz_localize(to_tz) pass match (self._adjust_interval, self._wrap_time_allowed): case (True, _): diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index a686e80..5006f37 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -145,9 +145,15 @@ def _check_from_schema(self, from_schema: TableSchema) -> None: msg = "" if not isinstance(from_schema.time_config, DatetimeRange): msg += "Source schema must have DatetimeRange time config. " - if from_schema.time_config.dtype != TimeDataType.TIMESTAMP_TZ: + if ( + isinstance(from_schema.time_config, DatetimeRange) + and from_schema.time_config.dtype != TimeDataType.TIMESTAMP_TZ + ): msg += "Source schema time config dtype must be Timestamp_TZ. " - if from_schema.time_config.start_time_is_tz_naive(): + if ( + isinstance(from_schema.time_config, DatetimeRange) + and from_schema.time_config.start_time_is_tz_naive() + ): msg += ( "Source schema time config start time must be timezone-aware. " "This converter will convert time zones and return timestamps as tz-naive " diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index 721eb56..4c62152 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -195,9 +195,15 @@ def _check_from_schema(from_schema: TableSchema) -> None: f"try using TimeZoneLocalizerByColumn(). {from_schema.time_config}" ) raise InvalidParameter(msg) - if from_schema.time_config.dtype != TimeDataType.TIMESTAMP_NTZ: + if ( + isinstance(from_schema.time_config, DatetimeRange) + and from_schema.time_config.dtype != TimeDataType.TIMESTAMP_NTZ + ): msg += "Source schema time config dtype must be TIMESTAMP_NTZ. " - if from_schema.time_config.start_time_is_tz_naive() is False: + if ( + isinstance(from_schema.time_config, DatetimeRange) + and from_schema.time_config.start_time_is_tz_naive() is False + ): msg += ( "Source schema time config start time must be tz-naive." "To convert between time zones for tz-aware timestamps, " @@ -286,6 +292,8 @@ class TimeZoneLocalizerByColumn(TimeZoneLocalizerBase): -------------------------------- """ + time_zone_column: str + def __init__( self, engine: Engine, @@ -297,9 +305,11 @@ def __init__( self._check_time_zone_column(from_schema, time_zone_column) super().__init__(engine, metadata, from_schema) if isinstance(self._from_schema.time_config, DatetimeRange): + assert time_zone_column is not None # validated by _check_time_zone_column self.time_zone_column = time_zone_column self._convert_from_time_config_to_datetime_range_with_tz_column() else: + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy self.time_zone_column = self._from_schema.time_config.time_zone_column self._check_standard_time_zones() self._to_schema = self.generate_to_schema() @@ -311,7 +321,10 @@ def _check_from_schema(from_schema: TableSchema) -> None: msg += ( "Source schema must have DatetimeRange or DatetimeRangeWithTZColumn time config. " ) - if from_schema.time_config.dtype != TimeDataType.TIMESTAMP_NTZ: + if ( + isinstance(from_schema.time_config, (DatetimeRange, DatetimeRangeWithTZColumn)) + and from_schema.time_config.dtype != TimeDataType.TIMESTAMP_NTZ + ): msg += "Source schema time config dtype must be TIMESTAMP_NTZ. " if msg != "": msg += f"\n{from_schema.time_config}" @@ -341,7 +354,7 @@ def _check_standard_time_zones(self) -> None: msg = "" time_zones = self._from_schema.time_config.time_zones for tz in time_zones: - if tz == "None": + if tz is None: msg += "\nChronify does not support None time zone in time_zone_column. " raise InvalidParameter(msg) if not is_standard_time_zone(tz): @@ -463,6 +476,7 @@ def _get_time_zones(self) -> list[tzinfo | None]: def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: """Create mapping dataframe for localizing tz-naive datetime to column time zones""" + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy time_col = self._from_schema.time_config.time_column from_time_col = "from_" + time_col from_time_generator = make_time_range_generator(self._from_schema.time_config) diff --git a/src/chronify/utils/sqlalchemy_table.py b/src/chronify/utils/sqlalchemy_table.py index ec7e0b7..8dac7ad 100644 --- a/src/chronify/utils/sqlalchemy_table.py +++ b/src/chronify/utils/sqlalchemy_table.py @@ -21,7 +21,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateTable) +@compiler.compiles(CreateTable) # type: ignore[no-untyped-call,misc] def _create_table(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE TABLE %s AS %s" % ( element.name, @@ -29,7 +29,7 @@ def _create_table(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropTable) +@compiler.compiles(DropTable) # type: ignore[no-untyped-call,misc] def _drop_table(element: Any, compiler: Any, **kw: Any) -> str: return "DROP TABLE %s" % (element.name) diff --git a/src/chronify/utils/sqlalchemy_view.py b/src/chronify/utils/sqlalchemy_view.py index bf4c495..6a191c0 100644 --- a/src/chronify/utils/sqlalchemy_view.py +++ b/src/chronify/utils/sqlalchemy_view.py @@ -20,7 +20,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateView) +@compiler.compiles(CreateView) # type: ignore[no-untyped-call,misc] def _create_view(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE VIEW %s AS %s" % ( element.name, @@ -28,7 +28,7 @@ def _create_view(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropView) +@compiler.compiles(DropView) # type: ignore[no-untyped-call,misc] def _drop_view(element: Any, compiler: Any, **kw: Any) -> str: return "DROP VIEW %s" % (element.name) From a0d8c62e179c49d3dbc31adb5f98df0c47385679 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:16:05 -0700 Subject: [PATCH 15/20] address comments --- src/chronify/time_configs.py | 2 +- src/chronify/time_zone_localizer.py | 8 +++----- tests/test_time_zone_localizer.py | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 7a5e8b8..a096913 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -113,7 +113,7 @@ def check_dtype_start_time_consistency( dtype = TimeDataType.TIMESTAMP_NTZ case (False, None): dtype = TimeDataType.TIMESTAMP_TZ - # validate dype if provided + # validate dtype if provided case (True, TimeDataType.TIMESTAMP_TZ): msg = ( "DatetimeRange with tz-naive start time must have dtype TIMESTAMP_NTZ: " diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index 4c62152..dd0dfb8 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -6,6 +6,7 @@ from pathlib import Path import pandas as pd from pandas import DatetimeTZDtype +from yaml import warnings from chronify.models import TableSchema, MappingTableSchema from chronify.time_configs import ( @@ -337,16 +338,13 @@ def _check_time_zone_column(from_schema: TableSchema, time_zone_column: Optional and time_zone_column is not None ): msg = f"Input {time_zone_column=} will be ignored. time_zone_column is already defined in the time_config." - raise Warning(msg) + warnings.warn(msg) msg = "" if isinstance(from_schema.time_config, DatetimeRange) and time_zone_column is None: msg += "time_zone_column must be provided when source schema time config is of type DatetimeRange. " - # if time_zone_column not in from_schema.time_array_id_columns: - # msg = f"{time_zone_column=} must be in source schema time_array_id_columns." - if msg != "": msg += f"\n{from_schema}" - raise InvalidParameter(msg) + raise MissingValue(msg) def _check_standard_time_zones(self) -> None: """Check that all time zones in the time_zone_column are valid standard time zones.""" diff --git a/tests/test_time_zone_localizer.py b/tests/test_time_zone_localizer.py index bc2968d..e7287fb 100644 --- a/tests/test_time_zone_localizer.py +++ b/tests/test_time_zone_localizer.py @@ -22,7 +22,7 @@ DatetimeRangeGenerator, DatetimeRangeGeneratorExternalTimeZone, ) -from chronify.exceptions import InvalidParameter +from chronify.exceptions import InvalidParameter, MissingValue def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: @@ -327,7 +327,7 @@ def test_time_localizer_by_column_missing_tz_column_error(iter_engines: Engine) """Test that TimeZoneLocalizerByColumn raises error when time_zone_column is missing for DatetimeRange""" from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") df = generate_datetime_dataframe(from_schema) - error = (InvalidParameter, "time_zone_column must be provided") + error = (MissingValue, "time_zone_column must be provided") run_localization_by_column_with_error(iter_engines, df, from_schema, error) @@ -360,7 +360,7 @@ def test_localize_time_zone_by_column_missing_tz_column_error(iter_engines: Engi from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") df = generate_datetime_dataframe(from_schema) error = ( - Exception, + MissingValue, "time_zone_column must be provided when source schema time config is of type DatetimeRange", ) run_localization_by_column_with_error( From 1b4eee2927a1356dc4daeb243266192d8d748cf2 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:24:31 -0700 Subject: [PATCH 16/20] mypy v2 --- src/chronify/time_zone_localizer.py | 2 +- src/chronify/utils/sqlalchemy_table.py | 4 ++-- src/chronify/utils/sqlalchemy_view.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index dd0dfb8..5c49312 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -1,4 +1,5 @@ import abc +import warnings from zoneinfo import ZoneInfo from datetime import tzinfo from sqlalchemy import Engine, MetaData, Table, select @@ -6,7 +7,6 @@ from pathlib import Path import pandas as pd from pandas import DatetimeTZDtype -from yaml import warnings from chronify.models import TableSchema, MappingTableSchema from chronify.time_configs import ( diff --git a/src/chronify/utils/sqlalchemy_table.py b/src/chronify/utils/sqlalchemy_table.py index 8dac7ad..cf0200d 100644 --- a/src/chronify/utils/sqlalchemy_table.py +++ b/src/chronify/utils/sqlalchemy_table.py @@ -21,7 +21,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateTable) # type: ignore[no-untyped-call,misc] +@compiler.compiles(CreateTable) # type: ignore[untyped-decorator,no-untyped-call] def _create_table(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE TABLE %s AS %s" % ( element.name, @@ -29,7 +29,7 @@ def _create_table(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropTable) # type: ignore[no-untyped-call,misc] +@compiler.compiles(DropTable) # type: ignore[untyped-decorator,no-untyped-call] def _drop_table(element: Any, compiler: Any, **kw: Any) -> str: return "DROP TABLE %s" % (element.name) diff --git a/src/chronify/utils/sqlalchemy_view.py b/src/chronify/utils/sqlalchemy_view.py index 6a191c0..abea77e 100644 --- a/src/chronify/utils/sqlalchemy_view.py +++ b/src/chronify/utils/sqlalchemy_view.py @@ -20,7 +20,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateView) # type: ignore[no-untyped-call,misc] +@compiler.compiles(CreateView) # type: ignore[untyped-decorator,no-untyped-call] def _create_view(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE VIEW %s AS %s" % ( element.name, @@ -28,7 +28,7 @@ def _create_view(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropView) # type: ignore[no-untyped-call,misc] +@compiler.compiles(DropView) # type: ignore[untyped-decorator,no-untyped-call] def _drop_view(element: Any, compiler: Any, **kw: Any) -> str: return "DROP VIEW %s" % (element.name) From 4ca496a422a524642d197b7c3a1e02ea9a498eb1 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:31:32 -0700 Subject: [PATCH 17/20] mypy v3 --- src/chronify/csv_time_series_parser.py | 2 +- src/chronify/utils/sqlalchemy_table.py | 4 ++-- src/chronify/utils/sqlalchemy_view.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/chronify/csv_time_series_parser.py b/src/chronify/csv_time_series_parser.py index 0234461..1364b3d 100644 --- a/src/chronify/csv_time_series_parser.py +++ b/src/chronify/csv_time_series_parser.py @@ -117,7 +117,7 @@ def _check_input_format(data_file: Path) -> None: @staticmethod def _read_data_file(data_file: Path) -> pd.DataFrame: - return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) + return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) # type: ignore def _ingest_data(self, data: pd.DataFrame, table_name: str, year: int, length: int) -> None: csv_fmt = CsvTimeSeriesFormats.from_columns(data.columns) diff --git a/src/chronify/utils/sqlalchemy_table.py b/src/chronify/utils/sqlalchemy_table.py index cf0200d..ec7e0b7 100644 --- a/src/chronify/utils/sqlalchemy_table.py +++ b/src/chronify/utils/sqlalchemy_table.py @@ -21,7 +21,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateTable) # type: ignore[untyped-decorator,no-untyped-call] +@compiler.compiles(CreateTable) def _create_table(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE TABLE %s AS %s" % ( element.name, @@ -29,7 +29,7 @@ def _create_table(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropTable) # type: ignore[untyped-decorator,no-untyped-call] +@compiler.compiles(DropTable) def _drop_table(element: Any, compiler: Any, **kw: Any) -> str: return "DROP TABLE %s" % (element.name) diff --git a/src/chronify/utils/sqlalchemy_view.py b/src/chronify/utils/sqlalchemy_view.py index abea77e..bf4c495 100644 --- a/src/chronify/utils/sqlalchemy_view.py +++ b/src/chronify/utils/sqlalchemy_view.py @@ -20,7 +20,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateView) # type: ignore[untyped-decorator,no-untyped-call] +@compiler.compiles(CreateView) def _create_view(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE VIEW %s AS %s" % ( element.name, @@ -28,7 +28,7 @@ def _create_view(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropView) # type: ignore[untyped-decorator,no-untyped-call] +@compiler.compiles(DropView) def _drop_view(element: Any, compiler: Any, **kw: Any) -> str: return "DROP VIEW %s" % (element.name) From 96a8902babc106e0969d79b956a27016865656ce Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 29 Jan 2026 14:56:34 -0700 Subject: [PATCH 18/20] factor out list_timestamps --- src/chronify/datetime_range_generator.py | 59 ++++-------------------- src/chronify/time_utils.py | 39 ++++++++++++++++ 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 5c036a6..352824a 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -1,7 +1,6 @@ from datetime import datetime, tzinfo from typing import Generator, Optional from itertools import chain -from calendar import isleap import pandas as pd @@ -10,7 +9,7 @@ TimeDataType, ) from chronify.time_configs import DatetimeRanges, DatetimeRange, DatetimeRangeWithTZColumn -from chronify.time_utils import get_tzname +from chronify.time_utils import get_tzname, list_timestamps from chronify.time_range_generator_base import TimeRangeGeneratorBase from chronify.exceptions import InvalidValue @@ -26,51 +25,11 @@ def __init__( self._model = model self._adjustment = leap_day_adjustment or LeapDayAdjustmentType.NONE - def _list_timestamps(self, start: Optional[datetime] = None) -> list[datetime]: - """Return all timestamps as a list. - if start is supplied, override self._model.start - """ - if start is None: - start = self._model.start - - timestamps = pd.date_range( - start=start, - periods=self._model.length, - freq=self._model.resolution, - ).tolist() - - match self._adjustment: - case LeapDayAdjustmentType.DROP_FEB29: - timestamps = [ - ts - for ts in timestamps - if not (isleap(ts.year) and ts.month == 2 and ts.day == 29) - ] - case LeapDayAdjustmentType.DROP_DEC31: - timestamps = [ - ts - for ts in timestamps - if not (isleap(ts.year) and ts.month == 12 and ts.day == 31) - ] - case LeapDayAdjustmentType.DROP_JAN1: - timestamps = [ - ts - for ts in timestamps - if not (isleap(ts.year) and ts.month == 1 and ts.day == 1) - ] - case _: - pass - - return timestamps # type: ignore - - def _iter_timestamps( - self, start: Optional[datetime] = None - ) -> Generator[datetime, None, None]: - """Generator from pd.date_range(). - Note: Established time library already handles historical changes in time zone conversion to UTC. - (e.g. Algeria (Africa/Algiers) changed from UTC+0 to UTC+1 on April 25, 1980) - """ - for ts in self._list_timestamps(start=start): + def _iter_timestamps(self) -> Generator[datetime, None, None]: + """Generator from pd.date_range().""" + for ts in list_timestamps( + self._model.start, self._model.length, self._model.resolution, self._adjustment + ): yield ts def list_time_columns(self) -> list[str]: @@ -95,7 +54,9 @@ def __init__( assert isinstance(self._model, DatetimeRange) def list_timestamps(self) -> list[datetime]: - return self._list_timestamps() # list(self._iter_timestamps()) + return list_timestamps( + self._model.start, self._model.length, self._model.resolution, self._adjustment + ) class DatetimeRangeGeneratorExternalTimeZone(DatetimeRangeGeneratorBase): @@ -147,7 +108,7 @@ def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[dat case _: msg = f"Unsupported combination of start_time_is_tz_naive and dtype: {self._model}" raise InvalidValue(msg) - return self._list_timestamps(start=start) # ist(self._iter_timestamps(start=start)) + return list_timestamps(start, self._model.length, self._model.resolution, self._adjustment) def list_timestamps(self) -> list[datetime]: """return ordered tz-naive timestamps across all time zones in the order of the time zones.""" diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index 0744c46..c286488 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -1,5 +1,6 @@ """Functions related to time""" +from calendar import isleap import logging from numpy.typing import NDArray import numpy as np @@ -8,6 +9,7 @@ import pandas as pd from chronify.time import ( + LeapDayAdjustmentType, TimeIntervalType, ) from chronify.exceptions import InvalidParameter @@ -184,3 +186,40 @@ def get_tzname(tz: tzinfo | None) -> str: return tz.key ts = datetime(year=2020, month=1, day=1, tzinfo=tz) return tz.tzname(ts) # type: ignore + + +def list_timestamps( + start: datetime, length: int, resolution: timedelta, adjustment: LeapDayAdjustmentType +) -> list[pd.Timestamp]: + """Generate timestamps with adjustment using pd.date_range(). + Established time library can handle historical changes in time zone conversion to UTC. + (e.g. Algeria (Africa/Algiers) changed from UTC+0 to UTC+1 on April 25, 1980) + """ + pd_timestamps = pd.date_range( + start=start, + periods=length, + freq=resolution, + ) + + match adjustment: + case LeapDayAdjustmentType.DROP_FEB29: + timestamps = [ + ts + for ts in pd_timestamps + if not (isleap(ts.year) and ts.month == 2 and ts.day == 29) + ] + case LeapDayAdjustmentType.DROP_DEC31: + timestamps = [ + ts + for ts in pd_timestamps + if not (isleap(ts.year) and ts.month == 12 and ts.day == 31) + ] + case LeapDayAdjustmentType.DROP_JAN1: + timestamps = [ + ts + for ts in pd_timestamps + if not (isleap(ts.year) and ts.month == 1 and ts.day == 1) + ] + case _: + timestamps = pd_timestamps.tolist() + return timestamps From 478f6d5b79e4137b979f968d2e6e23ca965859d9 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:07:37 -0700 Subject: [PATCH 19/20] mypy --- src/chronify/datetime_range_generator.py | 8 ++++---- src/chronify/time_utils.py | 20 ++++++++++---------- src/chronify/time_zone_converter.py | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 352824a..4df13b9 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -53,7 +53,7 @@ def __init__( super().__init__(model, leap_day_adjustment=leap_day_adjustment) assert isinstance(self._model, DatetimeRange) - def list_timestamps(self) -> list[datetime]: + def list_timestamps(self) -> list[pd.Timestamp]: return list_timestamps( self._model.start, self._model.length, self._model.resolution, self._adjustment ) @@ -80,7 +80,7 @@ def __init__( ) raise InvalidValue(msg) - def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[datetime]: + def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[pd.Timestamp]: """return timestamps for a given time_zone expected in the dataframe returned timestamp dtype matches that in the dataframe, i.e. self._model.dtype (e.g., if time_zone is None, return tz-naive timestamps else return tz-aware timestamps) @@ -110,12 +110,12 @@ def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[dat raise InvalidValue(msg) return list_timestamps(start, self._model.length, self._model.resolution, self._adjustment) - def list_timestamps(self) -> list[datetime]: + def list_timestamps(self) -> list[pd.Timestamp]: """return ordered tz-naive timestamps across all time zones in the order of the time zones.""" dct = self.list_timestamps_by_time_zone() return list(chain(*dct.values())) - def list_timestamps_by_time_zone(self) -> dict[str, list[datetime]]: + def list_timestamps_by_time_zone(self) -> dict[str, list[pd.Timestamp]]: """for each time zone, returns full timestamp iteration (duplicates allowed)""" dct = {} diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index c286488..e39c431 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -32,10 +32,10 @@ def adjust_timestamp_by_dst_offset(timestamp: datetime, resolution: timedelta) - def shifted_interval_timestamps( - ts_list: list[datetime], + ts_list: list[pd.Timestamp], from_interval_type: TimeIntervalType, to_interval_type: TimeIntervalType, -) -> list[datetime]: +) -> list[pd.Timestamp]: """Shift ts_list by ONE time interval based on interval type. Example: @@ -75,9 +75,9 @@ def shifted_interval_timestamps( def wrapped_time_timestamps( - ts_list: list[datetime], - to_timestamps: list[datetime], -) -> list[datetime]: + ts_list: list[pd.Timestamp], + to_timestamps: list[pd.Timestamp], +) -> list[pd.Timestamp]: """Returns the replacement timestamps in order to wrap the ts_list into the to_timestamps range. Example: @@ -105,16 +105,16 @@ def wrapped_time_timestamps( upper_cond = arr > tmax if upper_cond.sum() > 0: arr2.loc[upper_cond] -= tdelta - ts_list2 = arr2.tolist() - return ts_list2 # type: ignore + ts_list2: list[pd.Timestamp] = arr2.tolist() + return ts_list2 def rolled_interval_timestamps( - ts_list: list[datetime], + ts_list: list[pd.Timestamp], from_interval_type: TimeIntervalType, to_interval_type: TimeIntervalType, - to_timestamps: list[datetime], -) -> list[datetime]: + to_timestamps: list[pd.Timestamp], +) -> list[pd.Timestamp]: """Roll ts_list by shifting time interval based on interval type and then wrapping timestamps according to to_timestamps. diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 5006f37..543b01f 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -1,6 +1,6 @@ import abc from zoneinfo import ZoneInfo -from datetime import datetime, tzinfo +from datetime import tzinfo from sqlalchemy import Engine, MetaData, Table, select from typing import Optional from pathlib import Path @@ -424,10 +424,10 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: df_tz = [] for tz_name, time_data in to_time_data_dct.items(): - to_time_data: list[datetime] | list[pd.Timestamp] + to_time_data: list[pd.Timestamp] if self._wrap_time_allowed: # assume it is being wrapped based on the tz-naive version of the original time data - final_time_data = [x.replace(tzinfo=None) for x in from_time_data] + final_time_data = [x.tz_localize(None) for x in from_time_data] to_time_data = wrapped_time_timestamps(time_data, final_time_data) else: to_time_data = time_data From 1ad084f2e4f0bdc86a04d857d7a0f6b7a3e6dcf0 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:56:18 -0700 Subject: [PATCH 20/20] remove TimeZone + time zone cleanup --- src/chronify/datetime_range_generator.py | 28 ++-- src/chronify/duckdb/functions.py | 1 - src/chronify/sqlalchemy/functions.py | 1 - src/chronify/store.py | 11 +- src/chronify/time.py | 141 ------------------ src/chronify/time_series_mapper_index_time.py | 2 +- src/chronify/time_zone_converter.py | 10 +- src/chronify/time_zone_localizer.py | 16 +- tests/test_mapper_datetime_to_datetime.py | 2 +- tests/test_store.py | 16 +- tests/test_time.py | 57 ------- tests/test_time_series_checker.py | 10 +- tests/test_time_utils.py | 21 ++- tests/test_time_zone_localizer.py | 2 +- 14 files changed, 62 insertions(+), 256 deletions(-) delete mode 100644 tests/test_time.py diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 4df13b9..e679c06 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -26,7 +26,7 @@ def __init__( self._adjustment = leap_day_adjustment or LeapDayAdjustmentType.NONE def _iter_timestamps(self) -> Generator[datetime, None, None]: - """Generator from pd.date_range().""" + """Generate timestamps from pd.date_range().""" for ts in list_timestamps( self._model.start, self._model.length, self._model.resolution, self._adjustment ): @@ -60,10 +60,11 @@ def list_timestamps(self) -> list[pd.Timestamp]: class DatetimeRangeGeneratorExternalTimeZone(DatetimeRangeGeneratorBase): - """Generates datetime ranges based on a DatetimeRangeWithTZColumn model. - datetime ranges will be tz-naive and can be listed by time_zone name using special class func + """Generate datetime ranges based on a DatetimeRangeWithTZColumn model. + + Datetime ranges will be tz-naive and can be listed by time_zone name using special class functions. These ranges may be localized by the time_zone name. - # TODO: add offset as a column + """ def __init__( @@ -81,9 +82,10 @@ def __init__( raise InvalidValue(msg) def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[pd.Timestamp]: - """return timestamps for a given time_zone expected in the dataframe - returned timestamp dtype matches that in the dataframe, i.e. self._model.dtype - (e.g., if time_zone is None, return tz-naive timestamps else return tz-aware timestamps) + """Return timestamps for a given time_zone expected in the dataframe. + + The returned timestamp dtype matches that in the dataframe (i.e., self._model.dtype). + For example, if time_zone is None, return tz-naive timestamps; otherwise, return tz-aware timestamps. """ match (self._model.start_time_is_tz_naive(), self._model.dtype): case (True, TimeDataType.TIMESTAMP_NTZ): @@ -111,13 +113,15 @@ def _list_timestamps_by_time_zone(self, time_zone: Optional[tzinfo]) -> list[pd. return list_timestamps(start, self._model.length, self._model.resolution, self._adjustment) def list_timestamps(self) -> list[pd.Timestamp]: - """return ordered tz-naive timestamps across all time zones in the order of the time zones.""" + """Return ordered tz-naive timestamps across all time zones in the order of the time zones.""" dct = self.list_timestamps_by_time_zone() return list(chain(*dct.values())) def list_timestamps_by_time_zone(self) -> dict[str, list[pd.Timestamp]]: - """for each time zone, returns full timestamp iteration - (duplicates allowed)""" + """Return full timestamp iteration for each time zone. + + This follows DST if applicable. + """ dct = {} for tz in self._model.get_time_zones(): tz_name = get_tzname(tz) @@ -127,9 +131,7 @@ def list_timestamps_by_time_zone(self) -> dict[str, list[pd.Timestamp]]: def list_distinct_timestamps_by_time_zone_from_dataframe( self, df: pd.DataFrame ) -> dict[str, list[datetime]]: - """ - from the dataframe, for each time zone, returns distinct timestamps - """ + """Return distinct timestamps for each time zone from the dataframe.""" tz_col = self._model.get_time_zone_column() t_col = self._model.time_column df[t_col] = pd.to_datetime(df[t_col]) diff --git a/src/chronify/duckdb/functions.py b/src/chronify/duckdb/functions.py index d37ca46..1762392 100644 --- a/src/chronify/duckdb/functions.py +++ b/src/chronify/duckdb/functions.py @@ -15,7 +15,6 @@ def add_datetime_column( timestamps: list[datetime], ) -> DuckDBPyRelation: """Add a datetime column to the relation.""" - # TODO raise NotImplementedError # values = [] # columns = ",".join(rel.columns) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index db42eca..ff9c42c 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -261,7 +261,6 @@ def write_query_to_parquet( if not overwrite: msg = "write_table_to_parquet with Hive requires overwrite=True" raise InvalidOperation(msg) - # TODO: partition columns if partition_columns: msg = "write_table_to_parquet with Hive doesn't support partition_columns" raise InvalidOperation(msg) diff --git a/src/chronify/store.py b/src/chronify/store.py index 407b7a6..3f9adfe 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -494,7 +494,6 @@ def _ingest_from_csv( columns = set(src_schema.list_columns()) check_columns(rel.columns, columns) - # TODO if isinstance(src_schema.time_config, IndexTimeRangeBase): if isinstance(dst_schema.time_config, DatetimeRange): raise NotImplementedError @@ -978,7 +977,7 @@ def convert_time_zone( Examples -------- >>> store = Store() - >>> start = datetime(year=2018, month=1, day=1, tzinfo=ZoneInfo("EST")) + >>> start = datetime(year=2018, month=1, day=1, tzinfo=ZoneInfo("Etc/GMT+5")) >>> freq = timedelta(hours=1) >>> hours_per_year = 8760 >>> num_time_arrays = 1 @@ -1067,7 +1066,7 @@ def convert_time_zone_by_column( Examples -------- >>> store = Store() - >>> start = datetime(year=2018, month=1, day=1, tzinfo=ZoneInfo("EST")) + >>> start = datetime(year=2018, month=1, day=1, tzinfo=ZoneInfo("Etc/GMT+5")) >>> freq = timedelta(hours=1) >>> hours_per_year = 8760 >>> num_time_arrays = 3 @@ -1190,7 +1189,7 @@ def localize_time_zone( ... value_column="value", ... ) >>> store.ingest_table(df, schema) - >>> to_time_zone = ZoneInfo("EST") + >>> to_time_zone = ZoneInfo("Etc/GMT+5") >>> dst_schema = store.localize_time_zone( ... schema.name, to_time_zone, check_mapped_timestamps=True ... ) @@ -1265,7 +1264,9 @@ def localize_time_zone_by_column( ... "timestamp": np.tile( ... pd.date_range(start, periods=hours_per_year, freq="h"), num_time_arrays ... ), - ... "time_zone": np.repeat(["EST", "CST", "MST"], hours_per_year), + ... "time_zone": np.repeat( + ... ["Etc/GMT+5", "Etc/GMT+6", "Etc/GMT+7"], hours_per_year + ... ), # EST, CST, MST ... "value": np.random.random(hours_per_year * num_time_arrays), ... } ... ) diff --git a/src/chronify/time.py b/src/chronify/time.py index 26d273f..d8b9d14 100644 --- a/src/chronify/time.py +++ b/src/chronify/time.py @@ -2,9 +2,6 @@ from enum import StrEnum from typing import NamedTuple, Union -from zoneinfo import ZoneInfo - -from chronify.exceptions import InvalidParameter class TimeType(StrEnum): @@ -134,141 +131,3 @@ class DisaggregationType(StrEnum): ResamplingOperationType = Union[AggregationType, DisaggregationType] - - -class TimeZone(StrEnum): - """Time zones""" - - UTC = "UTC" - HST = "HawaiiAleutianStandard" - AST = "AlaskaStandard" - APT = "AlaskaPrevailing" - PST = "PacificStandard" - PPT = "PacificPrevailing" - MST = "MountainStandard" - MPT = "MountainPrevailing" - CST = "CentralStandard" - CPT = "CentralPrevailing" - EST = "EasternStandard" - EPT = "EasternPrevailing" - ARIZONA = "USArizona" - - -_TIME_ZONE_TO_ZONE_INFO = { - TimeZone.UTC: ZoneInfo("UTC"), - TimeZone.HST: ZoneInfo("US/Hawaii"), - TimeZone.AST: ZoneInfo("Etc/GMT+9"), - TimeZone.APT: ZoneInfo("US/Alaska"), - TimeZone.PST: ZoneInfo("Etc/GMT+8"), - TimeZone.PPT: ZoneInfo("US/Pacific"), - TimeZone.MST: ZoneInfo("Etc/GMT+7"), - TimeZone.MPT: ZoneInfo("US/Mountain"), - TimeZone.CST: ZoneInfo("Etc/GMT+6"), - TimeZone.CPT: ZoneInfo("US/Central"), - TimeZone.EST: ZoneInfo("Etc/GMT+5"), - TimeZone.EPT: ZoneInfo("US/Eastern"), - TimeZone.ARIZONA: ZoneInfo("US/Arizona"), -} - - -def get_zone_info(tz: TimeZone) -> ZoneInfo: - """Return a ZoneInfo instance for the given time zone.""" - return _TIME_ZONE_TO_ZONE_INFO[tz] - - -_TIME_ZONE_TO_TZ_OFFSET_AS_STR = { - TimeZone.UTC: "UTC", - TimeZone.HST: "-10:00", - TimeZone.AST: "-09:00", - TimeZone.PST: "-08:00", - TimeZone.MST: "-07:00", - TimeZone.CST: "-06:00", - TimeZone.EST: "-05:00", -} - - -def get_time_zone_offset(tz: TimeZone) -> str: - """Return the offset of the time zone from UTC.""" - offset = _TIME_ZONE_TO_TZ_OFFSET_AS_STR.get(tz) - if offset is None: - msg = f"Cannot get time zone offset for {tz=}" - raise InvalidParameter(msg) - return offset - - -def get_standard_time(tz: TimeZone) -> TimeZone: - """Return the equivalent standard time zone.""" - match tz: - case TimeZone.UTC: - return TimeZone.UTC - case TimeZone.HST: - return TimeZone.HST - case TimeZone.AST | TimeZone.APT: - return TimeZone.AST - case TimeZone.PST | TimeZone.PPT: - return TimeZone.PST - case TimeZone.MST | TimeZone.MPT: - return TimeZone.MST - case TimeZone.CST | TimeZone.CPT: - return TimeZone.CST - case TimeZone.EST | TimeZone.EPT: - return TimeZone.EST - case TimeZone.ARIZONA: - return TimeZone.ARIZONA - case _: - msg = f"BUG: case not covered: {tz}" - raise NotImplementedError(msg) - - -def get_prevailing_time(tz: TimeZone) -> TimeZone: - """Return the equivalent prevailing time zone.""" - match tz: - case TimeZone.UTC: - return TimeZone.UTC - case TimeZone.HST: - return TimeZone.HST - case TimeZone.AST | TimeZone.APT: - return TimeZone.APT - case TimeZone.PST | TimeZone.PPT: - return TimeZone.PPT - case TimeZone.MST | TimeZone.MPT: - return TimeZone.MPT - case TimeZone.CST | TimeZone.CPT: - return TimeZone.CPT - case TimeZone.EST | TimeZone.EPT: - return TimeZone.EPT - case TimeZone.ARIZONA: - return TimeZone.ARIZONA - case _: - msg = f"BUG: case not covered: {tz}" - raise NotImplementedError(msg) - - -_STANDARD_TIME_ZONES = { - TimeZone.UTC, - TimeZone.HST, - TimeZone.AST, - TimeZone.PST, - TimeZone.MST, - TimeZone.CST, - TimeZone.EST, - TimeZone.ARIZONA, -} -_PREVAILING_TIME_ZONES = { - TimeZone.APT, - TimeZone.PPT, - TimeZone.MPT, - TimeZone.CPT, - TimeZone.EPT, - TimeZone.ARIZONA, -} - - -def is_standard(tz: TimeZone) -> bool: - """Return True if the time zone is a standard time zone.""" - return tz in _STANDARD_TIME_ZONES - - -def is_prevailing(tz: TimeZone) -> bool: - """Return True if the time zone is a prevailing time zone.""" - return tz in _PREVAILING_TIME_ZONES diff --git a/src/chronify/time_series_mapper_index_time.py b/src/chronify/time_series_mapper_index_time.py index 4ef3925..43cd4c3 100644 --- a/src/chronify/time_series_mapper_index_time.py +++ b/src/chronify/time_series_mapper_index_time.py @@ -35,7 +35,7 @@ def __init__( data_adjustment: Optional[TimeBasedDataAdjustment] = None, wrap_time_allowed: bool = False, ) -> None: - # TODO: refactor to use new time configs + # TODO: refactor to use new time configs - Issue #64 super().__init__( engine, metadata, from_schema, to_schema, data_adjustment, wrap_time_allowed ) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 543b01f..0a1414c 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -24,7 +24,7 @@ from chronify.time_utils import wrapped_time_timestamps, get_tzname -# TODO - retain original timestamp column +# TODO - allow option to retain original timestamp column - Issue #64 def convert_time_zone( engine: Engine, metadata: MetaData, @@ -187,8 +187,8 @@ class TimeZoneConverter(TimeZoneConverterBase): Output data table will contain tz-naive timestamps with time zone recorded in a column Output time config will be of type DatetimeRange with Timestamp_NTZ dtype and tz-naive start time. - # TODO: support DatetimeRangeWithTZColumn as input time config - # TODO: support wrap_time_allowed option + # TODO: support DatetimeRangeWithTZColumn as input time config - Issue #64 + # TODO: support wrap_time_allowed option - Issue #64 """ def __init__( @@ -318,9 +318,7 @@ class TimeZoneConverterByColumn(TimeZoneConverterBase): Note: converted time is wrapped within the local time range of the original timestamps. -------------------------------- - # TODO: support DatetimeRangeWithTZColumn as input time config - # TODO: add util func to reduce code duplication with TimeZoneLocalizerByColumn - # TODO: add util func to reduce DatetimeRangeWithTZColumn aligned_in_absolute_time to DatetimeRange + # TODO: support DatetimeRangeWithTZColumn as input time config - Issue #64 """ def __init__( diff --git a/src/chronify/time_zone_localizer.py b/src/chronify/time_zone_localizer.py index 5c49312..7bbcabd 100644 --- a/src/chronify/time_zone_localizer.py +++ b/src/chronify/time_zone_localizer.py @@ -228,7 +228,7 @@ def _check_standard_time_zone(to_time_zone: tzinfo | None) -> tzinfo | None: return to_time_zone def generate_to_time_config(self) -> DatetimeRange: - assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy + assert isinstance(self._from_schema.time_config, DatetimeRange) to_time_config: DatetimeRange = self._from_schema.time_config.model_copy( update={ "dtype": TimeDataType.TIMESTAMP_TZ @@ -310,7 +310,7 @@ def __init__( self.time_zone_column = time_zone_column self._convert_from_time_config_to_datetime_range_with_tz_column() else: - assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) self.time_zone_column = self._from_schema.time_config.time_zone_column self._check_standard_time_zones() self._to_schema = self.generate_to_schema() @@ -348,7 +348,7 @@ def _check_time_zone_column(from_schema: TableSchema, time_zone_column: Optional def _check_standard_time_zones(self) -> None: """Check that all time zones in the time_zone_column are valid standard time zones.""" - assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) msg = "" time_zones = self._from_schema.time_config.time_zones for tz in time_zones: @@ -384,7 +384,7 @@ def _convert_from_time_config_to_datetime_range_with_tz_column(self) -> None: self._from_schema.time_config = DatetimeRangeWithTZColumn(**time_kwargs) def generate_to_time_config(self) -> DatetimeRangeBase: - assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) match self._from_schema.time_config.start_time_is_tz_naive(): case True: # tz-naive start, aligned_in_local_time of the time zones @@ -474,18 +474,18 @@ def _get_time_zones(self) -> list[tzinfo | None]: def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: """Create mapping dataframe for localizing tz-naive datetime to column time zones""" - assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) # mypy + assert isinstance(self._from_schema.time_config, DatetimeRangeWithTZColumn) time_col = self._from_schema.time_config.time_column from_time_col = "from_" + time_col from_time_generator = make_time_range_generator(self._from_schema.time_config) - assert isinstance(from_time_generator, DatetimeRangeGeneratorExternalTimeZone) # mypy + assert isinstance(from_time_generator, DatetimeRangeGeneratorExternalTimeZone) from_time_data_dct = from_time_generator.list_timestamps_by_time_zone() to_time_generator = make_time_range_generator(self._to_schema.time_config) match to_time_generator: - case DatetimeRangeGeneratorExternalTimeZone(): # mypy + case DatetimeRangeGeneratorExternalTimeZone(): to_time_data_dct = to_time_generator.list_timestamps_by_time_zone() - case DatetimeRangeGenerator(): # mypy + case DatetimeRangeGenerator(): to_time_data = to_time_generator.list_timestamps() to_time_data_dct = {tz_name: to_time_data for tz_name in from_time_data_dct.keys()} case _: diff --git a/tests/test_mapper_datetime_to_datetime.py b/tests/test_mapper_datetime_to_datetime.py index 5d969c2..b1009c1 100644 --- a/tests/test_mapper_datetime_to_datetime.py +++ b/tests/test_mapper_datetime_to_datetime.py @@ -194,7 +194,7 @@ def test_time_interval_shift_different_time_ranges( "tzinfo_tuple", [ # (ZoneInfo("US/Eastern"), None), - (None, ZoneInfo("EST")), + (None, ZoneInfo("Etc/GMT+5")), # (ZoneInfo("US/Eastern"), ZoneInfo("US/Mountain")), ], ) diff --git a/tests/test_store.py b/tests/test_store.py index 11a5ced..b33fcec 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -51,7 +51,7 @@ @pytest.fixture def generators_schema(): time_config = DatetimeRange( - start=datetime(year=2020, month=1, day=1, tzinfo=ZoneInfo("EST")), + start=datetime(year=2020, month=1, day=1, tzinfo=ZoneInfo("Etc/GMT+5")), resolution=timedelta(hours=1), length=8784, interval_type=TimeIntervalType.PERIOD_BEGINNING, @@ -117,7 +117,7 @@ def test_ingest_csv(iter_stores_by_engine: Store, tmp_path, generators_schema, u new_src_file = tmp_path / "gen_tz.csv" duckdb.sql( f""" - SELECT timezone('EST', timestamp) as timestamp, gen1, gen2, gen3 + SELECT timezone('Etc/GMT+5', timestamp) as timestamp, gen1, gen2, gen3 FROM read_csv('{src_file}') """ ).to_df().to_csv(new_src_file, index=False) @@ -318,7 +318,7 @@ def test_load_parquet(iter_stores_by_engine_no_data_ingestion: Store, tmp_path): return time_config = DatetimeRange( - start=datetime(year=2020, month=1, day=1, tzinfo=ZoneInfo("EST")), + start=datetime(year=2020, month=1, day=1, tzinfo=ZoneInfo("Etc/GMT+5")), resolution=timedelta(hours=1), length=8784, interval_type=TimeIntervalType.PERIOD_BEGINNING, @@ -435,7 +435,7 @@ def test_map_one_week_per_month_by_hour_to_datetime( store.map_table_time_config(src_schema.name, dst_schema, check_mapped_timestamps=True) -@pytest.mark.parametrize("tzinfo", [ZoneInfo("EST"), None]) +@pytest.mark.parametrize("tzinfo", [ZoneInfo("Etc/GMT+5"), None]) def test_map_datetime_to_datetime( tmp_path, iter_stores_by_engine_no_data_ingestion: Store, tzinfo ): @@ -536,7 +536,7 @@ def test_map_index_time_to_datetime( time_array_id_columns=["generator"], value_column="value", time_config=DatetimeRange( - start=datetime(year=year, month=1, day=1, hour=1, tzinfo=ZoneInfo("EST")), + start=datetime(year=year, month=1, day=1, hour=1, tzinfo=ZoneInfo("Etc/GMT+5")), resolution=timedelta(hours=1), length=time_array_len, interval_type=TimeIntervalType.PERIOD_ENDING, @@ -793,7 +793,7 @@ def test_convert_time_zone( store = iter_stores_by_engine_no_data_ingestion time_array_len = 8784 year = 2020 - tzinfo = ZoneInfo("EST") + tzinfo = ZoneInfo("Etc/GMT+5") src_time_config = DatetimeRange( start=datetime(year=year, month=1, day=1, hour=0, tzinfo=tzinfo), @@ -865,7 +865,7 @@ def test_convert_time_zone_by_column( store = iter_stores_by_engine_no_data_ingestion time_array_len = 8784 year = 2020 - tzinfo = ZoneInfo("EST") + tzinfo = ZoneInfo("Etc/GMT+5") src_time_config = DatetimeRange( start=datetime(year=year, month=1, day=1, hour=0, tzinfo=tzinfo), @@ -941,7 +941,7 @@ def test_convert_time_zone_by_column( check_timestamp_lists(actual, expected) -@pytest.mark.parametrize("to_time_zone", [ZoneInfo("UTC"), ZoneInfo("EST"), None]) +@pytest.mark.parametrize("to_time_zone", [ZoneInfo("UTC"), ZoneInfo("Etc/GMT+5"), None]) def test_localize_time_zone( tmp_path, iter_stores_by_engine_no_data_ingestion: Store, to_time_zone: tzinfo | None ): diff --git a/tests/test_time.py b/tests/test_time.py deleted file mode 100644 index ed176d1..0000000 --- a/tests/test_time.py +++ /dev/null @@ -1,57 +0,0 @@ -import pytest - -from chronify.exceptions import InvalidParameter -from chronify.time import ( - TimeZone, - get_standard_time, - get_prevailing_time, - get_time_zone_offset, - is_prevailing, - is_standard, -) - - -def test_standard_time(): - assert get_standard_time(TimeZone.UTC) == TimeZone.UTC - assert get_standard_time(TimeZone.HST) == TimeZone.HST - assert get_standard_time(TimeZone.AST) == TimeZone.AST - assert get_standard_time(TimeZone.APT) == TimeZone.AST - assert get_standard_time(TimeZone.PST) == TimeZone.PST - assert get_standard_time(TimeZone.PPT) == TimeZone.PST - assert get_standard_time(TimeZone.MST) == TimeZone.MST - assert get_standard_time(TimeZone.MPT) == TimeZone.MST - assert get_standard_time(TimeZone.CST) == TimeZone.CST - assert get_standard_time(TimeZone.CPT) == TimeZone.CST - assert get_standard_time(TimeZone.EST) == TimeZone.EST - assert get_standard_time(TimeZone.EPT) == TimeZone.EST - assert get_standard_time(TimeZone.ARIZONA) == TimeZone.ARIZONA - - -def test_prevailing_time(): - assert get_prevailing_time(TimeZone.UTC) == TimeZone.UTC - assert get_prevailing_time(TimeZone.HST) == TimeZone.HST - assert get_prevailing_time(TimeZone.AST) == TimeZone.APT - assert get_prevailing_time(TimeZone.APT) == TimeZone.APT - assert get_prevailing_time(TimeZone.PST) == TimeZone.PPT - assert get_prevailing_time(TimeZone.PPT) == TimeZone.PPT - assert get_prevailing_time(TimeZone.MST) == TimeZone.MPT - assert get_prevailing_time(TimeZone.MPT) == TimeZone.MPT - assert get_prevailing_time(TimeZone.CST) == TimeZone.CPT - assert get_prevailing_time(TimeZone.CPT) == TimeZone.CPT - assert get_prevailing_time(TimeZone.EST) == TimeZone.EPT - assert get_prevailing_time(TimeZone.EPT) == TimeZone.EPT - assert get_prevailing_time(TimeZone.ARIZONA) == TimeZone.ARIZONA - - -def test_is_standard(): - assert is_standard(TimeZone.EST) - assert not is_standard(TimeZone.EPT) - assert is_prevailing(TimeZone.EPT) - assert not is_prevailing(TimeZone.EST) - - -def test_get_time_zone_offset(): - assert get_time_zone_offset(TimeZone.EST) == "-05:00" - assert get_time_zone_offset(TimeZone.MST) == "-07:00" - with pytest.raises(InvalidParameter): - assert get_time_zone_offset(TimeZone.EPT) diff --git a/tests/test_time_series_checker.py b/tests/test_time_series_checker.py index cf05c30..e1c32b6 100644 --- a/tests/test_time_series_checker.py +++ b/tests/test_time_series_checker.py @@ -91,7 +91,7 @@ def _run_test( def _get_inputs_for_valid_datetimes_with_tz() -> tuple[pd.DataFrame, ZoneInfo, int, None]: - tzinfo = ZoneInfo("EST") + tzinfo = ZoneInfo("Etc/GMT+5") df = pd.DataFrame( { "timestamp": [ @@ -170,7 +170,7 @@ def _get_inputs_for_incorrect_datetime_length() -> tuple[pd.DataFrame, ZoneInfo, def _get_inputs_for_mismatched_time_array_lengths() -> tuple[pd.DataFrame, ZoneInfo, int, str]: - tzinfo = ZoneInfo("EST") + tzinfo = ZoneInfo("Etc/GMT+5") df = pd.DataFrame( { "timestamp": [ @@ -189,7 +189,7 @@ def _get_inputs_for_mismatched_time_array_lengths() -> tuple[pd.DataFrame, ZoneI def _get_inputs_for_incorrect_lengths() -> tuple[pd.DataFrame, ZoneInfo, int, str]: - tzinfo = ZoneInfo("EST") + tzinfo = ZoneInfo("Etc/GMT+5") df = pd.DataFrame( { "timestamp": [ @@ -210,7 +210,7 @@ def _get_inputs_for_incorrect_lengths() -> tuple[pd.DataFrame, ZoneInfo, int, st def _get_inputs_for_incorrect_time_arrays() -> tuple[pd.DataFrame, ZoneInfo, int, str]: - tzinfo = ZoneInfo("EST") + tzinfo = ZoneInfo("Etc/GMT+5") df = pd.DataFrame( { "timestamp": [ @@ -231,7 +231,7 @@ def _get_inputs_for_incorrect_time_arrays() -> tuple[pd.DataFrame, ZoneInfo, int def _get_inputs_for_incorrect_time_arrays_with_duplicates() -> ( tuple[pd.DataFrame, ZoneInfo, int, str] ): - tzinfo = ZoneInfo("EST") + tzinfo = ZoneInfo("Etc/GMT+5") df = pd.DataFrame( { "timestamp": [ diff --git a/tests/test_time_utils.py b/tests/test_time_utils.py index ed52ad1..f3f85f8 100644 --- a/tests/test_time_utils.py +++ b/tests/test_time_utils.py @@ -19,7 +19,7 @@ def test_adjust_timestamp_by_dst_offset() -> None: # DST-aware datetime vs standard time zone - tzs = [ZoneInfo("America/New_York"), ZoneInfo("EST")] + tzs = [ZoneInfo("America/New_York"), ZoneInfo("Etc/GMT+5")] hours = [23, 0] for tz, hour in zip(tzs, hours): dt = datetime(2020, 7, 1, 0, 0, tzinfo=tz) @@ -91,34 +91,39 @@ def test_is_standard_time_zone() -> None: def test_get_standard_time_zone() -> None: tzs = [ ZoneInfo("America/New_York"), + ZoneInfo("US/Eastern"), ZoneInfo("EST"), timezone(timedelta(hours=-5)), None, ] stzs = [ - ZoneInfo("EST"), - ZoneInfo("EST"), - timezone(timedelta(hours=-5)), + ZoneInfo("Etc/GMT+5"), + ZoneInfo("Etc/GMT+5"), + ZoneInfo("Etc/GMT+5"), + ZoneInfo("Etc/GMT+5"), None, ] for tz, stz in zip(tzs, stzs): std_tz = get_standard_time_zone(tz) if tz is None: assert std_tz is None - continue - assert std_tz == stz + else: + assert is_standard_time_zone(stz) is True + assert is_standard_time_zone(std_tz) is True + ts = datetime(2020, 6, 1, 0, 0) + assert stz.utcoffset(ts) == std_tz.utcoffset(ts) def test_get_tzname() -> None: tzs = [ ZoneInfo("America/New_York"), - ZoneInfo("EST"), + ZoneInfo("Etc/GMT+5"), timezone(timedelta(hours=-5)), None, ] etzs = [ "America/New_York", - "EST", + "Etc/GMT+5", "UTC-05:00", "None", ] diff --git a/tests/test_time_zone_localizer.py b/tests/test_time_zone_localizer.py index e7287fb..ce7bb0b 100644 --- a/tests/test_time_zone_localizer.py +++ b/tests/test_time_zone_localizer.py @@ -234,7 +234,7 @@ def run_localization_by_column_with_error( ).localize_time_zone(check_mapped_timestamps=True) -@pytest.mark.parametrize("to_time_zone", [None, ZoneInfo("EST")]) +@pytest.mark.parametrize("to_time_zone", [None, ZoneInfo("Etc/GMT+5")]) def test_time_localization(iter_engines: Engine, to_time_zone: tzinfo | None) -> None: from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") df = generate_datetime_dataframe(from_schema)