From d8f8fcbc2bb8bfb0a0004bf65e6d27b74354fe81 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Tue, 25 Jul 2023 09:05:50 -0500 Subject: [PATCH 1/9] remove leftover references to ORCA from docstrings and now-unused imports --- activitysim/abm/models/tour_mode_choice.py | 1 - activitysim/abm/models/tour_od_choice.py | 12 +++++------- activitysim/abm/models/util/tour_od.py | 1 - activitysim/abm/models/vehicle_allocation.py | 13 ++++++------- activitysim/abm/models/vehicle_type_choice.py | 8 ++++---- activitysim/benchmarking/componentwise.py | 2 +- activitysim/cli/benchmark.py | 3 ++- activitysim/core/test/utils_testing.py | 3 --- activitysim/core/workflow/checkpoint.py | 4 ++-- activitysim/core/workflow/runner.py | 2 +- activitysim/core/workflow/state.py | 6 +++--- .../parking_location_choice_at_university.py | 2 +- .../extensions/stop_frequency_university_parking.py | 2 +- .../extensions/university_location_zone_override.py | 4 ++-- test/cdap/test_cdap.py | 2 +- test/joint_tours/test_joint_tours.py | 2 +- .../test_non_mandatory_tour_frequency.py | 2 +- test/parking_location/test_parking_location.py | 4 +--- test/summarize/test_summarize.py | 4 +++- 19 files changed, 35 insertions(+), 42 deletions(-) diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index c56547b26..36e678617 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd -from orca import orca from activitysim.abm.models.util import annotate, school_escort_tours_trips, trip from activitysim.abm.models.util.mode import run_tour_mode_choice_simulate diff --git a/activitysim/abm/models/tour_od_choice.py b/activitysim/abm/models/tour_od_choice.py index 41f1593c0..c55af9af0 100644 --- a/activitysim/abm/models/tour_od_choice.py +++ b/activitysim/abm/models/tour_od_choice.py @@ -33,17 +33,15 @@ def tour_od_choice( Parameters ---------- - tours : orca.DataFrameWrapper + tours : pd.DataFrame lazy-loaded tours table - persons : orca.DataFrameWrapper + persons : pd.DataFrame lazy-loaded persons table - households : orca.DataFrameWrapper + households : pd.DataFrame lazy-loaded households table - land_use : orca.DataFrameWrapper + land_use : pd.DataFrame lazy-loaded land use data table - stop_frequency_alts : orca.DataFrameWrapper - lazy-loaded table of stop frequency alternatives, e.g. "1out2in" - network_los : orca._InjectableFuncWrapper + network_los : los.Network_LOS lazy-loaded activitysim.los.Network_LOS object chunk_size simulation chunk size, set in main settings.yaml diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index 26eefd735..ba1208b47 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd -from orca import orca from activitysim.abm.models.util import logsums as logsum from activitysim.abm.models.util import trip diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py index 372ed464d..fe2acd090 100644 --- a/activitysim/abm/models/vehicle_allocation.py +++ b/activitysim/abm/models/vehicle_allocation.py @@ -105,13 +105,12 @@ def vehicle_allocation( Parameters ---------- state : workflow.State - persons : orca.DataFrameWrapper - households : orca.DataFrameWrapper - vehicles : orca.DataFrameWrapper - vehicles_merged : orca.DataFrameWrapper - tours : orca.DataFrameWrapper - tours_merged : orca.DataFrameWrapper - chunk_size : orca.injectable + persons : pd.DataFrame + households : pd.DataFrame + vehicles : pd.DataFrame + tours : pd.DataFrame + tours_merged : pd.DataFrame + network_los : los.Network_LOS """ trace_label = "vehicle_allocation" model_settings_file_name = "vehicle_allocation.yaml" diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index c674ed62b..57273e977 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -316,7 +316,7 @@ def iterate_vehicle_type_choice( locals_dict : dict additional variables available when writing expressions estimator : Estimator object - chunk_size : orca.injectable + chunk_size : int trace_label : str Returns @@ -516,9 +516,9 @@ def vehicle_type_choice( Parameters ---------- - persons : orca.DataFrameWrapper - households : orca.DataFrameWrapper - vehicles : orca.DataFrameWrapper + persons : pd.DataFrame + households : pd.DataFrame + vehicles : pd.DataFrame vehicles_merged : DataFrame """ trace_label = "vehicle_type_choice" diff --git a/activitysim/benchmarking/componentwise.py b/activitysim/benchmarking/componentwise.py index ea34d8153..86fecea5c 100644 --- a/activitysim/benchmarking/componentwise.py +++ b/activitysim/benchmarking/componentwise.py @@ -181,7 +181,7 @@ def run_component(state, component_name): def teardown_component(state, component_name): logger.info("teardown_component: %s", component_name) - # use the pipeline module to clear out all the orca tables, so + # use the pipeline module to clear out all the tables, so # the next benchmark run has a clean slate. # anything needed should be reloaded from the pipeline checkpoint file pipeline_tables = state.registered_tables() diff --git a/activitysim/cli/benchmark.py b/activitysim/cli/benchmark.py index af46766aa..31b920011 100644 --- a/activitysim/cli/benchmark.py +++ b/activitysim/cli/benchmark.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import json import os import shutil @@ -43,7 +45,6 @@ "pyyaml": [], "pytables": [], "toolz": [], - "orca": [], "psutil": [], "requests": [], "numba": ["0.54"], diff --git a/activitysim/core/test/utils_testing.py b/activitysim/core/test/utils_testing.py index 223ae4cf2..22cfd20bd 100644 --- a/activitysim/core/test/utils_testing.py +++ b/activitysim/core/test/utils_testing.py @@ -1,6 +1,3 @@ -# Orca -# Copyright (C) 2016 UrbanSim Inc. -# See full license in LICENSE. from __future__ import annotations import numpy as np diff --git a/activitysim/core/workflow/checkpoint.py b/activitysim/core/workflow/checkpoint.py index 3aa4c3250..b553460a2 100644 --- a/activitysim/core/workflow/checkpoint.py +++ b/activitysim/core/workflow/checkpoint.py @@ -817,7 +817,7 @@ def load(self, checkpoint_name: str, store=None): table_name, checkpoint_name=last_checkpoint[table_name], store=store ) logger.info("load_checkpoint table %s %s" % (table_name, df.shape)) - # register it as an orca table + # register it as an workflow table self._obj.add_table(table_name, df) loaded_tables[table_name] = df if table_name == "land_use" and "_original_zone_id" in df.columns: @@ -1144,7 +1144,7 @@ def load_dataframe(self, table_name, checkpoint_name=None): Return pandas dataframe corresponding to table_name if checkpoint_name is None, return the current (most recent) version of the table. - The table can be a checkpointed table or any registered orca table (e.g. function table) + The table can be a checkpointed table or any registered table (e.g. function table) if checkpoint_name is specified, return table as it was at that checkpoint (the most recently checkpointed version of the table at or before checkpoint_name) diff --git a/activitysim/core/workflow/runner.py b/activitysim/core/workflow/runner.py index f4b72cb41..8ba37a314 100644 --- a/activitysim/core/workflow/runner.py +++ b/activitysim/core/workflow/runner.py @@ -314,7 +314,7 @@ def by_name(self, model_name, **kwargs): Parameters ---------- model_name : str - model_name is assumed to be the name of a registered orca step + model_name is assumed to be the name of a registered workflow step """ self.t0 = time.time() try: diff --git a/activitysim/core/workflow/state.py b/activitysim/core/workflow/state.py index 550b33935..22bbbf045 100644 --- a/activitysim/core/workflow/state.py +++ b/activitysim/core/workflow/state.py @@ -986,7 +986,7 @@ def get_table(self, table_name, checkpoint_name=None): Return pandas dataframe corresponding to table_name if checkpoint_name is None, return the current (most recent) version of the table. - The table can be a checkpointed table or any registered orca table (e.g. function table) + The table can be a checkpointed table or any registered table (e.g. function table) if checkpoint_name is specified, return table as it was at that checkpoint (the most recently checkpointed version of the table at or before checkpoint_name) @@ -1058,7 +1058,7 @@ def extend_table(self, table_name, df, axis=0): Parameters ---------- table_name : str - orca/inject table name + potentially existing table name df : pandas DataFrame """ assert axis in [0, 1] @@ -1095,7 +1095,7 @@ def extend_table(self, table_name, df, axis=0): def drop_table(self, table_name): if self.is_table(table_name): - logger.debug("drop_table dropping orca table '%s'" % table_name) + logger.debug("drop_table dropping table '%s'" % table_name) self._context.pop(table_name, None) self.existing_table_status.pop(table_name) diff --git a/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py b/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py index ce71108c9..49edf8d37 100644 --- a/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py +++ b/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py @@ -69,7 +69,7 @@ def parking_location_choice_at_university( the tour mode is auto. Parking locations are sampled weighted by the number of parking spots. The main interface to this model is the parking_location_choice_at_university() function. - This function is registered as an orca step in the example Pipeline. + This function is registered as a step in the example Pipeline. """ trace_label = "parking_location_choice_at_university" diff --git a/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py b/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py index 1d6dcda6d..7264fed35 100644 --- a/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py +++ b/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py @@ -24,7 +24,7 @@ def stop_frequency_university_parking( and after groups of trips that are on campus zones. The main interface to this model is the stop_frequency_university_parking() function. - This function is registered as an orca step in the example Pipeline. + This function is registered as a step in the example Pipeline. """ trace_label = "stop_frequency_university_parking" diff --git a/activitysim/examples/production_semcog/extensions/university_location_zone_override.py b/activitysim/examples/production_semcog/extensions/university_location_zone_override.py index cc4354c41..37c1b96e7 100644 --- a/activitysim/examples/production_semcog/extensions/university_location_zone_override.py +++ b/activitysim/examples/production_semcog/extensions/university_location_zone_override.py @@ -108,7 +108,7 @@ def university_location_zone_override( done to replicate the fact that university students can have classes all over campus. The main interface to this model is the university_location_zone_override() function. - This function is registered as an orca step in the example Pipeline. + This function is registered as a step in the example Pipeline. """ trace_label = "university_location_zone_override" @@ -171,7 +171,7 @@ def trip_destination_univ_zone_override( already handled in university_location_zone_override. The main interface to this model is the trip_destination_univ_zone_override() function. - This function is registered as an orca step in the example Pipeline. + This function is registered as a step in the example Pipeline. """ trace_label = "trip_destination_univ_zone_override" diff --git a/test/cdap/test_cdap.py b/test/cdap/test_cdap.py index 8847b591b..c48a4f1ac 100644 --- a/test/cdap/test_cdap.py +++ b/test/cdap/test_cdap.py @@ -11,7 +11,7 @@ from numpy import dot from numpy.linalg import norm -# import models is necessary to initalize the model steps with orca +# import models is necessary to initalize the model steps from activitysim.abm import models # noqa: F401 from activitysim.core import config, tracing, workflow from activitysim.core.util import read_csv, to_csv diff --git a/test/joint_tours/test_joint_tours.py b/test/joint_tours/test_joint_tours.py index fe4ffad2c..40c022240 100644 --- a/test/joint_tours/test_joint_tours.py +++ b/test/joint_tours/test_joint_tours.py @@ -9,7 +9,7 @@ from numpy import dot from numpy.linalg import norm -# import models is necessary to initalize the model steps with orca +# import models is necessary to initalize the model steps from activitysim.abm import models # noqa: F401 from activitysim.core import workflow from activitysim.core.util import read_csv, to_csv diff --git a/test/non_mandatory_tour_frequency/test_non_mandatory_tour_frequency.py b/test/non_mandatory_tour_frequency/test_non_mandatory_tour_frequency.py index b7a7676b2..135ea3454 100644 --- a/test/non_mandatory_tour_frequency/test_non_mandatory_tour_frequency.py +++ b/test/non_mandatory_tour_frequency/test_non_mandatory_tour_frequency.py @@ -11,7 +11,7 @@ from numpy import dot from numpy.linalg import norm -# import models is necessary to initalize the model steps with orca +# import models is necessary to initalize the model steps from activitysim.abm import models # noqa: F401 from activitysim.core import config, tracing, workflow from activitysim.core.util import read_csv, to_csv diff --git a/test/parking_location/test_parking_location.py b/test/parking_location/test_parking_location.py index c6e960b54..3062c3a9d 100644 --- a/test/parking_location/test_parking_location.py +++ b/test/parking_location/test_parking_location.py @@ -6,16 +6,14 @@ from pathlib import Path import numpy as np -import orca import pandas as pd import pytest import yaml from numpy import dot from numpy.linalg import norm -# import models is necessary to initalize the model steps with orca +# import models is necessary to initalize the model steps from activitysim.abm import models -from activitysim.abm.models.util import estimation from activitysim.core import config, simulate, tracing, workflow from activitysim.core.util import read_csv, to_csv diff --git a/test/summarize/test_summarize.py b/test/summarize/test_summarize.py index 6943fe41e..1faf73719 100644 --- a/test/summarize/test_summarize.py +++ b/test/summarize/test_summarize.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import os from pathlib import Path @@ -5,7 +7,7 @@ import pandas as pd import pytest -# import models is necessary to initalize the model steps with orca +# import models is necessary to initalize the model steps from activitysim.abm import models from activitysim.core import los, workflow From 460203c5b54c597e4416bbef30d67fc2a9da3ee8 Mon Sep 17 00:00:00 2001 From: Jeffrey Newman Date: Sun, 12 Nov 2023 12:54:45 -0600 Subject: [PATCH 2/9] Fix memory usage (#751) * clear cache of flow.tree * categorical time period dtype * add pydantic for tests * use time_label_dtype only when available * allow missing taz skim_dict * recover tree when needed * predigitized time periods * pass sh_tree back again for tracing * better error message --- .../abm/models/parking_location_choice.py | 2 +- activitysim/abm/models/trip_mode_choice.py | 2 +- activitysim/abm/models/util/logsums.py | 16 +++--- activitysim/abm/models/util/mode.py | 8 ++- .../models/util/vectorize_tour_scheduling.py | 50 +++++++++++++------ activitysim/abm/tables/landuse.py | 12 +++-- activitysim/core/flow.py | 36 ++++++++++--- activitysim/core/interaction_simulate.py | 13 +++-- activitysim/core/los.py | 24 +++++++-- activitysim/core/simulate.py | 6 +-- activitysim/core/skim_dataset.py | 30 +++++++++-- conda-environments/github-actions-tests.yml | 1 + 12 files changed, 147 insertions(+), 53 deletions(-) diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index 870f01af9..dbec927be 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -318,7 +318,7 @@ def parking_location( if "trip_period" not in trips_merged_df: # TODO: resolve this to the skim time period index not the label, it will be faster trips_merged_df["trip_period"] = network_los.skim_time_period_label( - trips_merged_df[proposed_trip_departure_period] + trips_merged_df[proposed_trip_departure_period], as_cat=True ) model_settings["TRIP_DEPARTURE_PERIOD"] = "trip_period" diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index 8f3e9f418..907444137 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -73,7 +73,7 @@ def trip_mode_choice( # setup skim keys assert "trip_period" not in trips_merged trips_merged["trip_period"] = network_los.skim_time_period_label( - trips_merged.depart + trips_merged.depart, as_cat=True ) orig_col = "origin" diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py index c48586a86..fff541e92 100644 --- a/activitysim/abm/models/util/logsums.py +++ b/activitysim/abm/models/util/logsums.py @@ -75,10 +75,10 @@ def compute_logsums( # FIXME - are we ok with altering choosers (so caller doesn't have to set these)? if (in_period_col is not None) and (out_period_col is not None): choosers["in_period"] = network_los.skim_time_period_label( - choosers[in_period_col] + choosers[in_period_col], as_cat=True ) choosers["out_period"] = network_los.skim_time_period_label( - choosers[out_period_col] + choosers[out_period_col], as_cat=True ) elif ("in_period" not in choosers.columns) and ( "out_period" not in choosers.columns @@ -92,17 +92,21 @@ def compute_logsums( and tour_purpose in model_settings["OUT_PERIOD"] ): choosers["in_period"] = network_los.skim_time_period_label( - model_settings["IN_PERIOD"][tour_purpose] + model_settings["IN_PERIOD"][tour_purpose], + as_cat=True, + broadcast_to=choosers.index, ) choosers["out_period"] = network_los.skim_time_period_label( - model_settings["OUT_PERIOD"][tour_purpose] + model_settings["OUT_PERIOD"][tour_purpose], + as_cat=True, + broadcast_to=choosers.index, ) else: choosers["in_period"] = network_los.skim_time_period_label( - model_settings["IN_PERIOD"] + model_settings["IN_PERIOD"], as_cat=True, broadcast_to=choosers.index ) choosers["out_period"] = network_los.skim_time_period_label( - model_settings["OUT_PERIOD"] + model_settings["OUT_PERIOD"], as_cat=True, broadcast_to=choosers.index ) else: logger.error("Choosers table already has columns 'in_period' and 'out_period'.") diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index 8a75ae8b6..3c0d2a5ed 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -131,8 +131,12 @@ def run_tour_mode_choice_simulate( assert ("in_period" not in choosers) and ("out_period" not in choosers) in_time = skims["in_time_col_name"] out_time = skims["out_time_col_name"] - choosers["in_period"] = network_los.skim_time_period_label(choosers[in_time]) - choosers["out_period"] = network_los.skim_time_period_label(choosers[out_time]) + choosers["in_period"] = network_los.skim_time_period_label( + choosers[in_time], as_cat=True + ) + choosers["out_period"] = network_los.skim_time_period_label( + choosers[out_time], as_cat=True + ) expressions.annotate_preprocessors( state, choosers, locals_dict, skims, model_settings, trace_label diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index 775d84b7b..297a61e33 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -185,6 +185,12 @@ def dedupe_alt_tdd(state: workflow.State, alt_tdd, tour_purpose, trace_label): logger.info("tdd_alt_segments specified for representative logsums") + if tdd_segments is not None: + # apply categorical dtypes + tdd_segments["time_period"] = tdd_segments["time_period"].astype( + alt_tdd["out_period"].dtype + ) + with chunk.chunk_log( state, tracing.extend_trace_label(trace_label, "dedupe_alt_tdd") ) as chunk_sizer: @@ -328,11 +334,12 @@ def compute_tour_scheduling_logsums( assert "out_period" not in alt_tdd assert "in_period" not in alt_tdd - # FIXME:MEMORY - # These two lines each generate a massive array of strings, - # using a bunch of RAM and slowing things down. - alt_tdd["out_period"] = network_los.skim_time_period_label(alt_tdd["start"]) - alt_tdd["in_period"] = network_los.skim_time_period_label(alt_tdd["end"]) + alt_tdd["out_period"] = network_los.skim_time_period_label( + alt_tdd["start"], as_cat=True + ) + alt_tdd["in_period"] = network_los.skim_time_period_label( + alt_tdd["end"], as_cat=True + ) alt_tdd["duration"] = alt_tdd["end"] - alt_tdd["start"] @@ -383,17 +390,28 @@ def compute_tour_scheduling_logsums( # tracing.log_runtime(model_name=trace_label, start_time=t0) - # redupe - join the alt_tdd_period logsums to alt_tdd to get logsums for alt_tdd - logsums = ( - pd.merge( - alt_tdd.reset_index(), - deduped_alt_tdds.reset_index(), - on=[index_name] + redupe_columns, - how="left", - ) - .set_index(index_name) - .logsums - ) + logsums = pd.Series(data=0, index=alt_tdd.index, dtype=np.float64) + left_on = [alt_tdd.index] + right_on = [deduped_alt_tdds.index] + for i in redupe_columns: + if ( + alt_tdd[i].dtype == "category" + and alt_tdd[i].dtype.ordered + and alt_tdd[i].dtype == deduped_alt_tdds[i].dtype + ): + left_on += [alt_tdd[i].cat.codes] + right_on += [deduped_alt_tdds[i].cat.codes] + else: + left_on += [alt_tdd[i].to_numpy()] + right_on += [deduped_alt_tdds[i].to_numpy()] + + logsums.iloc[:] = pd.merge( + pd.DataFrame(index=alt_tdd.index), + deduped_alt_tdds.logsums, + left_on=left_on, + right_on=right_on, + how="left", + ).logsums.to_numpy() chunk_sizer.log_df(trace_label, "logsums", logsums) del deduped_alt_tdds diff --git a/activitysim/abm/tables/landuse.py b/activitysim/abm/tables/landuse.py index 8d9376b75..9abc0c2e7 100644 --- a/activitysim/abm/tables/landuse.py +++ b/activitysim/abm/tables/landuse.py @@ -23,12 +23,16 @@ def land_use(state: workflow.State): sharrow_enabled = state.settings.sharrow if sharrow_enabled: + err_msg = ( + "a zero-based land_use index is required for sharrow,\n" + "try adding `recode_pipeline_columns: true` to your settings file." + ) # when using sharrow, the land use file must be organized (either in raw # form or via recoding) so that the index is zero-based and contiguous - assert df.index.is_monotonic_increasing - assert df.index[0] == 0 - assert df.index[-1] == len(df.index) - 1 - assert df.index.dtype.kind == "i" + assert df.index.is_monotonic_increasing, err_msg + assert df.index[0] == 0, err_msg + assert df.index[-1] == len(df.index) - 1, err_msg + assert df.index.dtype.kind == "i", err_msg # try to make life easy for everybody by keeping everything in canonical order # but as long as coalesce_pipeline doesn't sort tables it coalesces, it might not stay in order diff --git a/activitysim/core/flow.py b/activitysim/core/flow.py index 6d1e8e257..92429e7d4 100644 --- a/activitysim/core/flow.py +++ b/activitysim/core/flow.py @@ -267,6 +267,7 @@ def skims_mapping( parking_col_name=None, zone_layer=None, primary_origin_col_name=None, + predigitized_time_periods=False, ): logger.info("loading skims_mapping") logger.info(f"- orig_col_name: {orig_col_name}") @@ -337,6 +338,10 @@ def skims_mapping( ), ) else: + if predigitized_time_periods: + time_rel = "_code ->" + else: + time_rel = " @" return dict( # TODO:SHARROW: organize dimensions. odt_skims=skim_dataset, @@ -347,16 +352,16 @@ def skims_mapping( relationships=( f"df._orig_col_name -> odt_skims.{odim}", f"df._dest_col_name -> odt_skims.{ddim}", - "df.out_period @ odt_skims.time_period", + f"df.out_period{time_rel} odt_skims.time_period", f"df._dest_col_name -> dot_skims.{odim}", f"df._orig_col_name -> dot_skims.{ddim}", - "df.in_period @ dot_skims.time_period", + f"df.in_period{time_rel} dot_skims.time_period", f"df._orig_col_name -> odr_skims.{odim}", f"df._dest_col_name -> odr_skims.{ddim}", - "df.in_period @ odr_skims.time_period", + f"df.in_period{time_rel} odr_skims.time_period", f"df._dest_col_name -> dor_skims.{odim}", f"df._orig_col_name -> dor_skims.{ddim}", - "df.out_period @ dor_skims.time_period", + f"df.out_period{time_rel} dor_skims.time_period", f"df._orig_col_name -> od_skims.{odim}", f"df._dest_col_name -> od_skims.{ddim}", ), @@ -525,6 +530,15 @@ def new_flow( cache_dir = state.filesystem.get_sharrow_cache_dir() logger.debug(f"flow.cache_dir: {cache_dir}") + predigitized_time_periods = False + if "out_period" in choosers and "in_period" in choosers: + if ( + choosers["out_period"].dtype == "category" + and choosers["in_period"].dtype == "category" + ): + choosers["out_period_code"] = choosers["out_period"].cat.codes + choosers["in_period_code"] = choosers["in_period"].cat.codes + predigitized_time_periods = True skims_mapping_ = skims_mapping( state, orig_col_name, @@ -534,6 +548,7 @@ def new_flow( parking_col_name=parking_col_name, zone_layer=zone_layer, primary_origin_col_name=primary_origin_col_name, + predigitized_time_periods=predigitized_time_periods, ) if size_term_mapping is None: size_term_mapping = {} @@ -774,6 +789,9 @@ def apply_flow( it ever again, but having a reference to it available later can be useful in debugging and tracing. Flows are cached and reused anyway, so it is generally not important to delete this at any point to free resources. + tree : sharrow.DataTree + The tree data used to compute the flow result. It is seperate from the + flow to prevent it from being cached with the flow. """ if sh is None: return None, None @@ -800,7 +818,7 @@ def apply_flow( logger.error(f"error in apply_flow: {err!s}") if required: raise - return None, None + return None, None, None else: raise with logtime(f"{flow.name}.load", trace_label or ""): @@ -822,7 +840,9 @@ def apply_flow( logger.error(f"error in apply_flow: {err!s}") if required: raise - return None, flow + tree = flow.tree + flow.tree = None + return None, flow, tree raise except Exception as err: logger.error(f"error in apply_flow: {err!s}") @@ -833,4 +853,6 @@ def apply_flow( # Detecting compilation activity when in production mode is a bug # that should be investigated. tracing.timing_notes.add(f"compiled:{flow.name}") - return flow_result, flow + tree = flow.tree + flow.tree = None + return flow_result, flow, tree diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index 88dbfc73d..780bd53e7 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -171,7 +171,7 @@ def replace_in_index_level(mi, level, *repls): timelogger.mark("sharrow preamble", True, logger, trace_label) - sh_util, sh_flow = apply_flow( + sh_util, sh_flow, sh_tree = apply_flow( state, spec_sh, df, @@ -187,10 +187,13 @@ def replace_in_index_level(mi, level, *repls): index=df.index if extra_data is None else None, ) chunk_sizer.log_df(trace_label, "sh_util", None) # hand off to caller + if sharrow_enabled != "test": + # if not testing sharrow, we are done with this object now. + del sh_util timelogger.mark("sharrow flow", True, logger, trace_label) else: - sh_util, sh_flow = None, None + sh_util, sh_flow, sh_tree = None, None, None timelogger.mark("sharrow flow", False) if ( @@ -404,9 +407,9 @@ def to_series(x): if sh_flow is not None and trace_rows is not None and trace_rows.any(): assert type(trace_rows) == np.ndarray sh_utility_fat = sh_flow.load_dataarray( - # sh_flow.tree.replace_datasets( - # df=df.iloc[trace_rows], - # ), + sh_tree.replace_datasets( + df=df.iloc[trace_rows], + ), dtype=np.float32, ) sh_utility_fat = sh_utility_fat[trace_rows, :] diff --git a/activitysim/core/los.py b/activitysim/core/los.py index 9d2136098..d0cf66a3b 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -845,7 +845,9 @@ def get_tappairs3d(self, otap, dtap, dim3, key): return s.values - def skim_time_period_label(self, time_period, fillna=None): + def skim_time_period_label( + self, time_period, fillna=None, as_cat=False, broadcast_to=None + ): """ convert time period times to skim time period labels (e.g. 9 -> 'AM') @@ -873,6 +875,14 @@ def skim_time_period_label(self, time_period, fillna=None): assert 0 == model_time_window_min % period_minutes total_periods = model_time_window_min / period_minutes + try: + time_label_dtype = self.skim_dicts["taz"].time_label_dtype + except (KeyError, AttributeError): + # if the "taz" skim_dict is missing, or if using old SkimDict + # instead of SkimDataset, this labeling shortcut is unavailable. + time_label_dtype = str + as_cat = False + # FIXME - eventually test and use np version always? if np.isscalar(time_period): bin = ( @@ -888,6 +898,12 @@ def skim_time_period_label(self, time_period, fillna=None): result = self.skim_time_periods["labels"].get(bin, default=default) else: result = self.skim_time_periods["labels"][bin] + if broadcast_to is not None: + result = pd.Series( + data=result, + index=broadcast_to, + dtype=time_label_dtype if as_cat else str, + ) else: result = pd.cut( time_period, @@ -898,8 +914,10 @@ def skim_time_period_label(self, time_period, fillna=None): if fillna is not None: default = self.skim_time_periods["labels"][fillna] result = result.fillna(default) - result = result.astype(str) - + if as_cat: + result = result.astype(time_label_dtype) + else: + result = result.astype(str) return result def get_tazs(self, state): diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index 1763d17bf..9dda2a0b2 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -536,7 +536,7 @@ def eval_utilities( locals_dict.update(state.get_global_constants()) if locals_d is not None: locals_dict.update(locals_d) - sh_util, sh_flow = apply_flow( + sh_util, sh_flow, sh_tree = apply_flow( state, spec_sh, choosers, @@ -652,7 +652,7 @@ def eval_utilities( if sh_flow is not None: try: data_sh = sh_flow.load( - sh_flow.tree.replace_datasets( + sh_tree.replace_datasets( df=choosers.iloc[offsets], ), dtype=np.float32, @@ -731,7 +731,7 @@ def eval_utilities( ) print(f"{sh_util.shape=}") print(misses) - _sh_flow_load = sh_flow.load() + _sh_flow_load = sh_flow.load(sh_tree) print("possible problematic expressions:") for expr_n, expr in enumerate(exprs): closeness = np.isclose( diff --git a/activitysim/core/skim_dataset.py b/activitysim/core/skim_dataset.py index 759ecdead..e6528f1ea 100644 --- a/activitysim/core/skim_dataset.py +++ b/activitysim/core/skim_dataset.py @@ -33,6 +33,10 @@ def __init__(self, dataset): self.time_map = { j: i for i, j in enumerate(self.dataset.indexes["time_period"]) } + self.time_label_dtype = pd.api.types.CategoricalDtype( + self.dataset.indexes["time_period"], + ordered=True, + ) self.usage = set() # track keys of skims looked up @property @@ -184,6 +188,10 @@ def __init__(self, dataset, orig_key, dest_key, time_key=None, *, time_map=None) } else: self.time_map = time_map + self.time_label_dtype = pd.api.types.CategoricalDtype( + self.dataset.indexes["time_period"], + ordered=True, + ) @property def odim(self): @@ -246,6 +254,11 @@ def set_df(self, df): ): logger.info(f"natural use for time_period={self.time_key}") positions["time_period"] = df[self.time_key] + elif ( + df[self.time_key].dtype == "category" + and df[self.time_key].dtype == self.time_label_dtype + ): + positions["time_period"] = df[self.time_key].cat.codes else: logger.info(f"vectorize lookup for time_period={self.time_key}") positions["time_period"] = pd.Series( @@ -257,11 +270,18 @@ def set_df(self, df): self.positions = {} for k, v in positions.items(): try: - self.positions[k] = v.astype(int) - except TypeError: - # possibly some missing values that are not relevant, - # fill with zeros to continue. - self.positions[k] = v.fillna(0).astype(int) + is_int = np.issubdtype(v.dtype, np.integer) + except Exception: + is_int = False + if is_int: + self.positions[k] = v + else: + try: + self.positions[k] = v.astype(int) + except TypeError: + # possibly some missing values that are not relevant, + # fill with zeros to continue. + self.positions[k] = v.fillna(0).astype(int) else: self.positions = pd.DataFrame(positions).astype(int) diff --git a/conda-environments/github-actions-tests.yml b/conda-environments/github-actions-tests.yml index e4d44e8d7..c7cfd39e5 100644 --- a/conda-environments/github-actions-tests.yml +++ b/conda-environments/github-actions-tests.yml @@ -21,6 +21,7 @@ dependencies: - platformdirs = 3.2.* - psutil = 5.9.* - pyarrow = 11.* +- pydantic = 1.10.* - pypyr = 5.8.* - pytables >= 3.5.1,<3.7 # orca's constraint - pytest = 7.2.* From 9bc575ced907c5bfb84e0befa4ac942056261593 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 1 Dec 2023 11:58:43 -0600 Subject: [PATCH 3/9] Fixes #681 --- activitysim/core/los.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/activitysim/core/los.py b/activitysim/core/los.py index d0cf66a3b..7089466d7 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -787,7 +787,14 @@ def get_mazpairs(self, omaz, dmaz, attribute): # how="left")[attribute] # synthetic index method i : omaz_dmaz - i = np.asanyarray(omaz) * self.maz_ceiling + np.asanyarray(dmaz) + if self.maz_ceiling > 32767: + # too many MAZs, or un-recoded MAZ ID's that are too large + # will overflow a 32-bit index, so upgrade to 64bit. + i = np.asanyarray(omaz, dtype=np.int64) * np.int64( + self.maz_ceiling + ) + np.asanyarray(dmaz, dtype=np.int64) + else: + i = np.asanyarray(omaz) * self.maz_ceiling + np.asanyarray(dmaz) s = util.quick_loc_df(i, self.maz_to_maz_df, attribute) # FIXME - no point in returning series? From a015dddb674f04916b086f8f819b773f5b9a07a8 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 4 Dec 2023 11:10:40 -0600 Subject: [PATCH 4/9] two stage test env install --- .github/workflows/core_tests.yml | 96 +++++++++++++++++++-- conda-environments/github-actions-tests.yml | 56 ++++++------ 2 files changed, 118 insertions(+), 34 deletions(-) diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml index a72406dd2..385ce052e 100644 --- a/.github/workflows/core_tests.yml +++ b/.github/workflows/core_tests.yml @@ -45,7 +45,21 @@ jobs: id: cache - name: Update environment - run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + run: | + mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + mamba install --yes \ + "psutil = 5.9.5" \ + "pydantic = 1.10.13" \ + "pypyr = 5.8.0" \ + "pytables = 3.6.1" \ + "pytest-cov" \ + "pytest-regressions = 2.5.0" \ + "scikit-learn = 1.2.2" \ + "sharrow >= 2.6.0" \ + "simwrapper = 1.8.5" \ + "xarray = 2023.2.0" \ + "zarr = 2.14.2" \ + "zstandard = 0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -131,7 +145,21 @@ jobs: id: cache - name: Update environment - run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + run: | + mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + mamba install --yes \ + "psutil = 5.9.5" \ + "pydantic = 1.10.13" \ + "pypyr = 5.8.0" \ + "pytables = 3.6.1" \ + "pytest-cov" \ + "pytest-regressions = 2.5.0" \ + "scikit-learn = 1.2.2" \ + "sharrow >= 2.6.0" \ + "simwrapper = 1.8.5" \ + "xarray = 2023.2.0" \ + "zarr = 2.14.2" \ + "zstandard = 0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -215,7 +243,21 @@ jobs: id: cache - name: Update environment - run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + run: | + mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + mamba install --yes \ + "psutil = 5.9.5" \ + "pydantic = 1.10.13" \ + "pypyr = 5.8.0" \ + "pytables = 3.6.1" \ + "pytest-cov" \ + "pytest-regressions = 2.5.0" \ + "scikit-learn = 1.2.2" \ + "sharrow >= 2.6.0" \ + "simwrapper = 1.8.5" \ + "xarray = 2023.2.0" \ + "zarr = 2.14.2" \ + "zstandard = 0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -298,7 +340,21 @@ jobs: id: cache - name: Update environment - run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + run: | + mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + mamba install --yes \ + "psutil = 5.9.5" \ + "pydantic = 1.10.13" \ + "pypyr = 5.8.0" \ + "pytables = 3.6.1" \ + "pytest-cov" \ + "pytest-regressions = 2.5.0" \ + "scikit-learn = 1.2.2" \ + "sharrow >= 2.6.0" \ + "simwrapper = 1.8.5" \ + "xarray = 2023.2.0" \ + "zarr = 2.14.2" \ + "zstandard = 0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -351,7 +407,21 @@ jobs: id: cache - name: Update environment - run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + run: | + mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + mamba install --yes \ + "psutil = 5.9.5" \ + "pydantic = 1.10.13" \ + "pypyr = 5.8.0" \ + "pytables = 3.6.1" \ + "pytest-cov" \ + "pytest-regressions = 2.5.0" \ + "scikit-learn = 1.2.2" \ + "sharrow >= 2.6.0" \ + "simwrapper = 1.8.5" \ + "xarray = 2023.2.0" \ + "zarr = 2.14.2" \ + "zstandard = 0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -403,7 +473,21 @@ jobs: id: cache - name: Update environment - run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + run: | + mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + mamba install --yes \ + "psutil = 5.9.5" \ + "pydantic = 1.10.13" \ + "pypyr = 5.8.0" \ + "pytables = 3.6.1" \ + "pytest-cov" \ + "pytest-regressions = 2.5.0" \ + "scikit-learn = 1.2.2" \ + "sharrow >= 2.6.0" \ + "simwrapper = 1.8.5" \ + "xarray = 2023.2.0" \ + "zarr = 2.14.2" \ + "zstandard = 0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install Larch diff --git a/conda-environments/github-actions-tests.yml b/conda-environments/github-actions-tests.yml index c7cfd39e5..eeb822183 100644 --- a/conda-environments/github-actions-tests.yml +++ b/conda-environments/github-actions-tests.yml @@ -7,32 +7,32 @@ channels: - conda-forge dependencies: - pip -- black >= 22.0,<23 -- coveralls -- cytoolz = 0.12.* -- dask = 2023.3.* -- isort -- nbmake -- numba = 0.56.* -- numpy = 1.23.* -- openmatrix = 0.3.* +- black = 22.12.0 +- coveralls = 3.3.1 +- cytoolz = 0.12.2 +- dask = 2023.3.2 +- isort = 5.12.0 +- nbmake = 1.4.6 +- numba = 0.56.4 +- numpy = 1.23.5 +- openmatrix = 0.3.5.0 - orca = 1.8 -- pandas = 1.4.* -- platformdirs = 3.2.* -- psutil = 5.9.* -- pyarrow = 11.* -- pydantic = 1.10.* -- pypyr = 5.8.* -- pytables >= 3.5.1,<3.7 # orca's constraint -- pytest = 7.2.* -- pytest-cov -- pytest-regressions -- pyyaml = 6.* -- requests = 2.28.* -- ruff -- scikit-learn = 1.2.* -- sharrow >= 2.6.0 -- simwrapper > 1.7 -- xarray = 2023.2.* -- zarr = 2.14.* -- zstandard +- pandas = 1.4.4 +#- platformdirs = 3.2.0 ## +#- psutil = 5.9.5 +#- pyarrow = 11.0.0 ## +#- pydantic = 1.10.13 +#- pypyr = 5.8.0 +#- pytables = 3.6.1 # orca's constraint ## +#- pytest = 7.2.2 ## +#- pytest-cov = 4.1.0 +#- pytest-regressions = 2.5.0 +#- pyyaml = 6.0.1 #ok +#- requests = 2.28.2 +#- ruff = 0.1.1 +#- scikit-learn = 1.2.2 +#- sharrow >= 2.6.0 +#- simwrapper = 1.8.5 +#- xarray = 2023.2.0 +#- zarr = 2.14.2 +#- zstandard = 0.21.0 From dfd6e76bf56d3df8ed05395d159ee7c8024227d6 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 4 Dec 2023 11:55:58 -0600 Subject: [PATCH 5/9] drop buggy test for now --- .github/workflows/core_tests.yml | 132 +++++++++--------- .../estimation/test/test_larch_estimation.py | 2 +- 2 files changed, 67 insertions(+), 67 deletions(-) diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml index 385ce052e..61c55299c 100644 --- a/.github/workflows/core_tests.yml +++ b/.github/workflows/core_tests.yml @@ -48,18 +48,18 @@ jobs: run: | mamba env update -n asim-test -f conda-environments/github-actions-tests.yml mamba install --yes \ - "psutil = 5.9.5" \ - "pydantic = 1.10.13" \ - "pypyr = 5.8.0" \ - "pytables = 3.6.1" \ + "psutil=5.9.5" \ + "pydantic=1.10.13" \ + "pypyr=5.8.0" \ + "pytables=3.6.1" \ "pytest-cov" \ - "pytest-regressions = 2.5.0" \ - "scikit-learn = 1.2.2" \ - "sharrow >= 2.6.0" \ - "simwrapper = 1.8.5" \ - "xarray = 2023.2.0" \ - "zarr = 2.14.2" \ - "zstandard = 0.21.0" + "pytest-regressions=2.5.0" \ + "scikit-learn=1.2.2" \ + "sharrow>=2.6.0" \ + "simwrapper=1.8.5" \ + "xarray=2023.2.0" \ + "zarr=2.14.2" \ + "zstandard=0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -148,18 +148,18 @@ jobs: run: | mamba env update -n asim-test -f conda-environments/github-actions-tests.yml mamba install --yes \ - "psutil = 5.9.5" \ - "pydantic = 1.10.13" \ - "pypyr = 5.8.0" \ - "pytables = 3.6.1" \ + "psutil=5.9.5" \ + "pydantic=1.10.13" \ + "pypyr=5.8.0" \ + "pytables=3.6.1" \ "pytest-cov" \ - "pytest-regressions = 2.5.0" \ - "scikit-learn = 1.2.2" \ - "sharrow >= 2.6.0" \ - "simwrapper = 1.8.5" \ - "xarray = 2023.2.0" \ - "zarr = 2.14.2" \ - "zstandard = 0.21.0" + "pytest-regressions=2.5.0" \ + "scikit-learn=1.2.2" \ + "sharrow>=2.6.0" \ + "simwrapper=1.8.5" \ + "xarray=2023.2.0" \ + "zarr=2.14.2" \ + "zstandard=0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -246,18 +246,18 @@ jobs: run: | mamba env update -n asim-test -f conda-environments/github-actions-tests.yml mamba install --yes \ - "psutil = 5.9.5" \ - "pydantic = 1.10.13" \ - "pypyr = 5.8.0" \ - "pytables = 3.6.1" \ + "psutil=5.9.5" \ + "pydantic=1.10.13" \ + "pypyr=5.8.0" \ + "pytables=3.6.1" \ "pytest-cov" \ - "pytest-regressions = 2.5.0" \ - "scikit-learn = 1.2.2" \ - "sharrow >= 2.6.0" \ - "simwrapper = 1.8.5" \ - "xarray = 2023.2.0" \ - "zarr = 2.14.2" \ - "zstandard = 0.21.0" + "pytest-regressions=2.5.0" \ + "scikit-learn=1.2.2" \ + "sharrow>=2.6.0" \ + "simwrapper=1.8.5" \ + "xarray=2023.2.0" \ + "zarr=2.14.2" \ + "zstandard=0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -343,18 +343,18 @@ jobs: run: | mamba env update -n asim-test -f conda-environments/github-actions-tests.yml mamba install --yes \ - "psutil = 5.9.5" \ - "pydantic = 1.10.13" \ - "pypyr = 5.8.0" \ - "pytables = 3.6.1" \ + "psutil=5.9.5" \ + "pydantic=1.10.13" \ + "pypyr=5.8.0" \ + "pytables=3.6.1" \ "pytest-cov" \ - "pytest-regressions = 2.5.0" \ - "scikit-learn = 1.2.2" \ - "sharrow >= 2.6.0" \ - "simwrapper = 1.8.5" \ - "xarray = 2023.2.0" \ - "zarr = 2.14.2" \ - "zstandard = 0.21.0" + "pytest-regressions=2.5.0" \ + "scikit-learn=1.2.2" \ + "sharrow>=2.6.0" \ + "simwrapper=1.8.5" \ + "xarray=2023.2.0" \ + "zarr=2.14.2" \ + "zstandard=0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -410,18 +410,18 @@ jobs: run: | mamba env update -n asim-test -f conda-environments/github-actions-tests.yml mamba install --yes \ - "psutil = 5.9.5" \ - "pydantic = 1.10.13" \ - "pypyr = 5.8.0" \ - "pytables = 3.6.1" \ + "psutil=5.9.5" \ + "pydantic=1.10.13" \ + "pypyr=5.8.0" \ + "pytables=3.6.1" \ "pytest-cov" \ - "pytest-regressions = 2.5.0" \ - "scikit-learn = 1.2.2" \ - "sharrow >= 2.6.0" \ - "simwrapper = 1.8.5" \ - "xarray = 2023.2.0" \ - "zarr = 2.14.2" \ - "zstandard = 0.21.0" + "pytest-regressions=2.5.0" \ + "scikit-learn=1.2.2" \ + "sharrow>=2.6.0" \ + "simwrapper=1.8.5" \ + "xarray=2023.2.0" \ + "zarr=2.14.2" \ + "zstandard=0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install activitysim @@ -476,18 +476,18 @@ jobs: run: | mamba env update -n asim-test -f conda-environments/github-actions-tests.yml mamba install --yes \ - "psutil = 5.9.5" \ - "pydantic = 1.10.13" \ - "pypyr = 5.8.0" \ - "pytables = 3.6.1" \ + "psutil=5.9.5" \ + "pydantic=1.10.13" \ + "pypyr=5.8.0" \ + "pytables=3.6.1" \ "pytest-cov" \ - "pytest-regressions = 2.5.0" \ - "scikit-learn = 1.2.2" \ - "sharrow >= 2.6.0" \ - "simwrapper = 1.8.5" \ - "xarray = 2023.2.0" \ - "zarr = 2.14.2" \ - "zstandard = 0.21.0" + "pytest-regressions=2.5.0" \ + "scikit-learn=1.2.2" \ + "sharrow>=2.6.0" \ + "simwrapper=1.8.5" \ + "xarray=2023.2.0" \ + "zarr=2.14.2" \ + "zstandard=0.21.0" if: steps.cache.outputs.cache-hit != 'true' - name: Install Larch diff --git a/activitysim/estimation/test/test_larch_estimation.py b/activitysim/estimation/test/test_larch_estimation.py index ec38c2a01..1c2904b06 100644 --- a/activitysim/estimation/test/test_larch_estimation.py +++ b/activitysim/estimation/test/test_larch_estimation.py @@ -131,7 +131,7 @@ def test_location_model( [ ("non_mandatory_tour_scheduling", "SLSQP"), ("joint_tour_scheduling", "SLSQP"), - ("atwork_subtour_scheduling", "SLSQP"), + # ("atwork_subtour_scheduling", "SLSQP"), # TODO: needs a fix, this test is unstable, probably the test data is poor ("mandatory_tour_scheduling_work", "SLSQP"), ("mandatory_tour_scheduling_school", "SLSQP"), ], From cf19e3a7e58b7bc359534c827aa36ba037675fd6 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Wed, 13 Dec 2023 11:21:13 -0600 Subject: [PATCH 6/9] overflow protection --- activitysim/core/interaction_sample.py | 1 + .../core/interaction_sample_simulate.py | 29 +++++++---- activitysim/core/logit.py | 50 ++++++++++++++++++- activitysim/core/pathbuilder.py | 1 + activitysim/core/simulate.py | 1 + activitysim/core/test/test_logit.py | 10 +++- .../test/regress/final_1_zone_tours_sh.csv | 2 +- .../test/regress/final_1_zone_trips_sh.csv | 2 +- 8 files changed, 79 insertions(+), 17 deletions(-) diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index 966f4a7d8..b2cdcd0b5 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -404,6 +404,7 @@ def _interaction_sample( allow_zero_probs=allow_zero_probs, trace_label=trace_label, trace_choosers=choosers, + overflow_protection=not allow_zero_probs, ) chunk_sizer.log_df(trace_label, "probs", probs) diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index 274940b30..c69880c7b 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -248,20 +248,27 @@ def _interaction_sample_simulate( # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative - probs = logit.utils_to_probs( - state, - utilities_df, - allow_zero_probs=allow_zero_probs, - trace_label=trace_label, - trace_choosers=choosers, - ) - chunk_sizer.log_df(trace_label, "probs", probs) - if want_logsums: - logsums = logit.utils_to_logsums( - utilities_df, allow_zero_probs=allow_zero_probs + probs, logsums = logit.utils_to_probs( + state, + utilities_df, + allow_zero_probs=allow_zero_probs, + trace_label=trace_label, + trace_choosers=choosers, + overflow_protection=not allow_zero_probs, + return_logsums=True, ) chunk_sizer.log_df(trace_label, "logsums", logsums) + else: + probs = logit.utils_to_probs( + state, + utilities_df, + allow_zero_probs=allow_zero_probs, + trace_label=trace_label, + trace_choosers=choosers, + overflow_protection=not allow_zero_probs, + ) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities_df chunk_sizer.log_df(trace_label, "utilities_df", None) diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index 273e17c2d..a20701a15 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import warnings from builtins import object import numpy as np @@ -130,6 +131,8 @@ def utils_to_probs( exponentiated=False, allow_zero_probs=False, trace_choosers=None, + overflow_protection: bool = True, + return_logsums: bool = False, ): """ Convert a table of utilities to probabilities. @@ -155,6 +158,20 @@ def utils_to_probs( by report_bad_choices because it can't deduce hh_id from the interaction_dataset which is indexed on index values from alternatives df + overflow_protection : bool, default True + Always shift utility values such that the maximum utility in each row is + zero. This constant per-row shift should not fundamentally alter the + computed probabilities, but will ensure that an overflow does not occur + that will create infinite or NaN values. This will also provide effective + protection against underflow; extremely rare probabilities will round to + zero, but by definition they are extremely rare and losing them entirely + should not impact the simulation in a measureable fashion, and at least one + (and sometimes only one) alternative is guaranteed to have non-zero + probability, as long as at least one alternative has a finite utility value. + If utility values are certain to be well-behaved and non-extreme, enabling + overflow_protection will have no benefit but impose a modest computational + overhead cost. + Returns ------- probs : pandas.DataFrame @@ -167,9 +184,27 @@ def utils_to_probs( # utils_arr = utils.values.astype('float') utils_arr = utils.values - if utils_arr.dtype == np.float32 and utils_arr.max() > 85: + if allow_zero_probs: + if overflow_protection: + warnings.warn( + "cannot set overflow_protection with allow_zero_probs", stacklevel=2 + ) + overflow_protection = utils_arr.dtype == np.float32 and utils_arr.max() > 85 + if overflow_protection: + raise ValueError( + "cannot prevent expected overflow with allow_zero_probs" + ) + else: + overflow_protection = overflow_protection or ( + utils_arr.dtype == np.float32 and utils_arr.max() > 85 + ) + + if overflow_protection: # exponentiated utils will overflow, downshift them - utils_arr -= utils_arr.max(1, keepdims=True) + shifts = utils_arr.max(1, keepdims=True) + utils_arr -= shifts + else: + shifts = None if not exponentiated: # TODO: reduce memory usage by exponentiating in-place. @@ -185,6 +220,15 @@ def utils_to_probs( arr_sum = utils_arr.sum(axis=1) + if return_logsums: + with np.errstate(divide="ignore" if allow_zero_probs else "warn"): + logsums = np.log(arr_sum) + if shifts is not None: + logsums += np.squeeze(shifts, 1) + logsums = pd.Series(logsums, index=utils.index) + else: + logsums = None + if not allow_zero_probs: zero_probs = arr_sum == 0.0 if zero_probs.any(): @@ -222,6 +266,8 @@ def utils_to_probs( probs = pd.DataFrame(utils_arr, columns=utils.columns, index=utils.index) + if return_logsums: + return probs, logsums return probs diff --git a/activitysim/core/pathbuilder.py b/activitysim/core/pathbuilder.py index 01635b0ed..266a99832 100644 --- a/activitysim/core/pathbuilder.py +++ b/activitysim/core/pathbuilder.py @@ -994,6 +994,7 @@ def build_virtual_path( utilities_df, allow_zero_probs=True, trace_label=trace_label, + overflow_protection=False, ) chunk_sizer.log_df(trace_label, "probs", probs) diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index 9dda2a0b2..ad72dc468 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -1032,6 +1032,7 @@ def compute_nested_probabilities( trace_label=trace_label, exponentiated=True, allow_zero_probs=True, + overflow_protection=False, ) nested_probabilities = pd.concat([nested_probabilities, probs], axis=1) diff --git a/activitysim/core/test/test_logit.py b/activitysim/core/test/test_logit.py index c07c74650..1761fd260 100644 --- a/activitysim/core/test/test_logit.py +++ b/activitysim/core/test/test_logit.py @@ -81,13 +81,19 @@ def test_utils_to_probs_raises(): idx = pd.Index(name="household_id", data=[1]) with pytest.raises(RuntimeError) as excinfo: logit.utils_to_probs( - state, pd.DataFrame([[1, 2, np.inf, 3]], index=idx), trace_label=None + state, + pd.DataFrame([[1, 2, np.inf, 3]], index=idx), + trace_label=None, + overflow_protection=False, ) assert "infinite exponentiated utilities" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: logit.utils_to_probs( - state, pd.DataFrame([[-999, -999, -999, -999]], index=idx), trace_label=None + state, + pd.DataFrame([[-999, -999, -999, -999]], index=idx), + trace_label=None, + overflow_protection=False, ) assert "all probabilities are zero" in str(excinfo.value) diff --git a/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_tours_sh.csv b/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_tours_sh.csv index c5f84186b..317430a5b 100644 --- a/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_tours_sh.csv +++ b/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_tours_sh.csv @@ -20,7 +20,7 @@ tour_id,person_id,tour_type,tour_type_count,tour_type_num,tour_num,tour_count,to 2373898,57899,work,1,1,1,1,mandatory,1,3402.0,3746.0,20552,47.0,7.0,17.0,10.0,,,WALK,1.0388895039783694,no_subtours,,0out_0in,work 2373980,57901,work,2,1,1,2,mandatory,1,3115.0,3746.0,20552,25.0,6.0,12.0,6.0,,,SHARED3FREE,0.6022315390131013,no_subtours,,0out_0in,work 2373981,57901,work,2,2,2,2,mandatory,1,3115.0,3746.0,20552,150.0,15.0,20.0,5.0,,,SHARED2FREE,0.6232767878249469,no_subtours,,1out_0in,work -2563802,62531,school,1,1,1,1,mandatory,1,3460.0,3316.0,21869,180.0,20.0,20.0,0.0,,,SHARED3FREE,-0.7094603590463964,,,0out_0in,school +2563802,62531,school,1,1,1,1,mandatory,1,3460.0,3316.0,21869,181.0,20.0,21.0,1.0,,,SHARED3FREE,-0.7094603590463964,,,0out_0in,school 2563821,62532,escort,1,1,1,1,non_mandatory,1,3398.0,3316.0,21869,20.0,6.0,7.0,1.0,,12.499268454965652,SHARED2FREE,-1.4604154628072699,,,0out_0in,escort 2563862,62533,escort,3,1,1,4,non_mandatory,1,3402.0,3316.0,21869,1.0,5.0,6.0,1.0,,12.534424209198946,SHARED3FREE,-1.2940574569954848,,,0out_3in,escort 2563863,62533,escort,3,2,2,4,non_mandatory,1,3519.0,3316.0,21869,99.0,11.0,11.0,0.0,,12.466623656700463,SHARED2FREE,-0.9326373013150777,,,0out_0in,escort diff --git a/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_trips_sh.csv b/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_trips_sh.csv index 3e4eecce6..adc796c13 100644 --- a/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_trips_sh.csv +++ b/activitysim/examples/placeholder_sandag/test/regress/final_1_zone_trips_sh.csv @@ -55,7 +55,7 @@ trip_id,person_id,household_id,primary_purpose,trip_num,outbound,trip_count,dest 18991850,57901,20552,work,2,True,2,3115,3460,2373981,work,,16,DRIVEALONEFREE,0.10597046751418379 18991853,57901,20552,work,1,False,1,3746,3115,2373981,home,,20,SHARED2FREE,0.23660752783217825 20510417,62531,21869,school,1,True,1,3460,3316,2563802,school,,20,SHARED3FREE,-1.4448137456466916 -20510421,62531,21869,school,1,False,1,3316,3460,2563802,home,,20,WALK,-1.5207459403958272 +20510421,62531,21869,school,1,False,1,3316,3460,2563802,home,,21,WALK,-1.5207459403958272 20510569,62532,21869,escort,1,True,1,3398,3316,2563821,escort,,6,SHARED2FREE,0.17869598454022895 20510573,62532,21869,escort,1,False,1,3316,3398,2563821,home,,7,DRIVEALONEFREE,0.20045149458253975 20510897,62533,21869,escort,1,True,1,3402,3316,2563862,escort,,5,SHARED3FREE,0.7112775892674524 From ca15754af2d82175f428d09d1e9d981831a6e2d1 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Wed, 13 Dec 2023 11:43:41 -0600 Subject: [PATCH 7/9] more testing --- activitysim/core/test/test_logit.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/activitysim/core/test/test_logit.py b/activitysim/core/test/test_logit.py index 1761fd260..e249475de 100644 --- a/activitysim/core/test/test_logit.py +++ b/activitysim/core/test/test_logit.py @@ -88,6 +88,15 @@ def test_utils_to_probs_raises(): ) assert "infinite exponentiated utilities" in str(excinfo.value) + with pytest.raises(RuntimeError) as excinfo: + logit.utils_to_probs( + state, + pd.DataFrame([[1, 2, 9999, 3]], index=idx), + trace_label=None, + overflow_protection=False, + ) + assert "infinite exponentiated utilities" in str(excinfo.value) + with pytest.raises(RuntimeError) as excinfo: logit.utils_to_probs( state, @@ -97,6 +106,15 @@ def test_utils_to_probs_raises(): ) assert "all probabilities are zero" in str(excinfo.value) + # test that overflow protection works + z = logit.utils_to_probs( + state, + pd.DataFrame([[1, 2, 9999, 3]], index=idx), + trace_label=None, + overflow_protection=True, + ) + assert np.asarray(z).ravel() == pytest.approx(np.asarray([0.0, 0.0, 1.0, 0.0])) + def test_make_choices_only_one(): state = workflow.State().default_settings() From 9348a0c2de2046c294792061e0107b4a864171eb Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 15 Dec 2023 09:40:12 -0600 Subject: [PATCH 8/9] disable unstable test --- activitysim/estimation/test/test_larch_estimation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/activitysim/estimation/test/test_larch_estimation.py b/activitysim/estimation/test/test_larch_estimation.py index ec38c2a01..09355f846 100644 --- a/activitysim/estimation/test/test_larch_estimation.py +++ b/activitysim/estimation/test/test_larch_estimation.py @@ -131,7 +131,7 @@ def test_location_model( [ ("non_mandatory_tour_scheduling", "SLSQP"), ("joint_tour_scheduling", "SLSQP"), - ("atwork_subtour_scheduling", "SLSQP"), + # ("atwork_subtour_scheduling", "SLSQP"), # TODO this test is unstable, needs to be updated with better data ("mandatory_tour_scheduling_work", "SLSQP"), ("mandatory_tour_scheduling_school", "SLSQP"), ], From 73713c6187928b3088eb45e00e3e0610e8cf2ba7 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Tue, 2 Jan 2024 11:55:15 -0600 Subject: [PATCH 9/9] update repo pointers --- activitysim/examples/external_example_manifest.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/activitysim/examples/external_example_manifest.yaml b/activitysim/examples/external_example_manifest.yaml index 0be270636..5face8e8e 100644 --- a/activitysim/examples/external_example_manifest.yaml +++ b/activitysim/examples/external_example_manifest.yaml @@ -13,11 +13,11 @@ # prototype_mtc: - url: https://github.com/jpn--/activitysim-prototype-mtc/archive/refs/tags/v1.3.1.tar.gz + url: https://github.com/ActivitySim/activitysim-prototype-mtc/archive/refs/tags/v1.3.1.tar.gz sha256: ec53c6e72da1444bd5808de8c644cea75db284dfcc419b776575ba532b3ccb87 assets: test/prototype_mtc_reference_pipeline.zip: - url: https://github.com/jpn--/activitysim-prototype-mtc/releases/download/v1.3.1/prototype_mtc_reference_pipeline.zip + url: https://github.com/ActivitySim/activitysim-prototype-mtc/releases/download/v1.3.1/prototype_mtc_reference_pipeline.zip sha256: 394e5b403d4c61d5214493cefe161432db840ba4967c23c999d914178d43a1f0 estimation_example: