Merge branch 'develop' into input_checker

ActivitySim · Jan 28, 2024 · 359b265 · 359b265
2 parents b4761cd + 10208d0
commit 359b265
Show file tree

Hide file tree

Showing 40 changed files with 387 additions and 129 deletions.
diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml
@@ -45,7 +45,21 @@ jobs:
         id: cache
 
       - name: Update environment
-        run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+        run: |
+          mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+          mamba install --yes \
+          	"psutil=5.9.5" \
+          	"pydantic=1.10.13" \
+          	"pypyr=5.8.0" \
+          	"pytables=3.6.1" \
+          	"pytest-cov" \
+          	"pytest-regressions=2.5.0" \
+          	"scikit-learn=1.2.2" \
+          	"sharrow>=2.6.0" \
+          	"simwrapper=1.8.5" \
+          	"xarray=2023.2.0" \
+          	"zarr=2.14.2" \
+          	"zstandard=0.21.0"
         if: steps.cache.outputs.cache-hit != 'true'
 
       - name: Install activitysim
@@ -131,7 +145,21 @@ jobs:
         id: cache
 
       - name: Update environment
-        run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+        run: |
+          mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+          mamba install --yes \
+          	"psutil=5.9.5" \
+          	"pydantic=1.10.13" \
+          	"pypyr=5.8.0" \
+          	"pytables=3.6.1" \
+          	"pytest-cov" \
+          	"pytest-regressions=2.5.0" \
+          	"scikit-learn=1.2.2" \
+          	"sharrow>=2.6.0" \
+          	"simwrapper=1.8.5" \
+          	"xarray=2023.2.0" \
+          	"zarr=2.14.2" \
+          	"zstandard=0.21.0"
         if: steps.cache.outputs.cache-hit != 'true'
 
       - name: Install activitysim
@@ -215,7 +243,21 @@ jobs:
         id: cache
 
       - name: Update environment
-        run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+        run: |
+          mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+          mamba install --yes \
+          	"psutil=5.9.5" \
+          	"pydantic=1.10.13" \
+          	"pypyr=5.8.0" \
+          	"pytables=3.6.1" \
+          	"pytest-cov" \
+          	"pytest-regressions=2.5.0" \
+          	"scikit-learn=1.2.2" \
+          	"sharrow>=2.6.0" \
+          	"simwrapper=1.8.5" \
+          	"xarray=2023.2.0" \
+          	"zarr=2.14.2" \
+          	"zstandard=0.21.0"
         if: steps.cache.outputs.cache-hit != 'true'
 
       - name: Install activitysim
@@ -298,7 +340,21 @@ jobs:
         id: cache
 
       - name: Update environment
-        run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+        run: |
+          mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+          mamba install --yes \
+          	"psutil=5.9.5" \
+          	"pydantic=1.10.13" \
+          	"pypyr=5.8.0" \
+          	"pytables=3.6.1" \
+          	"pytest-cov" \
+          	"pytest-regressions=2.5.0" \
+          	"scikit-learn=1.2.2" \
+          	"sharrow>=2.6.0" \
+          	"simwrapper=1.8.5" \
+          	"xarray=2023.2.0" \
+          	"zarr=2.14.2" \
+          	"zstandard=0.21.0"
         if: steps.cache.outputs.cache-hit != 'true'
 
       - name: Install activitysim
@@ -351,7 +407,21 @@ jobs:
         id: cache
 
       - name: Update environment
-        run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+        run: |
+          mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+          mamba install --yes \
+          	"psutil=5.9.5" \
+          	"pydantic=1.10.13" \
+          	"pypyr=5.8.0" \
+          	"pytables=3.6.1" \
+          	"pytest-cov" \
+          	"pytest-regressions=2.5.0" \
+          	"scikit-learn=1.2.2" \
+          	"sharrow>=2.6.0" \
+          	"simwrapper=1.8.5" \
+          	"xarray=2023.2.0" \
+          	"zarr=2.14.2" \
+          	"zstandard=0.21.0"
         if: steps.cache.outputs.cache-hit != 'true'
 
       - name: Install activitysim
@@ -403,7 +473,21 @@ jobs:
         id: cache
 
       - name: Update environment
-        run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+        run: |
+          mamba env update -n asim-test -f conda-environments/github-actions-tests.yml
+          mamba install --yes \
+          	"psutil=5.9.5" \
+          	"pydantic=1.10.13" \
+          	"pypyr=5.8.0" \
+          	"pytables=3.6.1" \
+          	"pytest-cov" \
+          	"pytest-regressions=2.5.0" \
+          	"scikit-learn=1.2.2" \
+          	"sharrow>=2.6.0" \
+          	"simwrapper=1.8.5" \
+          	"xarray=2023.2.0" \
+          	"zarr=2.14.2" \
+          	"zstandard=0.21.0"
         if: steps.cache.outputs.cache-hit != 'true'
 
       - name: Install Larch

diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py
@@ -318,7 +318,7 @@ def parking_location(
         if "trip_period" not in trips_merged_df:
             # TODO: resolve this to the skim time period index not the label, it will be faster
             trips_merged_df["trip_period"] = network_los.skim_time_period_label(
-                trips_merged_df[proposed_trip_departure_period]
+                trips_merged_df[proposed_trip_departure_period], as_cat=True
             )
         model_settings["TRIP_DEPARTURE_PERIOD"] = "trip_period"
 

diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import pandas as pd
-from orca import orca
 
 from activitysim.abm.models.util import annotate, school_escort_tours_trips, trip
 from activitysim.abm.models.util.mode import run_tour_mode_choice_simulate

diff --git a/activitysim/abm/models/tour_od_choice.py b/activitysim/abm/models/tour_od_choice.py
@@ -33,17 +33,15 @@ def tour_od_choice(
 
     Parameters
     ----------
-    tours : orca.DataFrameWrapper
+    tours : pd.DataFrame
         lazy-loaded tours table
-    persons : orca.DataFrameWrapper
+    persons : pd.DataFrame
         lazy-loaded persons table
-    households : orca.DataFrameWrapper
+    households : pd.DataFrame
         lazy-loaded households table
-    land_use : orca.DataFrameWrapper
+    land_use : pd.DataFrame
         lazy-loaded land use data table
-    stop_frequency_alts : orca.DataFrameWrapper
-        lazy-loaded table of stop frequency alternatives, e.g. "1out2in"
-    network_los : orca._InjectableFuncWrapper
+    network_los : los.Network_LOS
         lazy-loaded activitysim.los.Network_LOS object
     chunk_size
         simulation chunk size, set in main settings.yaml

diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py
@@ -73,7 +73,7 @@ def trip_mode_choice(
     # setup skim keys
     assert "trip_period" not in trips_merged
     trips_merged["trip_period"] = network_los.skim_time_period_label(
-        trips_merged.depart
+        trips_merged.depart, as_cat=True
     )
 
     orig_col = "origin"

diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py
@@ -75,10 +75,10 @@ def compute_logsums(
     # FIXME - are we ok with altering choosers (so caller doesn't have to set these)?
     if (in_period_col is not None) and (out_period_col is not None):
         choosers["in_period"] = network_los.skim_time_period_label(
-            choosers[in_period_col]
+            choosers[in_period_col], as_cat=True
         )
         choosers["out_period"] = network_los.skim_time_period_label(
-            choosers[out_period_col]
+            choosers[out_period_col], as_cat=True
         )
     elif ("in_period" not in choosers.columns) and (
         "out_period" not in choosers.columns
@@ -92,17 +92,21 @@ def compute_logsums(
                 and tour_purpose in model_settings["OUT_PERIOD"]
             ):
                 choosers["in_period"] = network_los.skim_time_period_label(
-                    model_settings["IN_PERIOD"][tour_purpose]
+                    model_settings["IN_PERIOD"][tour_purpose],
+                    as_cat=True,
+                    broadcast_to=choosers.index,
                 )
                 choosers["out_period"] = network_los.skim_time_period_label(
-                    model_settings["OUT_PERIOD"][tour_purpose]
+                    model_settings["OUT_PERIOD"][tour_purpose],
+                    as_cat=True,
+                    broadcast_to=choosers.index,
                 )
         else:
             choosers["in_period"] = network_los.skim_time_period_label(
-                model_settings["IN_PERIOD"]
+                model_settings["IN_PERIOD"], as_cat=True, broadcast_to=choosers.index
             )
             choosers["out_period"] = network_los.skim_time_period_label(
-                model_settings["OUT_PERIOD"]
+                model_settings["OUT_PERIOD"], as_cat=True, broadcast_to=choosers.index
             )
     else:
         logger.error("Choosers table already has columns 'in_period' and 'out_period'.")

diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py
@@ -131,8 +131,12 @@ def run_tour_mode_choice_simulate(
     assert ("in_period" not in choosers) and ("out_period" not in choosers)
     in_time = skims["in_time_col_name"]
     out_time = skims["out_time_col_name"]
-    choosers["in_period"] = network_los.skim_time_period_label(choosers[in_time])
-    choosers["out_period"] = network_los.skim_time_period_label(choosers[out_time])
+    choosers["in_period"] = network_los.skim_time_period_label(
+        choosers[in_time], as_cat=True
+    )
+    choosers["out_period"] = network_los.skim_time_period_label(
+        choosers[out_time], as_cat=True
+    )
 
     expressions.annotate_preprocessors(
         state, choosers, locals_dict, skims, model_settings, trace_label

diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 import pandas as pd
-from orca import orca
 
 from activitysim.abm.models.util import logsums as logsum
 from activitysim.abm.models.util import trip

diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py
@@ -185,6 +185,12 @@ def dedupe_alt_tdd(state: workflow.State, alt_tdd, tour_purpose, trace_label):
 
     logger.info("tdd_alt_segments specified for representative logsums")
 
+    if tdd_segments is not None:
+        # apply categorical dtypes
+        tdd_segments["time_period"] = tdd_segments["time_period"].astype(
+            alt_tdd["out_period"].dtype
+        )
+
     with chunk.chunk_log(
         state, tracing.extend_trace_label(trace_label, "dedupe_alt_tdd")
     ) as chunk_sizer:
@@ -328,11 +334,12 @@ def compute_tour_scheduling_logsums(
     assert "out_period" not in alt_tdd
     assert "in_period" not in alt_tdd
 
-    # FIXME:MEMORY
-    #  These two lines each generate a massive array of strings,
-    #  using a bunch of RAM and slowing things down.
-    alt_tdd["out_period"] = network_los.skim_time_period_label(alt_tdd["start"])
-    alt_tdd["in_period"] = network_los.skim_time_period_label(alt_tdd["end"])
+    alt_tdd["out_period"] = network_los.skim_time_period_label(
+        alt_tdd["start"], as_cat=True
+    )
+    alt_tdd["in_period"] = network_los.skim_time_period_label(
+        alt_tdd["end"], as_cat=True
+    )
 
     alt_tdd["duration"] = alt_tdd["end"] - alt_tdd["start"]
 
@@ -383,17 +390,28 @@ def compute_tour_scheduling_logsums(
 
         # tracing.log_runtime(model_name=trace_label, start_time=t0)
 
-        # redupe - join the alt_tdd_period logsums to alt_tdd to get logsums for alt_tdd
-        logsums = (
-            pd.merge(
-                alt_tdd.reset_index(),
-                deduped_alt_tdds.reset_index(),
-                on=[index_name] + redupe_columns,
-                how="left",
-            )
-            .set_index(index_name)
-            .logsums
-        )
+        logsums = pd.Series(data=0, index=alt_tdd.index, dtype=np.float64)
+        left_on = [alt_tdd.index]
+        right_on = [deduped_alt_tdds.index]
+        for i in redupe_columns:
+            if (
+                alt_tdd[i].dtype == "category"
+                and alt_tdd[i].dtype.ordered
+                and alt_tdd[i].dtype == deduped_alt_tdds[i].dtype
+            ):
+                left_on += [alt_tdd[i].cat.codes]
+                right_on += [deduped_alt_tdds[i].cat.codes]
+            else:
+                left_on += [alt_tdd[i].to_numpy()]
+                right_on += [deduped_alt_tdds[i].to_numpy()]
+
+        logsums.iloc[:] = pd.merge(
+            pd.DataFrame(index=alt_tdd.index),
+            deduped_alt_tdds.logsums,
+            left_on=left_on,
+            right_on=right_on,
+            how="left",
+        ).logsums.to_numpy()
         chunk_sizer.log_df(trace_label, "logsums", logsums)
 
         del deduped_alt_tdds

diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py
@@ -105,13 +105,12 @@ def vehicle_allocation(
     Parameters
     ----------
     state : workflow.State
-    persons : orca.DataFrameWrapper
-    households : orca.DataFrameWrapper
-    vehicles : orca.DataFrameWrapper
-    vehicles_merged : orca.DataFrameWrapper
-    tours : orca.DataFrameWrapper
-    tours_merged : orca.DataFrameWrapper
-    chunk_size : orca.injectable
+    persons : pd.DataFrame
+    households : pd.DataFrame
+    vehicles : pd.DataFrame
+    tours : pd.DataFrame
+    tours_merged : pd.DataFrame
+    network_los : los.Network_LOS
     """
     trace_label = "vehicle_allocation"
     model_settings_file_name = "vehicle_allocation.yaml"

diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py
@@ -316,7 +316,7 @@ def iterate_vehicle_type_choice(
     locals_dict : dict
         additional variables available when writing expressions
     estimator : Estimator object
-    chunk_size : orca.injectable
+    chunk_size : int
     trace_label : str
 
     Returns
@@ -516,9 +516,9 @@ def vehicle_type_choice(
 
     Parameters
     ----------
-    persons : orca.DataFrameWrapper
-    households : orca.DataFrameWrapper
-    vehicles : orca.DataFrameWrapper
+    persons : pd.DataFrame
+    households : pd.DataFrame
+    vehicles : pd.DataFrame
     vehicles_merged : DataFrame
     """
     trace_label = "vehicle_type_choice"

diff --git a/activitysim/abm/tables/landuse.py b/activitysim/abm/tables/landuse.py
@@ -23,12 +23,16 @@ def land_use(state: workflow.State):
 
     sharrow_enabled = state.settings.sharrow
     if sharrow_enabled:
+        err_msg = (
+            "a zero-based land_use index is required for sharrow,\n"
+            "try adding `recode_pipeline_columns: true` to your settings file."
+        )
         # when using sharrow, the land use file must be organized (either in raw
         # form or via recoding) so that the index is zero-based and contiguous
-        assert df.index.is_monotonic_increasing
-        assert df.index[0] == 0
-        assert df.index[-1] == len(df.index) - 1
-        assert df.index.dtype.kind == "i"
+        assert df.index.is_monotonic_increasing, err_msg
+        assert df.index[0] == 0, err_msg
+        assert df.index[-1] == len(df.index) - 1, err_msg
+        assert df.index.dtype.kind == "i", err_msg
 
     # try to make life easy for everybody by keeping everything in canonical order
     # but as long as coalesce_pipeline doesn't sort tables it coalesces, it might not stay in order

diff --git a/activitysim/benchmarking/componentwise.py b/activitysim/benchmarking/componentwise.py
@@ -181,7 +181,7 @@ def run_component(state, component_name):
 def teardown_component(state, component_name):
     logger.info("teardown_component: %s", component_name)
 
-    # use the pipeline module to clear out all the orca tables, so
+    # use the pipeline module to clear out all the tables, so
     # the next benchmark run has a clean slate.
     # anything needed should be reloaded from the pipeline checkpoint file
     pipeline_tables = state.registered_tables()