ActivitySim · dhensle · Aug 17, 2024 · Aug 17, 2024 · Sep 6, 2024 · Sep 20, 2024
diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml
@@ -408,8 +408,8 @@ jobs:
           conda env update -n asim-test -f conda-environments/github-actions-tests.yml
         if: steps.cache.outputs.cache-hit != 'true'
 
-      - name: Install Larch
-        run: mamba install "larch>=5.7.1"
+      - name: Install Larch v6
+        run: python -m pip install larch6
 
       - name: Install activitysim
         # installing without dependencies is faster, we trust that all needed dependencies
@@ -427,6 +427,118 @@ jobs:
         run: |
           python -m pytest activitysim/estimation/test/test_larch_estimation.py --durations=0
 
+  estimation_notebooks:
+    needs: foundation
+    env:
+      python-version: "3.10"
+      label: linux-64
+    defaults:
+      run:
+        shell: bash -l {0}
+    name: Estimation Notebooks Test
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Miniforge
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          activate-environment: asim-test
+          python-version: ${{ env.python-version }}
+
+      - name: Set cache date for year and month
+        run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV
+
+      - uses: actions/cache@v4
+        with:
+          path: ${{ env.CONDA }}/envs
+          key: ${{ env.label }}-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
+        id: cache
+
+      - name: Update environment
+        run: |
+          conda env update -n asim-test -f conda-environments/github-actions-tests.yml
+        if: steps.cache.outputs.cache-hit != 'true'
+
+      - name: Install Graphviz
+        uses: ts-graphviz/setup-graphviz@v2
+
+      - name: Install Larch v6
+        run: python -m pip install larch6 "pandas<2" pydot
+
+      - name: Install activitysim
+        # installing without dependencies is faster, we trust that all needed dependencies
+        # are in the conda environment defined above.  Also, this avoids pip getting
+        # confused and reinstalling tables (pytables).
+        run: |
+          python -m pip install -e . --no-deps
+
+      - name: Conda checkup
+        run: |
+          conda info -a
+          conda list
+
+      - name: Create Estimation Data
+        run: |
+          python activitysim/examples/example_estimation/notebooks/est_mode_setup.py --household_sample_size 5000
+
+      - name: Test Estimation Notebooks
+        run: |
+          python -m pytest activitysim/examples/example_estimation/notebooks/*.ipynb \
+            --nbmake-timeout=3000 \
+            --ignore=activitysim/examples/example_estimation/notebooks/01_estimation_mode.ipynb
+
+  estimation_edb_creation:
+    needs: foundation
+    env:
+      python-version: "3.10"
+      label: linux-64
+    defaults:
+      run:
+        shell: bash -l {0}
+    name: estimation_edb_creation_test
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Miniforge
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          activate-environment: asim-test
+          python-version: ${{ env.python-version }}
+
+      - name: Set cache date for year and month
+        run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV
+
+      - uses: actions/cache@v4
+        with:
+          path: ${{ env.CONDA }}/envs
+          key: ${{ env.label }}-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
+        id: cache
+
+      - name: Update environment
+        run: |
+          conda env update -n asim-test -f conda-environments/github-actions-tests.yml
+        if: steps.cache.outputs.cache-hit != 'true'
+
+      - name: Install activitysim
+        # installing without dependencies is faster, we trust that all needed dependencies
+        # are in the conda environment defined above.  Also, this avoids pip getting
+        # confused and reinstalling tables (pytables).
+        run: |
+          python -m pip install -e . --no-deps
+
+      - name: Conda checkup
+        run: |
+          conda info -a
+          conda list
+
+      - name: Test Estimation EDB Creation
+        run: |
+          python -m pytest activitysim/estimation/test/test_edb_creation/test_edb_formation.py --durations=0
+
   develop-docbuild:
     needs: foundation
     if: github.ref_name == 'main'

diff --git a/activitysim/abm/models/cdap.py b/activitysim/abm/models/cdap.py
@@ -180,7 +180,7 @@ def cdap_simulate(
         estimator.write_coefficients(coefficients_df, model_settings)
         estimator.write_table(
             cdap_interaction_coefficients,
-            "interaction_coefficients",
+            "cdap_interaction_coefficients",
             index=False,
             append=False,
         )
@@ -189,7 +189,7 @@ def cdap_simulate(
             spec = cdap.get_cached_spec(state, hhsize)
             estimator.write_table(spec, "spec_%s" % hhsize, append=False)
             if add_joint_tour_utility:
-                joint_spec = cdap.get_cached_joint_spec(hhsize)
+                joint_spec = cdap.get_cached_joint_spec(state, hhsize)
                 estimator.write_table(
                     joint_spec, "joint_spec_%s" % hhsize, append=False
                 )

diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py
@@ -753,11 +753,12 @@ def get_disaggregate_logsums(
             state.filesystem, model_name + ".yaml"
         )
         model_settings.SAMPLE_SIZE = disagg_model_settings.DESTINATION_SAMPLE_SIZE
-        estimator = estimation.manager.begin_estimation(state, trace_label)
-        if estimator:
-            location_choice.write_estimation_specs(
-                state, estimator, model_settings, model_name + ".yaml"
-            )
+        # estimator = estimation.manager.begin_estimation(state, trace_label)
+        # if estimator:
+        #     location_choice.write_estimation_specs(
+        #         state, estimator, model_settings, model_name + ".yaml"
+        #     )
+        estimator = None
 
         # Append table references in settings with "proto_"
         # This avoids having to make duplicate copies of config files for disagg accessibilities

diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py
@@ -192,16 +192,19 @@ def joint_tour_frequency(
         print(f"len(joint_tours) {len(joint_tours)}")
 
         different = False
+        # need to check households as well because the full survey sample may not be used
+        # (e.g. if we set household_sample_size in settings.yaml)
         survey_tours_not_in_tours = survey_tours[
             ~survey_tours.index.isin(joint_tours.index)
+            & survey_tours.household_id.isin(households.index)
         ]
         if len(survey_tours_not_in_tours) > 0:
             print(f"survey_tours_not_in_tours\n{survey_tours_not_in_tours}")
             different = True
         tours_not_in_survey_tours = joint_tours[
             ~joint_tours.index.isin(survey_tours.index)
         ]
-        if len(survey_tours_not_in_tours) > 0:
+        if len(tours_not_in_survey_tours) > 0:
             print(f"tours_not_in_survey_tours\n{tours_not_in_survey_tours}")
             different = True
         assert not different
diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py
@@ -19,7 +19,6 @@
 from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
 from activitysim.core.util import reindex
 
-
 """
 The school/workplace location model predicts the zones in which various people will
 work or attend school.
@@ -140,7 +139,7 @@ def _location_sample(
 
     sample_size = model_settings.SAMPLE_SIZE
 
-    if estimator:
+    if estimator and model_settings.ESTIMATION_SAMPLE_SIZE >= 0:
         sample_size = model_settings.ESTIMATION_SAMPLE_SIZE
         logger.info(
             f"Estimation mode for {trace_label} using sample size of {sample_size}"
@@ -402,7 +401,7 @@ def location_presample(
 
     # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total
     maz_choices = tour_destination.choose_MAZ_for_TAZ(
-        state, taz_sample, MAZ_size_terms, trace_label
+        state, taz_sample, MAZ_size_terms, trace_label, model_settings
     )
 
     assert DEST_MAZ in maz_choices
@@ -491,38 +490,6 @@ def run_location_sample(
             trace_label=trace_label,
         )
 
-    # adding observed choice to alt set when running in estimation mode
-    if estimator:
-        # grabbing survey values
-        survey_persons = estimation.manager.get_survey_table("persons")
-        if "school_location" in trace_label:
-            survey_choices = survey_persons["school_zone_id"].reset_index()
-        elif ("workplace_location" in trace_label) and ("external" not in trace_label):
-            survey_choices = survey_persons["workplace_zone_id"].reset_index()
-        else:
-            return choices
-        survey_choices.columns = ["person_id", "alt_dest"]
-        survey_choices = survey_choices[
-            survey_choices["person_id"].isin(choices.index)
-            & (survey_choices.alt_dest > 0)
-        ]
-        # merging survey destination into table if not available
-        joined_data = survey_choices.merge(
-            choices, on=["person_id", "alt_dest"], how="left", indicator=True
-        )
-        missing_rows = joined_data[joined_data["_merge"] == "left_only"]
-        missing_rows["pick_count"] = 1
-        if len(missing_rows) > 0:
-            new_choices = missing_rows[
-                ["person_id", "alt_dest", "prob", "pick_count"]
-            ].set_index("person_id")
-            choices = choices.append(new_choices, ignore_index=False).sort_index()
-            # making probability the mean of all other sampled destinations by person
-            # FIXME is there a better way to do this? Does this even matter for estimation?
-            choices["prob"] = choices["prob"].fillna(
-                choices.groupby("person_id")["prob"].transform("mean")
-            )
-
     return choices
 
 

diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py
@@ -288,14 +288,22 @@ def non_mandatory_tour_frequency(
         )
 
         if estimator:
-            estimator.write_spec(model_settings, bundle_directory=True)
+            bundle_directory = True
+            # writing to separte subdirectory for each segment if multiprocessing
+            if state.settings.multiprocess:
+                bundle_directory = False
+            estimator.write_spec(model_settings, bundle_directory=bundle_directory)
             estimator.write_model_settings(
-                model_settings, model_settings_file_name, bundle_directory=True
+                model_settings,
+                model_settings_file_name,
+                bundle_directory=bundle_directory,
             )
             # preserving coefficients file name makes bringing back updated coefficients more straightforward
             estimator.write_coefficients(coefficients_df, segment_settings)
             estimator.write_choosers(chooser_segment)
-            estimator.write_alternatives(alternatives, bundle_directory=True)
+            estimator.write_alternatives(
+                alternatives, bundle_directory=bundle_directory
+            )
 
             # FIXME #interaction_simulate_estimation_requires_chooser_id_in_df_column
             #  shuold we do it here or have interaction_simulate do it?
@@ -434,8 +442,10 @@ def non_mandatory_tour_frequency(
     if estimator:
         # make sure they created the right tours
         survey_tours = estimation.manager.get_survey_table("tours").sort_index()
+        # need the household_id check below incase household_sample_size != 0
         non_mandatory_survey_tours = survey_tours[
-            survey_tours.tour_category == "non_mandatory"
+            (survey_tours.tour_category == "non_mandatory")
+            & survey_tours.household_id.isin(persons.household_id)
         ]
         # need to remove the pure-escort tours from the survey tours table for comparison below
         if state.is_table("school_escort_tours"):

diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py
@@ -493,7 +493,10 @@ def school_escorting(
                 coefficients_df, file_name=stage.upper() + "_COEFFICIENTS"
             )
             estimator.write_choosers(choosers)
-            estimator.write_alternatives(alts, bundle_directory=True)
+            if state.settings.multiprocess:
+                estimator.write_alternatives(alts, bundle_directory=False)
+            else:
+                estimator.write_alternatives(alts, bundle_directory=True)
 
             # FIXME #interaction_simulate_estimation_requires_chooser_id_in_df_column
             #  shuold we do it here or have interaction_simulate do it?

diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py
@@ -197,9 +197,15 @@ def stop_frequency(
 
         if estimator:
             estimator.write_spec(segment_settings, bundle_directory=False)
-            estimator.write_model_settings(
-                model_settings, model_settings_file_name, bundle_directory=True
-            )
+            # writing to separte subdirectory for each segment if multiprocessing
+            if state.settings.multiprocess:
+                estimator.write_model_settings(
+                    model_settings, model_settings_file_name, bundle_directory=False
+                )
+            else:
+                estimator.write_model_settings(
+                    model_settings, model_settings_file_name, bundle_directory=True
+                )
             estimator.write_coefficients(coefficients_df, segment_settings)
             estimator.write_choosers(chooser_segment)
 
@@ -271,7 +277,11 @@ def stop_frequency(
 
         survey_trips = estimation.manager.get_survey_table(table_name="trips")
         different = False
-        survey_trips_not_in_trips = survey_trips[~survey_trips.index.isin(trips.index)]
+        # need the check below on household_id incase household_sample_size != 0
+        survey_trips_not_in_trips = survey_trips[
+            ~survey_trips.index.isin(trips.index)
+            & survey_trips.household_id.isin(trips.household_id)
+        ]
         if len(survey_trips_not_in_trips) > 0:
             print(f"survey_trips_not_in_trips\n{survey_trips_not_in_trips}")
             different = True