merged data filtration groupby category selection

implemented various changes discussed with relevant CIERA staff; implemented data multiselect in category space; renormalized financial year
CIERA-Northwestern · Jul 23, 2024 · 5ae0402 · 5ae0402
1 parent 71906e1
commit 5ae0402
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 129 deletions.
diff --git a/press_dash_lib/dash_builder.py b/press_dash_lib/dash_builder.py
@@ -143,9 +143,7 @@ def recategorize_data(
     def filter_data(
         _self,
         recategorized_df: pd.DataFrame,
-        text_filters: dict[str, str] = {},
         categorical_filters: dict[str, list] = {},
-        numerical_filters: dict[str, tuple] = {},
     ) -> pd.DataFrame:
         '''Filter what data shows up in the dashboard.
 
@@ -163,9 +161,7 @@ def filter_data(
         with st.spinner(msg):
             return _self.data_handler.filter_data(
                 recategorized_df=recategorized_df,
-                text_filters=text_filters,
                 categorical_filters=categorical_filters,
-                numerical_filters=numerical_filters
             )
 
     @st.cache_data
@@ -178,7 +174,7 @@ def aggregate(
         aggregation_method: str = 'count',
     ) -> Union[pd.Series, pd.DataFrame]:
         '''Aggregate stats.
-
+        
         Args:
             df: The dataframe containing the selected data.
             x_column: The column containing the year or other time bin value.
@@ -192,6 +188,8 @@ def aggregate(
                 or
             totals: The series containing the counts per year
         '''
+
+
         msg = 'Aggregating...'
         print(msg)
         with st.spinner(msg):

diff --git a/press_dash_lib/data_handler.py b/press_dash_lib/data_handler.py
@@ -226,17 +226,13 @@ def recategorize_data(
     def filter_data(
         self,
         recategorized_df: pd.DataFrame,
-        text_filters: dict[str, str] = {},
         categorical_filters: dict[str, list] = {},
-        numerical_filters: dict[str, tuple] = {},
     ) -> pd.DataFrame:
         '''Filter what data shows up in the dashboard.
 
         Args:
             recategorized_df: The dataframe containing the data.
-            text_filters (dict): Search fields for text.
             categorical_filters (dict): How categories are filtered.
-            numerical_filters (dict): Ranges for numerical data filters
 
         Returns:
             selected_df: The dataframe containing the selected data.
@@ -245,22 +241,10 @@ def filter_data(
         # Initialized
         is_included = np.ones(len(recategorized_df), dtype=bool)
 
-        # Text filter
-        for text_filter_col, search_str in text_filters.items():
-            is_matching = recategorized_df[text_filter_col].str.extract('(' + search_str + ')', flags=re.IGNORECASE).notna().values[:,0]
-            is_included = is_included & is_matching
-
         # Categories filter
         for cat_filter_col, selected_cats in categorical_filters.items():
             is_included = is_included & recategorized_df[cat_filter_col].isin(selected_cats)
 
-        # Range filters
-        for num_filter_col, column_range in numerical_filters.items():
-            is_included = is_included & (
-                (column_range[0] <= recategorized_df[num_filter_col]) &
-                (recategorized_df[num_filter_col] <= column_range[1])
-            )
-
         selected_df = recategorized_df.loc[is_included]
 
         return selected_df
diff --git a/press_dash_lib/data_viewer.py b/press_dash_lib/data_viewer.py
@@ -129,7 +129,7 @@ def lineplot(
         Returns:
             fig: The figure containing the plot.
         '''
-        df.index = df.index + 1
+        #df.index = df.index + 1
 
         # Modify data if cumulative
         if cumulative:

diff --git a/press_dash_lib/interface.py b/press_dash_lib/interface.py
@@ -112,12 +112,13 @@ def request_data_axes(
         if key in ask_for:
             value, ind = selectbox(
                 st_loc,
-                'What do you want to group the data by?',
-                display_options.get('groupby_column', self.config['categorical_columns']),
+                'What do you want to categorize the data by?',
+                options=display_options.get('groupby_column', self.config['categorical_columns']),
                 index=display_defaults.get(key + '_ind', 0),
             )
-            selected_settings[key] = value
-            selected_settings[key + '_ind'] = ind
+
+        selected_settings[key] = value
+        selected_settings[key + '_ind'] = ind
 
         return selected_settings
 
@@ -191,14 +192,14 @@ def request_data_settings(
 
         return selected_settings
 
-    def request_filter_settings(
+    def process_filter_settings(
             self,
             st_loc,
             df: pd.DataFrame,
             ask_for: list[str] = ['text', 'categorical', 'numerical'],
             local_key: str = None,
             display_defaults: dict = {},
-            display_options: dict = {},
+            value: str = None,
             selected_settings: dict = None,
             tag: str = None,
     ) -> dict:
@@ -233,88 +234,25 @@ def request_filter_settings(
             tag = ''
         else:
             tag += ':'
-
-        key = 'text'
-        if key in ask_for:
-            current = selected_settings.setdefault(key, {})
-            # Select which columns to filter on
-            if len(current) == 0:
-                multiselect_default = []
-            else:
-                multiselect_default = list(current)
-            filter_columns = st_loc.multiselect(
-                'What columns do you want to search? (case insensitive; not a smart search)',
-                options=display_options.get(key, self.config['text_columns']),
-                default=multiselect_default,
-                key=tag + key
-            )
-            for col in filter_columns:
-                # Check the current values then the passed-in defaults
-                # for a default
-                default = current.get(col,'')
-                default = display_defaults.get(key, {}).get(col, default)
-                selected_settings[key][col] = st_loc.text_input(
-                    '"{}" column: What do you want to search for?'.format(col),
-                    value=default,
-                    key=tag + key + ':' + col
-                )
-
+
         key = 'categorical'
         if key in ask_for:
             current = selected_settings.setdefault(key, {})
-            # Select which columns to filter on
-            if len(current) == 0:
-                multiselect_default = []
-            else:
-                multiselect_default = list(current)
-            filter_columns = st_loc.multiselect(
-                'What categorical columns do you want to filter on?',
-                options=display_options.get(key, self.config['categorical_columns']),
-                default=multiselect_default,
-                key=tag + key
-            )
-            for col in filter_columns:
-                possible_columns = pd.unique(df[col])
-                # Check the current values then the passed-in defaults
-                # for a default
-                default = current.get(col, possible_columns)
-                default = display_defaults.get(key, {}).get(col, default)
-                selected_settings[key][col] = st_loc.multiselect(
-                    '"{}" column: What groups to include?'.format(col),
-                    possible_columns,
-                    default=default,
-                    key=tag + key + ':' + col
-                )
-
-        key = 'numerical'
-        if key in ask_for:
-            current = selected_settings.setdefault(key, {})
-            # Select which columns to filter on
-            if len(current) == 0:
-                multiselect_default = []
-            else:
-                multiselect_default = list(current)
-            filter_columns = st_loc.multiselect(
-                'What numerical columns do you want to filter on?',
-                options=display_options.get(key, self.config['numerical_columns']),
-                default=multiselect_default,
-                key=tag + key
+            key=tag + key
+
+
+            possible_columns = pd.unique(df[value])
+            # Check the current values then the passed-in defaults
+            # for a default
+            default = current.get(value, possible_columns)
+            default = display_defaults.get(key, {}).get(value, default)
+            selected_settings[key][value] = st_loc.multiselect(
+                '"{}" column: What groups to include?'.format(value),
+                possible_columns,
+                default=default,
+                key=tag + key + ':' + value
             )
-            for col in filter_columns:
-                value_min = df[col].min()
-                value_max = df[col].max()
-                # Check the current values then the passed-in defaults
-                # for a default
-                default = current.get(col, (value_min, value_max))
-                default = display_defaults.get(key, {}).get(col, default)
-                selected_settings[key][col] = st_loc.slider(
-                    '"{}" column: What range to include?'.format(col),
-                    min_value=default[0],
-                    max_value=default[1],
-                    value=default,
-                    key=tag + key + ':' + col
-                )
-
+
         return selected_settings
 
     def request_view_settings(
@@ -381,7 +319,10 @@ def request_view_settings(
             'include_annotations',
             'annotations_ha',
             'font',
-            'color_palette'
+            'color_palette',
+            'category_colors',
+            'totals',
+            'kwargs'
        ]
         if ask_for == 'all':
             ask_for = available_settings
@@ -579,7 +520,7 @@ def request_view_settings(
                     'legend scale',
                     0.1,
                     2.,
-                    value=display_defaults.get(key, 1.),
+                    value=display_defaults.get(key, 1.32),
                     key=tag + key,
                 )
             key = 'legend_x'
@@ -588,7 +529,7 @@ def request_view_settings(
                     'legend x',
                     0.,
                     1.5,
-                    value=display_defaults.get(key, 1.),
+                    value=display_defaults.get(key, 0.),
                     key=tag + key,
                 )
             key = 'legend_y'
@@ -597,7 +538,7 @@ def request_view_settings(
                     'legend y',
                     0.,
                     1.5,
-                    value=display_defaults.get(key, 1.),
+                    value=display_defaults.get(key, 1.4),
                     key=tag + key,
                 )
             key = 'legend_ha'

diff --git a/press_dash_lib/pages/base_page.py b/press_dash_lib/pages/base_page.py
@@ -32,7 +32,7 @@ def main(config_fp: str, user_utils: types.ModuleType = None):
 
     # Set the title that shows up at the top of the dashboard
     st.title(builder.config.get('page_title','Dashboard'))
-
+    
     # Prep data
     data, config = builder.prep_data(builder.config)
     builder.config.update(config)
@@ -60,26 +60,27 @@ def main(config_fp: str, user_utils: types.ModuleType = None):
         ),
     )
 
-    # Data filter settings
-    with st.expander('Data Filters'):
-        st.subheader('Data Filters')
-        builder.interface.request_filter_settings(
-            st,
-            data['recategorized'],
-        )
+    # Data axes
+    # entered search category passed down to filter settings for further specification
+    st.subheader('Data Axes')
+    builder.interface.request_data_axes(st)
 
+    # catches specified groupby category
+    category_specific = builder.settings.get_settings(common_to_include=['data'])
+
+    # filters data as per specs
+    builder.interface.process_filter_settings(
+        st,
+        data['recategorized'],
+        value=category_specific['groupby_column']
+    )
+
     # Apply data filters
     data['selected'] = builder.filter_data(
         data['recategorized'],
-        builder.settings.common['filters']['text'],
         builder.settings.common['filters']['categorical'],
-        builder.settings.common['filters']['numerical'],
     )
 
-    # Data axes
-    st.subheader('Data Axes')
-    builder.interface.request_data_axes(st)
-
     # Aggregate data
     data['aggregated'] = builder.aggregate(
         data['selected'],
@@ -99,6 +100,7 @@ def main(config_fp: str, user_utils: types.ModuleType = None):
     # Lineplot
     local_key = 'lineplot'
     st.header(config.get('lineplot_header', 'Lineplot'))
+    st.text("Note: some data entries may correspond to multiple categories, and so may be contribute to dataset of each.\n As such, the all categories combined may exceed the total, which only counts each entry once***")
     with st.expander('Lineplot settings'):
         local_opt_keys, common_opt_keys, unset_opt_keys = builder.settings.get_local_global_and_unset(
             function=builder.data_viewer.lineplot,

diff --git a/press_dash_lib/user_utils.py b/press_dash_lib/user_utils.py
@@ -61,11 +61,18 @@ def load_data(config):
 
     # Website data
     os.chdir(os.path.dirname(os.path.abspath(__file__)))
-    website_df = pd.read_csv('report.csv')
+    website_df = pd.read_csv('News_Report_Main.csv', encoding_errors='ignore')
     website_df.set_index('id', inplace=True)
+
+
+
+    for i in ['Press Mentions', 'Top Outlets', 'People Reached']:
+        if i not in website_df.columns:
+            website_df[i] = None
+
     # website_df = pd.read_csv(data_fp, parse_dates=['Date',])
     # website_df.set_index('id', inplace=True)
-
+    
     # # Load press data
     # press_df = pd.read_excel(press_office_data_fp)
     # press_df.set_index('id', inplace=True)
@@ -99,7 +106,7 @@ def clean_data(raw_df, config):
 
     # Drop rows where 'Date' year is 1970
     cleaned_df = raw_df[raw_df['Date'].dt.year != 1970]
-
+    
     # # Drop drafts
     # cleaned_df = raw_df.drop(
     #     raw_df.index[raw_df['Date'].dt.year == 1970],

diff --git a/src/config.yml b/src/config.yml
@@ -13,7 +13,7 @@ figure_dir: ../data/figures
 start_of_year: September 1
 
 # Aesthetic options
-page_title: 'Press Data'
+page_title: 'Press Data Test'
 # Seaborn color palette to use. More options at https://seaborn.pydata.org/tutorial/color_palettes.html
 color_palette: deep
 
@@ -41,8 +41,8 @@ groupings:
 # The anything that's in a given grouping can be analyzed similarly.
 primary_id_column: index
 id_columns: # Unique identifiers
-  - id
   - Title
+  - id
 numerical_columns: # Numeric columns that can be summed
   - Press Mentions
   - People Reached