Skip to content

Commit

Permalink
merged data filtration groupby category selection
Browse files Browse the repository at this point in the history
implemented various changes discussed with relevant CIERA staff; implemented data multiselect in category space; renormalized financial year
  • Loading branch information
llippeatt committed Jul 23, 2024
1 parent 71906e1 commit 5ae0402
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 129 deletions.
8 changes: 3 additions & 5 deletions press_dash_lib/dash_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,7 @@ def recategorize_data(
def filter_data(
_self,
recategorized_df: pd.DataFrame,
text_filters: dict[str, str] = {},
categorical_filters: dict[str, list] = {},
numerical_filters: dict[str, tuple] = {},
) -> pd.DataFrame:
'''Filter what data shows up in the dashboard.
Expand All @@ -163,9 +161,7 @@ def filter_data(
with st.spinner(msg):
return _self.data_handler.filter_data(
recategorized_df=recategorized_df,
text_filters=text_filters,
categorical_filters=categorical_filters,
numerical_filters=numerical_filters
)

@st.cache_data
Expand All @@ -178,7 +174,7 @@ def aggregate(
aggregation_method: str = 'count',
) -> Union[pd.Series, pd.DataFrame]:
'''Aggregate stats.
Args:
df: The dataframe containing the selected data.
x_column: The column containing the year or other time bin value.
Expand All @@ -192,6 +188,8 @@ def aggregate(
or
totals: The series containing the counts per year
'''


msg = 'Aggregating...'
print(msg)
with st.spinner(msg):
Expand Down
16 changes: 0 additions & 16 deletions press_dash_lib/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,17 +226,13 @@ def recategorize_data(
def filter_data(
self,
recategorized_df: pd.DataFrame,
text_filters: dict[str, str] = {},
categorical_filters: dict[str, list] = {},
numerical_filters: dict[str, tuple] = {},
) -> pd.DataFrame:
'''Filter what data shows up in the dashboard.
Args:
recategorized_df: The dataframe containing the data.
text_filters (dict): Search fields for text.
categorical_filters (dict): How categories are filtered.
numerical_filters (dict): Ranges for numerical data filters
Returns:
selected_df: The dataframe containing the selected data.
Expand All @@ -245,22 +241,10 @@ def filter_data(
# Initialized
is_included = np.ones(len(recategorized_df), dtype=bool)

# Text filter
for text_filter_col, search_str in text_filters.items():
is_matching = recategorized_df[text_filter_col].str.extract('(' + search_str + ')', flags=re.IGNORECASE).notna().values[:,0]
is_included = is_included & is_matching

# Categories filter
for cat_filter_col, selected_cats in categorical_filters.items():
is_included = is_included & recategorized_df[cat_filter_col].isin(selected_cats)

# Range filters
for num_filter_col, column_range in numerical_filters.items():
is_included = is_included & (
(column_range[0] <= recategorized_df[num_filter_col]) &
(recategorized_df[num_filter_col] <= column_range[1])
)

selected_df = recategorized_df.loc[is_included]

return selected_df
2 changes: 1 addition & 1 deletion press_dash_lib/data_viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def lineplot(
Returns:
fig: The figure containing the plot.
'''
df.index = df.index + 1
#df.index = df.index + 1

# Modify data if cumulative
if cumulative:
Expand Down
117 changes: 29 additions & 88 deletions press_dash_lib/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,13 @@ def request_data_axes(
if key in ask_for:
value, ind = selectbox(
st_loc,
'What do you want to group the data by?',
display_options.get('groupby_column', self.config['categorical_columns']),
'What do you want to categorize the data by?',
options=display_options.get('groupby_column', self.config['categorical_columns']),
index=display_defaults.get(key + '_ind', 0),
)
selected_settings[key] = value
selected_settings[key + '_ind'] = ind

selected_settings[key] = value
selected_settings[key + '_ind'] = ind

return selected_settings

Expand Down Expand Up @@ -191,14 +192,14 @@ def request_data_settings(

return selected_settings

def request_filter_settings(
def process_filter_settings(
self,
st_loc,
df: pd.DataFrame,
ask_for: list[str] = ['text', 'categorical', 'numerical'],
local_key: str = None,
display_defaults: dict = {},
display_options: dict = {},
value: str = None,
selected_settings: dict = None,
tag: str = None,
) -> dict:
Expand Down Expand Up @@ -233,88 +234,25 @@ def request_filter_settings(
tag = ''
else:
tag += ':'

key = 'text'
if key in ask_for:
current = selected_settings.setdefault(key, {})
# Select which columns to filter on
if len(current) == 0:
multiselect_default = []
else:
multiselect_default = list(current)
filter_columns = st_loc.multiselect(
'What columns do you want to search? (case insensitive; not a smart search)',
options=display_options.get(key, self.config['text_columns']),
default=multiselect_default,
key=tag + key
)
for col in filter_columns:
# Check the current values then the passed-in defaults
# for a default
default = current.get(col,'')
default = display_defaults.get(key, {}).get(col, default)
selected_settings[key][col] = st_loc.text_input(
'"{}" column: What do you want to search for?'.format(col),
value=default,
key=tag + key + ':' + col
)


key = 'categorical'
if key in ask_for:
current = selected_settings.setdefault(key, {})
# Select which columns to filter on
if len(current) == 0:
multiselect_default = []
else:
multiselect_default = list(current)
filter_columns = st_loc.multiselect(
'What categorical columns do you want to filter on?',
options=display_options.get(key, self.config['categorical_columns']),
default=multiselect_default,
key=tag + key
)
for col in filter_columns:
possible_columns = pd.unique(df[col])
# Check the current values then the passed-in defaults
# for a default
default = current.get(col, possible_columns)
default = display_defaults.get(key, {}).get(col, default)
selected_settings[key][col] = st_loc.multiselect(
'"{}" column: What groups to include?'.format(col),
possible_columns,
default=default,
key=tag + key + ':' + col
)

key = 'numerical'
if key in ask_for:
current = selected_settings.setdefault(key, {})
# Select which columns to filter on
if len(current) == 0:
multiselect_default = []
else:
multiselect_default = list(current)
filter_columns = st_loc.multiselect(
'What numerical columns do you want to filter on?',
options=display_options.get(key, self.config['numerical_columns']),
default=multiselect_default,
key=tag + key
key=tag + key


possible_columns = pd.unique(df[value])
# Check the current values then the passed-in defaults
# for a default
default = current.get(value, possible_columns)
default = display_defaults.get(key, {}).get(value, default)
selected_settings[key][value] = st_loc.multiselect(
'"{}" column: What groups to include?'.format(value),
possible_columns,
default=default,
key=tag + key + ':' + value
)
for col in filter_columns:
value_min = df[col].min()
value_max = df[col].max()
# Check the current values then the passed-in defaults
# for a default
default = current.get(col, (value_min, value_max))
default = display_defaults.get(key, {}).get(col, default)
selected_settings[key][col] = st_loc.slider(
'"{}" column: What range to include?'.format(col),
min_value=default[0],
max_value=default[1],
value=default,
key=tag + key + ':' + col
)


return selected_settings

def request_view_settings(
Expand Down Expand Up @@ -381,7 +319,10 @@ def request_view_settings(
'include_annotations',
'annotations_ha',
'font',
'color_palette'
'color_palette',
'category_colors',
'totals',
'kwargs'
]
if ask_for == 'all':
ask_for = available_settings
Expand Down Expand Up @@ -579,7 +520,7 @@ def request_view_settings(
'legend scale',
0.1,
2.,
value=display_defaults.get(key, 1.),
value=display_defaults.get(key, 1.32),
key=tag + key,
)
key = 'legend_x'
Expand All @@ -588,7 +529,7 @@ def request_view_settings(
'legend x',
0.,
1.5,
value=display_defaults.get(key, 1.),
value=display_defaults.get(key, 0.),
key=tag + key,
)
key = 'legend_y'
Expand All @@ -597,7 +538,7 @@ def request_view_settings(
'legend y',
0.,
1.5,
value=display_defaults.get(key, 1.),
value=display_defaults.get(key, 1.4),
key=tag + key,
)
key = 'legend_ha'
Expand Down
30 changes: 16 additions & 14 deletions press_dash_lib/pages/base_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def main(config_fp: str, user_utils: types.ModuleType = None):

# Set the title that shows up at the top of the dashboard
st.title(builder.config.get('page_title','Dashboard'))

# Prep data
data, config = builder.prep_data(builder.config)
builder.config.update(config)
Expand Down Expand Up @@ -60,26 +60,27 @@ def main(config_fp: str, user_utils: types.ModuleType = None):
),
)

# Data filter settings
with st.expander('Data Filters'):
st.subheader('Data Filters')
builder.interface.request_filter_settings(
st,
data['recategorized'],
)
# Data axes
# entered search category passed down to filter settings for further specification
st.subheader('Data Axes')
builder.interface.request_data_axes(st)

# catches specified groupby category
category_specific = builder.settings.get_settings(common_to_include=['data'])

# filters data as per specs
builder.interface.process_filter_settings(
st,
data['recategorized'],
value=category_specific['groupby_column']
)

# Apply data filters
data['selected'] = builder.filter_data(
data['recategorized'],
builder.settings.common['filters']['text'],
builder.settings.common['filters']['categorical'],
builder.settings.common['filters']['numerical'],
)

# Data axes
st.subheader('Data Axes')
builder.interface.request_data_axes(st)

# Aggregate data
data['aggregated'] = builder.aggregate(
data['selected'],
Expand All @@ -99,6 +100,7 @@ def main(config_fp: str, user_utils: types.ModuleType = None):
# Lineplot
local_key = 'lineplot'
st.header(config.get('lineplot_header', 'Lineplot'))
st.text("Note: some data entries may correspond to multiple categories, and so may be contribute to dataset of each.\n As such, the all categories combined may exceed the total, which only counts each entry once***")
with st.expander('Lineplot settings'):
local_opt_keys, common_opt_keys, unset_opt_keys = builder.settings.get_local_global_and_unset(
function=builder.data_viewer.lineplot,
Expand Down
13 changes: 10 additions & 3 deletions press_dash_lib/user_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,18 @@ def load_data(config):

# Website data
os.chdir(os.path.dirname(os.path.abspath(__file__)))
website_df = pd.read_csv('report.csv')
website_df = pd.read_csv('News_Report_Main.csv', encoding_errors='ignore')
website_df.set_index('id', inplace=True)



for i in ['Press Mentions', 'Top Outlets', 'People Reached']:
if i not in website_df.columns:
website_df[i] = None

# website_df = pd.read_csv(data_fp, parse_dates=['Date',])
# website_df.set_index('id', inplace=True)

# # Load press data
# press_df = pd.read_excel(press_office_data_fp)
# press_df.set_index('id', inplace=True)
Expand Down Expand Up @@ -99,7 +106,7 @@ def clean_data(raw_df, config):

# Drop rows where 'Date' year is 1970
cleaned_df = raw_df[raw_df['Date'].dt.year != 1970]

# # Drop drafts
# cleaned_df = raw_df.drop(
# raw_df.index[raw_df['Date'].dt.year == 1970],
Expand Down
4 changes: 2 additions & 2 deletions src/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ figure_dir: ../data/figures
start_of_year: September 1

# Aesthetic options
page_title: 'Press Data'
page_title: 'Press Data Test'
# Seaborn color palette to use. More options at https://seaborn.pydata.org/tutorial/color_palettes.html
color_palette: deep

Expand Down Expand Up @@ -41,8 +41,8 @@ groupings:
# The anything that's in a given grouping can be analyzed similarly.
primary_id_column: index
id_columns: # Unique identifiers
- id
- Title
- id
numerical_columns: # Numeric columns that can be summed
- Press Mentions
- People Reached
Expand Down

0 comments on commit 5ae0402

Please sign in to comment.