-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
353 lines (299 loc) · 11.6 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
import marimo
__generated_with = "0.10.16"
app = marimo.App(
width="medium",
app_title="Publishing Interactive Visualizations"
)
@app.cell
def _():
import marimo as mo
return (mo,)
@app.cell
def _(mo):
mo.md(
r"""
# Publishing Interactive Visualizations
Effectively communicating complex datasets is one of the most rewarding tasks in modern science, but it comes with real challenges.
When you have reached the limits of what can be done with a static manuscript figure, the next option to explore may be an **interactive visualization**.
These displays are generally accessed using web browsers, since browsers are already installed on almost every personal computer in the word.
However, it is often challenging for researchers to publish to the web without either running a server or forgoing interactivity.
A new exciting development in this field is **marimo**, a Python-based framework for deploying interactive visualizations in an entirely serverless manner.
The text and figures that you see on this page were generated using **marimo**, with all of the visualizations and interactivity generated within the browser on your computer.
"""
)
return
@app.cell
def _():
# If the script is running in WASM (instead of local development mode), load micropip
import sys
if "pyodide" in sys.modules:
import micropip
else:
micropip = None
return micropip, sys
@app.cell
async def _(micropip, mo):
# Load the python dependencies needed to read in data and display the
with mo.status.spinner("Loading dependencies"):
import pyarrow
import pandas as pd
if micropip is not None:
await micropip.install("plotly<6.0.0")
import plotly.express as px
return pd, px, pyarrow
@app.cell
def _(mo):
mo.md(
"""
## Loading Data
Datasets can be read into memory from a public URL, the code repository which contains the manuscript text, or from any other source which can be loaded using a Python library.
"""
)
return
@app.cell
def _(mo, pd):
# Read in the dataset to display
with mo.status.spinner("Loading data"):
df = pd.read_feather(mo.notebook_location() / "public" / "accidents_opendata.feather")
return (df,)
@app.cell
def _(mo):
mo.md(
"""
The dataset that we will use here was obtained from [the Barcelona Traffic Accidents entry in Kaggle](https://www.kaggle.com/datasets/emmanuelfwerr/barcelona-car-accidents) on January 24, 2025 under the [CC0: Public Domain](https://creativecommons.org/publicdomain/zero/1.0/) licence.
A full description of the source and formatting modifications made to this table can be found in the README.md of [this repository's public/ folder](https://github.com/FredHutch/marimo-publication/tree/main/public).
"""
)
return
@app.cell
def _(df):
df
return
@app.cell
def _(mo):
mo.md(r"""## Web Browsers are Inherently Interactive""")
return
@app.cell
def _(df, pd):
def bin_df(df: pd.DataFrame, nbins: int):
return (
df
.assign(
utm_coordinate_x=pd.cut(df["utm_coordinate_x"], nbins).apply(lambda v: v.mid).astype(float),
utm_coordinate_y=pd.cut(df["utm_coordinate_y"], nbins).apply(lambda v: v.mid).astype(float),
)
.groupby(["utm_coordinate_x", "utm_coordinate_y", "district_name"])
.apply(
lambda d: pd.Series(dict(n_incidents=d.shape[0], n_vehicles=d["n_vehicles"].sum(), n_victims=d["n_victims"].sum())),
include_groups=False
)
.reset_index()
)
binned_df = bin_df(df, 40)
return bin_df, binned_df
@app.cell
def _(binned_df, px):
# Bin the points to prevent overplotting
fig = px.scatter(
binned_df,
x='utm_coordinate_x',
y='utm_coordinate_y',
size='n_vehicles',
color="district_name",
template="simple_white",
labels=dict(
utm_coordinate_x="UTM Coordinate X",
utm_coordinate_y="UTM Coordinate Y",
district_name="District",
n_vehicles="Number of Vehicles",
n_victims="Number of Victims",
n_incidents="Number of Incidents"
),
title="Barcelona Accident Data",
hover_data=["n_vehicles", "n_incidents", "n_victims"],
width=800,
height=600
)
fig.update_layout(
xaxis_showticklabels=False,
xaxis_showline=False,
yaxis_showline=False,
yaxis_showticklabels=False
)
fig
return (fig,)
@app.cell
def _(mo):
mo.md(
r"""
Compared to a static manuscript, displaying figures in a web browser provides some immediate features that can be useful.
A basic element of interactivity is moving your cursor over a single point a complex display, or by clicking and dragging a box to zoom into a particular region of the display.
For publications using **marimo**, the best options for generating interactive displays are [Plotly](https://plotly.com/python/) and [Altair](https://altair-viz.github.io/), each of which provide an amazing amount of flexibility and power.
"""
)
return
@app.cell
def _(df, pd):
# Make a summary table
summary = (
df
.groupby(
["district_name", "year_month"]
)
.apply(
lambda d: pd.Series(dict(
n_incidents=d.shape[0],
**d[['n_victims', 'n_vehicles']].sum().to_dict()
)),
include_groups=False
)
.reset_index()
.assign(
year=lambda d: d["year_month"].apply(lambda s: int(s.split("-")[0])),
month=lambda d: d["year_month"].apply(lambda s: int(s.split("-")[1]))
)
)
return (summary,)
@app.cell
def _(mo):
mo.md("""## Customizing Plots with User Input""")
return
@app.cell
def _(px, summary):
summary_lineplot = px.line(
summary,
x='year_month',
y='n_incidents',
color="district_name",
template="simple_white",
labels=dict(
utm_coordinate_x="UTM Coordinate X",
utm_coordinate_y="UTM Coordinate Y",
district_name="District",
n_vehicles="Number of Vehicles",
n_victims="Number of Victims",
n_incidents="Number of Incidents",
datetime="Date / Time",
year_month="Year / Month"
)
)
summary_lineplot
return (summary_lineplot,)
@app.cell
def _(mo):
mo.md(
r"""
While the summary figure above shows a number of patterns in the data (e.g. differences between districts, variability over months within a year, and a sharp dropoff in March 2020), it may be difficult for the reader to isolate those axes of variability in a focused display without loading the source data and rerunning the entire analysis process locally.
The user input features provided by marimo are extremely open-ended, and can be used to provide the user with the ability to create precisely the plot they want.
"""
)
return
@app.cell
def _(df, mo):
# Collect user input on how to summarize the data and format the plot
params = (
mo.md("""### User Input
- Include Districts: {districts}
- Include Neighborhoods: {neighborhoods}
- Group by {group_by}
- Include Years: {years}
- X-axis: {x_axis}
- Y-axis: {y_axis}
"""
)
.batch(
districts=mo.ui.multiselect(options=df['district_name'].unique(), value=df['district_name'].unique()),
neighborhoods=mo.ui.multiselect(options=df['neighborhood_name'].unique(), value=df['neighborhood_name'].unique()),
group_by=mo.ui.dropdown(options=["Districts", "Neighborhoods"], value="Districts"),
years=mo.ui.multiselect(options=df['year'].apply(str).unique(), value=df['year'].apply(str).unique()),
months=mo.ui.multiselect(options=df['month'].apply(str).unique(), value=df['month'].apply(str).unique()),
x_axis=mo.ui.dropdown(options=["Month", "Year"], value="Month"),
y_axis=mo.ui.dropdown(options=["Incidents", "Vehicles"], value="Incidents")
)
)
params
return (params,)
@app.cell
def _(df, params, pd):
# Subset the summary data based on the user input
group_by_kw = params.value["group_by"].lower()[:-1] + "_name"
time_unit = "year_month" if params.value["x_axis"] == "Month" else "year"
subset_summary = (
df
.loc[
(
df.apply(
lambda r: (
r["district_name"] in params.value["districts"]
and
r["neighborhood_name"] in params.value["neighborhoods"]
and
str(r['year']) in params.value["years"]
and
str(r['month']) in params.value["months"]
),
axis=1
)
)
]
.groupby(
[group_by_kw, time_unit]
)
.apply(
lambda d: pd.Series(dict(
n_incidents=d.shape[0],
**d[['n_victims', 'n_vehicles']].sum().to_dict()
)),
include_groups=False
)
.reset_index()
)
if time_unit == "year_month":
subset_summary = subset_summary.assign(
year=lambda d: d["year_month"].apply(lambda s: int(s.split("-")[0])),
month=lambda d: d["year_month"].apply(lambda s: int(s.split("-")[1]))
)
return group_by_kw, subset_summary, time_unit
@app.cell
def _(group_by_kw, params, px, subset_summary, time_unit):
# Make the display
y_cname = "n_incidents" if params.value["y_axis"] == "Incidents" else "n_vehicles"
custom_fig = px.line(
data_frame=subset_summary,
x=time_unit,
y=y_cname,
color=group_by_kw,
template="simple_white",
labels=dict(
utm_coordinate_x="UTM Coordinate X",
utm_coordinate_y="UTM Coordinate Y",
district_name="District",
neighborhood_name="Neighborhood",
n_vehicles="Number of Vehicles",
n_victims="Number of Victims",
n_incidents="Number of Incidents",
datetime="Date / Time",
year_month="Year / Month"
)
)
custom_fig
return custom_fig, y_cname
@app.cell
def _(mo):
mo.md(
r"""
When the user modifies the inputs, the figure is regenerated from the input data using Python code that runs entirely in the browser.
The drawback of this approach is that code runs a bit slowly, so this is not the place to put long-running tasks.
However, the advantage is that there is no limit to the number of users who can open this publication, and there is effectively no cost to host the website.
## Using this Approach
All of the code needed to build this text and visualization into a website can be found in an open source GitHub repository - [FredHutch/marimo-publication](https://github.com/FredHutch/marimo-publication).
To build something similar, just fork the repository, modify the contents to meet your needs, and then turn on GitHub pages to instantly create a website publishing your findings.
More details can be found in the [Readme](https://github.com/FredHutch/marimo-publication).
"""
)
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()