-
Notifications
You must be signed in to change notification settings - Fork 0
/
readc40mobility.py
executable file
·879 lines (845 loc) · 42.2 KB
/
readc40mobility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 1 12:08:51 2020
@author: ghkerr
"""
DIR = '/Users/ghkerr/GW/'
DIR_MOBILITY = DIR+'data/mobility/'
def read_berlin(startdate, enddate):
"""Calculate time series of traffic counts and fraction of heavy-duty
vehicles for traffic counting sites in Berlin. Note that traffic counts are
taken at five air pollution monitoring sites (MC117, MC124, MC174, MC220,
MC143). The naming/coordinates of these traffic count stations are the same
as the air quality monitoring site coordinates.
In the input dataset, there are absolute values for long vehicles (> 7.5
meters, “Lkw”) and short vehicles (< 7.5 meters, “Pkw”), subdivided per
direction (N-North, E – East, W – West, S – South) and per hour.
There are also mean hourly speed values per direction (v). Traffic data
might have some missing data due to instrument failures, road construction,
etc. These hourly data are aggregated to daily sums and by site (i.e.,
east and west are combined to a single sum, north and south as well).
Parameters
----------
startdate : str
Start date of period of interest; YYYY-mm-dd format
enddate : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
df : pandas.core.frame.DataFrame
Site- and hour-summed traffic data in Berlin
"""
import pandas as pd
import numpy as np
# Read pollutant-specific file
df = pd.read_csv(DIR_MOBILITY+'berlin/'+
'traffic_counts_2019_2020_header.csv', delimiter=';', header=0,
engine='python')
# Change Datum column to datetime
df['Datum'] = pd.to_datetime(df['Datum'], format='%d.%m.%Y %H:%M')
df.rename(columns={'Datum':'Date'}, inplace=True)
df.set_index('Date', drop=True, inplace=True)
# Change column types from object to float
for column in df.columns:
df[column] = df[column].apply(pd.to_numeric, args=('coerce',))
# Resample from hourly to daily counts
df = df.resample('D').sum()
# Sum traffic counts going different directions at same sites (i.e., E and W
# or N and S), number of passenger vehicles/freight, and the
# fraction of heavy-duty vehicles
df['MC117'] = (df['MC117 Dir.E Lkw']+df['MC117 Dir.W Lkw']+
df['MC117 Dir.E Pkw']+df['MC117 Dir.W Pkw'])
df['MC117_passenger'] = (df['MC117 Dir.E Pkw']+df['MC117 Dir.W Pkw'])
df['MC117_freight'] = (df['MC117 Dir.E Lkw']+df['MC117 Dir.W Lkw'])
df['MC117_FracHDV'] = (df['MC117 Dir.E Lkw']+df['MC117 Dir.W Lkw']
)/df['MC117']
df['MC124'] = (df['MC124 Dir.N Lkw']+df['MC124 Dir.S Lkw']+
df['MC124 Dir.N Pkw']+df['MC124 Dir.S Pkw'])
df['MC124_passenger'] = (df['MC124 Dir.N Pkw']+df['MC124 Dir.S Pkw'])
df['MC124_freight'] = (df['MC124 Dir.N Lkw']+df['MC124 Dir.S Lkw'])
df['MC124_FracHDV'] = (df['MC124 Dir.N Lkw']+df['MC124 Dir.S Lkw']
)/df['MC124']
df['MC174'] = (df['MC174 Dir.E Lkw']+df['MC174 Dir.W Lkw']+
df['MC174 Dir.E Pkw']+df['MC174 Dir.W Pkw'])
df['MC174_passenger'] = (df['MC174 Dir.E Pkw']+df['MC174 Dir.W Pkw'])
df['MC174_freight'] = (df['MC174 Dir.E Lkw']+df['MC174 Dir.W Lkw'])
df['MC174_FracHDV'] = (df['MC174 Dir.E Lkw']+df['MC174 Dir.W Lkw']
)/df['MC174']
df['MC220'] = (df['MC220 Dir.N Lkw']+df['MC220 Dir.S Lkw']+
df['MC220 Dir.N Pkw']+df['MC220 Dir.S Pkw'])
df['MC220_passenger'] = (df['MC220 Dir.N Pkw']+df['MC220 Dir.S Pkw'])
df['MC220_freight'] = (df['MC220 Dir.N Lkw']+df['MC220 Dir.S Lkw'])
df['MC220_FracHDV'] = (df['MC220 Dir.N Lkw']+df['MC220 Dir.S Lkw']
)/df['MC220']
df['MC143'] = (df['MC143 Dir.E Lkw']+df['MC143 Dir.W Lkw']+
df['MC143 Dir.E Pkw']+df['MC143 Dir.W Pkw'])
df['MC143_passenger'] = (df['MC143 Dir.E Pkw']+df['MC143 Dir.W Pkw'])
df['MC143_freight'] = (df['MC143 Dir.E Lkw']+df['MC143 Dir.W Lkw'])
df['MC143_FracHDV'] = (df['MC143 Dir.E Lkw']+df['MC143 Dir.W Lkw']
)/df['MC143']
# Drop all summed mean hourly speed values per direction and directional
# columns
df = df.drop(['MC117 Dir.E Lkw', 'MC117 Dir.W Lkw', 'MC117 Dir.E Pkw',
'MC117 Dir.W Pkw', 'MC117 Dir.E v', 'MC117 Dir.W v', 'MC124 Dir.N Lkw',
'MC124 Dir.S Lkw', 'MC124 Dir.N Pkw', 'MC124 Dir.S Pkw',
'MC124 Dir.N v', 'MC124 Dir.S v', 'MC174 Dir.E Lkw', 'MC174 Dir.W Lkw',
'MC174 Dir.E Pkw', 'MC174 Dir.W Pkw', 'MC174 Dir.E v', 'MC174 Dir.W v',
'MC220 Dir.N Lkw', 'MC220 Dir.S Lkw', 'MC220 Dir.N Pkw',
'MC220 Dir.S Pkw', 'MC220 Dir.N v', 'MC220 Dir.S v', 'MC143 Dir.E Lkw',
'MC143 Dir.W Lkw', 'MC143 Dir.E Pkw', 'MC143 Dir.W Pkw',
'MC143 Dir.E v', 'MC143 Dir.W v'], axis=1)
# Melt such that different sites are represented in rows, not columns
df = pd.melt(df, var_name='Site', value_name='Count', ignore_index=False)
# 2020-07-01 has an error, so drop this!
df = df.drop(pd.date_range('2020-07-01','2020-07-03'), errors='ignore')
# Create column for fraction of heavy-duty vehicles
df['Frac_HDV'] = np.nan
df['Passenger'] = np.nan
df['Freight'] = np.nan
# Kludgy
for index, row in df.iterrows():
if 'FracHDV' in row['Site']:
frac = row['Count']
df.loc[(df['Site']==row['Site'][:5]) & (df.index==index),
'Frac_HDV'] = frac
for index, row in df.iterrows():
if 'passenger' in row['Site']:
frac = row['Count']
df.loc[(df['Site']==row['Site'][:5]) & (df.index==index),
'Passenger'] = frac
for index, row in df.iterrows():
if 'freight' in row['Site']:
frac = row['Count']
df.loc[(df['Site']==row['Site'][:5]) & (df.index==index),
'Freight'] = frac
# Drop dummy columns
df = df[~df.Site.str.contains('FracHDV')]
df = df[~df.Site.str.contains('passenger')]
df = df[~df.Site.str.contains('freight')]
df = df.loc[startdate:enddate]
return df
def read_auckland(startdate, enddate):
"""fuction reads the (messy) traffic data from Auckland for 2019 and
2020. The extracted metric in the output table is the seven day ADT. Site
identifiers are easting and northings. "Frac_HDV" represents the fraction
of heavy-duty vehicles in the traffic.
Parameters
----------
startdate : str
Start date of period of interest; YYYY-mm-dd format
enddate : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
df : pandas.core.frame.DataFrame
Site- and hour-summed traffic data in Berlin
"""
import numpy as np
import pandas as pd
# For 2019. Note that given the "count_duration" column (=7D), it appears
# that the ADT column is equivalent to the "7Days ADT" in the 2020 COVID
# data
df_2019 = pd.read_excel(DIR_MOBILITY+'auckland/'+
'RAMM_Report_Summary_Final.xlsm', sheet_name='BECA Listing', header=1,
usecols='E,F,H,M,O,P,Q,R,S,T')
df_2019 = df_2019.rename(columns={
'count_date':'Count Start Date', 'ADT':'7Days ADT', 'PcCar':'Car',
'PcLCV':'LCV', 'PcMCV':'MCV', 'PcHCVI':'HCV-I', 'PcHCVII':'HCV-II',
'PcHeavy':'Heavy'})
# Combine easting/northing column into a single column to match COVID-19
# data
df_2019['northing'] = 'N'+df_2019['northing'].astype(str)
df_2019['easting'] = 'E'+df_2019['easting'].astype(str)
df_2019['Site'] = df_2019['easting']+' '+df_2019['northing']
# For 2020. Note that I'm only using columns AI-BG...these columns seem
# to be in a better format (counts summed over multi-direction highways).
# I am considering "Road ID" to be the standard identifier for different
# traffic counters
df_2020 = pd.read_excel(DIR_MOBILITY+'auckland/'+'COVID 19 Traffic Count '+
'Data.xlsx', sheet_name='Sheet1', header=2,
usecols='F,AP,AR,BB,BC,BD,BE,BF,BG')
df_2020 = df_2020.rename(columns={'Location':'Site',
'Unnamed: 41':'Count Start Date',
'Unnamed: 43':'7Days ADT', 'Unnamed: 53':'Car', 'Unnamed: 55':'MCV',
'Unnamed: 56':'HCV-I', 'Unnamed: 57':'HCV-II',
'Unnamed: 58':'HCV Total'})
df_2020['Site'] = df_2020['Site'].astype(str)
# Now for the heavy-duty vehicle contribution. For 2019, it appears that
# MCV + HCV-I + HCV-II = Heavy (this isn't entirely true, but the mean
# of this operature is 0.0008 and max = 0.01 and min = -0.01, so whatever)
# Replace these columns with a single 'Frac_HDV' column
df_2019['Frac_HDV'] = df_2019['Heavy']
df_2019 = df_2019.drop(['northing', 'easting', 'LCV','MCV','HCV-I',
'HCV-II','Car','Heavy'], axis=1)
# For 2020, it appears that Car + LCV + HCV Total = 1. (and MCV + HCV-I +
# HCVII = Heavy)
df_2020['Frac_HDV'] = df_2020['HCV Total']
df_2020 = df_2020.drop(['Car', 'LCV', 'MCV', 'HCV-I', 'HCV-II',
'HCV Total'], axis=1)
# Merge datasets
df = pd.concat([df_2019, df_2020])
# Rename to standarized names
df = df.rename(columns={'Count Start Date':'Date', '7Days ADT':'Count'})
# Set time as index
df.set_index('Date', drop=True, inplace=True)
df = df.loc[startdate:enddate]
return df
def read_milan(startdate, enddate):
"""Traffic data in Milan is a bit strange. Area C (which is the congestion
charge of the historic center of the city; ~9 km^2) is the only part of the
city that has both vehicle counts (total number of vehicles entering
the area and LDV/HDV fraction). Thus, we only consider Area C, even though
it is a much smaller geographic area than the area considered for the
change in NO2 or the air quality observations. Note that the output values
are not site-specific but rather represent the total number of vehicles
entering Area C summed over the hours of the day.
Parameters
----------
startdate : str
Start date of period of interest; YYYY-mm-dd format
enddate : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
df : pandas.core.frame.DataFrame
Traffic counts for Milan (total number of vehicles entering Area C;
see input .xlsx file for a map of Area C
"""
import pandas as pd
df = pd.read_excel(DIR_MOBILITY+'milan/'+
'Traffic data Milan_AMAT_01_01_2019_31_10_2020.xlsx',
sheet_name='AreaC - % LDV_HDV', header=0)
df = df.rename(columns={'date':'Date',
'Area C entering vehicles':'Count', '%HDV':'Frac_HDV',
'HDV':'Freight', 'LDV':'Passenger'})
df = df.drop(['%LDV'], axis=1)
# Since these values only represent one site, make a dummy column to
# represent this one site (to be consistent with standardized format)
df['Site'] = 'AreaC'
# Set time as index
df.set_index('Date', drop=True, inplace=True)
df = df.loc[startdate:enddate]
return df
def read_santiago(startdate, enddate):
"""Read traffic counts from Santiago. Counts are provided for a variety
of sites for different times of the day: punta manana (6-7 am, 7-8 am,
8-9am, 9-10am) and punta tarde (5-6pm, 6-7pm, 7-8pm, 8-9pm). For a given
site and date, these observations are summed to produce a daily, site-
specific sum. Note that there are strange notes in the Excel file for
specific counts that I ignored (all in Spanish).
Parameters
----------
startdate : str
Start date of period of interest; YYYY-mm-dd format
enddate : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
df : pandas.core.frame.DataFrame
Daily summed traffic counts for individual monitoring sites in
Santiago
"""
import numpy as np
import pandas as pd
# PM is for punta de la mañana (morning)
df_pm = pd.read_excel(DIR_MOBILITY+'santiago/'+
'resumen de flujos vehiculares PM disponibles marzo a agosto 2020.xlsx',
sheet_name='PUNTA MAÑANA', header=3)
# Drop information about Ubicación and Comuna - these appear to just be
# location information about the address and neighborhood. Instead we'll
# use Estación for the location identifier
df_pm = df_pm.drop(['Hora','Ubicación','Comuna'], axis=1)
# Drop rows that are only NaN for all columns
df_pm.dropna(inplace=True)
# Replace missing information with NaN. As far as I can tell, there are
# a variety of ways that Santiago indicates missing information
# Falla - instrument failure
# falla - same as above, but with a lowercase f just to fuck with me
# s/i - without information (sine información)
# Reversible - no idea what this means in this context
# ERROR - no explination needed
# desvío - detour
df_pm.replace({'falla':np.nan, 'Falla':np.nan, 's/i':np.nan, 'Reversible':
np.nan, 'ERROR':np.nan, 'desvío':np.nan, 'Falal':np.nan, 'reversible':
np.nan, 'Bajo':np.nan, 'fall':np.nan, 'fala':np.nan, 'rev':np.nan,
'error':np.nan}, regex=True, inplace=True)
# Set station as index and melt
df_pm.set_index('Estación', drop=True, inplace=True)
df_pm = df_pm.melt(ignore_index=False)
# Some of the entries in the date column have the two letter day
# abbreviation in Spanish (e.g., lu for lunes). Remove these.
df_pm['variable'] = df_pm['variable'].astype(str)
df_pm['variable'] = df_pm['variable'].replace(dict.fromkeys(['lu ','vi ',
'ju ','mi ','ma ', 'Ma ', 'lu ', 'jue ', 'ju ', ' ju '], ''),
regex=True)
df_pm.reset_index(inplace=True)
# Dates are a mixture of Y-m-d and d-m-Y format (are you KIDDING me?!) so
# address this
date1 = pd.to_datetime(df_pm['variable'], errors='coerce',
format='%Y-%m-%d')
date2 = pd.to_datetime(df_pm['variable'], errors='coerce',
format='%d-%m-%y')
df_pm['Date'] = date1.fillna(date2)
df_pm.rename(columns={'value':'Count','Estación':'Site'}, inplace=True)
df_pm = df_pm.drop(['variable'], axis=1)
df_pm['Count'] = pd.to_numeric(df_pm['Count'], errors='coerce')
df_pm = df_pm.groupby(['Site','Date'], as_index=False)['Count'].sum()
df_pm.set_index('Date', drop=True, inplace=True)
# Same as above but for PT (punta de la tarde; i.e., afternoon)
df_pt = pd.read_excel(DIR_MOBILITY+'santiago/'+
'resumen de flujos vehiculares PT disponibles marzo a agosto 2020.xlsx',
sheet_name='PUNTA TARDE', usecols='A:EA', header=3)
# Split by different times of the day (e.g., Comparación 17:00 a 18:00
# horas)
df_pt_a = df_pt.iloc[:84]
df_pt_b = df_pt.iloc[91:175]
df_pt_c = df_pt.iloc[182:266]
df_pt_d = df_pt.iloc[273:357]
df_pt = pd.concat([df_pt_a,df_pt_b,df_pt_c,df_pt_d], ignore_index=True,
sort=False)
df_pt = df_pt.drop(['Espira','Ubicación','Comuna'], axis=1)
df_pt.replace({'falla':np.nan, 'Falla':np.nan, 's/i':np.nan, 'Reversible':
np.nan, 'ERROR':np.nan, 'desvío':np.nan, 'Falal':np.nan, 'reversible':
np.nan, 'Bajo':np.nan, 'fall':np.nan, 'fala':np.nan, 'rev':np.nan,
'error':np.nan, 'no hay':np.nan, 'TROF':np.nan, 'NaTType':np.nan},
regex=True, inplace=True)
df_pt.set_index('Estación', drop=True, inplace=True)
df_pt = df_pt.melt(ignore_index=False)
df_pt.reset_index(inplace=True)
df_pt['variable'] = pd.to_datetime(df_pt['variable'], format='%Y-%m-%d')
df_pt.rename(columns={'value':'Count','Estación':'Site','variable':
'Date'}, inplace=True)
df_pt['Count'] = pd.to_numeric(df_pt['Count'], errors='coerce')
df_pt = df_pt.groupby(['Site','Date'], as_index=False)['Count'].sum()
df_pt.set_index('Date', drop=True, inplace=True)
# Marry two datasets
df = pd.concat([df_pm, df_pt], ignore_index=False)
# Calculate a daily sum over morning (PM) and afternoon (PT) sites
df = df.groupby([df.index.get_level_values(0),'Site']).sum()
df.reset_index(inplace=True)
df.set_index('Date', drop=True, inplace=True)
df = df.loc[startdate:enddate]
return df
def read_london(startdate, enddate):
"""Traffic data from London is provided as daily vehicle kilometers
traveled on the TLRN in March-October 2019 and 2020. Note that a percent
change is also given for "central," "inner," and "outer" but no baseline
values are included, so these data weren't extracted in the DataFrame. No
information on the paritioning of light- versus heavy-duty is given.
n.b., a map of the TLRN can be found here: https://www.cleanhighways.co.uk/
legislation/london-roads-2
Parameters
----------
startdate : str
Start date of period of interest; YYYY-mm-dd format
enddate : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
df : pandas.core.frame.DataFrame
Vehicle km traveled on TLRN for March-October 2019 and 2020.
"""
import pandas as pd
# Extract daily values of vehicle km travelled on the TLRN in 2020
df_2020 = pd.read_excel(DIR_MOBILITY+'london/'+
'COVID 19 Flow Summary Table for Andy.xlsx', sheet_name='Sheet1',
header=1, usecols='C,H')
# Baseline day 2019 vehicle km travelled on the TLRN
df_2019 = pd.read_excel(DIR_MOBILITY+'london/'+
'COVID 19 Flow Summary Table for Andy.xlsx', sheet_name='Sheet1',
header=1, usecols='C,L')
# Drop NaN columns at the end
df_2019.dropna(inplace=True)
df_2020.dropna(inplace=True)
# Rename
df_2019.rename(columns={'Unnamed: 2':'Date', 'TRLN':'Vehicle km'},
inplace=True)
df_2020.rename(columns={'Unnamed: 2':'Date', 'TLRN.1':'Vehicle km'},
inplace=True)
# Since the column with information about km driven uses the same
# dates as 2020 (MM-DD are the same, just different year), change the
# year to 2019
df_2019['Date'] = df_2019['Date'] - pd.DateOffset(years=1)
# Stack DataFrames into one DataFrame
df = pd.concat([df_2019, df_2020])
df.set_index('Date', drop=True, inplace=True)
df = df.loc[startdate:enddate]
return df
def read_mexicocity_baseline(startdate,enddate):
"""For Mexico City baseline traffic (2016-2019), there are six different
input files labeled C1-C6. The png image in the same folder provides a
legend for which file corresponds to which vehicle type with C1
correpsonding to the smallest type. Thus,
C1 = Autos 2.1-5 meters
C2 = Microbuses and vans 5-9 meters
C3 = Buses 9-14 meters
C4 = Unit truck (2-6 axles) 14-18 meters
C5 = articulated truck (5-9 axles) 18-23 meters
C6 = biarticulated trucks (5-9 axles) > 23 meters
The fraction of heavy-duty vehicles is calculated as the sum of counts from
C4-C6 over the total number. This function isn't the fastest, and could
perhaps be sped up by only reading in C4-C6 and then reading
'BASE_TOTAL_AFORO_2016-2019.csv' as the total number of vehicles (n.b.,
I checked that C1+C2+C3+C4+C5+C6=TOTAL AFORO, and this is indeed the case).
Parameters
----------
startdate : str
Start date of period of interest; YYYY-mm-dd format
enddate : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
df : pandas.core.frame.DataFrame
Traffic counts in Mexico City for the pre-COVID period (2016-2019)
"""
import pandas as pd
# For light-duty vehicles (types C1-C3)
df_light = pd.DataFrame([])
for vtype in ['C1','C2','C3']:
dfty = pd.read_csv(DIR_MOBILITY+'mexicocity/SEDEMA_MOVILIDAD/INFOVIAL/'+
'BASE_%s_2016-2019.csv'%vtype, delimiter=',', header=3,
engine='python')
# Drop row
dfty.drop(dfty.index[[0]], inplace=True)
# Set first column as datetime
dfty['Unnamed: 1'] = pd.to_datetime(dfty['Unnamed: 1'])
dfty = dfty.rename(columns={'Unnamed: 1':'Date'})
dfty = dfty.drop(['3','NOMBRE'], axis=1)
cols=[i for i in dfty.columns if i not in ['Date']]
for col in cols:
dfty[col]=pd.to_numeric(dfty[col])
# Calculate daily sum for each site
dfty = dfty.resample('D', on='Date').sum()
dfty = pd.melt(dfty, var_name='Site', value_name='Count',
ignore_index=False)
df_light = df_light.append(dfty, ignore_index=False)
# Group by index and 'Site' column
df_light = df_light.groupby([df_light.index,'Site'])['Count'].sum()
df_light = df_light.reset_index()
# Same as above but for heavy-duty vehicles (types C4-C6)
df_heavy = pd.DataFrame([])
for vtype in ['C4','C5','C6']:
dfty = pd.read_csv(DIR_MOBILITY+'mexicocity/SEDEMA_MOVILIDAD/INFOVIAL/'+
'BASE_%s_2016-2019.csv'%vtype, delimiter=',', header=3,
engine='python')
dfty.drop(dfty.index[[0]], inplace=True)
dfty['Unnamed: 1'] = pd.to_datetime(dfty['Unnamed: 1'])
dfty = dfty.rename(columns={'Unnamed: 1':'Date'})
dfty = dfty.drop(['3','NOMBRE'], axis=1)
cols=[i for i in dfty.columns if i not in ['Date']]
for col in cols:
dfty[col]=pd.to_numeric(dfty[col])
dfty = dfty.resample('D', on='Date').sum()
dfty = pd.melt(dfty, var_name='Site', value_name='Count',
ignore_index=False)
df_heavy = df_heavy.append(dfty, ignore_index=False)
# Group by index and 'Site' column
df_heavy = df_heavy.groupby([df_heavy.index,'Site'])['Count'].sum()
df_heavy = df_heavy.reset_index()
# Merge DataFrames
df = pd.merge(df_light, df_heavy, how='left', left_on=['Date','Site'],
right_on=['Date','Site'])
# Calculate a total count category
df['Count'] = df['Count_x']+df['Count_y']
df['Frac_HDV'] = df['Count_y']/df['Count']
# Drop unneeded columns
df = df.drop(['Count_x','Count_y'], axis=1)
df.set_index('Date', drop=True, inplace=True)
df = df.loc[startdate:enddate]
return df
def read_mexicocity_lockdown(startdate, enddate):
"""Mexico City provides detailed traffic count information for six
different vehicle types prior to the lockdowns (see function
"read_mexicocity_baseline"), but for the lockdowns only Waze traffic
jam data for each municipality (hourly) is provided in the file
"Conteos_lineas_muni_2019_2020" for pre-COVID and COVID months. Possible
date ranges span 2019-05-01 to 2020-10-18.
Parameters
----------
startdate : str
Start date of period of interest; YYYY-mm-dd format
enddate : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
df : pandas.core.frame.DataFrame
Traffic counts in Mexico City for the COVID period
"""
import pandas as pd
df = pd.read_csv(DIR_MOBILITY+'mexicocity/'+
'Conteo_Lineas_Muni_2019_2020.csv', delimiter=',', engine='python')
# Combine date and hour columns
df['Date'] = pd.to_datetime(df['date'])
# Daily sum at each site
df = df.groupby(['Date','city'])['Total'].sum()
df = df.reset_index()
df.set_index('Date', drop=True, inplace=True)
df = df.rename(columns={'city':'Site', 'Total':'Count'})
df = df.loc[startdate:enddate]
return df
def read_applemobility(start, end):
"""Read Apple mobility data for select cities (major European cities with
EEA monitors and C40 cities) for the time period of interest. If the
specified start date is before the beginning of the Apple mobility dataset
(2020-01-13), and artificial traffic volume dataset is created by
averaging over pre-lockdown weekdays and constructing a weekly timeseries
with these data.
The CSV file and charts on this site show a relative volume of directions
requests per country/region, sub-region or city compared to a baseline
volume on January 13th, 2020. We define our day as midnight-to-midnight,
Pacific time. Cities are defined as the greater metropolitan area and their
geographic boundaries remain constant across the data set. In many
countries/regions, sub-regions, and cities, relative volume has increased
since January 13th, consistent with normal, seasonal usage of Apple Maps.
Day of week effects are important to normalize as you use this data.
Data were downloaded on 13 Dec 2020 (final entry is 12 Dec 2020)
from https://covid19.apple.com/mobility. Note that data for May 11-12 are
not available and will appear as blank columns in the data set.
Parameters
----------
start : str
Start date of period of interest; YYYY-mm-dd format
end : str
End date of period of interest; YYYY-mm-dd format
Returns
-------
mobility : pandas.core.frame.DataFrame
Timeseries of relative traffic volume (with 13 Jan 2020 as the baseline
volume) for cities for the specified time period.
"""
import numpy as np
import pandas as pd
df = pd.read_csv(DIR_MOBILITY+'applemobilitytrends-2020-12-12.csv',
delimiter=',', engine='python')
station_city = ['Amsterdam', 'Athens', 'Auckland', 'Bangkok', 'Barcelona',
'Belgrade', 'Berlin', 'Bratislava', 'Brussels', 'Bucharest',
'Budapest', 'Cologne', 'Copenhagen', 'Dublin', 'Dusseldorf',
'Frankfurt', 'Hamburg', 'Helsinki', 'Krakow', 'Lodz', 'London',
'Los Angeles', 'Luxembourg', 'Madrid', 'Marseille', 'Medellin',
'Mexico City', 'Milan', 'Montreal', 'Munich', 'Naples',
# Removed 'Mumbai' on 22 April
'Oslo', 'Palermo', 'Paris', 'Prague', 'Riga', 'Rome', 'Rotterdam',
'Santiago', 'Saragossa', 'Seville', 'Sofia', 'Stockholm',
'Stuttgart', 'Sydney', 'Taipei City', 'Turin', 'Valencia',
'Vienna', 'Vilnius', 'Warsaw', 'Wroclaw', 'Zagreb', 'Zurich']
mobility = []
# Select region of interest and the driving timeseries; note that the if/
# elif statements are for cities that do not have their own timeseries in
# the Apple mobility dataset or are classified, for example, as a sub-
# region rather than a city
for cityname in station_city:
if cityname=='Sofia':
city = df.loc[(df['geo_type']=='country/region') & (df['region']==
'Bulgaria') & (df['transportation_type']=='driving')]
elif cityname=='Turin':
city = df.loc[(df['geo_type']=='city') & (df['region']=='Torino') &
(df['transportation_type']=='driving')]
elif cityname=='Zagreb':
city = df.loc[(df['geo_type']=='country/region') & (df['region']==
'Croatia') & (df['transportation_type']=='driving')]
elif cityname=='Lodz':
city = df.loc[(df['geo_type']=='sub-region') & (df['region']==
'Lodz Province') & (df['transportation_type']=='driving')]
elif cityname=='Saragossa':
city = df.loc[(df['geo_type']=='sub-region') & (df['region']==
'Aragon') & (df['transportation_type']=='driving')]
elif cityname=='Wroclaw':
city = df.loc[(df['geo_type']=='sub-region') & (df['region']==
'Lower Silesia Province') & (df['transportation_type']==
'driving')]
elif cityname=='Riga':
city = df.loc[(df['geo_type']=='country/region') & (df['region'
]=='Latvia') & (df['transportation_type']=='driving')]
elif cityname=='Vilnius':
city = df.loc[(df['geo_type']=='country/region') & (df['region'
]=='Lithuania') & (df['transportation_type']=='driving')]
elif cityname=='Belgrade':
city = df.loc[(df['geo_type']=='country/region') & (df['region'
]=='Serbia') & (df['transportation_type']=='driving')]
elif cityname=='Bratislava':
city = df.loc[(df['geo_type']=='sub-region') & (df['region'
]=='Bratislava Region') & (df['transportation_type']=='driving')]
elif cityname=='Taipei City':
city = df.loc[(df['geo_type']=='sub-region') & (df['region'
]=='Taipei City') & (df['transportation_type']=='driving')]
elif cityname=='Medellin':
city = df.loc[(df['geo_type']=='country/region') & (df['region'
]=='Colombia') & (df['transportation_type']=='driving')]
elif cityname=='Luxembourg':
city = df.loc[(df['geo_type']=='sub-region') & (df['region'
]=='Luxembourg District') & (df['transportation_type']=='driving')]
elif cityname=='Dublin':
city = df.loc[(df['geo_type']=='city') & (df['region'
]=='Dublin') & (df['transportation_type']=='driving') &
(df['country']=='Ireland')]
elif cityname=='Los Angeles':
# Los Angeles metropolitan area is comprised of Los Angeles and
# Orange counties
city = df.loc[(df['geo_type']=='county') & (df['region'].isin([
'Los Angeles County','Orange County'])) &
(df['sub-region']=='California') &
(df['transportation_type']=='driving')]
city = city.groupby('geo_type').mean()
elif cityname=='Naples':
city = df.loc[(df['geo_type']=='city') & (df['region']=='Naples') &
(df['country']=='Italy') & (df['transportation_type'
]=='driving')]
else:
city = df.loc[(df['geo_type']=='city') & (df['region']==cityname) &
(df['transportation_type']=='driving')]
todrop = ['geo_type', 'region', 'transportation_type', 'alternative_name',
'sub-region', 'country']
city = city.drop(todrop, axis=1, errors='ignore')
city = pd.melt(city, var_name='Date', value_name='Volume')
# If there are > 1 values per day (e.g., Los Angeles and Orange
# Counties for Los Angeles), compute a daily average
if city.shape[0] > np.unique(city['Date'].values).shape[0]:
city.groupby(by='Date').mean().reset_index()
city['Date'] = pd.to_datetime(city['Date'])
city.set_index('Date', drop=True, inplace=True)
if (pd.to_datetime(start) < pd.to_datetime(city.index[0].strftime('%Y-%m-%d')))==True:
# Select quasi-pre-lockdown information
pre = city.loc['2020-01-13':'2020-02-29'].copy()
# Find days of week corresponding to days
pre['day_of_week'] = pre.index.day_name()
# Group by day of week and average
weeklycycle = pre.groupby(by='day_of_week').mean()
# Create dummy DataFrame extending from the specified start date
# to 2020-01-13
dummy = pd.DataFrame([])
dummy['Date'] = pd.date_range(start, '2020-01-12')
dummy['day_of_week'] = dummy['Date'].dt.day_name()
# Map weekly cycle from 2020 to dummy DateFrame
dummy['Volume'] = np.nan
dummy['Volume'] = dummy['day_of_week'].map(weeklycycle['Volume'])
# Merge DataFrames
dummy = dummy.drop(['day_of_week'], axis=1)
dummy.set_index('Date', drop=True, inplace=True)
city = pd.concat([dummy, city])
# Select date range
city = city.loc[start:end]
city['city'] = cityname
mobility.append(city)
mobility = pd.concat(mobility)
# Change city names to be consistent with conventions in code
mobility.loc[mobility.city=='Los Angeles','city'] = 'Los Angeles C40'
mobility.loc[mobility.city=='Mexico City','city'] = 'Mexico City C40'
mobility.loc[mobility.city=='Santiago','city'] = 'Santiago C40'
mobility.loc[mobility.city=='Auckland','city'] = 'Auckland C40'
mobility.loc[mobility.city=='London','city'] = 'London C40'
mobility.loc[mobility.city=='Berlin','city'] = 'Berlin C40'
mobility.loc[mobility.city=='Milan','city'] = 'Milan C40'
mobility.loc[mobility.city=='Tapei City','city'] = 'Tapei'
return mobility
def read_losangeles():
"""FUNCTION NEEDS TO BE DEBUGGED
Returns
-------
None.
"""
import requests
import numpy as np
import pandas as pd
import glob
# # # # Open siting information (files with prefix
# all_text_tmg_locations)
sitecolnames = ['Location ID', # Unique identifier generated by
# rc_location_seq
'Segment ID', # Pointer to route_carriageway_segments
'State Postmile', # State Postmile
'Absolute Postmile', # Absolute Postmile
'Latitude', # Latitude
'Longitude', # Longitude
'Angle', # Used to project icons next to the freeway
'Name', # Free form text description of the location.
'Abbrev', # Brief text description of the location.
'Freeway ID', # Freeway ID
'Freeway Direction', # A string indicating the freeway direction.
'District ID', # A string indicating the district.
'County ID', # A string indicating the county.
'City ID'] # A string indicating the city.
# From a **painstaking** comparison of the truck census file and the
# location file, the two need to be matched on:
# Absolute Postmile
# Freeway Identifier
# Freeway Direction
# County Identifier
# District Identifier
fname = '/Users/ghkerr/Downloads/all_text_tmg_station_configs_2019_01_01/'+\
'all_text_tmg_locations_2019_01_01.txt'
sites = pd.read_csv(fname, names=sitecolnames, header=None)
# Field Specification (with typos fixed and weird repeated columns
# deleted!) from pems.dot.ca.gov for Census Trucks Day. Data from
# pems.dot.ca.gov -> Data Clearinghouse -> Census Trucks Day ->
colnames = ['Timestamp', # The date and time of the beginning
# of the summary interval. For example, a date of 1/1/2012 indicates
# that the aggregate(s) contain measurements collected between 1/1/2012
# 00:00:00 and 1/1/2012 23:59:59. Note that hour, minute, and second
# values are always 0 for daily aggregations. The format is MM/DD/YYYY
# HH24:MI:SS.
'Census Station Identifier', # The unique number that identifies this
# census station within PeMS.
'Census Substation Identifier', # The unique number that identifies this
# census substation within PeMS.
'Freeway Identifier', # The unique number that identifies this
# freeway within PeMS.
'Freeway Direction', # A string indicating the freeway direction.
'City Identifier', # The unique number that identifies the city
# that contains this census station within PeMS.
'County Identifier', # The unique number that identifies the county
# that contains this census station within PeMS.
'District Identifier', # The unique number that identifies the Caltrans
# distrcit that contains this census station within PeMS.
'Absolute Postmile', # The postmile where this census station is
# located.
'Station Type', # A string indicating the type of station. Possible
# values (and their meaning are:
# CD (Coll/Dist)
# CH (Conventional Highway)
# FF (Fwy-Fwy connector)
# FR (Off Ramp)
# HV (HOV)
# ML (Mainline)
# OR (On Ramp)
'Census Station Set ID', # The unique number that identifies the
# census station set within PeMS.
'Lane Number', # The lane number
'Vehicle Class', # The vehicle class
'Vehicle Count', # The vehicle count
'Average Speed',# The average speed observed for this vehicle class
# during the summary interval. MPH
'Violation Count', # The number of violations observed during his
# summary interval.
'Violation Codes', # A set of tuples of the form 'violation code'
# = 'count': ...
'Single Axle Count', # The single axle count
'Tandem Axle Count', # Tandem axle count
'Tridem Axle Count', # The tridem axle count
'Quad Axle Count', # The quad axle count
'Average Gross Weight', # The average gross weight unknown
'Gross Weight Distribution', # The gross weight distribution
'Average Single Weight', # The average single weight
'Average Tandem Weight', # The average tandem weight
'Average Tridem Weight', # The average tridem weight
'Average Quad Weight', # The average quad weight
'Average Vehicle Length', # The average vehicle length
'Vehicle Length Distribution', # The vehicle length distribution
'Average Tandem Spacing',# The average tandem spacing
'Average Tridem Spacing',# The average tridem spacing
'Average Quad Spacing', # The average quad spacing
'Average Wheelbase', # The average wheelbase
'Wheelbase Distribution', #The wheelbase distribution
'Total Flex ESAL 300', # Total Flex ESAL 300
'Total Flex ESAL 285', # Total Flex ESAL 285
'Total Rigid ESAL 300', # Total Rigid ESAL 300
'Total Rigid ESAL 285'] # Total Rigid ESAL 285
# Open files for 2019-2020
path = DIR_MOBILITY+'losangeles/'
all_files = glob.glob(path + '/*.txt')
all_files.sort()
df = []
for fname in all_files:
li = pd.read_csv(fname, names=colnames, header=None)
df.append(li)
df = pd.concat(df, axis=0, ignore_index=True)
# Add empty rows for latitude, longitude
df['Latitude'], df['Longitude'] = np.nan, np.nan
# Information about vehicle size class comes from
# https://en.wikipedia.org/wiki/Vehicle_size_class:
# 1: Motorcycles
# 2: Passenger Cars
# 3: Other Two-Axle Four-Tire Single-Unit Vehicles
# 4: Buses
# 5: Two-Axle, Six-Tire, Single-Unit Trucks
# 6: Three-Axle Single-Unit Trucks
# 7: Four or More Axle Single-Unit Trucks
# 8: Four or Fewer Axle Single-Trailer Trucks
# 9: Five-Axle Single-Trailer Trucks
# 10: Six or More Axle Single-Trailer Trucks
# 11: Five or Fewer Axle Multi-Trailer Trucks
# 12: Six-Axle Multi-Trailer Trucks
# 13: Seven or More Axle Multi-Trailer
# 14: Unused ---- ----
# 15: Unclassified Vehicle
# Check out this infographic too:
# http://onlinemanuals.txdot.gov/txdotmanuals/tri/images/FHWA_Classification_Chart_FINAL.png
# From a quick inspection of "Census Station Identifier" 129000 summed
# over January 2019, here's the distribution:
# vclass = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
# counts = [0.0,12026.0,63916.0,15304.0,190176.0,40634.0,13424.0,
# 29278.0,221582.0,782.0,9430.0,2028.0,152.0,9842.0,12122.0]
# plt.bar(vclass,counts)
# plt.xlim([1,16])
# plt.xticks(np.arange(1,16,1))
# plt.show()
# So the largest contributors: 3 (two-axle four tire vehicles),
# 5 (two axle, six-tire trucks), and 9 (five-axle single-trailer trucks)
losangeles = ['06037','06059']
# Identify a station by the "Census Station Identifier". Note that each
# census station identifier has a couple substation identifiers associated
# with it (corresponding to a North and South Directions). Additionally,
# all station identifiers should have identical County Identifier,
# District Identifier, and Absolute Postmile.
latraf = []
for stationid in np.unique(df['Census Station Identifier'].values):
station = df.loc[df['Census Station Identifier']==stationid].copy()
# Get unique match/join values
abs_post = np.unique(station['Absolute Postmile'].values)
free_id = np.unique(station['Freeway Identifier'].values)
free_direction = np.unique(station['Freeway Direction'].values)
county_id = np.unique(station['County Identifier'].values)
district_id = np.unique(station['District Identifier'].values)
# Find siting information from siteinfo DataFrame using match/join
# values
siteinfo = sites.loc[
(sites['Freeway ID'].isin(free_id)) &
(sites['Freeway Direction'].isin(free_direction)) &
(sites['County ID'].isin(county_id)) &
(sites['District ID'].isin(district_id))]
# Since the absolute postmile parameter is wonky and sometimes doesn't
# match between the traffic data and site information, deal with
# this parameter differently
lattemp, lngtemp = [], []
for ap in abs_post:
siteinfo_ap = siteinfo.iloc[(siteinfo['Absolute Postmile']-ap
).abs().argsort()[:1]]
lattemp.append(siteinfo_ap['Latitude'].values[0])
lngtemp.append(siteinfo_ap['Longitude'].values[0])
# Ensure that stations (if > 1 exists on N/S freeway directions) are
# close to each other. This step is a bit kludgy
# if ((np.ptp(lattemp) > 0.1) or (np.ptp(lngtemp) > 0.01)):
# print('Lat and/or lon for different highway directions at '+
# 'Census Station Identifier %s are > 0.01 deg apart!'%(
# station['Census Station Identifier'].values[0]))
# Fetch FIPS census block number based on census station
# latitude/longitude
response = requests.get('https://geo.fcc.gov/api/census/block/'+
'find?latitude=%s&longitude=%s&format=json'%(
np.nanmean(lattemp), np.nanmean(lngtemp)))
fips = response.json()['Block']['FIPS']
# Lob off state + county code (first five digits) from FIPS
fips = fips[:5]
if fips in losangeles:
# Group by date and vehicle class (i.e., combine over lane
# direction and lane number)
station = station.groupby(by=['Timestamp','Vehicle Class']).agg(
{'Vehicle Count':'sum'})
station = station.reset_index()
# Pivot table
station = station.pivot(index='Timestamp', columns='Vehicle Class',
values='Vehicle Count')
station.columns = station.columns.map(str)
# Sum over total vehicle count, passenger vehicles, and freight
station['Count'] = station.loc[:,'0':'15'].sum(axis=1)
station['Passenger'] = station.loc[:,'0':'5'].sum(axis=1)
station['Freight'] = station.loc[:,'6':'15'].sum(axis=1)
# Drop unneeded columns
station = station[['Count','Passenger','Freight']]
station['Site'] = stationid
station['Latitude'] = np.nanmean(lattemp)
station['Longitude'] = np.nanmean(lngtemp)
station.index = pd.to_datetime(station.index)
latraf.append(station)
latraf = pd.concat(latraf, axis=0)
df.rename(columns={'Datum':'Date'}, inplace=True)