1
1
2
2
path <- file.path(" D:/OneDrive - University of Leeds/Data/CREDS Data/MOT anoymised/clean" )
3
3
4
- years <- 2005 : 2018
4
+ years <- 2005 : 2020
5
5
6
6
library(dplyr )
7
7
library(data.table )
@@ -10,6 +10,8 @@ library(purrr)
10
10
library(future )
11
11
library(furrr )
12
12
library(progressr )
13
+ library(tidyr )
14
+ library(zoo )
13
15
14
16
mot <- list ()
15
17
@@ -24,11 +26,14 @@ for(i in 1:length(years)){
24
26
mot [[i ]] <- mot_sub
25
27
}
26
28
27
- mot <- bind_rows(mot )
28
- saveRDS(mot ,paste0(path ," test_result_2005_2018.Rds" ))
29
+ # mot <- bind_rows(mot)
30
+ rm(mot_sub )
31
+ gc()
32
+ mot <- data.table :: rbindlist(mot )
33
+ saveRDS(mot ,paste0(path ," test_result_2005_2020.Rds" ))
29
34
gc()
30
35
31
- mot <- readRDS(paste0(path ," test_result_2005_2018 .Rds" ))
36
+ mot <- readRDS(paste0(path ," test_result_2005_2020 .Rds" ))
32
37
car_data <- mot [,c(" vehicle_id" ," make" ," model" , " colour" , " fuel_type" ," cylinder_capacity" ," first_use_date" )]
33
38
34
39
car_data $ make <- as.factor(car_data $ make )
@@ -54,65 +59,104 @@ summary(duplicated(car_data$vehicle_id))
54
59
55
60
car_data <- car_data [! duplicated(car_data $ vehicle_id ),]
56
61
57
- saveRDS(car_data ,paste0(path ," car_data_2005_2018 .Rds" ))
62
+ saveRDS(car_data ,paste0(path ," car_data_2005_2020 .Rds" ))
58
63
59
64
rm(car_data , foo , bar , car_data_dup )
60
65
61
- test_data <- readRDS(paste0(path ," test_result_2005_2018 .Rds" ))
66
+ test_data <- readRDS(paste0(path ," test_result_2005_2020 .Rds" ))
62
67
nrow(test_data ) / 1e6
63
68
64
69
test_data <- as.data.table(test_data )
65
70
test_data <- test_data [order(test_data $ vehicle_id ),]
66
71
test_data <- test_data [,c(" vehicle_id" ," test_date" ," test_mileage" ," postcode_area" ," year" )]
67
- saveRDS(test_data ,paste0(path ," test_data_2005_2018 .Rds" ))
72
+ saveRDS(test_data ,paste0(path ," test_data_2005_2020 .Rds" ))
68
73
69
74
test_data_wide <- tidyr :: pivot_wider(test_data , id_cols = " vehicle_id" ,
70
75
names_from = " year" , values_from = c(" test_mileage" ," postcode_area" ))
71
76
72
- saveRDS(test_data_wide ,paste0(path ," test_data_wide_2005_2018.Rds" ))
77
+ saveRDS(test_data_wide ,paste0(path ," test_data_wide_2005_2020.Rds" ), compress = FALSE )
78
+
79
+ test_data_wide <- readRDS(paste0(path ," test_data_wide_2005_2020.Rds" ))
80
+ test_data_wide <- test_data_wide [,c(" vehicle_id" ," test_mileage_2005" ," test_mileage_2006" , " test_mileage_2007" , " test_mileage_2008" ,
81
+ " test_mileage_2009" , " test_mileage_2010" , " test_mileage_2011" , " test_mileage_2012" ," test_mileage_2013" ," test_mileage_2014" ,
82
+ " test_mileage_2015" , " test_mileage_2016" , " test_mileage_2017" , " test_mileage_2018" , " test_mileage_2019" , " test_mileage_2020" ,
83
+ " postcode_area_2005" ," postcode_area_2006" , " postcode_area_2007" , " postcode_area_2008" , " postcode_area_2009" , " postcode_area_2010" ,
84
+ " postcode_area_2011" , " postcode_area_2012" , " postcode_area_2013" , " postcode_area_2014" , " postcode_area_2015" , " postcode_area_2016" ,
85
+ " postcode_area_2017" , " postcode_area_2018" , " postcode_area_2019" , " postcode_area_2020" )]
86
+
87
+ # Check for year to year constancy
88
+
89
+ test_mileage <- test_data_wide [, c(" test_mileage_2005" ," test_mileage_2006" , " test_mileage_2007" , " test_mileage_2008" ,
90
+ " test_mileage_2009" , " test_mileage_2010" , " test_mileage_2011" , " test_mileage_2012" ," test_mileage_2013" ," test_mileage_2014" ,
91
+ " test_mileage_2015" , " test_mileage_2016" , " test_mileage_2017" , " test_mileage_2018" , " test_mileage_2019" , " test_mileage_2020" ," vehicle_id" )]
92
+ test_postcode <- test_data_wide [, c(" postcode_area_2005" ," postcode_area_2006" , " postcode_area_2007" , " postcode_area_2008" ,
93
+ " postcode_area_2009" , " postcode_area_2010" , " postcode_area_2011" , " postcode_area_2012" ," postcode_area_2013" ," postcode_area_2014" ,
94
+ " postcode_area_2015" , " postcode_area_2016" , " postcode_area_2017" , " postcode_area_2018" , " postcode_area_2019" , " postcode_area_2020" ," vehicle_id" )]
95
+ # summary(duplicated(test_data_wide$vehicle_id))
96
+
97
+ rm(test_data , test_data_wide )
98
+ test_mileage $ first_year <- names(test_mileage [1 : 16 ])[max.col(! is.na(test_mileage [1 : 16 ]), " first" )]
99
+ test_mileage $ first_year <- gsub(" test_mileage_" ," " ,test_mileage $ first_year )
100
+ test_mileage $ first_year <- as.numeric(test_mileage $ first_year )
101
+
102
+ test_mileage $ last_year <- names(test_mileage [1 : 16 ])[max.col(! is.na(test_mileage [1 : 16 ]), " last" )]
103
+ test_mileage $ last_year <- gsub(" test_mileage_" ," " ,test_mileage $ last_year )
104
+ test_mileage $ last_year <- as.numeric(test_mileage $ last_year )
105
+
106
+ test_mileage $ total_years <- rowSums(! is.na(test_mileage [1 : 16 ]))
107
+ test_mileage $ year_range <- test_mileage $ last_year - test_mileage $ first_year + 1
108
+ test_mileage $ missing_years <- test_mileage $ year_range - test_mileage $ total_years
109
+ test_mileage $ zero_year <- test_mileage $ first_year - 3
110
+ # summary(test_mileage$missing_years)
111
+ # hist(test_mileage$missing_years)
112
+
113
+
114
+ update_col <- function (orig , zy , yr ){
115
+ ifelse(zy == yr & is.na(orig ), 0 , orig )
116
+ }
73
117
74
- test_data_wide <- readRDS(paste0(path ," test_data_wide_2005_2018.Rds" ))
75
- # car_data <- readRDS(paste0(path,"car_data_2005_2018.Rds"))
76
- # car_data <- car_data[,c("vehicle_id","first_use_date")]
118
+ test_mileage $ test_mileage_2005 = update_col(test_mileage $ test_mileage_2005 , test_mileage $ zero_year , 2005 )
119
+ test_mileage $ test_mileage_2006 = update_col(test_mileage $ test_mileage_2006 , test_mileage $ zero_year , 2006 )
120
+ test_mileage $ test_mileage_2007 = update_col(test_mileage $ test_mileage_2007 , test_mileage $ zero_year , 2007 )
121
+ test_mileage $ test_mileage_2008 = update_col(test_mileage $ test_mileage_2008 , test_mileage $ zero_year , 2008 )
122
+ test_mileage $ test_mileage_2009 = update_col(test_mileage $ test_mileage_2009 , test_mileage $ zero_year , 2009 )
123
+ test_mileage $ test_mileage_2010 = update_col(test_mileage $ test_mileage_2010 , test_mileage $ zero_year , 2010 )
124
+ test_mileage $ test_mileage_2011 = update_col(test_mileage $ test_mileage_2011 , test_mileage $ zero_year , 2011 )
125
+ test_mileage $ test_mileage_2012 = update_col(test_mileage $ test_mileage_2012 , test_mileage $ zero_year , 2012 )
126
+ test_mileage $ test_mileage_2013 = update_col(test_mileage $ test_mileage_2013 , test_mileage $ zero_year , 2013 )
127
+ test_mileage $ test_mileage_2014 = update_col(test_mileage $ test_mileage_2014 , test_mileage $ zero_year , 2014 )
128
+ test_mileage $ test_mileage_2015 = update_col(test_mileage $ test_mileage_2015 , test_mileage $ zero_year , 2015 )
129
+ test_mileage $ test_mileage_2016 = update_col(test_mileage $ test_mileage_2016 , test_mileage $ zero_year , 2016 )
130
+ test_mileage $ test_mileage_2017 = update_col(test_mileage $ test_mileage_2017 , test_mileage $ zero_year , 2017 )
131
+ test_mileage $ test_mileage_2018 = update_col(test_mileage $ test_mileage_2018 , test_mileage $ zero_year , 2018 )
132
+ test_mileage $ test_mileage_2019 = update_col(test_mileage $ test_mileage_2019 , test_mileage $ zero_year , 2019 )
133
+ test_mileage $ test_mileage_2020 = update_col(test_mileage $ test_mileage_2020 , test_mileage $ zero_year , 2020 )
134
+
135
+ test_mileage <- test_mileage [,c(" vehicle_id" ,paste0(" test_mileage_" ,2005 : 2020 ))]
136
+ test_mileage <- pivot_longer(test_mileage ,
137
+ cols = test_mileage_2005 : test_mileage_2020 ,
138
+ names_to = " year" ,
139
+ names_prefix = " test_mileage_" ,
140
+ values_to = " mileage" ,
141
+ values_drop_na = FALSE )
142
+ test_mileage $ year <- as.integer(test_mileage $ year )
143
+
144
+ # clear up memory
145
+ test_mileage $ mileage <- as.integer(test_mileage $ mileage )
146
+ gc()
77
147
78
- # test_data_wide <- left_join(test_data_wide, car_data, by = c("vehicle_id"))
79
- # x = test_data_wide[1:1000000,]
80
- #
81
- # summarise_history <- function(vehicle_id,
82
- # test_mileage_2005,
83
- # test_mileage_2006,
84
- # test_mileage_2007,
85
- # test_mileage_2008,
86
- # test_mileage_2009,
87
- # test_mileage_2010,
88
- # test_mileage_2011,
89
- # test_mileage_2012,
90
- # test_mileage_2013,
91
- # test_mileage_2014,
92
- # test_mileage_2015,
93
- # test_mileage_2016,
94
- # test_mileage_2017,
95
- # test_mileage_2018){
96
- #
97
- # res <- data.frame(vehicle_id,
98
- # km_2006 = as.integer(round((test_mileage_2006 - test_mileage_2005) * 1.60934, 0)),
99
- # km_2007 = as.integer(round((test_mileage_2007 - test_mileage_2006) * 1.60934, 0)),
100
- # km_2008 = as.integer(round((test_mileage_2008 - test_mileage_2007) * 1.60934, 0)),
101
- # km_2009 = as.integer(round((test_mileage_2009 - test_mileage_2008) * 1.60934, 0)),
102
- # km_2010 = as.integer(round((test_mileage_2010 - test_mileage_2009) * 1.60934, 0)),
103
- # km_2011 = as.integer(round((test_mileage_2011 - test_mileage_2010) * 1.60934, 0)),
104
- # km_2012 = as.integer(round((test_mileage_2012 - test_mileage_2011) * 1.60934, 0)),
105
- # km_2013 = as.integer(round((test_mileage_2013 - test_mileage_2012) * 1.60934, 0)),
106
- # km_2014 = as.integer(round((test_mileage_2014 - test_mileage_2013) * 1.60934, 0)),
107
- # km_2015 = as.integer(round((test_mileage_2015 - test_mileage_2014) * 1.60934, 0)),
108
- # km_2016 = as.integer(round((test_mileage_2016 - test_mileage_2015) * 1.60934, 0)),
109
- # km_2017 = as.integer(round((test_mileage_2017 - test_mileage_2016) * 1.60934, 0)),
110
- # km_2018 = as.integer(round((test_mileage_2018 - test_mileage_2017) * 1.60934, 0)))
111
- #
112
- #
113
- # return(res)
114
- #
115
- # }
148
+ test_mileage_aprox <- test_mileage %> %
149
+ group_by(vehicle_id ) %> %
150
+ mutate(approx = as.integer(round(na.approx(mileage , na.rm = FALSE ),0 )))
151
+
152
+ test_mileage_aprox $ mileage <- NULL
153
+
154
+ test_data_wide <- pivot_wider(test_mileage_aprox ,
155
+ names_from = " year" ,
156
+ names_prefix = " test_mileage_" ,
157
+ values_from = " approx" )
158
+
159
+ rm(test_mileage_aprox , test_mileage )
116
160
117
161
test_data_wide $ km_2006 = as.integer(round((test_data_wide $ test_mileage_2006 - test_data_wide $ test_mileage_2005 ) * 1.60934 , 0 ))
118
162
test_data_wide $ km_2007 = as.integer(round((test_data_wide $ test_mileage_2007 - test_data_wide $ test_mileage_2006 ) * 1.60934 , 0 ))
@@ -127,7 +171,35 @@ test_data_wide$km_2015 = as.integer(round((test_data_wide$test_mileage_2015 - te
127
171
test_data_wide $ km_2016 = as.integer(round((test_data_wide $ test_mileage_2016 - test_data_wide $ test_mileage_2015 ) * 1.60934 , 0 ))
128
172
test_data_wide $ km_2017 = as.integer(round((test_data_wide $ test_mileage_2017 - test_data_wide $ test_mileage_2016 ) * 1.60934 , 0 ))
129
173
test_data_wide $ km_2018 = as.integer(round((test_data_wide $ test_mileage_2018 - test_data_wide $ test_mileage_2017 ) * 1.60934 , 0 ))
174
+ test_data_wide $ km_2019 = as.integer(round((test_data_wide $ test_mileage_2019 - test_data_wide $ test_mileage_2018 ) * 1.60934 , 0 ))
175
+ test_data_wide $ km_2020 = as.integer(round((test_data_wide $ test_mileage_2020 - test_data_wide $ test_mileage_2019 ) * 1.60934 , 0 ))
176
+
177
+
178
+ saveRDS(test_data_wide ,paste0(path ," test_data_wide_2005_2020_with_neg.Rds" ), compress = FALSE )
130
179
180
+ # Some cars have -ve millage due to error so set to 0
181
+ summary(test_data_wide $ km_2018 )
182
+ foo <- test_data_wide [test_data_wide $ km_2018 > 160000 | test_data_wide $ km_2018 < - 160000 , ]
183
+ foo <- foo [! is.na(foo $ km_2018 ),]
184
+
185
+ test_data_wide $ km_2006 [test_data_wide $ km_2006 < 0 ] <- 0
186
+ test_data_wide $ km_2007 [test_data_wide $ km_2007 < 0 ] <- 0
187
+ test_data_wide $ km_2008 [test_data_wide $ km_2008 < 0 ] <- 0
188
+ test_data_wide $ km_2009 [test_data_wide $ km_2009 < 0 ] <- 0
189
+ test_data_wide $ km_2010 [test_data_wide $ km_2010 < 0 ] <- 0
190
+ test_data_wide $ km_2011 [test_data_wide $ km_2011 < 0 ] <- 0
191
+ test_data_wide $ km_2012 [test_data_wide $ km_2012 < 0 ] <- 0
192
+ test_data_wide $ km_2013 [test_data_wide $ km_2013 < 0 ] <- 0
193
+ test_data_wide $ km_2014 [test_data_wide $ km_2014 < 0 ] <- 0
194
+ test_data_wide $ km_2015 [test_data_wide $ km_2015 < 0 ] <- 0
195
+ test_data_wide $ km_2016 [test_data_wide $ km_2016 < 0 ] <- 0
196
+ test_data_wide $ km_2017 [test_data_wide $ km_2017 < 0 ] <- 0
197
+ test_data_wide $ km_2018 [test_data_wide $ km_2018 < 0 ] <- 0
198
+ test_data_wide $ km_2019 [test_data_wide $ km_2019 < 0 ] <- 0
199
+ test_data_wide $ km_2020 [test_data_wide $ km_2020 < 0 ] <- 0
200
+
201
+
202
+ test_data_wide <- left_join(test_data_wide , test_postcode , by = " vehicle_id" )
131
203
132
204
test_data_wide <- test_data_wide [,c(" vehicle_id" ,
133
205
" km_2006" ,
@@ -143,6 +215,8 @@ test_data_wide <- test_data_wide[,c("vehicle_id",
143
215
" km_2016" ,
144
216
" km_2017" ,
145
217
" km_2018" ,
218
+ " km_2019" ,
219
+ " km_2020" ,
146
220
" postcode_area_2006" ,
147
221
" postcode_area_2007" ,
148
222
" postcode_area_2008" ,
@@ -155,15 +229,18 @@ test_data_wide <- test_data_wide[,c("vehicle_id",
155
229
" postcode_area_2015" ,
156
230
" postcode_area_2016" ,
157
231
" postcode_area_2017" ,
158
- " postcode_area_2018" )]
232
+ " postcode_area_2018" ,
233
+ " postcode_area_2019" ,
234
+ " postcode_area_2020"
235
+ )]
159
236
160
237
saveRDS(test_data_wide , paste0(path ," vehicle_km_per_year.Rds" ))
161
238
162
239
library(tidyr )
163
240
164
241
names(test_data_wide ) <- gsub(" postcode_area" ," postcodearea" , names(test_data_wide ))
165
242
vehicle_km_long <- pivot_longer(test_data_wide ,
166
- cols = km_2006 : postcodearea_2018 ,
243
+ cols = km_2006 : postcodearea_2020 ,
167
244
names_to = c(" .value" , " year" ),
168
245
names_pattern = " (.+)_(.+)" ,
169
246
values_drop_na = TRUE )
@@ -177,7 +254,9 @@ postcode_summary <- vehicle_km_long %>%
177
254
library(ggplot2 )
178
255
library(ggrepel )
179
256
180
- postcode_summary [postcode_summary $ year > 2006 ,] %> %
257
+ postcode_summary %> %
258
+ filter(year > 2006 ) %> %
259
+ filter(! is.na(postcodearea )) %> %
181
260
mutate(label = if_else(year == max(year ), as.character(postcodearea ), NA_character_ )) %> %
182
261
ggplot(aes(x = year , y = total_km , group = postcodearea )) +
183
262
geom_line(aes(colour = postcodearea )) +
@@ -188,16 +267,17 @@ postcode_summary[postcode_summary$year > 2006,] %>%
188
267
na.rm = TRUE ,
189
268
max.overlaps = 10 )
190
269
270
+ postcode_summary <- pivot_wider(postcode_summary ,
271
+ id_cols = " postcodearea" ,
272
+ names_from = " year" ,
273
+ values_from = " total_km" )
274
+
191
275
192
276
write.csv(postcode_summary , " ../CarbonCalculator/data/postocde_driving_summary.csv" )
193
277
library(sf )
194
278
195
279
postcodes <- readRDS(" ../CarbonCalculator/data/bounds/postcode_areas.Rds" )
196
280
197
- postcode_summary <- pivot_wider(postcode_summary ,
198
- id_cols = " postcodearea" ,
199
- names_from = " year" ,
200
- values_from = " total_km" )
201
281
202
282
postcodes <- left_join(postcodes , postcode_summary , by = c(" PC_AREA" = " postcodearea" ))
203
283
@@ -209,23 +289,7 @@ tm_shape(postcodes) +
209
289
style = " quantile" ) +
210
290
tm_borders()
211
291
212
- # pb <- progress_estimated(length(file_list))
213
- # system.time(res <- pmap_dfr(.l = x[,c("vehicle_id",
214
- # "test_mileage_2005",
215
- # "test_mileage_2006",
216
- # "test_mileage_2007",
217
- # "test_mileage_2008",
218
- # "test_mileage_2009",
219
- # "test_mileage_2010",
220
- # "test_mileage_2011",
221
- # "test_mileage_2012",
222
- # "test_mileage_2013",
223
- # "test_mileage_2014",
224
- # "test_mileage_2015",
225
- # "test_mileage_2016",
226
- # "test_mileage_2017",
227
- # "test_mileage_2018")],
228
- # .f = summarise_history))
292
+
229
293
230
294
km_summary <- list ()
231
295
@@ -269,3 +333,64 @@ saveRDS(km_summary, paste0(path,"vehicle_km_per_year.Rds"))
269
333
#
270
334
# car_data$make_model <- paste0(car_data$make," ",car_data$model)
271
335
# summary(car_data$make_model %in% classif$GenModel)
336
+
337
+
338
+ # car_data <- readRDS(paste0(path,"car_data_2005_2018.Rds"))
339
+ # car_data <- car_data[,c("vehicle_id","first_use_date")]
340
+
341
+ # test_data_wide <- left_join(test_data_wide, car_data, by = c("vehicle_id"))
342
+ # x = test_data_wide[1:1000000,]
343
+ #
344
+ # summarise_history <- function(vehicle_id,
345
+ # test_mileage_2005,
346
+ # test_mileage_2006,
347
+ # test_mileage_2007,
348
+ # test_mileage_2008,
349
+ # test_mileage_2009,
350
+ # test_mileage_2010,
351
+ # test_mileage_2011,
352
+ # test_mileage_2012,
353
+ # test_mileage_2013,
354
+ # test_mileage_2014,
355
+ # test_mileage_2015,
356
+ # test_mileage_2016,
357
+ # test_mileage_2017,
358
+ # test_mileage_2018){
359
+ #
360
+ # res <- data.frame(vehicle_id,
361
+ # km_2006 = as.integer(round((test_mileage_2006 - test_mileage_2005) * 1.60934, 0)),
362
+ # km_2007 = as.integer(round((test_mileage_2007 - test_mileage_2006) * 1.60934, 0)),
363
+ # km_2008 = as.integer(round((test_mileage_2008 - test_mileage_2007) * 1.60934, 0)),
364
+ # km_2009 = as.integer(round((test_mileage_2009 - test_mileage_2008) * 1.60934, 0)),
365
+ # km_2010 = as.integer(round((test_mileage_2010 - test_mileage_2009) * 1.60934, 0)),
366
+ # km_2011 = as.integer(round((test_mileage_2011 - test_mileage_2010) * 1.60934, 0)),
367
+ # km_2012 = as.integer(round((test_mileage_2012 - test_mileage_2011) * 1.60934, 0)),
368
+ # km_2013 = as.integer(round((test_mileage_2013 - test_mileage_2012) * 1.60934, 0)),
369
+ # km_2014 = as.integer(round((test_mileage_2014 - test_mileage_2013) * 1.60934, 0)),
370
+ # km_2015 = as.integer(round((test_mileage_2015 - test_mileage_2014) * 1.60934, 0)),
371
+ # km_2016 = as.integer(round((test_mileage_2016 - test_mileage_2015) * 1.60934, 0)),
372
+ # km_2017 = as.integer(round((test_mileage_2017 - test_mileage_2016) * 1.60934, 0)),
373
+ # km_2018 = as.integer(round((test_mileage_2018 - test_mileage_2017) * 1.60934, 0)))
374
+ #
375
+ #
376
+ # return(res)
377
+ #
378
+ # }
379
+
380
+ # pb <- progress_estimated(length(file_list))
381
+ # system.time(res <- pmap_dfr(.l = x[,c("vehicle_id",
382
+ # "test_mileage_2005",
383
+ # "test_mileage_2006",
384
+ # "test_mileage_2007",
385
+ # "test_mileage_2008",
386
+ # "test_mileage_2009",
387
+ # "test_mileage_2010",
388
+ # "test_mileage_2011",
389
+ # "test_mileage_2012",
390
+ # "test_mileage_2013",
391
+ # "test_mileage_2014",
392
+ # "test_mileage_2015",
393
+ # "test_mileage_2016",
394
+ # "test_mileage_2017",
395
+ # "test_mileage_2018")],
396
+ # .f = summarise_history))
0 commit comments