Skip to content

Commit 5df8fa9

Browse files
committed
fix the car data
1 parent eca80c5 commit 5df8fa9

File tree

5 files changed

+481
-73
lines changed

5 files changed

+481
-73
lines changed

compare_years_anon.R

Lines changed: 198 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
path <- file.path("D:/OneDrive - University of Leeds/Data/CREDS Data/MOT anoymised/clean")
33

4-
years <- 2005:2018
4+
years <- 2005:2020
55

66
library(dplyr)
77
library(data.table)
@@ -10,6 +10,8 @@ library(purrr)
1010
library(future)
1111
library(furrr)
1212
library(progressr)
13+
library(tidyr)
14+
library(zoo)
1315

1416
mot <- list()
1517

@@ -24,11 +26,14 @@ for(i in 1:length(years)){
2426
mot[[i]] <- mot_sub
2527
}
2628

27-
mot <- bind_rows(mot)
28-
saveRDS(mot,paste0(path,"test_result_2005_2018.Rds"))
29+
#mot <- bind_rows(mot)
30+
rm(mot_sub)
31+
gc()
32+
mot <- data.table::rbindlist(mot)
33+
saveRDS(mot,paste0(path,"test_result_2005_2020.Rds"))
2934
gc()
3035

31-
mot <- readRDS(paste0(path,"test_result_2005_2018.Rds"))
36+
mot <- readRDS(paste0(path,"test_result_2005_2020.Rds"))
3237
car_data <- mot[,c("vehicle_id","make","model", "colour", "fuel_type","cylinder_capacity","first_use_date")]
3338

3439
car_data$make <- as.factor(car_data$make)
@@ -54,65 +59,104 @@ summary(duplicated(car_data$vehicle_id))
5459

5560
car_data <- car_data[!duplicated(car_data$vehicle_id),]
5661

57-
saveRDS(car_data,paste0(path,"car_data_2005_2018.Rds"))
62+
saveRDS(car_data,paste0(path,"car_data_2005_2020.Rds"))
5863

5964
rm(car_data, foo, bar, car_data_dup)
6065

61-
test_data <- readRDS(paste0(path,"test_result_2005_2018.Rds"))
66+
test_data <- readRDS(paste0(path,"test_result_2005_2020.Rds"))
6267
nrow(test_data) / 1e6
6368

6469
test_data <- as.data.table(test_data)
6570
test_data <- test_data[order(test_data$vehicle_id),]
6671
test_data <- test_data[,c("vehicle_id","test_date","test_mileage","postcode_area","year")]
67-
saveRDS(test_data,paste0(path,"test_data_2005_2018.Rds"))
72+
saveRDS(test_data,paste0(path,"test_data_2005_2020.Rds"))
6873

6974
test_data_wide <- tidyr::pivot_wider(test_data, id_cols = "vehicle_id",
7075
names_from = "year", values_from = c("test_mileage","postcode_area"))
7176

72-
saveRDS(test_data_wide,paste0(path,"test_data_wide_2005_2018.Rds"))
77+
saveRDS(test_data_wide,paste0(path,"test_data_wide_2005_2020.Rds"), compress = FALSE)
78+
79+
test_data_wide <- readRDS(paste0(path,"test_data_wide_2005_2020.Rds"))
80+
test_data_wide <- test_data_wide[,c("vehicle_id","test_mileage_2005","test_mileage_2006" , "test_mileage_2007" , "test_mileage_2008",
81+
"test_mileage_2009" , "test_mileage_2010" , "test_mileage_2011" , "test_mileage_2012","test_mileage_2013","test_mileage_2014",
82+
"test_mileage_2015" , "test_mileage_2016" , "test_mileage_2017" , "test_mileage_2018", "test_mileage_2019", "test_mileage_2020",
83+
"postcode_area_2005","postcode_area_2006", "postcode_area_2007", "postcode_area_2008", "postcode_area_2009", "postcode_area_2010",
84+
"postcode_area_2011", "postcode_area_2012", "postcode_area_2013", "postcode_area_2014", "postcode_area_2015", "postcode_area_2016",
85+
"postcode_area_2017", "postcode_area_2018", "postcode_area_2019", "postcode_area_2020")]
86+
87+
# Check for year to year constancy
88+
89+
test_mileage <- test_data_wide[, c("test_mileage_2005","test_mileage_2006" , "test_mileage_2007" , "test_mileage_2008",
90+
"test_mileage_2009" , "test_mileage_2010" , "test_mileage_2011" , "test_mileage_2012","test_mileage_2013","test_mileage_2014",
91+
"test_mileage_2015" , "test_mileage_2016" , "test_mileage_2017" , "test_mileage_2018", "test_mileage_2019", "test_mileage_2020","vehicle_id")]
92+
test_postcode <- test_data_wide[, c("postcode_area_2005","postcode_area_2006" , "postcode_area_2007" , "postcode_area_2008",
93+
"postcode_area_2009" , "postcode_area_2010" , "postcode_area_2011" , "postcode_area_2012","postcode_area_2013","postcode_area_2014",
94+
"postcode_area_2015" , "postcode_area_2016" , "postcode_area_2017" , "postcode_area_2018", "postcode_area_2019", "postcode_area_2020","vehicle_id")]
95+
#summary(duplicated(test_data_wide$vehicle_id))
96+
97+
rm(test_data, test_data_wide)
98+
test_mileage$first_year <- names(test_mileage[1:16])[max.col(!is.na(test_mileage[1:16]), "first")]
99+
test_mileage$first_year <- gsub("test_mileage_","",test_mileage$first_year)
100+
test_mileage$first_year <- as.numeric(test_mileage$first_year)
101+
102+
test_mileage$last_year <- names(test_mileage[1:16])[max.col(!is.na(test_mileage[1:16]), "last")]
103+
test_mileage$last_year <- gsub("test_mileage_","",test_mileage$last_year)
104+
test_mileage$last_year <- as.numeric(test_mileage$last_year)
105+
106+
test_mileage$total_years <- rowSums(!is.na(test_mileage[1:16]))
107+
test_mileage$year_range <- test_mileage$last_year - test_mileage$first_year + 1
108+
test_mileage$missing_years <- test_mileage$year_range - test_mileage$total_years
109+
test_mileage$zero_year <- test_mileage$first_year - 3
110+
#summary(test_mileage$missing_years)
111+
#hist(test_mileage$missing_years)
112+
113+
114+
update_col <- function(orig, zy, yr){
115+
ifelse(zy == yr & is.na(orig), 0, orig)
116+
}
73117

74-
test_data_wide <- readRDS(paste0(path,"test_data_wide_2005_2018.Rds"))
75-
# car_data <- readRDS(paste0(path,"car_data_2005_2018.Rds"))
76-
# car_data <- car_data[,c("vehicle_id","first_use_date")]
118+
test_mileage$test_mileage_2005 = update_col(test_mileage$test_mileage_2005, test_mileage$zero_year, 2005)
119+
test_mileage$test_mileage_2006 = update_col(test_mileage$test_mileage_2006, test_mileage$zero_year, 2006)
120+
test_mileage$test_mileage_2007 = update_col(test_mileage$test_mileage_2007, test_mileage$zero_year, 2007)
121+
test_mileage$test_mileage_2008 = update_col(test_mileage$test_mileage_2008, test_mileage$zero_year, 2008)
122+
test_mileage$test_mileage_2009 = update_col(test_mileage$test_mileage_2009, test_mileage$zero_year, 2009)
123+
test_mileage$test_mileage_2010 = update_col(test_mileage$test_mileage_2010, test_mileage$zero_year, 2010)
124+
test_mileage$test_mileage_2011 = update_col(test_mileage$test_mileage_2011, test_mileage$zero_year, 2011)
125+
test_mileage$test_mileage_2012 = update_col(test_mileage$test_mileage_2012, test_mileage$zero_year, 2012)
126+
test_mileage$test_mileage_2013 = update_col(test_mileage$test_mileage_2013, test_mileage$zero_year, 2013)
127+
test_mileage$test_mileage_2014 = update_col(test_mileage$test_mileage_2014, test_mileage$zero_year, 2014)
128+
test_mileage$test_mileage_2015 = update_col(test_mileage$test_mileage_2015, test_mileage$zero_year, 2015)
129+
test_mileage$test_mileage_2016 = update_col(test_mileage$test_mileage_2016, test_mileage$zero_year, 2016)
130+
test_mileage$test_mileage_2017 = update_col(test_mileage$test_mileage_2017, test_mileage$zero_year, 2017)
131+
test_mileage$test_mileage_2018 = update_col(test_mileage$test_mileage_2018, test_mileage$zero_year, 2018)
132+
test_mileage$test_mileage_2019 = update_col(test_mileage$test_mileage_2019, test_mileage$zero_year, 2019)
133+
test_mileage$test_mileage_2020 = update_col(test_mileage$test_mileage_2020, test_mileage$zero_year, 2020)
134+
135+
test_mileage <- test_mileage[,c("vehicle_id",paste0("test_mileage_",2005:2020))]
136+
test_mileage <- pivot_longer(test_mileage,
137+
cols = test_mileage_2005:test_mileage_2020,
138+
names_to = "year",
139+
names_prefix = "test_mileage_",
140+
values_to = "mileage",
141+
values_drop_na = FALSE)
142+
test_mileage$year <- as.integer(test_mileage$year)
143+
144+
#clear up memory
145+
test_mileage$mileage <- as.integer(test_mileage$mileage)
146+
gc()
77147

78-
# test_data_wide <- left_join(test_data_wide, car_data, by = c("vehicle_id"))
79-
# x = test_data_wide[1:1000000,]
80-
#
81-
# summarise_history <- function(vehicle_id,
82-
# test_mileage_2005,
83-
# test_mileage_2006,
84-
# test_mileage_2007,
85-
# test_mileage_2008,
86-
# test_mileage_2009,
87-
# test_mileage_2010,
88-
# test_mileage_2011,
89-
# test_mileage_2012,
90-
# test_mileage_2013,
91-
# test_mileage_2014,
92-
# test_mileage_2015,
93-
# test_mileage_2016,
94-
# test_mileage_2017,
95-
# test_mileage_2018){
96-
#
97-
# res <- data.frame(vehicle_id,
98-
# km_2006 = as.integer(round((test_mileage_2006 - test_mileage_2005) * 1.60934, 0)),
99-
# km_2007 = as.integer(round((test_mileage_2007 - test_mileage_2006) * 1.60934, 0)),
100-
# km_2008 = as.integer(round((test_mileage_2008 - test_mileage_2007) * 1.60934, 0)),
101-
# km_2009 = as.integer(round((test_mileage_2009 - test_mileage_2008) * 1.60934, 0)),
102-
# km_2010 = as.integer(round((test_mileage_2010 - test_mileage_2009) * 1.60934, 0)),
103-
# km_2011 = as.integer(round((test_mileage_2011 - test_mileage_2010) * 1.60934, 0)),
104-
# km_2012 = as.integer(round((test_mileage_2012 - test_mileage_2011) * 1.60934, 0)),
105-
# km_2013 = as.integer(round((test_mileage_2013 - test_mileage_2012) * 1.60934, 0)),
106-
# km_2014 = as.integer(round((test_mileage_2014 - test_mileage_2013) * 1.60934, 0)),
107-
# km_2015 = as.integer(round((test_mileage_2015 - test_mileage_2014) * 1.60934, 0)),
108-
# km_2016 = as.integer(round((test_mileage_2016 - test_mileage_2015) * 1.60934, 0)),
109-
# km_2017 = as.integer(round((test_mileage_2017 - test_mileage_2016) * 1.60934, 0)),
110-
# km_2018 = as.integer(round((test_mileage_2018 - test_mileage_2017) * 1.60934, 0)))
111-
#
112-
#
113-
# return(res)
114-
#
115-
# }
148+
test_mileage_aprox <- test_mileage %>%
149+
group_by(vehicle_id) %>%
150+
mutate(approx = as.integer(round(na.approx(mileage, na.rm = FALSE),0)))
151+
152+
test_mileage_aprox$mileage <- NULL
153+
154+
test_data_wide <- pivot_wider(test_mileage_aprox,
155+
names_from = "year",
156+
names_prefix = "test_mileage_",
157+
values_from = "approx")
158+
159+
rm(test_mileage_aprox, test_mileage)
116160

117161
test_data_wide$km_2006 = as.integer(round((test_data_wide$test_mileage_2006 - test_data_wide$test_mileage_2005) * 1.60934, 0))
118162
test_data_wide$km_2007 = as.integer(round((test_data_wide$test_mileage_2007 - test_data_wide$test_mileage_2006) * 1.60934, 0))
@@ -127,7 +171,35 @@ test_data_wide$km_2015 = as.integer(round((test_data_wide$test_mileage_2015 - te
127171
test_data_wide$km_2016 = as.integer(round((test_data_wide$test_mileage_2016 - test_data_wide$test_mileage_2015) * 1.60934, 0))
128172
test_data_wide$km_2017 = as.integer(round((test_data_wide$test_mileage_2017 - test_data_wide$test_mileage_2016) * 1.60934, 0))
129173
test_data_wide$km_2018 = as.integer(round((test_data_wide$test_mileage_2018 - test_data_wide$test_mileage_2017) * 1.60934, 0))
174+
test_data_wide$km_2019 = as.integer(round((test_data_wide$test_mileage_2019 - test_data_wide$test_mileage_2018) * 1.60934, 0))
175+
test_data_wide$km_2020 = as.integer(round((test_data_wide$test_mileage_2020 - test_data_wide$test_mileage_2019) * 1.60934, 0))
176+
177+
178+
saveRDS(test_data_wide,paste0(path,"test_data_wide_2005_2020_with_neg.Rds"), compress = FALSE)
130179

180+
# Some cars have -ve millage due to error so set to 0
181+
summary(test_data_wide$km_2018)
182+
foo <- test_data_wide[test_data_wide$km_2018 > 160000 | test_data_wide$km_2018 < -160000, ]
183+
foo <- foo[!is.na(foo$km_2018),]
184+
185+
test_data_wide$km_2006[test_data_wide$km_2006 < 0] <- 0
186+
test_data_wide$km_2007[test_data_wide$km_2007 < 0] <- 0
187+
test_data_wide$km_2008[test_data_wide$km_2008 < 0] <- 0
188+
test_data_wide$km_2009[test_data_wide$km_2009 < 0] <- 0
189+
test_data_wide$km_2010[test_data_wide$km_2010 < 0] <- 0
190+
test_data_wide$km_2011[test_data_wide$km_2011 < 0] <- 0
191+
test_data_wide$km_2012[test_data_wide$km_2012 < 0] <- 0
192+
test_data_wide$km_2013[test_data_wide$km_2013 < 0] <- 0
193+
test_data_wide$km_2014[test_data_wide$km_2014 < 0] <- 0
194+
test_data_wide$km_2015[test_data_wide$km_2015 < 0] <- 0
195+
test_data_wide$km_2016[test_data_wide$km_2016 < 0] <- 0
196+
test_data_wide$km_2017[test_data_wide$km_2017 < 0] <- 0
197+
test_data_wide$km_2018[test_data_wide$km_2018 < 0] <- 0
198+
test_data_wide$km_2019[test_data_wide$km_2019 < 0] <- 0
199+
test_data_wide$km_2020[test_data_wide$km_2020 < 0] <- 0
200+
201+
202+
test_data_wide <- left_join(test_data_wide, test_postcode, by = "vehicle_id")
131203

132204
test_data_wide <- test_data_wide[,c("vehicle_id",
133205
"km_2006",
@@ -143,6 +215,8 @@ test_data_wide <- test_data_wide[,c("vehicle_id",
143215
"km_2016",
144216
"km_2017",
145217
"km_2018",
218+
"km_2019",
219+
"km_2020",
146220
"postcode_area_2006",
147221
"postcode_area_2007",
148222
"postcode_area_2008",
@@ -155,15 +229,18 @@ test_data_wide <- test_data_wide[,c("vehicle_id",
155229
"postcode_area_2015",
156230
"postcode_area_2016",
157231
"postcode_area_2017",
158-
"postcode_area_2018")]
232+
"postcode_area_2018",
233+
"postcode_area_2019",
234+
"postcode_area_2020"
235+
)]
159236

160237
saveRDS(test_data_wide, paste0(path,"vehicle_km_per_year.Rds"))
161238

162239
library(tidyr)
163240

164241
names(test_data_wide) <- gsub("postcode_area","postcodearea", names(test_data_wide))
165242
vehicle_km_long <- pivot_longer(test_data_wide,
166-
cols = km_2006:postcodearea_2018,
243+
cols = km_2006:postcodearea_2020,
167244
names_to = c(".value", "year"),
168245
names_pattern = "(.+)_(.+)",
169246
values_drop_na = TRUE)
@@ -177,7 +254,9 @@ postcode_summary <- vehicle_km_long %>%
177254
library(ggplot2)
178255
library(ggrepel)
179256

180-
postcode_summary[postcode_summary$year > 2006,] %>%
257+
postcode_summary %>%
258+
filter(year > 2006) %>%
259+
filter(!is.na(postcodearea)) %>%
181260
mutate(label = if_else(year == max(year), as.character(postcodearea), NA_character_)) %>%
182261
ggplot(aes(x = year, y = total_km, group = postcodearea)) +
183262
geom_line(aes(colour = postcodearea)) +
@@ -188,16 +267,17 @@ postcode_summary[postcode_summary$year > 2006,] %>%
188267
na.rm = TRUE,
189268
max.overlaps = 10)
190269

270+
postcode_summary <- pivot_wider(postcode_summary,
271+
id_cols = "postcodearea",
272+
names_from = "year",
273+
values_from = "total_km")
274+
191275

192276
write.csv(postcode_summary, "../CarbonCalculator/data/postocde_driving_summary.csv")
193277
library(sf)
194278

195279
postcodes <- readRDS("../CarbonCalculator/data/bounds/postcode_areas.Rds")
196280

197-
postcode_summary <- pivot_wider(postcode_summary,
198-
id_cols = "postcodearea",
199-
names_from = "year",
200-
values_from = "total_km")
201281

202282
postcodes <- left_join(postcodes, postcode_summary, by = c("PC_AREA" = "postcodearea"))
203283

@@ -209,23 +289,7 @@ tm_shape(postcodes) +
209289
style = "quantile") +
210290
tm_borders()
211291

212-
# pb <- progress_estimated(length(file_list))
213-
# system.time(res <- pmap_dfr(.l = x[,c("vehicle_id",
214-
# "test_mileage_2005",
215-
# "test_mileage_2006",
216-
# "test_mileage_2007",
217-
# "test_mileage_2008",
218-
# "test_mileage_2009",
219-
# "test_mileage_2010",
220-
# "test_mileage_2011",
221-
# "test_mileage_2012",
222-
# "test_mileage_2013",
223-
# "test_mileage_2014",
224-
# "test_mileage_2015",
225-
# "test_mileage_2016",
226-
# "test_mileage_2017",
227-
# "test_mileage_2018")],
228-
# .f = summarise_history))
292+
229293

230294
km_summary <- list()
231295

@@ -269,3 +333,64 @@ saveRDS(km_summary, paste0(path,"vehicle_km_per_year.Rds"))
269333
#
270334
# car_data$make_model <- paste0(car_data$make," ",car_data$model)
271335
# summary(car_data$make_model %in% classif$GenModel)
336+
337+
338+
# car_data <- readRDS(paste0(path,"car_data_2005_2018.Rds"))
339+
# car_data <- car_data[,c("vehicle_id","first_use_date")]
340+
341+
# test_data_wide <- left_join(test_data_wide, car_data, by = c("vehicle_id"))
342+
# x = test_data_wide[1:1000000,]
343+
#
344+
# summarise_history <- function(vehicle_id,
345+
# test_mileage_2005,
346+
# test_mileage_2006,
347+
# test_mileage_2007,
348+
# test_mileage_2008,
349+
# test_mileage_2009,
350+
# test_mileage_2010,
351+
# test_mileage_2011,
352+
# test_mileage_2012,
353+
# test_mileage_2013,
354+
# test_mileage_2014,
355+
# test_mileage_2015,
356+
# test_mileage_2016,
357+
# test_mileage_2017,
358+
# test_mileage_2018){
359+
#
360+
# res <- data.frame(vehicle_id,
361+
# km_2006 = as.integer(round((test_mileage_2006 - test_mileage_2005) * 1.60934, 0)),
362+
# km_2007 = as.integer(round((test_mileage_2007 - test_mileage_2006) * 1.60934, 0)),
363+
# km_2008 = as.integer(round((test_mileage_2008 - test_mileage_2007) * 1.60934, 0)),
364+
# km_2009 = as.integer(round((test_mileage_2009 - test_mileage_2008) * 1.60934, 0)),
365+
# km_2010 = as.integer(round((test_mileage_2010 - test_mileage_2009) * 1.60934, 0)),
366+
# km_2011 = as.integer(round((test_mileage_2011 - test_mileage_2010) * 1.60934, 0)),
367+
# km_2012 = as.integer(round((test_mileage_2012 - test_mileage_2011) * 1.60934, 0)),
368+
# km_2013 = as.integer(round((test_mileage_2013 - test_mileage_2012) * 1.60934, 0)),
369+
# km_2014 = as.integer(round((test_mileage_2014 - test_mileage_2013) * 1.60934, 0)),
370+
# km_2015 = as.integer(round((test_mileage_2015 - test_mileage_2014) * 1.60934, 0)),
371+
# km_2016 = as.integer(round((test_mileage_2016 - test_mileage_2015) * 1.60934, 0)),
372+
# km_2017 = as.integer(round((test_mileage_2017 - test_mileage_2016) * 1.60934, 0)),
373+
# km_2018 = as.integer(round((test_mileage_2018 - test_mileage_2017) * 1.60934, 0)))
374+
#
375+
#
376+
# return(res)
377+
#
378+
# }
379+
380+
# pb <- progress_estimated(length(file_list))
381+
# system.time(res <- pmap_dfr(.l = x[,c("vehicle_id",
382+
# "test_mileage_2005",
383+
# "test_mileage_2006",
384+
# "test_mileage_2007",
385+
# "test_mileage_2008",
386+
# "test_mileage_2009",
387+
# "test_mileage_2010",
388+
# "test_mileage_2011",
389+
# "test_mileage_2012",
390+
# "test_mileage_2013",
391+
# "test_mileage_2014",
392+
# "test_mileage_2015",
393+
# "test_mileage_2016",
394+
# "test_mileage_2017",
395+
# "test_mileage_2018")],
396+
# .f = summarise_history))

postcode_trends_2007_2018.png

19.6 KB
Loading

0 commit comments

Comments
 (0)