Skip to content

Commit 78e1af5

Browse files
committed
playing with data
1 parent d68af3d commit 78e1af5

File tree

8 files changed

+470
-65
lines changed

8 files changed

+470
-65
lines changed

annual_km.R

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Summarise the annual km driven by a vehicle taking into account test times
2+
3+
ts <- tests[1:1000,]
4+
ms <- main[,c("registration","firstUsedDate")]
5+
ts <- dplyr::left_join(ts, ms, by = "registration")
6+
7+
annual_km_internal <- function(ts, ystart = 2005, yend = 2019){
8+
# exclude duplicated values
9+
ts <- ts[!duplicated(ts$odometerValue),]
10+
# convert to km
11+
ts$km <- ifelse(ts$odometerUnit == "mi", as.integer(round(ts$odometerValue * 1.60934)), ts$odometerValue)
12+
# order by date
13+
ts <- ts[order(ts$completedDate),]
14+
# check distance increase each year
15+
if(any(ts$km != ts$km[order(ts$km)])){
16+
stop("readings do not increase with each test")
17+
}
18+
ts_dates <- c(ts$firstUsedDate[1], as.Date(ts$completedDate))
19+
ts_km <- c(0L, ts$km)
20+
#ts_dkm <- ts_km[seq(2, length(ts_km))] - ts_km[seq(1, length(ts_km)-1)]
21+
#ts_dkm <- c(0,ts_dkm)
22+
ApproxFun <- approxfun(x = ts_dates, y = ts_km)
23+
Dates <- seq.Date(ymd(paste0(ystart,"-12-31")), ymd(paste0(yend,"-12-31")), by = "year")
24+
LinearFit <- ApproxFun(Dates)
25+
names(LinearFit) <- seq(ystart, yend)
26+
lf_off <- c(0, LinearFit[seq(1, length(LinearFit) - 1)])
27+
lf_off[is.na(lf_off)] <- 0
28+
akm <- LinearFit - lf_off
29+
30+
return(LinearFit)
31+
32+
}
33+
34+
ts <- ts[,c("registration","completedDate","odometerValue","odometerUnit")]
35+
ts <- split(ts, ts$registration)
36+
kms <- pbapply::pblapply(ts, annual_km_internal)

compare_years_anon.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
anon_2015 <- readRDS("E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/MOT anoymised/clean/test_result_2015.Rds")
2+
anon_2014 <- readRDS("E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/MOT anoymised/clean/test_result_2014.Rds")
3+
4+
anon_2015$vehicle_id <- as.numeric(anon_2015$vehicle_id)
5+
6+
sub_15 <- anon_2015[1:5,]
7+
sub_14 <- anon_2014[anon_2014$vehicle_id %in% sub_15$vehicle_id,]

get_data.R

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,19 @@ res_tests <- list()
66
res_comments <- list()
77
# page fails, 4318, 4319, 4321, 4322, 4323, 10590 - 10596, 10849
88
# 11867, 11868, 20308 - 20313
9-
fails = c(4318, 4319, 4321, 4322, 4323, 10590:10596, 10849,
10-
11867, 11868, 20308:20313)
9+
fails = c(73002, 73004, 73053, 73055, 73153, 73155, 81587, 81588, 81593, 81608, 82121, 82127, 82133, 82312, 82324,
10+
82782, 82794, 82795, 82893, 82903, 82905, 83090, 83100)
1111
pb = txtProgressBar(min = 1, max = npages, initial = 1) # Make progress bar
1212
#gc()
13-
for(i in seq(21729, npages)){
14-
setTxtProgressBar(pb,i)
15-
#message(Sys.time()," ", i)
13+
#for(i in seq(70632, npages)){
14+
15+
16+
17+
18+
19+
for(i in fails){
20+
#setTxtProgressBar(pb,i)
21+
message(Sys.time()," ", i)
1622
# Request page
1723
req <- try(GET(
1824
url = "https://beta.check-mot.service.gov.uk/trade/vehicles/mot-tests",
@@ -64,7 +70,7 @@ for(i in seq(21729, npages)){
6470
}
6571

6672

67-
if(i %% 2000 == 0){
73+
if(i %% 5000 == 0){
6874
message(paste0(Sys.time()," starting to save a backup for requests 1:",i))
6975
saveRDS(res_main,"F:/MOT_data/download_data_main.Rds")
7076
saveRDS(res_tests,"F:/MOT_data/download_data_test.Rds")
@@ -89,7 +95,9 @@ tests$testResult <- as.factor(tests$testResult)
8995
tests$odometerUnit <- as.factor(tests$odometerUnit)
9096
tests$odometerValue <- as.numeric(tests$odometerValue)
9197

98+
#foo <- lengths(res_main[70632:98065])
99+
#summary(foo)
92100

93-
saveRDS(main,"F:/MOT_data/mot_history_main_21729-xx.Rds")
94-
saveRDS(tests,"F:/MOT_data/mot_history_tests_21729-xx.Rds")
95-
saveRDS(comments,"F:/MOT_data/mot_history_comments_21729-xx.Rds")
101+
saveRDS(main,"F:/MOT_data/mot_history_main_70632-98065.Rds")
102+
saveRDS(tests,"F:/MOT_data/mot_history_tests_706312-98065.Rds")
103+
saveRDS(comments,"F:/MOT_data/mot_history_comments_70632-98065.Rds")

merge_datasets.R

Lines changed: 52 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,60 @@
1-
# merge datasets
1+
library(plyr)
2+
library(dplyr)
23

4+
# load data
35
path = "E:/OneDrive - University of Leeds/CREDS Data/"
6+
path = "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/"
47

5-
main_api = readRDS(paste0(path,"MOT API/mot_history_main_1-21728.Rds"))
6-
main_anon = readr::read_delim(paste0(path,"MOT anoymised/test_result_2017.zip"), n_max = 10, delim = ",")
7-
file = "E:/OneDrive - University of Leeds/CREDS Data/MOT anoymised/test_result_2017/test_result_2017.csv"
8-
import_mot = function(file){
9-
data <- readLines(file, n = 100000)
10-
data <- strsplit(data,",")
11-
lths <- lengths(data)
12-
13-
data_good <- data[lths == 14]
14-
data_bad <- data[lths != 14]
15-
#rm(data)
16-
17-
# format up the good data
18-
data_good <- data.frame(matrix(unlist(data_good), ncol=14, byrow=T),stringsAsFactors=FALSE)
19-
names(data_good) <- as.character(data_good[1,])
20-
data_good <- data_good[2:nrow(data_good),]
21-
22-
#handel the bad data
23-
fix_mot <- function(sub){
24-
#cols are
25-
#test_id number
26-
#vehicle_id number
27-
#test_date date
28-
#test_class_id number 1 digit
29-
#test_type character 2 letters
30-
#test_result character 1-3 letters
31-
#test_mileage number
32-
#postcode_area character 2 letters
33-
#make character
34-
#model character
35-
#colour character
36-
#fuel_type character 2 letters
37-
#cylinder_capacity number
38-
# first_use_date date
39-
40-
is_int <- !is.na(as.integer(sub))
41-
is_date <- !is.na(lubridate::ymd(sub))
42-
n_char <- nchar(sub)
43-
44-
if()
45-
46-
47-
48-
}
49-
50-
51-
}
8+
main_api <- readRDS(paste0(path,"MOT API/mot_history_main_1-21728.Rds"))
9+
main_annon <- readRDS(paste0(path,"MOT anoymised/clean/test_result_2006.Rds"))
10+
main_annon <- main_annon[!duplicated(main_annon$vehicle_id),]
11+
tests_api <- readRDS(paste0(path,"MOT API/mot_history_tests_1-21728.Rds"))
5212

5313

14+
# filter tests to 2006
15+
tests_2006 <- tests_api[tests_api$completedDate <= lubridate::ymd("2006-12-31"),]
16+
tests_2006 <- tests_2006[tests_2006$completedDate > lubridate::ymd("2005-12-31"),]
17+
tests_2006 <- tests_2006[!duplicated(tests_2006$registration),]
18+
tests_2006$miles <- ifelse(tests_2006$odometerUnit == "mi", tests_2006$odometerValue, round(tests_2006$odometerValue * 0.621371,0))
19+
tests_2006$test_date <- lubridate::date(tests_2006$completedDate)
20+
tests_2006 <- tests_2006[,c("registration","miles","test_date")]
21+
#summary(duplicated(tests_2006[,c("miles","test_date")]))
5422

55-
# problem reading the data
56-
foo <- (1:nrow(main_anon))[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST")]
57-
foo2 <- main_anon[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST"),]
58-
main_anon2 <- readLines(con = "E:/OneDrive - University of Leeds/CREDS Data/MOT anoymised/test_result_2017/test_result_2017.csv", n = 1)
59-
main_anon2[13833:13836]
60-
main_anon2[13834]
23+
# about 5% of date and miles are duplicated so join to main first
24+
durp_reg = main_api$registration[duplicated(main_api$registration)]
25+
foo = main_api[main_api$registration %in% durp_reg,]
6126

62-
# Mathc formats
63-
main_api$firstUsedDate <- lubridate::ymd(main_api$firstUsedDate)
27+
main_api <- left_join(main_api, tests_2006, by = c("registration"))
6428

29+
30+
# match formatting
31+
#table(main_api$fuelType)
32+
#table(main_annon$fuel_type)
33+
34+
main_annon$fuel_type <- revalue(main_annon$fuel_type,
35+
c("CN" = "CNG",
36+
"DI" = "Diesel",
37+
"ED" = "Electric Diesel",
38+
"EL" = "Electric",
39+
"FC" = "Fuel Cells",
40+
"GA" = "Gas",
41+
"GB" = "Gas Bi-Fuel",
42+
"GD" = "Gas Diesel",
43+
"HY" = "Hybrid Electric (Clean)",
44+
"LN" = "LNG",
45+
"LP" = "LPG",
46+
"OT" = "Other",
47+
"PE" = "Petrol",
48+
"ST" = "Steam"))
49+
50+
main_api$colour <- toupper(main_api$colour)
51+
52+
tmp <- main_api[1:100,]
53+
#tmp$primaryColour <- toupper(tmp$primaryColour)
54+
55+
# non -unique so need test date and millage
56+
foo <- dplyr::left_join(tmp, main_annon, by = c("make" = "make",
57+
"model" = "model",
58+
"primaryColour" = "colour",
59+
"fuelType" = "fuel_type",
60+
"firstUsedDate" = "first_use_date"))

prep_annon.R

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
library(readr)
2+
3+
path = "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/"
4+
5+
files = list.files(paste0(path,"/MOT anoymised/raw"), pattern = "result")
6+
#dir.create(paste0(path,"MOT anoymised/clean"))
7+
8+
for(i in 3:length(files)){
9+
message(files[i])
10+
format(object.size(file), units = "Mb")
11+
file = readr::read_delim(paste0(path,"MOT anoymised/raw/",files[i]),
12+
delim = "|",
13+
escape_backslash = FALSE,
14+
escape_double = FALSE,
15+
col_types = readr::cols(
16+
test_id = col_double(),
17+
vehicle_id = col_double(),
18+
test_date = col_date(format = ""),
19+
test_class_id = col_double(),
20+
test_type = col_factor(),
21+
test_result = col_factor(),
22+
test_mileage = col_double(),
23+
postcode_area = col_factor(),
24+
make = col_factor(),
25+
model = col_factor(),
26+
colour = col_factor(),
27+
fuel_type = col_factor(),
28+
cylinder_capacity = col_double(),
29+
first_use_date = col_date(format = "")
30+
))
31+
32+
33+
saveRDS(file, paste0(path,"MOT anoymised/clean/",substr(files[i],1,nchar(files[i]) - 3),"Rds"))
34+
}
35+
36+
37+
38+
files = list.files(paste0(path,"/MOT anoymised/raw"), pattern = "item")
39+
40+
41+
for(i in 1:length(files)){
42+
message(files[i])
43+
44+
file = readr::read_delim(paste0(path,"MOT anoymised/raw/",files[i]),
45+
delim = "|",
46+
escape_backslash = FALSE,
47+
escape_double = FALSE,
48+
col_types = readr::cols(
49+
test_id = col_double(),
50+
rfr_id = col_double(),
51+
rfr_type_code = col_character(),
52+
location_id = col_double(),
53+
dangerous_mark = col_character()
54+
),
55+
n_max = 100
56+
)
57+
58+
59+
saveRDS(file, paste0(path,"MOT anoymised/clean/",substr(files[i],1,nchar(files[i]) - 3),"Rds"))
60+
}
61+

prep_annon_2015.R

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# merge datasets
2+
3+
path = "E:/OneDrive - University of Leeds/CREDS Data/"
4+
path = "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/"
5+
6+
file = paste0(path,"MOT anoymised/raw/test_result_2015.txt")
7+
8+
data <- readLines(file)
9+
data <- strsplit(data,"|", fixed = TRUE)
10+
lths <- lengths(data)
11+
12+
data_good <- data[lths == 14]
13+
data_bad <- data[lths != 14]
14+
#data_bad2 <- data[c(34549796, 34549795, 34549797)]
15+
16+
# format up the good data
17+
data_good <- data.frame(matrix(unlist(data_good), ncol=14, byrow=T),stringsAsFactors=FALSE)
18+
names(data_good) <- as.character(data_good[1,])
19+
data_good <- data_good[2:nrow(data_good),]
20+
21+
22+
data_bad_13 = data_bad[lengths(data_bad) == 13]
23+
#data_bad_15 = data_bad[lengths(data_bad) == 15]
24+
hea = data_bad[lengths(data_bad) == 16]
25+
#data_bad_17 = data_bad[lengths(data_bad) == 17]
26+
27+
28+
data_bad_16 = lapply(data_bad_16, function(x){c(x[1:10],x[13:16])})
29+
data_bad_16 = t(as.data.frame(data_bad_16))
30+
rownames(data_bad_16) = 1:nrow(data_bad_16)
31+
32+
data_bad_13 = lapply(data_bad_13, function(x){c(x[1:13],"")})
33+
data_bad_13 = t(as.data.frame(data_bad_13))
34+
rownames(data_bad_13) = 1:nrow(data_bad_13)
35+
36+
data_fixed <- rbind(data_bad_13, data_bad_16)
37+
data_fixed <- as.data.frame(data_fixed)
38+
names(data_fixed) <- names(data_good)
39+
data_fixed[] <- lapply(data_fixed, as.character)
40+
data_final <- rbind(data_good, data_fixed)
41+
saveRDS(data_final, "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/MOT anoymised/clean/test_result_2015.Rds")
42+
43+
# import_mot = function(file){
44+
# data <- readLines(file, n = 100000)
45+
# data <- strsplit(data,",")
46+
# lths <- lengths(data)
47+
#
48+
# data_good <- data[lths == 14]
49+
# data_bad <- data[lths != 14]
50+
# #rm(data)
51+
#
52+
#
53+
#
54+
# #handel the bad data
55+
# fix_mot <- function(sub){
56+
# #cols are
57+
# #test_id number
58+
# #vehicle_id number
59+
# #test_date date
60+
# #test_class_id number 1 digit
61+
# #test_type character 2 letters
62+
# #test_result character 1-3 letters
63+
# #test_mileage number
64+
# #postcode_area character 2 letters
65+
# #make character
66+
# #model character
67+
# #colour character
68+
# #fuel_type character 2 letters
69+
# #cylinder_capacity number
70+
# # first_use_date date
71+
#
72+
# is_int <- !is.na(as.integer(sub))
73+
# is_date <- !is.na(lubridate::ymd(sub))
74+
# n_char <- nchar(sub)
75+
#
76+
# if()
77+
#
78+
#
79+
#
80+
# }
81+
#
82+
#
83+
# }
84+
85+
86+
87+
# problem reading the data
88+
foo <- (1:nrow(main_anon))[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST")]
89+
foo2 <- main_anon[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST"),]
90+
main_anon2 <- readLines(con = "E:/OneDrive - University of Leeds/CREDS Data/MOT anoymised/test_result_2017/test_result_2017.csv", n = 1)
91+
main_anon2[13833:13836]
92+
main_anon2[13834]
93+
94+
# Mathc formats
95+
main_api$firstUsedDate <- lubridate::ymd(main_api$firstUsedDate)
96+

0 commit comments

Comments
 (0)