playing with data

mem48 · mem48 · commit 78e1af543305 · 2019-10-31T17:21:31.000Z
diff --git a/annual_km.R b/annual_km.R
@@ -0,0 +1,36 @@
+# Summarise the annual km driven by a vehicle taking into account test times
+
+ts <- tests[1:1000,]
+ms <- main[,c("registration","firstUsedDate")]
+ts <- dplyr::left_join(ts, ms, by = "registration")
+
+annual_km_internal <- function(ts, ystart = 2005, yend = 2019){
+  # exclude duplicated values
+  ts <- ts[!duplicated(ts$odometerValue),]
+  # convert to km
+  ts$km <- ifelse(ts$odometerUnit == "mi", as.integer(round(ts$odometerValue * 1.60934)), ts$odometerValue)
+  # order by date
+  ts <- ts[order(ts$completedDate),]
+  # check distance increase each year
+  if(any(ts$km != ts$km[order(ts$km)])){
+    stop("readings do not increase with each test")
+  }
+  ts_dates <- c(ts$firstUsedDate[1], as.Date(ts$completedDate))
+  ts_km <- c(0L, ts$km)
+  #ts_dkm <- ts_km[seq(2, length(ts_km))] - ts_km[seq(1, length(ts_km)-1)]
+  #ts_dkm <- c(0,ts_dkm)
+  ApproxFun <- approxfun(x = ts_dates, y = ts_km)
+  Dates <- seq.Date(ymd(paste0(ystart,"-12-31")), ymd(paste0(yend,"-12-31")), by = "year")
+  LinearFit <- ApproxFun(Dates)
+  names(LinearFit) <- seq(ystart, yend)
+  lf_off <- c(0, LinearFit[seq(1, length(LinearFit) - 1)])
+  lf_off[is.na(lf_off)] <- 0
+  akm <- LinearFit - lf_off
+  
+  return(LinearFit)
+  
+}
+
+ts <- ts[,c("registration","completedDate","odometerValue","odometerUnit")]
+ts <- split(ts, ts$registration)
+kms <- pbapply::pblapply(ts, annual_km_internal)
diff --git a/compare_years_anon.R b/compare_years_anon.R
@@ -0,0 +1,7 @@
+anon_2015 <- readRDS("E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/MOT anoymised/clean/test_result_2015.Rds")
+anon_2014 <- readRDS("E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/MOT anoymised/clean/test_result_2014.Rds")
+
+anon_2015$vehicle_id <- as.numeric(anon_2015$vehicle_id)
+
+sub_15 <- anon_2015[1:5,]
+sub_14 <- anon_2014[anon_2014$vehicle_id %in% sub_15$vehicle_id,]
diff --git a/get_data.R b/get_data.R
@@ -6,13 +6,19 @@ res_tests <- list()
 res_comments <- list()
 # page fails, 4318, 4319, 4321, 4322, 4323, 10590 - 10596, 10849
 # 11867, 11868, 20308 - 20313
-fails = c(4318, 4319, 4321, 4322, 4323, 10590:10596, 10849,
-          11867, 11868, 20308:20313)
+fails = c(73002, 73004, 73053, 73055, 73153, 73155, 81587, 81588, 81593, 81608, 82121, 82127, 82133, 82312, 82324, 
+          82782, 82794, 82795, 82893, 82903, 82905, 83090, 83100)
 pb = txtProgressBar(min = 1, max = npages, initial = 1) # Make progress bar
 #gc()
-for(i in seq(21729, npages)){
-  setTxtProgressBar(pb,i)
-  #message(Sys.time()," ", i)
+#for(i in seq(70632, npages)){
+
+
+
+
+
+for(i in fails){
+  #setTxtProgressBar(pb,i)
+  message(Sys.time()," ", i)
   # Request page
   req <- try(GET(
     url = "https://beta.check-mot.service.gov.uk/trade/vehicles/mot-tests",
@@ -64,7 +70,7 @@ for(i in seq(21729, npages)){
   }
   
   
-  if(i %% 2000 == 0){
+  if(i %% 5000 == 0){
     message(paste0(Sys.time()," starting to save a backup for requests 1:",i))
     saveRDS(res_main,"F:/MOT_data/download_data_main.Rds")
     saveRDS(res_tests,"F:/MOT_data/download_data_test.Rds")
@@ -89,7 +95,9 @@ tests$testResult <- as.factor(tests$testResult)
 tests$odometerUnit <- as.factor(tests$odometerUnit)
 tests$odometerValue <- as.numeric(tests$odometerValue)
 
+#foo <- lengths(res_main[70632:98065])
+#summary(foo)
 
-saveRDS(main,"F:/MOT_data/mot_history_main_21729-xx.Rds")
-saveRDS(tests,"F:/MOT_data/mot_history_tests_21729-xx.Rds")
-saveRDS(comments,"F:/MOT_data/mot_history_comments_21729-xx.Rds")
+saveRDS(main,"F:/MOT_data/mot_history_main_70632-98065.Rds")
+saveRDS(tests,"F:/MOT_data/mot_history_tests_706312-98065.Rds")
+saveRDS(comments,"F:/MOT_data/mot_history_comments_70632-98065.Rds")
diff --git a/merge_datasets.R b/merge_datasets.R
@@ -1,64 +1,60 @@
-# merge datasets 
+library(plyr)
+library(dplyr)
 
+# load data
 path = "E:/OneDrive - University of Leeds/CREDS Data/"
+path = "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/"
 
-main_api = readRDS(paste0(path,"MOT API/mot_history_main_1-21728.Rds"))
-main_anon = readr::read_delim(paste0(path,"MOT anoymised/test_result_2017.zip"), n_max = 10, delim = ",")
-file = "E:/OneDrive - University of Leeds/CREDS Data/MOT anoymised/test_result_2017/test_result_2017.csv"
-import_mot = function(file){
-  data <- readLines(file, n = 100000)
-  data <- strsplit(data,",")
-  lths <- lengths(data)
-  
-  data_good <- data[lths == 14]
-  data_bad <- data[lths != 14]
-  #rm(data)
-  
-  # format up the good data
-  data_good <- data.frame(matrix(unlist(data_good), ncol=14, byrow=T),stringsAsFactors=FALSE)
-  names(data_good) <- as.character(data_good[1,])
-  data_good <- data_good[2:nrow(data_good),]
-  
-  #handel the bad data
-  fix_mot <- function(sub){
-    #cols are
-    #test_id number 
-    #vehicle_id  number
-    #test_date date
-    #test_class_id number 1 digit
-    #test_type character 2 letters
-    #test_result character 1-3 letters
-    #test_mileage number
-    #postcode_area  character 2 letters        
-    #make character    
-    #model character
-    #colour character
-    #fuel_type character 2 letters
-    #cylinder_capacity number
-    # first_use_date date
-    
-    is_int <- !is.na(as.integer(sub))
-    is_date <- !is.na(lubridate::ymd(sub))
-    n_char <- nchar(sub)
-    
-    if()
-    
-    
-    
-  }
-  
-  
-}
+main_api <- readRDS(paste0(path,"MOT API/mot_history_main_1-21728.Rds"))
+main_annon <- readRDS(paste0(path,"MOT anoymised/clean/test_result_2006.Rds"))
+main_annon <- main_annon[!duplicated(main_annon$vehicle_id),]
+tests_api <- readRDS(paste0(path,"MOT API/mot_history_tests_1-21728.Rds"))
 
 
+# filter tests to 2006
+tests_2006 <- tests_api[tests_api$completedDate <= lubridate::ymd("2006-12-31"),]
+tests_2006 <- tests_2006[tests_2006$completedDate > lubridate::ymd("2005-12-31"),]
+tests_2006 <- tests_2006[!duplicated(tests_2006$registration),]
+tests_2006$miles <- ifelse(tests_2006$odometerUnit == "mi", tests_2006$odometerValue, round(tests_2006$odometerValue * 0.621371,0))
+tests_2006$test_date <- lubridate::date(tests_2006$completedDate)
+tests_2006 <- tests_2006[,c("registration","miles","test_date")]
+#summary(duplicated(tests_2006[,c("miles","test_date")]))
 
-# problem reading the data
-foo <- (1:nrow(main_anon))[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST")]
-foo2 <- main_anon[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST"),]
-main_anon2 <- readLines(con = "E:/OneDrive - University of Leeds/CREDS Data/MOT anoymised/test_result_2017/test_result_2017.csv", n = 1)
-main_anon2[13833:13836]
-main_anon2[13834]
+# about 5% of date and miles are duplicated so join to main first
+durp_reg = main_api$registration[duplicated(main_api$registration)]
+foo = main_api[main_api$registration %in% durp_reg,]
 
-# Mathc formats
-main_api$firstUsedDate <- lubridate::ymd(main_api$firstUsedDate)
+main_api <- left_join(main_api, tests_2006, by = c("registration"))
 
+
+# match formatting
+#table(main_api$fuelType)
+#table(main_annon$fuel_type)
+
+main_annon$fuel_type <- revalue(main_annon$fuel_type, 
+           c("CN" = "CNG",
+             "DI" = "Diesel",
+             "ED" = "Electric Diesel",
+             "EL" = "Electric",
+             "FC" = "Fuel Cells",
+             "GA" = "Gas",
+             "GB" = "Gas Bi-Fuel",
+             "GD" = "Gas Diesel",
+             "HY" = "Hybrid Electric (Clean)",
+             "LN" = "LNG",
+             "LP" = "LPG",
+             "OT" = "Other",
+             "PE" = "Petrol",
+             "ST" = "Steam"))
+
+main_api$colour <- toupper(main_api$colour)
+
+tmp <- main_api[1:100,]
+#tmp$primaryColour <- toupper(tmp$primaryColour)
+
+# non -unique so need test date and millage
+foo <- dplyr::left_join(tmp, main_annon, by = c("make" = "make", 
+                                                "model" = "model",
+                                                "primaryColour" = "colour",
+                                                "fuelType" = "fuel_type",
+                                                "firstUsedDate" = "first_use_date"))
diff --git a/prep_annon.R b/prep_annon.R
@@ -0,0 +1,61 @@
+library(readr)
+
+path = "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/"
+
+files = list.files(paste0(path,"/MOT anoymised/raw"), pattern = "result")
+#dir.create(paste0(path,"MOT anoymised/clean"))
+
+for(i in 3:length(files)){
+  message(files[i])
+  format(object.size(file), units = "Mb")
+  file = readr::read_delim(paste0(path,"MOT anoymised/raw/",files[i]), 
+                           delim = "|",
+                           escape_backslash = FALSE,
+                           escape_double = FALSE,
+                           col_types = readr::cols(
+                             test_id = col_double(),
+                             vehicle_id = col_double(),
+                             test_date = col_date(format = ""),
+                             test_class_id = col_double(),
+                             test_type = col_factor(),
+                             test_result = col_factor(),
+                             test_mileage = col_double(),
+                             postcode_area = col_factor(),
+                             make = col_factor(),
+                             model = col_factor(),
+                             colour = col_factor(),
+                             fuel_type = col_factor(),
+                             cylinder_capacity = col_double(),
+                             first_use_date = col_date(format = "")
+                           ))
+  
+  
+  saveRDS(file, paste0(path,"MOT anoymised/clean/",substr(files[i],1,nchar(files[i]) - 3),"Rds"))
+}
+
+
+
+files = list.files(paste0(path,"/MOT anoymised/raw"), pattern = "item")
+
+
+for(i in 1:length(files)){
+  message(files[i])
+  
+  file = readr::read_delim(paste0(path,"MOT anoymised/raw/",files[i]), 
+                           delim = "|",
+                           escape_backslash = FALSE,
+                           escape_double = FALSE,
+                           col_types = readr::cols(
+                              test_id = col_double(),
+                              rfr_id = col_double(),
+                              rfr_type_code = col_character(),
+                              location_id = col_double(),
+                              dangerous_mark = col_character()
+                           ),
+                           n_max = 100
+                           )
+  
+  
+  saveRDS(file, paste0(path,"MOT anoymised/clean/",substr(files[i],1,nchar(files[i]) - 3),"Rds"))
+}
+
diff --git a/prep_annon_2015.R b/prep_annon_2015.R
@@ -0,0 +1,96 @@
+# merge datasets 
+
+path = "E:/OneDrive - University of Leeds/CREDS Data/"
+path = "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/"
+
+file = paste0(path,"MOT anoymised/raw/test_result_2015.txt")
+
+data <- readLines(file)
+data <- strsplit(data,"|", fixed = TRUE)
+lths <- lengths(data)
+
+data_good <- data[lths == 14]
+data_bad <- data[lths != 14]
+#data_bad2 <- data[c(34549796, 34549795, 34549797)]
+
+# format up the good data
+data_good <- data.frame(matrix(unlist(data_good), ncol=14, byrow=T),stringsAsFactors=FALSE)
+names(data_good) <- as.character(data_good[1,])
+data_good <- data_good[2:nrow(data_good),]
+
+
+data_bad_13 = data_bad[lengths(data_bad) == 13]
+#data_bad_15 = data_bad[lengths(data_bad) == 15]
+hea = data_bad[lengths(data_bad) == 16]
+#data_bad_17 = data_bad[lengths(data_bad) == 17]
+
+
+data_bad_16 = lapply(data_bad_16, function(x){c(x[1:10],x[13:16])})
+data_bad_16 = t(as.data.frame(data_bad_16))
+rownames(data_bad_16) = 1:nrow(data_bad_16)
+
+data_bad_13 = lapply(data_bad_13, function(x){c(x[1:13],"")})
+data_bad_13 = t(as.data.frame(data_bad_13))
+rownames(data_bad_13) = 1:nrow(data_bad_13)
+
+data_fixed <- rbind(data_bad_13, data_bad_16)
+data_fixed <- as.data.frame(data_fixed)
+names(data_fixed) <- names(data_good)
+data_fixed[] <- lapply(data_fixed, as.character)
+data_final <- rbind(data_good, data_fixed)
+saveRDS(data_final, "E:/Users/earmmor/OneDrive - University of Leeds/CREDS Data/MOT anoymised/clean/test_result_2015.Rds")
+
+# import_mot = function(file){
+#   data <- readLines(file, n = 100000)
+#   data <- strsplit(data,",")
+#   lths <- lengths(data)
+#   
+#   data_good <- data[lths == 14]
+#   data_bad <- data[lths != 14]
+#   #rm(data)
+#   
+#   
+#   
+#   #handel the bad data
+#   fix_mot <- function(sub){
+#     #cols are
+#     #test_id number 
+#     #vehicle_id  number
+#     #test_date date
+#     #test_class_id number 1 digit
+#     #test_type character 2 letters
+#     #test_result character 1-3 letters
+#     #test_mileage number
+#     #postcode_area  character 2 letters        
+#     #make character    
+#     #model character
+#     #colour character
+#     #fuel_type character 2 letters
+#     #cylinder_capacity number
+#     # first_use_date date
+#     
+#     is_int <- !is.na(as.integer(sub))
+#     is_date <- !is.na(lubridate::ymd(sub))
+#     n_char <- nchar(sub)
+#     
+#     if()
+#     
+#     
+#     
+#   }
+#   
+#   
+# }
+
+
+
+# problem reading the data
+foo <- (1:nrow(main_anon))[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST")]
+foo2 <- main_anon[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST"),]
+main_anon2 <- readLines(con = "E:/OneDrive - University of Leeds/CREDS Data/MOT anoymised/test_result_2017/test_result_2017.csv", n = 1)
+main_anon2[13833:13836]
+main_anon2[13834]
+
+# Mathc formats
+main_api$firstUsedDate <- lubridate::ymd(main_api$firstUsedDate)
+
diff --git a/prep_annon_2016.R b/prep_annon_2016.R
diff --git a/prep_annon_2017.R b/prep_annon_2017.R