Skip to content

Commit

Permalink
issue 86: add script to sandbox/Vestcor Data Cleaning for review
Browse files Browse the repository at this point in the history
  • Loading branch information
spinnj committed Apr 2, 2022
1 parent bd9b0b5 commit 48097f7
Showing 1 changed file with 44 additions and 0 deletions.
44 changes: 44 additions & 0 deletions sandbox/Vestcor Data Cleaning/spinnj_issue86.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# spinnj_issue86.R
#
# factorsSPGMI and stocksCRSP do not use official GICS naming conventions in
# sector names. In addition, several securities have obvious misspellings in
# Sector assignment, resulting in incorrect groupings. This appears to be a
# result of human intervention/manual data manipulations after the data
# were obtained from CRSP and SPGMI.
#

# load data & create tmp copies
load("./data/factorsSPGMI.rda")
load("./data/stocksCRSP.rda")
factorsSPGMI_tmp <- factorsSPGMI
stocksCRSP_tmp <- stocksCRSP

# confirm factorsSPGMI and stocksCRSP have incorrect sectors (misspelled)
unique(factorsSPGMI_tmp$Sector) # contains 16 sectors with dupes/misspellings
unique(stocksCRSP_tmp$Sector) # contains 16 sectors with dupes/misspellings

# replacement data as per issue #86 description
bad_sectors <- unique(factorsSPGMI_tmp$Sector)
good_sectors <- c("Information Technology","Industrials","Health Care",
"Consumer Staples","Energy","Materials",
"Consumer Discretionary","Communication Services","Utilities",
"Real Estate","Health Care","Financials",
"Consumer Discretionary","Information Technology",
"Consumer Staples","Communication Services")
sector_table <- data.frame(cbind(bad_sectors,good_sectors))
colnames(sector_table) <- c("BadSectors","GoodSectors")
sector_table

# replacements
factorsSPGMI_tmp$Sector <- sector_table$GoodSectors[match(factorsSPGMI_tmp$Sector,sector_table$BadSectors)]
stocksCRSP_tmp$Sector <- sector_table$GoodSectors[match(stocksCRSP_tmp$Sector,sector_table$BadSectors)]

# confirm factorsSPGMI and stocksCRSP have correct sectors
unique(factorsSPGMI_tmp$Sector) # contains 11 sectors with no dupes
unique(stocksCRSP_tmp$Sector) # contains 11 sectors with no dupes

# save data (not yet run)
#factorsSPGMI <- factorsSPGMI_tmp
#stocksCRSP <- stocksCRSP_tmp
#save(factorsSPGMI,file="./data/factorsSPGMI.rda")
#save(stocksCRSP,file="./data/stocksCRSP.rda")

0 comments on commit 48097f7

Please sign in to comment.