-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
note that after this merge request, capitalization and names will no longer match SPGMI data, so merging future data will be improssible without modification Original merge request: factorsSPGMI and stocksCRSP had the following issues: - certain sectors incorrectly named (misspelled), - 7 securities in the sample of 300 were in sectors that were to be removed (as per Doug Martin, requested no securities in Financials, Utilities, or Real Estate sectors), - per Doug Martin, the CapGroup assignments were incorrectly mapped (our comparison suggested about 75% accuracy) - we remapped CapGroup assignments on a point in time basis using an outside data set and the CRSP percentile breakpoints of 0.7, 0.85, 0.98 to divide Large, Mid, Small, and Micro cap stocks and also created a new variable "CapGroupL" which is the ending group assignment on 2015-12-31 for each stock, Additionally, certain other changes were made at the request of Doug Martin: - The sector names were modified slightly for brevity, - "TickerLast" was renamed to "TickerL" - Incorporating this pull request would: - Modify factorsSPGMI and stocksCRSP as described (80868 rows vs. previously 82800) with new CapGroupL variable (1 extra column). - Add a folder within the Sandbox directory titled "Vestcor Data Cleaning" that has an R file with some rough code used to clean the data, and the MarketCapPercentiles.csv file historical data used to reclassify stocks CapGroup assignments. We can be reached at [email protected] and [email protected] for comments/questions/concerns.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# to do: clean data in stocksCRSP and factorsSPGMI | ||
### 1. clean sector names | ||
### 2. delete 7 securities in financials, real estate, and utilities sectors | ||
### 3. fix cap group assignments and create CapGroup and CapGroupL variables | ||
### 4. rename "TickerLast" as "TickerL" | ||
|
||
load("C:/FA/FactorAnalytics/data/factorsSPGMI.rda") | ||
This comment has been minimized.
Sorry, something went wrong. |
||
load("C:/FA/FactorAnalytics/data/stocksCRSP.rda") | ||
This comment has been minimized.
Sorry, something went wrong.
braverock
Owner
|
||
|
||
factorsSPGMI_tmp <- factorsSPGMI | ||
stocksCRSP_tmp <- stocksCRSP | ||
|
||
### 1. clean up list of sectors for factorsSPGMI & stocksCRSP | ||
|
||
# confirm factorsSPGMI and stocksCRSP have incorrect sectors (misspelled) | ||
unique(factorsSPGMI_tmp$Sector) # contains 16 sectors with dupes | ||
unique(stocksCRSP_tmp$Sector) # containts 16 sectors with dupes | ||
|
||
# replacement data | ||
bad_sectors <- unique(factorsSPGMI_tmp$Sector) | ||
good_sectors <- c("InfoTech","Industrials","Healthcare","ConsumStap","Energy", | ||
"Materials","ConsumDisc","ComServices","Utilities", | ||
"RealEstate","Healthcare","Financials","ConsumDisc", | ||
"InfoTech","ConsumStap","ComServices") | ||
sector_table <- data.frame(cbind(bad_sectors,good_sectors)) | ||
colnames(sector_table) <- c("BadSectors","GoodSectors") | ||
|
||
# replacements | ||
factorsSPGMI_tmp$Sector <- sector_table$GoodSectors[match(factorsSPGMI_tmp$Sector,sector_table$BadSectors)] | ||
stocksCRSP_tmp$Sector <- sector_table$GoodSectors[match(stocksCRSP_tmp$Sector,sector_table$BadSectors)] | ||
|
||
# confirm factorsSPGMI and stocksCRSP have correct sectors | ||
unique(factorsSPGMI_tmp$Sector) # contains 11 sectors with no dupes | ||
unique(stocksCRSP_tmp$Sector) # containts 11 sectors with no dupes | ||
|
||
### 2. delete 7 securities to leave temp sample of 293 stocks in 8 sectors (no | ||
### financials, utilities, or real estate) | ||
|
||
# get tickers to remove | ||
to_remove <- unique(factorsSPGMI_tmp[factorsSPGMI_tmp$Sector %in% c("Financials","RealEstate","Utilities"),]$TickerLast) | ||
to_remove_crsp <- unique(stocksCRSP_tmp[stocksCRSP_tmp$Sector %in% c("Financials","RealEstate","Utilities"),]$TickerLast) | ||
(to_remove == to_remove_crsp) # confirm same stocks to be removed from both data sets | ||
|
||
# check size of factorsSPGMI and stocksCRSP | ||
dim(factorsSPGMI_tmp)[1] # 276 * 300 = 82800 rows | ||
dim(stocksCRSP_tmp)[1] # 276 * 300 = 82800 rows | ||
|
||
# delete members of to_remove from factorsSPGMI and stocksCRSP | ||
stocksCRSP_tmp <- stocksCRSP_tmp[!(stocksCRSP_tmp$TickerLast %in% to_remove),] | ||
factorsSPGMI_tmp <- factorsSPGMI_tmp[!(factorsSPGMI_tmp$TickerLast %in% to_remove),] | ||
|
||
# check size of factorsSPGMI and stocksCRSP & same membership | ||
dim(factorsSPGMI_tmp)[1] # 276 * 293 = 80868 rows | ||
dim(stocksCRSP_tmp)[1] # 276 * 300 = 80868 rows | ||
unique(sort(stocksCRSP_tmp$TickerLast)) == unique(sort(factorsSPGMI_tmp$TickerLast)) | ||
|
||
### 3. fix cap group assignments and create CapGroup and CapGroupL variables | ||
# cap group membership deemed incorrect, to be replaced with new assignments. | ||
# Suggest a point-in-time replacement based on CRSP 70/85/98 percentile splits | ||
# into LargeCap (up to 75% mkt coverage), MidCap (75% to 85%), SmallCap (85% to | ||
# 98%), and MicroCap (above 98%), replace current CapGroup with reassigned data | ||
# at each point in time (e.g. membership changes); create new variable CapGroupL | ||
|
||
# import market percentiles (data from from CompuStat via Vestcor) | ||
MktCapPercentiles <- read.csv("MktCapPercentiles.csv") | ||
factorsSPGMI_tmp$MktCap <- exp(factorsSPGMI_tmp$LogMktCap) | ||
|
||
factorsSPGMI_tmp$CapGroupLC <- MktCapPercentiles$LC[match(as.Date(factorsSPGMI_tmp$Date),as.Date(MktCapPercentiles$Date))] | ||
factorsSPGMI_tmp$CapGroupMC <- MktCapPercentiles$MC[match(as.Date(factorsSPGMI_tmp$Date),as.Date(MktCapPercentiles$Date))] | ||
factorsSPGMI_tmp$CapGroupSC <- MktCapPercentiles$SC[match(as.Date(factorsSPGMI_tmp$Date),as.Date(MktCapPercentiles$Date))] | ||
|
||
factorsSPGMI_tmp$CapGroup <- ifelse(factorsSPGMI_tmp$MktCap < factorsSPGMI_tmp$CapGroupSC,"MicroCap", | ||
ifelse(factorsSPGMI_tmp$MktCap < factorsSPGMI_tmp$CapGroupMC,"SmallCap", | ||
ifelse(factorsSPGMI_tmp$MktCap < factorsSPGMI_tmp$CapGroupLC,"MidCap", | ||
"LargeCap"))) | ||
|
||
# create CapGroupL variable | ||
end_dat <- factorsSPGMI_tmp[factorsSPGMI_tmp$Date == "2015-12-31",] | ||
factorsSPGMI_tmp$CapGroupL <- end_dat$CapGroup[match(factorsSPGMI_tmp$TickerLast,end_dat$TickerLast)] | ||
|
||
|
||
# match CapGroup and CapGroupL in stocksCRSP | ||
factorsSPGMI_tmp$MatchVar <- paste(factorsSPGMI_tmp$Date,factorsSPGMI_tmp$Ticker,sep="-") | ||
stocksCRSP_tmp$MatchVar <- paste(stocksCRSP_tmp$Date,stocksCRSP_tmp$Ticker,sep="-") | ||
stocksCRSP_tmp$CapGroup <- factorsSPGMI_tmp$CapGroup[match(stocksCRSP_tmp$MatchVar,factorsSPGMI_tmp$MatchVar)] | ||
stocksCRSP_tmp$CapGroupL <- factorsSPGMI_tmp$CapGroupL[match(stocksCRSP_tmp$MatchVar,factorsSPGMI_tmp$MatchVar)] | ||
|
||
# reorder columns | ||
This comment has been minimized.
Sorry, something went wrong.
braverock
Owner
|
||
factorsSPGMI_tmp <- factorsSPGMI_tmp[,c("Date","Ticker","TickerLast","Company", | ||
"CapGroup","CapGroupL","GICS", | ||
"Sector","AnnVol12M","Beta60M","BP","EP", | ||
"LogMktCap","PM12M1M","AccrualRatioCF", | ||
"AstAdjChg1YOCF","CFROIC","Chg1YAstTo", | ||
"EBITDAEV","FCFP","PM1M","SEV")] | ||
|
||
stocksCRSP_tmp <- stocksCRSP_tmp[,c("Date","Ticker","TickerLast","Company", | ||
"CapGroup","CapGroupL","GICS","Sector","Return", | ||
"RetExDiv","Price","PrcSplitAdj","Ret4WkBill", | ||
"Ret13WkBill","Ret1YrBill","mktIndexCRSP")] | ||
|
||
### 4. rename TickerLast as TickerL | ||
names(factorsSPGMI_tmp)[names(factorsSPGMI_tmp) == "TickerLast"] <- "TickerL" | ||
This comment has been minimized.
Sorry, something went wrong.
braverock
Owner
|
||
names(stocksCRSP_tmp)[names(stocksCRSP_tmp) == "TickerLast"] <- "TickerL" | ||
|
||
|
||
### 5. save data | ||
#setwd("C:/FA/FactorAnalytics/data") | ||
factorsSPGMI <- factorsSPGMI_tmp | ||
stocksCRSP <- stocksCRSP_tmp | ||
save(factorsSPGMI,file="factorsSPGMI.rda") | ||
save(stocksCRSP,file="stocksCRSP.rda") |
these references should be relative references, or they should use system.file so they will work relative to the working directory on any machine