-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
130 lines (93 loc) · 4.63 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Getting and Cleaning Data: Course project for Amit Wadhwa, April 2014
# See README.md for further instructions and CodeBook.md for documenation of final tiny data
# Assume data is extracted to folder as noted below:
setwd("data/UCI HAR Dataset")
# This function extracts the training data from 3 files
getTrainingData <- function() {
# get column headers from features file
dataCols <- read.table("features.txt", header=FALSE, as.is=TRUE, col.names=c("MeasureID", "MeasureName"))
# get subject data
filePath <- file.path("train/subject_train.txt")
subjectData <- read.table(filePath, header=FALSE, col.names=c("SubjectID"))
# get y data
filePath <- file.path("train/y_train.txt")
yData <- read.table(filePath, header=FALSE, col.names=c("ActivityID"))
# get x data
filePath <- file.path("train/X_train.txt")
trainingData <- read.table(filePath, header=FALSE, col.names=dataCols$MeasureName)
# add the activity id and subject id columns
trainingData$ActivityID <- yData$ActivityID
trainingData$SubjectID <- subjectData$SubjectID
# return the data
trainingData
}
# This function extracts the test data from 3 files
getTestData <- function() {
# get column headers from features file
dataCols <- read.table("features.txt", header=FALSE, as.is=TRUE, col.names=c("MeasureID", "MeasureName"))
# get subject data
filePath <- file.path("test/subject_test.txt")
subjectData <- read.table(filePath, header=FALSE, col.names=c("SubjectID"))
# get y data
filePath <- file.path("test/y_test.txt")
yData <- read.table(filePath, header=FALSE, col.names=c("ActivityID"))
# get x data
filePath <- file.path("test/X_test.txt")
testData <- read.table(filePath, header=FALSE, col.names=dataCols$MeasureName)
# add the activity id and subject id columns
testData$ActivityID <- yData$ActivityID
testData$SubjectID <- subjectData$SubjectID
# return the data
testData
}
# 1. Merge the training and the test sets to create one data set.
mergeData <- function() {
# merge datasets (add rows using rbind)
masterData <- rbind(getTrainingData(), getTestData())
masterData
}
# 2. Extracts only the measurements on the mean and standard deviation for each measurement.
extractData <- function(data) {
# create vector of columns to keep based on mean and std.deviation
cNames <- colnames(data)
colKeep <- grep("mean|std",cNames,ignore.case=TRUE, value=TRUE)
# for later steps, need to keep the activity and subject IDs
#colKeep <- paste(colKeep, "ActivityID", "SubjectID")
keepMore <- c("ActivityID", "SubjectID")
colKeep <- append(keepMore, colKeep)
# create data frame with just columns for activity, subject, mean & std.deviation vars
reducedData <- data[, colKeep]
reducedData
}
# 3. Use descriptive activity names to name the activities in the data set
# &
# 4. Appropriately label the data set with descriptive activity names
addActivityLabel <- function(data) {
# open labels and assign column names
activityLabels <- read.table("activity_labels.txt", header=FALSE, as.is=TRUE, col.names=c("ActivityID", "ActivityName"))
activityLabels$ActivityName <- as.factor(activityLabels$ActivityName)
# merge on ActivityID (default ok here)
act <- merge(activityLabels, data)
act
}
# 5. Create a second, independent tidy data set with the average of each variable for each
# activity and each subject. Save the tidy data.
# This function melts and recasts the working dataset to form a tidy dataset
createTidyData <- function(data) {
library(reshape2)
# provide input variables for melt function
identifiers = c("ActivityID", "ActivityName", "SubjectID")
# set all the other variables as measurements
measurements = setdiff(colnames(data), identifiers)
# melt it!
meltData <- melt(data, id=identifiers, measure.vars=measurements)
# recast the melted data with mean values by subject and activity
tidyData <- dcast(meltData, SubjectID + ActivityID + ActivityName ~ variable, mean)
# save the output data
exportTinyData(tidyData)
tidyData
}
# write the tidy data
exportTinyData <- function(data) {
write.table(data, file = "tidydata.txt")
}