-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
146 lines (109 loc) · 5.24 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
## Getting and Cleaning Data - Course Project
#
## Script name: run_analysis.R
#
## Description: This file contains code that prepares tidy data which can be
# used for subsequent analysis. Specifically, the script does the following:
#
# 1) Merges the training and the test sets to create one data set.
#
# 2) Extracts only the measurements on the mean and standard
# deviation for each measurement.
#
# 3) Uses descriptive activity names to name the activities
# in the data set
#
# 4) Appropriately labels the data set with descriptive
# variable names.
#
# 5) From the data set in step 4, creates a second, independent
# tidy data set with the average of each variable for
# each activity and each subject.
#
## Data: The data for the project was obtained from:
# https://d396qusza40orc.cloudfront.net/_
# getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip
#
## Assumptions:
# - Data files are located in your working directory
# - The plyr, dplyr, and tidyr packages are loaded in your library
############################################################################
## Step 1) Merge the training and test data sets
# Load and name features and activity data
features <- read.table("features.txt", header = FALSE)
names(features) <- c("index", "description")
activities <- read.table("activity_labels.txt", header = FALSE)
# Add column names that will be used to merge with yData data frame
names(activities) <- c("index", "description")
# Load and merge X data (test data then train data)
XTestData <- read.table("X_test.txt", header = FALSE)
XTrainData <- read.table("X_train.txt", header = FALSE)
XData <- rbind(XTestData, XTrainData)
# Place data into easier to view format
XData <- tbl_df(XData)
# Add column names based on variable descriptions
names(XData) <- features$description
# Load and merge y data (test data then train data)
yTestData <- read.table("y_test.txt", header = FALSE)
yTrainData <- read.table("y_train.txt", header = FALSE)
yData <- rbind(yTestData, yTrainData)
# Place data into easier to view format
yData <- tbl_df(yData)
# Add column names that will be used to merge with activities data frame
names(yData) <- "index"
# Load and merge subject data (test data then train data)
subjectTestData <- read.table("subject_test.txt", header = FALSE)
subjectTrainData <- read.table("subject_train.txt", header = FALSE)
subjectData <- rbind(subjectTestData, subjectTrainData)
# Place data into easier to view format
subjectData <- tbl_df(subjectData)
# Add column names that will be used to merge with activities data frame
names(subjectData) <- "subjectNumber"
############################################################################
## Step 2) Extracts only measurements on the mean and standard deviation
# for each measurement
# Find columns containing mean "mean()" and standard deviation "std()"
stdVaribables <- grep("std()", names(XData), value=TRUE, fixed = TRUE)
meanVariables <- grep("mean()", names(XData), value=TRUE, fixed = TRUE)
# Merge relevant variables
relevantVariables <- c(meanVariables, stdVaribables)
# Subset data based on mean and standard deviation variables
XDataSubset <- XData[, relevantVariables]
# Place data into easier to view format
XDataSubset <- tbl_df(XDataSubset)
############################################################################
## Step 3) Use descriptive activity names to name the activities
# in the data set
# Join data by their activity index
yData <- join(yData, activities, by = "index")
# Place data into easier to view format
yData <- tbl_df(yData)
############################################################################
## Step 4) Appropriately labels the data set with descriptive
# variable names.
# Note that this step of the process was performed during Step (1)
# with the following lines of code.
# # Add column names based on variable descriptions
# names(XData) <- features$description
############################################################################
## Step 5) Create a second, independent tidy data set with the
# average of each variable for each activity and each subject.
# Bind XDataSubset, subjectData, and yData$Description into a new data frame
mergedData <- cbind(XDataSubset, subjectData, yData$description)
# Rename activity description column
colnames(mergedData)[end(colnames(mergedData))[1]] <- "activityDescription"
# Melt data frame using subjectNumber and activityDescription as ID variables
mergedMelt <- melt(mergedData, id = c("subjectNumber", "activityDescription"))
# Cast data frame by ID variables (e.g., subjectNumber and activityDescription)
castMergedMelt <- dcast(mergedMelt,
subjectNumber + activityDescription ~ variable, mean)
# Place data into easier to view format
castMergedMelt <- tbl_df(castMergedMelt)
# Write data frame to a .txt file
write.table(castMergedMelt, file = "run_analysis_castMergedMelt_Knigge.txt", row.names = FALSE)
# Write data frame to a .csv file
# write.table(castMergedMelt, file = "run_analysis_castMergedMelt_Knigge.csv",
# sep = ",", row.names = FALSE)
# Print resulting data frame to console
print(castMergedMelt)
## End of script