| output | html_document |
|---|
Read the data into R Studio and view the summary of it.
setwd("C:/Users/u6027576/Documents/Coursera/reproducibleResearch/RepData_PeerAssessment1")activity <- read.csv("activity.csv")
summary(activity)## steps date interval
## Min. : 0.00 2012-10-01: 288 Min. : 0.0
## 1st Qu.: 0.00 2012-10-02: 288 1st Qu.: 588.8
## Median : 0.00 2012-10-03: 288 Median :1177.5
## Mean : 37.38 2012-10-04: 288 Mean :1177.5
## 3rd Qu.: 12.00 2012-10-05: 288 3rd Qu.:1766.2
## Max. :806.00 2012-10-06: 288 Max. :2355.0
## NA's :2304 (Other) :15840
Remove the rows in the data frame where the number of steps is not given (NA) or just calculate the means of all.
!Optional: remove data entries NA from the data set before processing
## Warning: package 'ggplot2' was built under R version 3.1.3
activityNAR <- na.omit(activity)
## Sum
activitySum <- tapply(activity$steps, activity$date, sum)
df.actSum <- data.frame(date=names(activitySum), sum.steps=activitySum)
## Create histogram using ggplot
ggplot(df.actSum,aes(x=sum.steps)) +
geom_histogram(fill = "red", alpha = 0.4)## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Calculating the mean steps per day:
activityMeans <- tapply(activityNAR$steps, activityNAR$date, sum)
AvStepsPerDay <- mean(activityMeans, na.rm=TRUE)
AvStepsPerDay## [1] 10766.19
Calculating the median steps per day:
MedianStepsPerDay <- median(activityMeans, na.rm=TRUE)
MedianStepsPerDay## [1] 10765
The subject took a mean of 10766.19 and a median of 10765 steps per day.
TimeSeries <- tapply(activityNAR$steps, activityNAR$interval, mean)
df.timeSeries <- data.frame(Time=as.numeric(as.character(names(TimeSeries))), mean.steps=TimeSeries)
dtt <- sprintf("%04d", df.timeSeries$Time)
df.timeSeries$Time2 <- strptime(dtt, format="%H%M")
df.timeSeries$IntervalMinutes <- (df.timeSeries$Time %% 100) + (df.timeSeries$Time %/% 100)*60
q <- ggplot() +
geom_line(data = df.timeSeries, aes(x = IntervalMinutes, y = mean.steps, group=1), colour="#000099") +labs(x = "Time in minutes", y = "Mean average of steps", title = "Average steps through the day")
qAlternatively: time could be added to axis
p <- ggplot() +
geom_line(data = df.timeSeries, aes(x = Time2, y = mean.steps, group=1), color="red") +labs(x = "Time", y = "Mean average of steps", title = "Average steps through the day")
pWhich 5-minute interval contains the highest average for steps?
maxStepAv <- max(df.timeSeries$mean.steps)
maxStepAvIndex <- grep(pattern=maxStepAv, df.timeSeries$mean.steps)
fiveMInterval <- df.timeSeries$Time[maxStepAvIndex]
df2 <- data.frame(Interval=fiveMInterval, Steps=maxStepAv)
df2## Interval Steps
## 1 835 206.1698
The maximum average steps per period occur at interval 835, with an average of 206 steps per 5 minutes.
Calculating rows containing NA. False values on complete cases are the rows which contain NA.
False cases represent rows missing step entries.
table(complete.cases(activity))##
## FALSE TRUE
## 2304 15264
Loop over time intervals where step data is NA and replace the value with the average step value for that interval.
intervals <- unique(activity$interval)
a <- which(is.na(activity$steps))
for (i in intervals){
b <- which(activity$interval== i)
empties <- intersect(a, b)
intervalPosition <- which(df.timeSeries$Time == i)
intMean <- df.timeSeries$mean.steps[intervalPosition]
activity$steps[empties] <- intMean
}Complete cases is run again to show all cases are complete. No false cases means that all NAs have been replaced with mean data for that interval.
A <- complete.cases(activity)
table(A)## A
## TRUE
## 17568
Making the histograms with the new data
activitySum <- tapply(activity$steps, activity$date, sum)
df.actSum <- data.frame(date=names(activitySum), sum.steps=activitySum)
##plot
ggplot(df.actSum,aes(x=sum.steps)) +
geom_histogram(fill = "red", alpha = 0.4)## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Mean
activityMeans <- tapply(activityNAR$steps, activityNAR$date, sum)
AvStepsPerDay <- mean(activityMeans, na.rm=TRUE)
AvStepsPerDay## [1] 10766.19
After imputing the data with the method, the number of daily steps shows a fractional difference between the mean and median. From these observations, it seems that the impact of imputing missing values on the total number of daily steps is negligible.
When creating the column, Monday, Tuesday, Wednesday, Thursday, Friday are weekday and Saturday and Sunday are Weekends.
activity$day <- weekdays(as.Date(activity$date, "%Y-%m-%d"))
weekend <- c('Saturday', 'Sunday')
activity$weekday <- factor((activity$day %in% weekend), levels=c(TRUE, FALSE), labels=c('weekend', 'weekday'))To plot weekdays Vs. weekends
DayEnd <- split(activity, activity$weekday)
weekendData <- data.frame(DayEnd[[1]])
weekdayData <- data.frame(DayEnd[[2]])
TimeSeries <- tapply(weekendData$steps, weekendData$interval, mean)
TimeSeries2 <- tapply(weekdayData$steps, weekdayData$interval, mean)
#weekend df
df.timeSeries <- data.frame(Time=as.numeric(as.character(names(TimeSeries))), mean.steps=TimeSeries, weekday="weekend")
df.timeSeries$IntervalMinutes <- (df.timeSeries$Time %% 100) + (df.timeSeries$Time %/% 100)*60
#weekday df
df.timeSeries2 <- data.frame(Time=as.numeric(as.character(names(TimeSeries2))), mean.steps=TimeSeries2, weekday="weekday")
df.timeSeries2$IntervalMinutes <- (df.timeSeries2$Time %% 100) + (df.timeSeries2$Time %/% 100)*60
df.full <- rbind(df.timeSeries2, df.timeSeries)
## q plot panel of weekend and weekday
qplot(IntervalMinutes, mean.steps, data=df.full, geom="line", color=weekday) + facet_grid(~weekday, scales='free', space='free') +labs(x = "Time in minutes", y = "Mean average of steps", title = "Average steps through the day")



