Skip to content

Latest commit

 

History

History
252 lines (176 loc) · 6.81 KB

File metadata and controls

252 lines (176 loc) · 6.81 KB
output html_document

Reproducible Research: Peer Assessment 1

Read the data into R Studio and view the summary of it.

setwd("C:/Users/u6027576/Documents/Coursera/reproducibleResearch/RepData_PeerAssessment1")
activity <- read.csv("activity.csv")
summary(activity)
##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 12.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##  NA's   :2304     (Other)   :15840

Remove the rows in the data frame where the number of steps is not given (NA) or just calculate the means of all.

What is mean total number of steps taken per day?

!Optional: remove data entries NA from the data set before processing

## Warning: package 'ggplot2' was built under R version 3.1.3
activityNAR <- na.omit(activity)
## Sum
activitySum <- tapply(activity$steps, activity$date, sum)
df.actSum <- data.frame(date=names(activitySum), sum.steps=activitySum)
## Create histogram using ggplot
ggplot(df.actSum,aes(x=sum.steps)) +
    geom_histogram(fill = "red", alpha = 0.4)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-4

Calculating the mean steps per day:

activityMeans <- tapply(activityNAR$steps, activityNAR$date, sum)
AvStepsPerDay <- mean(activityMeans, na.rm=TRUE)
AvStepsPerDay
## [1] 10766.19

Calculating the median steps per day:

MedianStepsPerDay <- median(activityMeans, na.rm=TRUE)
MedianStepsPerDay
## [1] 10765

The subject took a mean of 10766.19 and a median of 10765 steps per day.

What is the average daily activity pattern?

TimeSeries <- tapply(activityNAR$steps, activityNAR$interval, mean)
df.timeSeries <- data.frame(Time=as.numeric(as.character(names(TimeSeries))), mean.steps=TimeSeries)

dtt <- sprintf("%04d", df.timeSeries$Time)
df.timeSeries$Time2 <- strptime(dtt, format="%H%M")

df.timeSeries$IntervalMinutes <- (df.timeSeries$Time %% 100) + (df.timeSeries$Time %/% 100)*60

q <- ggplot() + 
  geom_line(data = df.timeSeries, aes(x = IntervalMinutes, y = mean.steps, group=1), colour="#000099") +labs(x = "Time in minutes", y = "Mean average of steps", title = "Average steps through the day")
q

plot of chunk unnamed-chunk-8

Alternatively: time could be added to axis

p <- ggplot() + 
  geom_line(data = df.timeSeries, aes(x = Time2, y = mean.steps, group=1), color="red") +labs(x = "Time", y = "Mean average of steps", title = "Average steps through the day")
p

plot of chunk unnamed-chunk-9

Which 5-minute interval contains the highest average for steps?

maxStepAv <- max(df.timeSeries$mean.steps)
maxStepAvIndex <- grep(pattern=maxStepAv, df.timeSeries$mean.steps)
fiveMInterval <- df.timeSeries$Time[maxStepAvIndex]
df2 <- data.frame(Interval=fiveMInterval, Steps=maxStepAv)
df2
##   Interval    Steps
## 1      835 206.1698

The maximum average steps per period occur at interval 835, with an average of 206 steps per 5 minutes.

Imputing missing values

Calculating rows containing NA. False values on complete cases are the rows which contain NA.

False cases represent rows missing step entries.

table(complete.cases(activity))
## 
## FALSE  TRUE 
##  2304 15264

Loop over time intervals where step data is NA and replace the value with the average step value for that interval.

intervals <- unique(activity$interval)
a <- which(is.na(activity$steps))

for (i in intervals){
  b <- which(activity$interval== i)
  empties <- intersect(a, b) 
  intervalPosition <- which(df.timeSeries$Time == i)
  intMean <- df.timeSeries$mean.steps[intervalPosition]
  activity$steps[empties] <- intMean
}

Complete cases is run again to show all cases are complete. No false cases means that all NAs have been replaced with mean data for that interval.

A <- complete.cases(activity)
table(A)
## A
##  TRUE 
## 17568

Making the histograms with the new data

activitySum <- tapply(activity$steps, activity$date, sum)
df.actSum <- data.frame(date=names(activitySum), sum.steps=activitySum)

##plot
ggplot(df.actSum,aes(x=sum.steps)) +
    geom_histogram(fill = "red", alpha = 0.4)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-14

## Mean
activityMeans <- tapply(activityNAR$steps, activityNAR$date, sum)
AvStepsPerDay <- mean(activityMeans, na.rm=TRUE)
AvStepsPerDay
## [1] 10766.19

After imputing the data with the method, the number of daily steps shows a fractional difference between the mean and median. From these observations, it seems that the impact of imputing missing values on the total number of daily steps is negligible.

Are there differences in activity patterns between weekdays and weekends?

When creating the column, Monday, Tuesday, Wednesday, Thursday, Friday are weekday and Saturday and Sunday are Weekends.

activity$day <- weekdays(as.Date(activity$date, "%Y-%m-%d"))

weekend <- c('Saturday', 'Sunday')
activity$weekday <- factor((activity$day %in% weekend), levels=c(TRUE, FALSE), labels=c('weekend', 'weekday'))

To plot weekdays Vs. weekends

DayEnd <- split(activity, activity$weekday)
weekendData <- data.frame(DayEnd[[1]])
weekdayData <- data.frame(DayEnd[[2]])
TimeSeries <- tapply(weekendData$steps, weekendData$interval, mean)
TimeSeries2 <- tapply(weekdayData$steps, weekdayData$interval, mean)

#weekend df
df.timeSeries <- data.frame(Time=as.numeric(as.character(names(TimeSeries))), mean.steps=TimeSeries, weekday="weekend")
df.timeSeries$IntervalMinutes <- (df.timeSeries$Time %% 100) + (df.timeSeries$Time %/% 100)*60

#weekday df
df.timeSeries2 <- data.frame(Time=as.numeric(as.character(names(TimeSeries2))), mean.steps=TimeSeries2, weekday="weekday")
df.timeSeries2$IntervalMinutes <- (df.timeSeries2$Time %% 100) + (df.timeSeries2$Time %/% 100)*60

df.full <- rbind(df.timeSeries2, df.timeSeries)

## q plot panel of weekend and weekday
qplot(IntervalMinutes, mean.steps, data=df.full, geom="line", color=weekday) + facet_grid(~weekday, scales='free', space='free') +labs(x = "Time in minutes", y = "Mean average of steps", title = "Average steps through the day")

plot of chunk unnamed-chunk-16