-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathSwrl.R
More file actions
413 lines (358 loc) · 17.1 KB
/
Swrl.R
File metadata and controls
413 lines (358 loc) · 17.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
### Swirl Assignments from Getting and Cleaning Data
# Installation...
# library(swirl)
# install_from_swirl("Getting and Cleaning Data")
library(swirl)
swirl()
## Manipulating Data with dplyr
library(dplyr)
mydf <- read.csv(path2csv, stringsAsFactors = FALSE)
dim() # to view data fram dimensions
packageVersion("dplyr")
x <- tbl_df() # to load the data into what the package authors call a 'data frame tbl' or 'tbl_df'
#######################
cran <- tbl_df(mydf)
#######################
rm() # removed the old data frama
### dplyr supplies five 'verbs' that cover most fundamental data
### manipulation tasks: select(), filter(), arrange(), mutate(), and summarize()
select(cran, ip_id, package, country) # we don't have to type cran$ip_id, cran$package, and cran$country, as we
# normally would when referring to columns of a data frame
select(cran, r_arch:country) # specify a sequence of columns using the `:` operator, just like 5:20
select(cran, country:r_arch) # can be done in reverse order
filter(cran, package == "swirl") # to select all rows for which the package variable is equal to "swirl"
filter(cran, r_version == "3.1.1", country == "US") # you can specify as many conditions as you want, separated by commas
filter(cran, country == "US" | country == "IN") # request rows for which EITHER one condition OR another condition are TRUE
filter(cran, !is.na(r_version)) # return all rows of cran for which r_version is NOT NA
arrange() # order the rows of a dataset according to the values of a particular variable
arrange(cran2, ip_id) # to order the ROWS of cran2 so that ip_id is in ascending order (from small to large)
arrange(cran2, desc(ip_id)) # in descending order
arrange(cran2, package, ip_id) # arrange the data according to the values of multiple variables
arrange(cran2, country, desc(r_version), ip_id) # arrange works on columns in the order they are given
mutate() # to create a new variable based on the value of one or more variables already in a dataset
mutate(cran3, size_mb = size / 2^20) # to add a column called size_mb that contains the download size in megabytes
mutate(cran3, size_mb = size / 2^20, size_gb = size_mb / 2^10) # mutate will handle operations on newly created columns in the same line of code.
summarize() # collapses the dataset to a single row
summarize(cran, avg_bytes = mean(size)) # the average download size
group_by() # to break up your dataset into groups of rows based on the values of one or more variables
by_package <- group_by(cran, package) # Group cran by the package variable and store the result in a new variable called by_package
# now any operation we apply to the grouped data will take place on a per package basis.
summarize(by_package, mean(size)) # Instead of returning a single value, summarize() now returns the mean size for EACH package in our dataset.
# Compute four values, in the following order, from
# the grouped data:
#
# 1. count = n()
# 2. unique = n_distinct(ip_id)
# 3. countries = n_distinct(country)
# 4. avg_bytes = mean(size)
#
# A few thing to be careful of:
#
# 1. Separate arguments by commas
# 2. Make sure you have a closing parenthesis
# 3. Check your spelling!
# 4. Store the result in pack_sum (for 'package summary')
#
# You should also take a look at ?n and ?n_distinct, so
# that you really understand what is going on.
pack_sum <- summarize(by_package,
count = n(),
unique = n_distinct(ip_id),
countries = n_distinct(country),
avg_bytes = mean(size))
quantile(pack_sum$count, probs = 0.99) # the value of 'count' that splits the data into the top 1% and bottom 99%
top_counts <- filter(pack_sum, count > 679) # use filter() to select all rows from pack_sum for which 'count' is strictly greater (>) than 679
View(top_counts) # Full table view of a varialbe
top_counts_sorted <- arrange(top_counts, desc(count))
## Chaining ##
# chaining allows you to string together multiple function calls in a way that is compact and readable
?chain
# Read the code below, but don't change anything. As
# you read it, you can pronounce the %>% operator as
# the word 'then'.
#
# Type submit() when you think you understand
# everything here.
result3 <-
cran %>%
group_by(package) %>%
summarize(count = n(),
unique = n_distinct(ip_id),
countries = n_distinct(country),
avg_bytes = mean(size)
) %>%
filter(countries > 60) %>%
arrange(desc(countries), avg_bytes)
# Print result to console
print(result3)
####### STEP1 #######
# select() the following columns from cran. Keep in mind
# that when you're using the chaining operator, you don't
# need to specify the name of the data tbl in your call to
# select().
#
# 1. ip_id
# 2. country
# 3. package
# 4. size
#
# The call to print() at the end of the chain is optional,
# but necessary if you want your results printed to the
# console. Note that since there are no additional arguments
# to print(), you can leave off the parentheses after
# the function name. This is a convenient feature of the %>%
# operator.
cran %>%
select(ip_id,
country,
package,
size) %>%
print
####### STEP2 #######
# Use mutate() to add a column called size_mb that contains
# the size of each download in megabytes (i.e. size / 2^20).
#
# If you want your results printed to the console, add
# print to the end of your chain.
cran %>%
select(ip_id, country, package, size) %>%
mutate(size_mb = size / 2^20) %>%
print
####### STEP3 #######
# Use filter() to select all rows for which size_mb is
# less than or equal to (<=) 0.5.
#
# If you want your results printed to the console, add
# print to the end of your chain.
cran %>%
select(ip_id, country, package, size) %>%
mutate(size_mb = size / 2^20) %>%
filter(size_mb <= 0.5) %>%
print
####### STEP4 #######
# arrange() the result by size_mb, in descending order.
#
# If you want your results printed to the console, add
# print to the end of your chain.
cran %>%
select(ip_id, country, package, size) %>%
mutate(size_mb = size / 2^20) %>%
filter(size_mb <= 0.5) %>%
arrange(desc(size_mb)) %>%
print
### Tidying Data with tidyr
library(tidyr)
# http://vita.had.co.nz/papers/tidy-data.pdf
# tidy data satisfies three conditions:
# 1) Each variable forms a column
# 2) Each observation forms a row
# 3) Each type of observational unit forms a table
### 1. The first problem is when you have column headers that are values, not variable names.
gather()
gather(students, sex, count, -grade) # we need to have one column for each of these three variables
# The data argument, students, gives the name of
# the original dataset. The key and value arguments -- sex and count, respectively -- give the column names for our
# tidy dataset. The final argument, -grade, says that we want to gather all columns EXCEPT the grade column (since
# grade is already a proper column variable.)
### 2. The second messy data case we'll look at is when multiple variables are stored in one column.
# students2 suffers from the same messy data problem of having column headers that are values (male_1, female_1, etc.)
res <- gather(students2, sex_class, count, -grade) # That got us half way to tidy data, but we still have two different variables, sex and class, stored together in the sex_class column
separate()
separate(res, sex_class, into = c("sex", "class")) # Conveniently, separate() was able to figure out on its own how to separate the sex_class column.
# Unless you request otherwise with the 'sep' argument, it
# splits on non-alphanumeric values. In other words, it assumes that the values are separated by
# something other than a letter or number (in this case, an underscore.)
### just like with dplyr, you can use the %>% operator to chain multiple function calls together.
# Repeat your calls to gather() and separate(), but this time
# use the %>% operator to chain the commands together without
# storing an intermediate result.
#
# If this is your first time seeing the %>% operator, check
# out ?chain, which will bring up the relevant documentation.
# You can also look at the Examples section at the bottom
# of ?gather and ?separate.
#
# The main idea is that the result to the left of %>%
# takes the place of the first argument of the function to
# the right. Therefore, you OMIT THE FIRST ARGUMENT to each
# function.
#
students2 %>%
gather(sex_class, count, -grade) %>%
separate(sex_class, c("sex", "class")) %>%
print
### 3. A third symptom of messy data is when variables are stored in both rows and columns
# Call gather() to gather the columns class1 through
# through class5 into a new variable called class.
# The 'key' should be class, and the 'value'
# should be grade.
#
# tidyr makes it easy to reference multiple adjacent
# columns with class1:class5, just like with sequences
# of numbers.
#
# Since each student is only enrolled in two of
# the five possible classes, there are lots of missing
# values (i.e. NAs). Use the argument na.rm = TRUE
# to omit these values from the final result.
#
# Remember that when you're using the %>% operator,
# the value to the left of it gets inserted as the
# first argument to the function on the right.
#
# Consult ?gather and/or ?chain if you get stuck.
#
students3 %>%
gather(class, grade, class1:class5 ,na.rm = TRUE) %>%
print
# This script builds on the previous one by appending
# a call to spread(), which will allow us to turn the
# values of the test column, midterm and final, into
# column headers (i.e. variables).
#
# You only need to specify two arguments to spread().
# Can you figure out what they are? (Hint: You don't
# have to specify the data argument since we're using
# the %>% operator.
#
students3 %>%
gather(class, grade, class1:class5, na.rm = TRUE) %>%
spread(test, grade) %>%
print
# Lastly, we want the values in the class column to simply be 1, 2, ..., 5 and not class1, class2, ...,
extract_numeric("class5") # class5. We can use the extract_numeric() function from tidyr to accomplish this. To see how it works, try
# We want the values in the class columns to be
# 1, 2, ..., 5 and not class1, class2, ..., class5.
#
# Use the mutate() function from dplyr along with
# extract_numeric(). Hint: You can "overwrite" a column
# with mutate() by assigning a new value to the existing
# column instead of creating a new column.
#
# Check out ?mutate and/or ?extract_numeric if you need
# a refresher.
#
students3 %>%
gather(class, grade, class1:class5, na.rm = TRUE) %>%
spread(test, grade) %>%
mutate(class = extract_numeric(class)) %>% ### Call to mutate() goes here %>%
print
### 4. The fourth messy data problem we'll look at occurs when multiple observational units are stored in the same table.
# notice that each id, name, and sex is repeated twice, which seems quite redundant
# Our solution will be to break students4 into two separate tables -- one containing basic student
# information (id, name, and sex) and the other containing grades (id, class, midterm, final).
# Complete the chained command below so that we are
# selecting the id, name, and sex column from students4
# and storing the result in student_info.
#
student_info <- students4 %>%
select(id, name, sex) %>%
print
# Add a call to unique() below, which will remove
# duplicate rows from student_info.
#
# Like with the call to the print() function below,
# you can omit the parentheses after the function name.
# This is a nice feature of %>% that applies when
# there are no additional arguments to specify.
#
student_info <- students4 %>%
select(id, name, sex) %>%
unique %>%
print
# select() the id, class, midterm, and final columns
# (in that order) and store the result in gradebook.
#
gradebook <- students4 %>%
select(id, class, midterm, final) %>%
print
### 5. when a single observational unit is stored in multiple tables. It's the opposite of the fourth problem.
# The name of each dataset actually represents the value of a new variable that we will call 'status'. Before
# joining the two tables together, we'll add a new column to each containing this information so that its not
# lost when we put everything together.
passed <- mutate(passed, status = "passed")
failed <- mutate(failed, status = "failed")
bind_rows()
### 6. Practice with a real dataset
# http://research.collegeboard.org/programs/sat/data/cb-seniors-2013
# sat' in your workspace, which contains data on all college-bound seniors who took the SAT exam in 2013.
# Accomplish the following three goals:
#
# 1. select() all columns that do NOT contain the word "total",
# since if we have the male and female data, we can always
# recreate the total count in a separate column, if we want it.
# Hint: Use the contains() function, which you'll
# find detailed in 'Selection' section of ?select.
#
# 2. gather() all columns EXCEPT score_range, using
# key = part_sex and value = count.
#
# 3. separate() part_sex into two separate variables (columns),
# called "part" and "sex", respectively. You may need to check
# the 'Examples' section of ?separate to remember how the 'into'
# argument should be phrased.
#
sat %>%
select(-contains("total")) %>%
gather(part_sex, count, -score_range) %>%
separate(part_sex, c("part", "sex")) %>%
print
# Append two more function calls to accomplish the following:
#
# 1. Use group_by() (from dplyr) to group the data by part and
# sex, in that order.
#
# 2. Use mutate to add two new columns, whose values will be
# automatically computed group-by-group:
#
# * total = sum(count)
# * prop = count / total
#
sat %>%
select(-contains("total")) %>%
gather(part_sex, count, -score_range) %>%
separate(part_sex, c("part", "sex")) %>%
group_by(part, sex) %>%
mutate(total = sum(count),
prop = count / total
) %>% print
###### Dates and Times with lubridate
library(lubridate)
today()
this_day <- today() # There are three components to this date. In order, they are year, month, and day.
# We can extract any of these components using the year(), month(), or day() function, respectively
wday() # get the day of the week from this_day
wday(this_day, label = TRUE)
now() # returns the date-time representing this exact moment in time
this_moment <- now()
minute(this_moment) # we can also use hour(),minute(), and second() to extract specific time information
# lubridate offers a variety of functions for parsing date-times
# ymd(), dmy(), hms(), ymd_hms(), etc., where each letter in the name of the function stands for the
# location of years (y), months (m), days (d), hours (h), minutes (m), and/or seconds (s) in the date-time
# being read in.
my_date <- ymd("1989-05-17")
class(my_date) # POSIXct is, but just know that it is one way that R stores date-time information internally.
ymd("1989 May 17") # to parse "1989 May 17"
mdy("March 12, 1975") # to parse "1989 May 17", etc...
ymd("1920/1/2")
ymd_hms(dt1)
hms("03:22:14")
ymd(dt2) # works on vectors
update() # allows us to update one or more components of a date-time
update(this_moment, hours = 8, minutes = 34, seconds = 55) #It's important to recognize that the previous command does not alter this_moment unless we reassign theresult to this_moment
nyc <- now(tzone = "America/New_York") # To find the current date in New York, we'll use the now() function again
### For a complete list of valid time zones for use with lubridate, check out the following Wikipedia page:
# http://en.wikipedia.org/wiki/List_of_tz_database_time_zones
depart <- nyc + days(2)
depart <- update(depart, hours = 17, minutes = 34)
arrive <- depart + hours(15) + minutes(50)
with_tz() # returns a date-time as it would appear in another time zone
arrive <- with_tz(arrive, "Asia/Hong_Kong")
## add/subtract dates
new_interval()
as.period()
last_time <- mdy("June 17, 2008", tz = "Singapore")
how_long <- new_interval(last_time, arrive)
as.period(how_long)
stopwatch() # to see how long you've been working!
# 2011 Journal of Statistical Software paper titled 'Dates and Times Made Easy with lubridate'