-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcollect_clorox_ingdisc_data.R
More file actions
199 lines (148 loc) · 8.14 KB
/
collect_clorox_ingdisc_data.R
File metadata and controls
199 lines (148 loc) · 8.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
## Name: collect_clorox_ingdisc_data.R
## Author: Katherine A. Phillips
## Date Created: March 2015
## Purpose: Collects and parses HTML files for Clorox's ingredient use information.
library(XML)
library(RCurl)
setwd("C:/Users/kphillip/Documents/DataCollection/Clorox_Ingredient_List")
source("../../CASFunctions.r")
##----------------------------------------------------------------------------##
## Part I: Get Brand Name Links ##
##----------------------------------------------------------------------------##
## This finds all the links on the sc johnson webpage, don't really need this ##
## now since I saved the hrefs to a file ##
##----------------------------------------------------------------------------##
## URL to page listing all Method products
CloroxURL <- "http://www.thecloroxcompany.com/products/ingredients-inside/"
## Parse the HTML on the page
CloroxPage <- htmlParse(CloroxURL)
## Get all the links on the page
CloroxLinks <- xpathSApply(CloroxPage,"//a/@href")
## Find unique links
CloroxLinks <- unique(CloroxLinks)
## Stop using the page
free(CloroxPage)
## Convert the HTML object to a vector
CloroxLinks <- as.vector(CloroxLinks)
## Open a file for writing
CloroxFile <- file("CloroxBrand_links.txt")
## Write each link to a line in the file
writeLines(CloroxLinks,CloroxFile)
## Close the file
close(CloroxFile)
##----------------------------------------------------------------------------##
## Part II: Get Product Name Links for each Brand ##
##----------------------------------------------------------------------------##
## Sigh! Nothing is easy, Clorox structures their website different than all ##
## the others. Now, read in the links from the main page and get all the ##
## links for each brand. ##
##----------------------------------------------------------------------------##
CloroxURL <- "http://www.thecloroxcompany.com" ## URL to page listing all Method products
urls <- readLines("CloroxBrand_links.txt",warn=FALSE) ## Store all children URLs as vector
for (i in 1:length(urls)){
BrandURL <- paste(CloroxURL,urls[i],sep="") ## Get the URL for each brand
BrandName <- paste(tail(strsplit(urls[i],
split="/")[[1]],n=1),"_links.txt",sep="") ## Create a name for the links files
BrandPage <- htmlParse(BrandURL) ## Parse the HTML on the page
BrandLinks <- xpathSApply(BrandPage,"//a/@href") ## Get all the links on the page
free(BrandPage) ## Stop using the page
BrandLinks <- as.vector(BrandLinks) ## Convert the list of links to a vector
BrandLinks <- unique(BrandLinks) ## Find unique links
BrandFile <- file(BrandName) ## Open a file for writing
writeLines(BrandLinks,BrandFile) ## Write each link to a line in the file
close(BrandFile) ## Close the file
}
##----------------------------------------------------------------------------##
## Part III: Download the HTML Files for each Product ##
##----------------------------------------------------------------------------##
## Okay, now I have all of the product links that I need so I can go out and ##
## download the files I need and parse them after they are downloaded. ##
## Downloaded files don't parse correctly...so just read the page and store ##
## the necessary information 'on the fly'. ##
##----------------------------------------------------------------------------##
## URL to page listing all Method products
CloroxURL <- "http://www.thecloroxcompany.com"
## Store all children URLs as vector
CompanyURLs <- readLines("CloroxBrand_links.txt",warn=FALSE)
## Loop over all Brand URLs
for (i in 1:length(CompanyURLs)){
CURL <- CompanyURLs[i]
## Get the URL for each brand
BrandURL <- paste(CloroxURL,CURL[i],sep="")
## Create a name for the links files
BrandName <- paste(tail(strsplit(urls[i],
split="/")[[1]],n=1),"_links.txt",sep="")
## Get the URL for each product
BrandURLs <- readLines(BrandName,warn=FALSE)
## Loop over all product URLs
for(j in 1:length(BrandURLs)){
## Get the URL for a product
BURL <- BrandURLs[j]
ProductURL <- paste(CloroxURL,BURL,sep="")
## Create a HTML file name
ProdFileName <- paste(gsub("-","_",
gsub(" ","",
tail(strsplit(ProductURL,split="[/]")[[1]],
n=1)
)
),".html",
sep="")
ProdFileName <- paste("ProductHTML",ProdFileName,sep="/")
## Download the file
ProdFile <- download.file(url=ProductURL,destfile=ProdFileName)
## Don't overload the server
Sys.sleep(10)
}
}
##----------------------------------------------------------------------------##
## Part IV: Get the Information from the HTML files ##
##----------------------------------------------------------------------------##
## Downloaded files don't parse correctly...so just read the page and store ##
## the necessary information 'on the fly'. This should probably be combined ##
## with Part III at some point. ##
##----------------------------------------------------------------------------##
## URL to page listing all Method products
CloroxURL <- "http://www.thecloroxcompany.com"
## Store all children URLs as vector
CompanyURLs <- readLines("CloroxBrand_links.txt",warn=FALSE)
product_list <- list()
## Loop over all Brand URLs
for (i in 1:length(CompanyURLs)){
CURL <- CompanyURLs[i]
## Get the URL for each brand
BrandURL <- paste(CloroxURL,CURL,sep="")
## Create a name for the links files
BrandName <- paste(tail(strsplit(CURL,
split="/")[[1]],n=1),"_links.txt",sep="")
## Get the URL for each product
BrandURLs <- readLines(BrandName,warn=FALSE)
## Loop over all product URLs
for(j in 1:length(BrandURLs)){
## Get the URL for a product
BURL <- BrandURLs[j]
ProductURL <- paste(CloroxURL,BURL,"/",sep="")
ProductName <- tail(strsplit(ProductURL,split='/')[[1]],n=1)
CompanyName <- strsplit(strsplit(BrandName,
split="[.]")[[1]][1],split="_")[[1]][1]
ProdPage <- getURL(ProductURL)
## Parse chemical HTML file
ProdDoc <- htmlParse(ProdPage)
ChemName <- xpathSApply(ProdDoc,"//h4/a",xmlValue)
ChemUse <- xpathSApply(ProdDoc,"//div[@class = 'accordionContent']/p",xmlValue)
free(ProdDoc)
ChemName <- sapply(ChemName,FUN=function(x) {gsub(",",";",x)})
ChemName <- UniStr(ChemName)
ChemUse <- sapply(ChemUse,FUN=function(x) {gsub(",",";",x)})
ChemUse <- UniStr(ChemUse)
ChemFrame <- data.frame(ChemicalName=ChemName,UseCategory=ChemUse,ProductName=ProductName)
product_list[[ProductName]] <- ChemFrame
rownames(ChemFrame) <- NULL
print(paste("Data Aquired for",paste(toupper(CompanyName),"'s",sep=""),paste(ProductName,"...",sep="")),quote=FALSE)
## Don't overload the server
Sys.sleep(10)
}
}
## Merge all data frames in list
CloroxChem <- Reduce(function(x,y) merge(x,y,all=TRUE),product_list)
## Write data to file
write.csv(CloroxChem,"CloroxChemicalIngredients_RAW.csv",quote=FALSE,row.names=FALSE)