forked from abhishek-ch/MachineLearning-using-R
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMultivariateRegression.R
More file actions
145 lines (96 loc) · 4.55 KB
/
MultivariateRegression.R
File metadata and controls
145 lines (96 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
library(ggplot2)
library(GGally)
#refrerence
#http://ww2.coastal.edu/kingw/statistics/R-tutorials/multregr.html
data(state)
View(state.x77)
st = as.data.frame(state.x77)
#lets rename the column without space
colnames(st)[4] = "Life.Exp" # no spaces in variable names, please
colnames(st)[6] = "HS.Grad"
cor(st) #relation with each other
plotmatrix(st[,1:2])
pairs(st)
ggpairs(st, alpha=0.4)
##
model1 = lm(Life.Exp ~ Population + Income + Illiteracy + Murder +
HS.Grad + Frost + Area , data=st)
##after this ,things will be very obvious as Population and Murder have influence over Life.Exp
#higher population increases the life expectancy and higher murder rates strongly opposed Life exp
########################################Basic Data Analysis#####################################
#so following are smooth plot with dataset
st[,9] = st$Income < 4000
colnames(st)[9] = "Extra"
qplot(Life.Exp, Population,color=factor(Extra), data=st, geom=c("smooth", "point"))
qplot(Life.Exp, Area,color=factor(Extra), data=st, geom=c("smooth", "point"))
qplot(Life.Exp, Murder,color=factor(Extra), data=st, geom=c("smooth", "point"))
qplot(Population, Murder,color=factor(Extra), data=st, geom=c("smooth", "point"))
##wel we can surely derive a pattern , so naturally its not that good data
res <- qplot(st$Area, resid(lm(Life.Exp ~ Area,data=st)))
res+geom_hline(yintercept=0)
##residual plot to see how good the data for murder, great spread of data , doing good
res <- qplot(st$Murder, resid(lm(Life.Exp ~ Murder,data=st)))
res+geom_hline(yintercept=0)
#better than Area
res <- qplot(st$Population, resid(lm(Life.Exp ~ Population,data=st)))
res+geom_hline(yintercept=0)
#Data is bit disperersed so I can find atleast some relation
res <- qplot(st$Murder, resid(lm(Population ~ Murder,data=st)))
res+geom_hline(yintercept=0)
###########################Minimizing Predictors################################
#logically we must work with only significant data and we will be exluding all less important
#predictors
#lets start with Area
model2 = update(model1,. ~.-Area) #removes area
summary(model2)
model2 = lm(Life.Exp ~ Population + Income + Illiteracy + Murder +
HS.Grad + Frost , data=st)
#lets remove Illiteracy
model3 = update(model2,. ~.-Illiteracy) #removes area
summary(model3)
model3 = lm(Life.Exp ~ Population + Income + Murder +
HS.Grad + Frost , data=st)
##comparision of 2 models
anova(model2,model3)
#try to find more differences above which can justify how much significance
#lets remove Murder and you can find a significant drop in P-Value
##so we have the answer that Murder is important
modelNull = update(model3,. ~.-Murder) #removes area
summary(modelNull)
modelNull = lm(Life.Exp ~ Population + Income +
HS.Grad + Frost , data=st)
##model 3 shows that income has least preference
model4 = update(model3,. ~.-Income) #removes area
summary(model4)
model4 = lm(Life.Exp ~ Population + Murder +
HS.Grad + Frost , data=st)
#ok now HS. Grad got more attention or I must say more influence over the dataset
anova(model3,model4)
model5 = update(model4,. ~.-Population) #removes area
model5 = lm(Life.Exp ~ Murder +
HS.Grad + Frost , data=st)
summary(model5)
##Now our model is in We have reached the minimal adequate model.
###########Plotting the smooth graph##########################
#check NonLinearRegression.R
#other example
qplot(Life.Exp,Population + Income + Illiteracy + Murder +
HS.Grad + Frost, data = model1, geom = c("point", "smooth"),
method = "lm")
##to find how good is the model or fit or validity of the model, run the residual plot
res <- qplot(fitted(model5), resid(model5))
res+geom_hline(yintercept=0)
##Isn't it obvious from above .. how good is model5
######################Confidence Level############################################
#http://stattrek.com/statistics/dictionary.aspx?definition=Confidence%20interval
#http://www.r-tutor.com/elementary-statistics/simple-linear-regression/confidence-interval-linear-regression
confint(model5)
confint(model1,level=0.99)
newdata = data.frame(Murder=20.8, HS.Grad=80 , Frost=10)
predict(model5, newdata, interval="confidence")
#So above output for confidence level says that Life.Exp will fall within the lwr and upr
#range having 95% confidence level
#So it can be used to predict the output
#lets try a value which crosses the boundary, which is not ideal with linear regression
newdata2 = data.frame(Murder=120.8, HS.Grad=180 , Frost=210)
predict(model5, newdata2, interval="confidence")