-
Notifications
You must be signed in to change notification settings - Fork 0
/
.Rhistory
512 lines (512 loc) · 21.7 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
a <- Y - beta_2 * x_2
beta_1 <- lm(a~x_1)$coef[2]
#Store Iteration Estimates
estimation_data[i]$iteration <- i
estimation_data[i]$beta_0 <- lm(a~x_1)$coef[1]
estimation_data[i]$beta_1 <- beta_1
estimation_data[i]$beta_2 <- beta_2
}
estimation_data <- data_frame(iteration=numeric(1000), beta_0=numeric(1000),beta_1=numeric(1000),beta_2=numeric(1000))
#Initialise Beta one etstimate to a value
beta_1 <- 20
for(i in 1:1000){
#Fit model to residual of Y and B1X1
a <- Y - beta_1*x_1
beta_2 <- lm(a~x_2)$coef[2]
#Now we keep Beta_2 fixed and vary beta 1
a <- Y - beta_2 * x_2
beta_1 <- lm(a~x_1)$coef[2]
#Store Iteration Estimates
estimation_data[i]$iteration <- i
estimation_data[i]$beta_0 <- lm(a~x_1)$coef[1]
estimation_data[i]$beta_1 <- beta_1
estimation_data[i]$beta_2 <- beta_2
}
lm(a~x_1)$coef[1]
estimation_data$iteration[1] <- 69
estimation_data
estimation_data <- data_frame(iteration=numeric(1000), beta_0=numeric(1000),beta_1=numeric(1000),beta_2=numeric(1000))
#Initialise Beta one etstimate to a value
beta_1 <- 20
for(i in 1:1000){
#Fit model to residual of Y and B1X1
a <- Y - beta_1*x_1
beta_2 <- lm(a~x_2)$coef[2]
#Now we keep Beta_2 fixed and vary beta 1
a <- Y - beta_2 * x_2
beta_1 <- lm(a~x_1)$coef[2]
#Store Iteration Estimates
estimation_data$iteration[i] <- i
estimation_data$beta_0[i] <- lm(a~x_1)$coef[1]
estimation_data$beta_1[i] <- beta_1
estimation_data$beta_2[i] <- beta_2
}
estimation_data
estimation_long <- spread(estimation_data, value="value",key="coefficient",-iteration)
estimation_long <- spread(estimation_data, value="value",key=-iteration)
?spread
estimation_long <- gather(estimation_data, value="value",key="coefficient",-iteration)
estimation_long
ggplot(estimation_long,aes(x=factor(iteration),y=value , col = coefficient)) + geom_point()
estimation_data <- data_frame(iteration=numeric(1000), beta_0=numeric(1000),beta_1=numeric(1000),beta_2=numeric(1000))
#Initialise Beta one etstimate to a value
beta_1 <- 20
for(i in 1:1000){
#load previous estimates
if(i>1){
beta_1 <- estimation_data$beta_1[i-1]
beta_2 <- estimation_data$beta_2[i-1]
}
#Fit model to residual of Y and B1X1
a <- Y - beta_1*x_1
beta_2 <- lm(a~x_2)$coef[2]
#Now we keep Beta_2 fixed and vary beta 1
a <- Y - beta_2 * x_2
beta_1 <- lm(a~x_1)$coef[2]
#Store Iteration Estimates
estimation_data$iteration[i] <- i
estimation_data$beta_0[i] <- lm(a~x_1)$coef[1]
estimation_data$beta_1[i] <- beta_1
estimation_data$beta_2[i] <- beta_2
}
estimation_long <- gather(estimation_data, value="value",key="coefficient",-iteration)
ggplot(estimation_long,aes(x=factor(iteration),y=value , col = coefficient)) + geom_point()
#Initialise Beta one etstimate to a value
beta_1 <- 2523
beta_1 <- 2523
for(i in 1:1000){
#load previous estimates
if(i>1){
beta_1 <- estimation_data$beta_1[i-1]
beta_2 <- estimation_data$beta_2[i-1]
}
#Fit model to residual of Y and B1X1
a <- Y - beta_1*x_1
beta_2 <- lm(a~x_2)$coef[2]
#Now we keep Beta_2 fixed and vary beta 1
a <- Y - beta_2 * x_2
beta_1 <- lm(a~x_1)$coef[2]
#Store Iteration Estimates
estimation_data$iteration[i] <- i
estimation_data$beta_0[i] <- lm(a~x_1)$coef[1]
estimation_data$beta_1[i] <- beta_1
estimation_data$beta_2[i] <- beta_2
}
estimation_long <- gather(estimation_data, value="value",key="coefficient",-iteration)
ggplot(estimation_long,aes(x=factor(iteration),y=value , col = coefficient)) + geom_point()
#Initialise Beta one etstimate to a value
beta_1 <- 202
for(i in 1:1000){
#load previous estimates
if(i>1){
beta_1 <- estimation_data$beta_1[i-1]
beta_2 <- estimation_data$beta_2[i-1]
}
#Fit model to residual of Y and B1X1
a <- Y - beta_1*x_1
beta_2 <- lm(a~x_2)$coef[2]
#Now we keep Beta_2 fixed and vary beta 1
a <- Y - beta_2 * x_2
beta_1 <- lm(a~x_1)$coef[2]
#Store Iteration Estimates
estimation_data$iteration[i] <- i
estimation_data$beta_0[i] <- lm(a~x_1)$coef[1]
estimation_data$beta_1[i] <- beta_1
estimation_data$beta_2[i] <- beta_2
}
estimation_long <- gather(estimation_data, value="value",key="coefficient",-iteration)
ggplot(estimation_long,aes(x=factor(iteration),y=value , col = coefficient)) + geom_point()
View(estimation_data)
#Initialise Beta one etstimate to a value
beta_1 <- 9
estimation_data <- data_frame(iteration=numeric(1000), beta_0=numeric(1000),beta_1=numeric(1000),beta_2=numeric(1000))
for(i in 1:1000){
#load previous estimates
if(i>1){
beta_1 <- estimation_data$beta_1[i-1]
beta_2 <- estimation_data$beta_2[i-1]
}
#Fit model to residual of Y and B1X1
a <- Y - beta_1*x_1
beta_2 <- lm(a~x_2)$coef[2]
#Now we keep Beta_2 fixed and vary beta 1
a <- Y - beta_2 * x_2
beta_1 <- lm(a~x_1)$coef[2]
#Store Iteration Estimates
estimation_data$iteration[i] <- i
estimation_data$beta_0[i] <- lm(a~x_1)$coef[1]
estimation_data$beta_1[i] <- beta_1
estimation_data$beta_2[i] <- beta_2
}
estimation_long <- gather(estimation_data, value="value",key="coefficient",-iteration)
ggplot(estimation_long,aes(x=factor(iteration),y=value , col = coefficient)) + geom_point()
ggplot(estimation_long,aes(x=factor(iteration),y=value , col = coefficient)) + geom_smooth()
ggplot(estimation_long,aes(x=factor(iteration),y=value , col = coefficient)) + geom_point(alpha=.2)
install.packages("tree")
library(tree)
library(ISLR)
library(tidyverse)
#Classification Trees --------
Carseats$High <- ifelse(Carseats$Sales<=8,"No","Yes")
tree_carseats <- tree(High ~ . - Sales , data = Carseats)
View(Carseats)
summary(tree_carseats)
library(tree)
library(ISLR)
library(tidyverse)
#Classification Trees --------
Carseats$High <- ifelse(Carseats$Sales<=8,"No","Yes")
tree_carseats <- tree(High ~ . - Sales , data = Carseats)
summary(tree_carseats)
tree_carseats
library(ISLR)
library(tidyverse)
library(MASS)
library(class)
names(Smarket)
summary(Smarket)
cor(Smarket[,-9])
#You can see strong correlation between year and volume of shares traded
#We fit a logistic GLM to predict Direction categorical variable
glm_model <- glm(data = Smarket,
formula = Direction ~ Lag1 + Lag2 +Lag3 +Lag4 +Lag5 +Volume,
family = "binomial")
summary(glm_model)
library(tidyverse)
library(ISLR)
library(boot)
library(splines)
library(gam)
library(akima) #Visualising 2D local regression in GAM
#Polynomial Regression and Step Functions
polynomial_fit <- lm(wage ~ poly(age,4) , data = Wage)
#exctract coefficients
coefficients(summary(polynomial_fit))
#We can also choose the basis function based on the raw polynomials set Raw = T
#Remember, if you want to type out the polynomials manually, keep it wihtin I(age^2) as the ^ has a special meaning in the R formula notation
#We will recreate figure 7.1
agelims <- range(Wage$age)
age_grid <- seq(agelims[1],agelims[2])
predictions <- predict(polynomial_fit , newdata = list(age=age_grid), se = TRUE)
se_bands <- cbind(predictions$fit - 2*predictions$se.fit ,predictions$fit + 2*predictions$se.fit )
#Now lets plot this
par(mfrow=(c(1,2)), mar =c(4.5,4.5,1,1) , oma = c(0,0,4,0))
plot(Wage$age, Wage$wage, xlim= agelims , cex =0.5 , col = "darkgrey")
title("Degree -4 Polynomial", outer = T)
lines(age_grid, predictions$fit, lwd=2 , col="blue") #remember, the x coordinates are specificed first
matlines(age_grid,se_bands,lwd=1,col='blue',lty=3)
#What degree polynomial will be the best?
fit_1 <- glm(data = Wage , wage ~ age)
fit_2 <- glm(data = Wage , wage ~ poly(age,2))
fit_3 <- glm(data = Wage , wage ~ poly(age,3))
fit_4 <- glm(data = Wage , wage ~ poly(age,4))
fit_5 <- glm(data = Wage , wage ~ poly(age,5))
anova(fit_1,fit_2,fit_3,fit_4,fit_5 ,test = "F")
#degree 3 or 4 seems to be the best
#Anova method is robust and will work if you have other, non-age variables in the model
#Notice this
coefficients(summary(fit_5))
#P-values of the t-statistics are the same!!
#in fact, if you square the t values, they will equal the f-values
#Can confirm anova results using cross-validation
cv_results <- map(list(fit_1,fit_2,fit_3,fit_4,fit_5),cv.glm, K=10 , data=Wage) %>% transpose(~delta)
#Remember that glm logistic is a type of linear model, so lets try a polynomial glm
glm_fit <- glm(I(wage>250)~poly(age,4),data=Wage, family = "binomial")
#note we use the I function as an indicator function which sets values to 1 when wage>250
glm_predictions <- predict(glm_fit, newdata = list(age=age_grid),se = T,type="response")
#standard error bands
se_bands_glm <- cbind(glm_predictions$fit - 2*glm_predictions$se.fit ,glm_predictions$fit + 2*glm_predictions$se.fit )
plot(Wage$age,I(Wage$wage>250),xlim=agelims,type="n",ylim=c(0,0.2))
points(jitter(Wage$age),I((Wage$wage>250)/5), cex=.5,pch="|",col="darkgrey") #plot actual data, need to transform it to keep it in the range
lines(age_grid,glm_predictions$fit,lwd=2,col="blue")
matlines(age_grid,se_bands_glm,lwd=1,col="blue",lty=3)
#Let's create a step function
table(cut(Wage$age,4))
step_lm <- lm(wage~cut(age,4),data = Wage)
coefficients(summary(step_lm))
#Notice how cut creates factors or categorical variables
#Regression Splines
#bs stands for basis spline, in other words you specify the basis formulae, forms a cubic spline by default , note not a NATURAL cubic spline
regression_spline <- lm(wage ~bs(age,knots=c(25,40,60)),data = Wage)
summary(regression_spline)
spline_prediction <- predict(regression_spline, newdata= list(age=age_grid) , se = T)
plot(Wage$age,Wage$wage, col ="gray")
#natural spline
natural_spline <- lm(wage~ns(age,df=4), data = Wage) #can either specify number of knots or by degrees of FREEDOM, degrees of freedom for cubic spline is 4 + Knots
natural_predictions <- predict(natural_spline , newdata = list(age=age_grid),se=T)
title("natural spline")
lines(age_grid,natural_predictions$fit,col="red")
#Smoothing Spline
plot(Wage$age,Wage$wage, col ="gray")
title("Smoothing Spline")
smoothing_specified <- smooth.spline(Wage$age,Wage$wage,df=16)
smoothing_cv_determined <- smooth.spline(Wage$age, Wage$wage, cv = TRUE)
lines(smoothing_specified,col="green")
lines(smoothing_cv_determined,col="black")
#not much difference in this case
#Local Regresion
plot(Wage$age,Wage$wage, col ="gray")
title("LOESS ")
loess_fit <- loess(wage~age,span=0.2,data=Wage)
loess_fit_2 <- loess(wage~age,span=0.5,data=Wage)
lines(age_grid,predict(loess_fit,newdata=data_frame(age=age_grid)),col="red")
lines(age_grid,predict(loess_fit_2,newdata=data_frame(age=age_grid)),col="blue")
#higher the span, greater the smoothness, span of 0.2 means that 20% of data will be within neighbourhood for the regression at each unique x_i
#GAMs
#Additive model framework, create a separate f(x_j) then use OLS to determine B_j s.t. B_j * f(x_j)
#Note this works because you are performing OLS on the basis functions' outputs. Note with smoothing splines, you aren't using basis functions, you are trying to find a general function g(x), therefore to use smoothing splines in a GAM, you need to be careful
gam_1 <- lm( wage ~ ns(year,4) + ns(age,5) + education , data = Wage)
#Because of the issue stated above, we use the gam library and the gam function instead of the base lm function
gam_2 <- gam(wage~s(year,4) + s(age,5) +education , data = Wage) #s() indicates that we wanta smoothing spline with the desired degrees of freedom
par(mfrow=c(1,3))
plot(gam_2,se=T,col="Blue") #very cool!
#it invokes plot.gam() by default automatically, we can call plot.gam on the "lm" version of the gam (gam_1)
plot.gam(gam_1,se=T,col="red")
#The smoothing spine for year looks linear, lets do an anova on a few models to determine if it should be a linear function rather than a smoothing spline
gam_noyear <- gam(wage~ s(age,5) +education , data = Wage)
gam_linear_year <- gam(wage~ year + s(age,5) +education , data = Wage)
anova(gam_noyear,gam_linear_year , gam_2 , test = "F")
#Having linear year is better than not having year. The smoothing spline is not significantly better than a linear function of year!!
#let's look at a summary of one of the models
summary(gam_2)
#TODO: read up on how to understand the output of this
#You can also add in local regression as a building block to a GAM
gam_local_regression_single <- gam(wage ~s(year,df=4) + lo(age,span=.7)+education , data =Wage)
plot(gam_local_regression_single, se =T , col ="blue")
#try interactions with local regression
gam_local_regression_interaction <- gam(wage~lo(year,age,span=.5)+ education , data = Wage)
#Use akima package to visualise this
par(mfrow=c(1,2))
plot(gam_local_regression_interaction , se =T)
#We can also use gams for LOGISTIC REGRESSION
gam_logistic <- gam(I(wage>250) ~year + s(age,df=5) + education , family ="binomial" , data = Wage)
par(mfrow=c(1,3))
plot(gam_logistic , se = T , col = "red")
#Hold on, look at error bars for <HS grad, why are they so big?
table(Wage$education, I(Wage$wage>250))
#there are no people earning above that wage in <HS grad, it is screwing up the model and should be excluded
gam_logistic <- gam(I(wage>250) ~year + s(age,df=5) + education , family ="binomial" , data = Wage, subset =(education!="1. < HS Grad"))
plot(gam_logistic , se = T , col = "purple")
glmnet
?glmnet
library(tidyverse)
library(ISLR)
library(boot)
set.seed(1)
library(ISLR)
library(boot)
library(tidyverse)
library(MASS)
library(tidyverse)
library(ISLR)
library(leaps)# for subset selection
library(glmnet) # for ridge and lasso regression
library(pls) #PCR and PLS regression
#With all of these approaches, we want to perform model size and variable selection on TRAINING data! i.e. cv on training data to figure out which class model is better
#Then use the test set only for working out a test MSE
#Finally, use ALL data when specifying the final model in its full form
#Best subset --------
Hitters <- na.omit(Hitters)
all_subsets <- regsubsets(Salary ~ . , data = Hitters)
summary(all_subsets)
# What it will do is report the details of the best training model for each model SIZE
#default is max size = 8 so we just pass a new max
all_subsets <- regsubsets(Salary ~ . , data = Hitters , nvmax = 19)
summary_of_subsets <- summary(all_subsets)
summary_of_subsets$cp
#Let's plot this information to see which model we shold try
#(remember how Cp is an unbiased estimator for TEST MSE if the estimation of noise variance is unbiased!)
par(mfrow =c(2,2))
plot(summary_of_subsets$rss , xlab ="No of Variables", ylab="RSS" , type ="l")
plot(summary_of_subsets$adjr2 , xlab ="No of Variables", ylab="Adjusted R-squared" , type ="l")
#The points command works like plot except it plots point on an already existing plot
#instead of creating a new plot
#identify point with highest adjusted r squared
which.max(summary_of_subsets$adjr2 )
points(x = 11 , summary_of_subsets$adjr2[11] , col="red" , cex = 2 , pch = 20)
plot(summary_of_subsets$cp , xlab ="No of Variables", ylab="Cp" , type ="l")
points(which.min(summary_of_subsets$cp) ,summary_of_subsets$cp[which.min(summary_of_subsets$cp)] , col="red", cex = 2 , pch=20 )
plot(summary_of_subsets$bic , xlab ="No of Variables", ylab="BIC" , type ="l")
points(which.min(summary_of_subsets$bic) ,summary_of_subsets$bic[which.min(summary_of_subsets$cp)] , col="red", cex = 2 , pch=20 )
par(mfrow=c(1,1))
#also the package has a plotting function
#Change scale argument to change what is on Y-axis. Look at ?plot.regsubsets
plot(all_subsets , scale ="r2")
#Look at linear coefficients of Model with lowest bic
coef(all_subsets , 6)
#Forward/Backward set selection -----
regfit_forward <- regsubsets(Salary~. , data = Hitters , nvmax=19 ,method = "forward")
summary(regfit_forward)
regfit_backward <- regsubsets(Salary~. , data = Hitters , nvmax=19 ,method = "backward")
summary(regfit_forward)
#How to select best set?
#Validation Set Approach------
#remember, we do subset selecction on TRAINING data, this is because we need test data
#to confirm the appropriatness. We wouldn't get a good estimate of test MSE
set.seed(1)
train <- sample(c(TRUE,FALSE), nrow(Hitters) , replace = TRUE)
test <- !train
regfit_best <- regsubsets(Salary ~ . , data = Hitters[train,] , nvmax = 19)
#We can look at "best" models for each size by RSS
#Now we create a MODEL MATRIX to be used as a data matrix for the test data
test_mat <- model.matrix(Salary ~ . , data = Hitters[test,])
test_mse <- c(numeric(19))
for( i in 1:19){
coeffi <- coef(regfit_best,id = i)
prediction <- test_mat[,names(coeffi)]%*%coeffi #multiply data matrix by extracted coefficients
test_mse[i] <- mean((Hitters$Salary[test] - prediction)^2)
}
#View errors
test_mse
#find mind
which.min(test_mse)
#Model 10 minimises test mse
#let's look at the model specification
coef(regfit_best, id = 10)
#Little tedious because there is no predict function within the leaps package
#Let's write out own one
predict.regsubsets <- function(object,newdata,id,...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form,newdata)
coefi <- coef(object , id =id)
xvars <- names(coefi)
#return this now
mat[ , xvars]%*%coefi
}
#Only use test and training split when you are trying to figure out which model size /class to use
# Once you have decided that you will use model of size 10, use ALL data to create those models
#This is because you have already validated it and determined that model size 10 is the most accurate
best_model <- regsubsets(Salary ~ . , data = Hitters , nvmax =19) %>% coef(. ,id=10)
summary(regfit_forward)
regfit_best
x <- c(5,8,10)
for (i in seq_along(x)){print(i)}
for (i in x ){print(i)}
!= <>
2 == 2/1
library(tree)
library(ISLR)
library(tidyverse)
Carseats$High <- ifelse(Carseats$Sales<=8,"No","Yes")
tree_carseats <- tree(High ~ . - Sales , data = Carseats)
tree_carseats
Carseats
summary(tree_carseats)
tree_carseats <- tree(High ~ . - Sales , data = Carseats)
Carseats
Carseats
Carseats
tree_carseats <- tree(High ~ . , data = Carseats)
tree_carseats <- tree(High ~ . - Sales , data = Carseats)
tree_carseats
#Classification Trees --------
Carseats$High <- ifelse(Carseats$Sales<=8,"No","Yes") %>% factor()
tree_carseats <- tree(High ~ . - Sales , data = Carseats)
summary(tree_carseats)
#we can see a 9% trainning error
plot(tree_carseats)
text(tree_carseats, prety = 0)
plot(tree_carseats)
text(tree_carseats, pretty = 0)
#Can see that shelve location is the most important variable as the tree splits on this first
tree_carseats
#Can see that shelve location is the most important variable as the tree splits on this first
tree_carseats
#Can see that shelve location is the most important variable as the tree splits on this first
cat(tree_carseats)
#Can see that shelve location is the most important variable as the tree splits on this first
tree_carseats$frame
#Can see that shelve location is the most important variable as the tree splits on this first
tree_carseats$frame %>% summary
#Divide data into test and train sets
set.seed(2)
train <- sample(nrow(Carseats),200)
carseats_test <- Carseats[-train,]
tree_carseats <- tree(High ~ . - Sales , Carseats , subset = train)
tree_predictions <- predict(tree_carseats,carseats_test, type = "class")
#construct confusion matrix
table(tree_predictions,Carseats$high[-train])
carseats_test
#construct confusion matrix
table(tree_predictions,carseats_test$High)
#Let's try prunning these trees
set.seed(3)
cv_carseats <- cv.tree(tree_carseats, FUN = prune.misclass)
cv_carseats
#The dev object refers to the number of misclassifications
par(mfrow = c(1,2))
plot(cv_carseats$size,cv_carseats$dev, type = "b")
plot(cv_carseats$size,cv_carseats$dev, type = "b" , ylab ="No of misclassifications")
par(mfrow = c(1,2))
plot(cv_carseats$size,cv_carseats$dev, type = "b" , ylab ="No of misclassifications", xlab ="No of terminal nodes")
plot(cv_carseats$k,cv_carseats$dev, type = "b" , ylab ="No of misclassifications", xlab = "value of reg. param")
prune_carseats <- prune.misclass(tree_carseats , best = 9)
plot(prune_carseats)
text(prune_carseats, pretty = 0)
par(mfrow=c(1,1))
plot(prune_carseats)
text(prune_carseats, pretty = 0)
tree_predictions <- predict(prune_carseats,carseats_test, type = "class")
#construct confusion matrix
table(tree_predictions,carseats_test$High)
# overall hit rate improved to 77%
#Regression Trees ------------------
set.seed(1)
train <- sample(nrow(Boston),nrow(Boston)/2)
# overall hit rate improved to 77%
#Regression Trees ------------------
library(MASS)
set.seed(1)
# overall hit rate improved to 77%
#Regression Trees ------------------
attach(MASS::Boston)
Boston
train <- sample(nrow(Boston),nrow(Boston)/2)
tree_boston <- (medv ~ . , Boston , subset = train)
tree_boston <- tree(medv ~ . , Boston , subset = train)
summary(tree_boston)
plot(tree_boston)
text(tree_boston, pretty = 0)
#see if tree should be pruned
cv_boston <- cv.tree(tree_boston)
cv_boston
?ranger
install.packages("Ranger")
?ranger::ranger()
6.96/997
6.96/997 * 12
70*60*0.083771
5
#bagging is a special case of a random forest where m = 9; i.e. consider all variables in each tree
library(randomForest)
install.packages("randomForest")
#bagging is a special case of a random forest where m = 9; i.e. consider all variables in each tree
library(randomForest)
set.seed(1)
bag_boston <- randomForest( medv ~ . , data = Boston , subset = train , mtry = 13 , importance = T)
attach(MASS::Boston)
bag_boston <- randomForest( medv ~ . , data = Boston , subset = train , mtry = 13 , importance = T)
Boston
data(Boston)
data(MASS::Boston)
Boston <- MASS::Boston
bag_boston <- randomForest( medv ~ . , data = Boston , subset = train , mtry = 13 , importance = T)
# mtry refers to the number of predicotrs that should be considered for each tree... this was set to ALL predictors
bag_boston
#caclulate test mse
yhat_bag <- predict(bag_boston , newdata = Boston[-train,])
plot(yhat_bag, Boston[-train,]$medv)
abline(0,1)
abline(0,1 , col = "red")
mean((yhat_bag) - Boston[-train,]$medv)^2)
mean((yhat_bag - Boston[-train,]$medv)^2)
#try a random forest
set.seed(1)
rf_boston <- randomForest(medv ~ . , data = Boston , subset = train , mtry = 13 , ntree = 25)
rf_boston <- randomForest(medv ~ . , data = Boston , subset = train , mtry = 6 , ntree = 25)
rf_boston <- randomForest(medv ~ . , data = Boston , subset = train , mtry = 6, importance = T)
yhat_rf <- predict(rf_boston , newdata = Boston[-train,])
mean((yhat_rf - Boston[-train,]$medv)^2)
importance(rf_boston)
#two measures to consider when looking at importance, reduction in training MSE and node purity
varImpPlot(rf_boston)