library(ISLR2) ?Boston head(Boston) dim(Boston) set.seed(1221) train <- sample(1:nrow(Boston), nrow(Boston) / 2) boston.test <- Boston[-train, "medv"] library(randomForest) set.seed(131) ?randomForest ## dosta opcija ### bagging= RF s m=p (=mtry), ntree=B (broj bootstrap stabala) bag.boston <- randomForest(medv ~ ., data = Boston, subset = train, mtry = 12, xtest=Boston[-train, -13], ytest=Boston[-train, 13], importance = TRUE, ntree=1000) bag.boston oob.procjene_bag=bag.boston$mse test_err_bag=bag.boston$test$mse plot(1:1000, test_err_bag, type = "l", col="darkgreen") ## samo za usporedbu points(1:1000, oob.procjene_bag, type="l") ## vec od 200 stabala se procjena greske stabilizirala ## ili plot(bag.boston) ### crta OOB procjene greske ### probajmo RF model s m=2,4,6 fit=randomForest(medv ~ ., data = Boston, subset = train, mtry = 2, importance = TRUE, ntree=1000) plot(1:1000, fit$mse, type="l", col=2, ylim=c(12,20)) for(m in c(4,6,12)){ fit=randomForest(medv ~ ., data = Boston, subset = train, mtry = m, importance = TRUE, ntree=1000) points(1:1000, fit$mse, type="l", col=m) } legend("top", legend = paste("m=", c(2,4,6,12)), col = c(2,4,6,12), pch=16, horiz=TRUE) # m=6 se cini najbolji odabir, testnu gresku onda mozemo procijeniti na testnom skupu set.seed(3278) rf.boston <- randomForest(medv ~ ., data = Boston, subset = train, mtry = 6, importance = TRUE, ntree=1000) yhat.rf <- predict(rf.boston, newdata = Boston[-train, ]) mean((yhat.rf - boston.test)^2) ## [1] 11.88502 ### importance(rf.boston, type=2) ?importance ### varImpPlot(rf.boston, type=2) ?Boston ## rm=prosjecan broj soba po nekretnini u podrucju ## lstat= the percentage of homeowners in the neighborhood considered lower class ## ove dvije kovarijate se cine daleko najznacajnije library(gbm) ?gbm ## distribution (funkcija gubitka), n.trees (=M), interaction.depth (=J-1), shrinkage (=eta) set.seed(1) boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian", n.trees = 500, interaction.depth = 6, cv.folds = 5) plot(boost.boston$cv.error, type="l") which.min(boost.boston$cv.error) ### summary(boost.boston) ## crta variable importance plot - slicni rezultati kao kod RF ### ?plot.gbm ## ovo nismo obradili plot(boost.boston, i = "rm") plot(boost.boston, i = "lstat") ### odabir interaction.depth set.seed(56) boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian", n.trees = 500, interaction.depth = 1, cv.folds = 5) plot(boost.boston$cv.error, type="l", col=1, ylim=c(10,40)) for(d in c(2,4,6,8)){ boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian", n.trees = 500, interaction.depth = d, cv.folds = 5) points(boost.boston$cv.error, type="l", col=d) } legend("top", legend = paste("d=", c(2,4,6,8)), col = c(2,4,6,8), pch=16, horiz=TRUE) ## d=4 minimizira CV procjenu greske boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian", n.trees = 500, interaction.depth = 4, cv.folds = 5) plot(boost.boston$cv.error, type="l", col=1, ylim=c(10,40)) M0=which.min(boost.boston$cv.error) ## 127 yhat.boost <- predict(boost.boston, newdata = Boston[-train, ], n.trees = M0) mean((yhat.boost - boston.test)^2) ## [1] 12.40046 - nesto losije nego RF ### probajmo smanjiti shrinkage (default je 0.1) boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian", n.trees = 500, interaction.depth = 4, shrinkage = 0.01, cv.folds=5) plot(boost.boston$cv.error, type="l", col=1) ## trebamo puno veci broj stabala set.seed(1232) boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian", n.trees = 5000, interaction.depth = 4, shrinkage = 0.01, cv.folds=5) plot(boost.boston$cv.error, type="l", col=1, ylim=c(12,25)) M0=which.min(boost.boston$cv.error) ## 1519 yhat.boost <- predict(boost.boston, newdata = Boston[-train, ], n.trees = M0) mean((yhat.boost - boston.test)^2) ## [1] 11.61111 -- otprilike kao RF ### Boosting cesto daje nesto bolje rezultate nego RF, ### ali RF ima puno manje parametara za odabrati