library(ISLR2) attach(Wage) ?Wage agelims <- range(age)+c(-5,5) age.grid <- seq(from = agelims[1], to = agelims[2]) plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") ## -> nelinearna veza ### Polinomijalna regresija fit <- lm(wage ~ poly(age, 4), data = Wage) ## poly generira kovarijate h_0(age),...,h_4(age) ## gdje je {h_i} ortogonalna baza za prostor 4d polinoma. coef(summary(fit)) ## lin. model kao i svaki drugi, samo nas ne zanimaju koeficijenti ## alternativno: fit2 <- lm(wage ~ age + I(age^2) + I(age^3) + I(age^4), data = Wage) coef(fit2) ## ipak, predikcije su iste preds <- predict(fit, newdata = list(age = age.grid), se.fit = TRUE) preds2 <-predict(fit2, newdata = list(age = age.grid), se.fit = TRUE) max(abs(preds$fit-preds2$fit)) ## [1] 7.81597e-11 #### se.bands <- cbind(preds$fit + 2 * preds$se.fit, preds$fit - 2 * preds$se.fit) plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") title("Degree-4 Polynomial", outer = T) lines(age.grid, preds$fit, lwd = 2, col = "blue") matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3) ## -> siri intervali pri krajevima (manje podataka) ############# splajnovi library(splines) ?bs f_matrix=bs(age, knots = c(25, 40, 60)) ## degree=3 je default ## 3 cvora, d=3 -> 3+3+1=7 baznih funkcija ## (bs izostavlja slobodni clan ## -> npr. on se automatski dodaje kod lm) dim(f_matrix) ## ako hocemo slobodni clan f_matrix2=bs(age, knots = c(25, 40, 60), intercept = TRUE) dim(f_matrix2) ind=sort(age, index.return=TRUE, decreasing = FALSE)$ix ## Kako izgledaju funkcije iz "B-spline" baze? ## svaki stupac j sadrzi vrijednosti ## j-te funkcije baze evaluirane u age for (j in 7:1) { plot(age[ind], f_matrix2[ind,j], type="l") abline(v= c(25, 40, 60), lty="dashed", col ="darkred") ## cvorovi } ## regresijski kubicni splajn fit <- lm(wage ~ bs(age, knots = c(25, 40, 60)), data = Wage) ## -> to je opet obican linearan model ## sa kovarijatama izvedenim iz age summary(fit) pred <- predict(fit, newdata = list(age = age.grid), se = T) plot(age, wage,xlim = agelims, cex = .5, col = "darkgrey") lines(age.grid, pred$fit, lwd = 2) lines(age.grid, pred$fit + 2 * pred$se, lty = "dashed") lines(age.grid, pred$fit - 2 * pred$se, lty = "dashed") abline(v= c(25, 40, 60), lty="dashed", col ="darkred") ## cvorovi ##### stupnjevi slobode ## umjesto bs(age, knots = c(25, 40, 60)) ## mogli smo zadati stupnjeve slobode dim(bs(age, df = 6)) as.numeric(attr(bs(age, df = 6), "knots")) ## -> cvorovi postavljeni na kvantile od age ## (mozda prirodnije jer se prilagodava podacima, ## ali nije jasno zasto ## bi to uvijek bilo optimalno) fit <- lm(wage ~ bs(age, df=6), data = Wage) pred <- predict(fit, newdata = list(age = age.grid), se = T) plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") lines(age.grid, pred$fit, lwd = 2) # lines(age.grid, pred$fit + 2 * pred$se, lty = "dashed") # lines(age.grid, pred$fit - 2 * pred$se, lty = "dashed") abline(v= as.numeric(attr(bs(age, df = 6), "knots")), lty="dashed", col ="darkred") ## cvorovi ### povecajmo df fit2 <- lm(wage ~ bs(age, df=30), data = Wage) pred2 <- predict(fit2, newdata = list(age = age.grid), se = T) plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") lines(age.grid, pred$fit, lwd = 2) lines(age.grid, pred2$fit, lwd = 2, col="blue") # abline(v= as.numeric(attr(bs(age, df = 20), "knots")), # lty="dashed", col ="darkred") ## cvorovi # ########### 1-splajnovi i step funkcije # plot(age, wage, cex = .5, col = "darkgrey") # lines(age.grid, pred$fit, lwd = 1) # # # neprekidna po dijelovima linearna aproksimacija # fit1 <- lm(wage ~ bs(age, knots = c(25, 40, 60), degree=1), data = Wage) # pred1 <- predict(fit1, newdata = list(age = age.grid), se = T) # lines(age.grid, pred1$fit, lwd = 1, col="darkblue") # # # step funkcije # fit0 <- lm(wage ~ cut(age, c(agelims[1],25, 40, 60,agelims[2])), data = Wage) # pred0 <- predict(fit0, newdata = list(age = age.grid), se = T) # lines(age.grid, pred0$fit, lwd = 1, col="darkorange") ###### prirodni splajn ###### linearna aproksimacija ###### na rubnim regijama (smanjenje varijance) ?ns f_matrix_ns=ns(age, knots=c(30,50,70), Boundary.knots = c(20,75), intercept = TRUE) ## broj baznih funkcija=broj cvorova dim(f_matrix_ns) ## Kako izgledaju funkcije iz "B-spline" baze za prirodne splajnove? for (i in 5:1) { plot(age[ind], f_matrix_ns[ind,i], xlim = agelims, type="l") abline(v= c(20,30,50,70,75), lty="dashed", col ="darkred") ## cvorovi } ### linearne u rubnim segmentima ### kubicni splajn vs prirodni kubicni splajn ### predikcije izvan raspona podataka fit_bs <- lm(wage ~ bs(age, df = 10), data = Wage) fit_ns <- lm(wage ~ ns(age, df = 10), data = Wage) pred_bs <- predict(fit_bs, newdata = list(age = age.grid), se = T) pred_ns <- predict(fit_ns, newdata = list(age = age.grid), se = T) plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") lines(age.grid, pred_bs$fit, lwd = 2, col="darkorange") # lines(age.grid, pred_bs$fit + 2 * pred$se, lty = "dashed") # lines(age.grid, pred_bs$fit - 2 * pred$se, lty = "dashed") lines(age.grid, pred_ns$fit, col = "darkblue", lwd = 2) # lines(age.grid, pred_ns$fit + 2 * pred2$se, lty = "dashed", col = "darkblue") # lines(age.grid, pred_ns$fit - 2 * pred2$se, lty = "dashed", col = "darkblue") ########### Smoothing spline ### ponovno prirodan kubicni splajn, ### ali postoji samo jedan parametar ### koji kontrolira fleksibilnost ### -> CV, i to tipicno LOOCV ili GCV plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") title("Smoothing Spline") fit_ss <- smooth.spline(age, wage, df = 20) fit2_ss <- smooth.spline(age, wage, cv = TRUE) ## lambda, ili ekvivalentno df, odabrani LOOCV metodom ## Zasto LOOCv mozda nije najbolja ideja? fit3_ss <- smooth.spline(age, wage, cv = FALSE) ## sada koristimo GCV metodu ### vidi zadnji dio od Details u ?smooth.spline fit2_ss$df # [1] 6.794596 fit3_ss$df # [1] 6.468299 ## kod predict funkcije nema "newdata" pred_ss=predict(fit_ss, x=age.grid)$y pred2_ss=predict(fit2_ss, x=age.grid)$y lines(age.grid, pred_ss, lwd=2) lines(age.grid, pred2_ss, col="darkred", lwd=2) legend("topright", legend = c("16 DF", "6.8 DF"), col = c("black", "darkred"), lty = 1, lwd = 2, cex = .5) ### Intervali pouzdanosti za procjene? ---> Bootstrap (kasnije)