19 Nonparametric Regression

19.0.1 Step up

  • Data: \((x_{i}, y_{i}), i = 1.,,,,n\)
  • \(y_i = m(x_{i}) + \epsilon_{i}, i =1,...,n\)
  • m unknown, usually continuous and smooth
  • For example, \(m(x_{i}) = \beta_{0} + \beta_{1}x_{i} +\beta_{2}x_{i}^2\)
  • \(\epsilon_i\) iid from a continuous distribution, \(E\epsilon_i = 0\)
  • \(E(y_{i} \mid x_{1}) = m(x_{i})\)
n=30; x=sort(runif(n,0,1)); y=sin(x*pi)+rnorm(n,0,0.1)
plot(x,y); m1=lm(y~x); abline(m1)

x2=x^2; m2=update(m1,~.+x2)
plot(x,y); abline(m1); lines(x,m2$fit,col=2)

summary(m1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54263 -0.10194  0.05317  0.19357  0.44373 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.8662     0.1116   7.764 1.86e-08 ***
## x            -0.2812     0.1927  -1.459    0.156    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.271 on 28 degrees of freedom
## Multiple R-squared:  0.07069,    Adjusted R-squared:  0.0375 
## F-statistic:  2.13 on 1 and 28 DF,  p-value: 0.1556
summary(m2)
## 
## Call:
## lm(formula = y ~ x + x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.14690 -0.05118 -0.02123  0.04942  0.25536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.05135    0.07980  -0.643    0.525    
## x            4.12476    0.33616  12.270 1.49e-12 ***
## x2          -4.08321    0.30452 -13.409 1.88e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09973 on 27 degrees of freedom
## Multiple R-squared:  0.8787, Adjusted R-squared:  0.8697 
## F-statistic: 97.76 on 2 and 27 DF,  p-value: 4.305e-13
y=sin(x*pi*2)+rnorm(n,0,0.1); plot(x,y); m1=lm(y~x); abline(m1,lty=3)

m2=update(m1,~.+I(x^2));plot(x,y); abline(m1,lty=3);
lines(x,m2$fit,col=2,lty=2)

range(x); cx5=cut(x,5); levels(cx5);
## [1] 0.09806405 0.96024698
## [1] "(0.0972,0.271]" "(0.271,0.443]"  "(0.443,0.615]"  "(0.615,0.788]" 
## [5] "(0.788,0.961]"
cx5=cut(x,5,include.lowest=T); levels(cx5)
## [1] "[0.0972,0.271]" "(0.271,0.443]"  "(0.443,0.615]"  "(0.615,0.788]" 
## [5] "(0.788,0.961]"
m3=lm(y~cx5);
summary(m3)$coef
##                     Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)       0.79440594  0.1075450   7.3867297 9.744056e-08
## cx5(0.271,0.443] -0.08205625  0.1422687  -0.5767696 5.692526e-01
## cx5(0.443,0.615] -0.95311579  0.1595150  -5.9750845 3.072359e-06
## cx5(0.615,0.788] -1.76686658  0.1595150 -11.0764895 3.918224e-11
## cx5(0.788,0.961] -1.33388708  0.1520916  -8.7702868 4.224418e-09
tapply(y,cx5,mean)
## [0.0972,0.271]  (0.271,0.443]  (0.443,0.615]  (0.615,0.788]  (0.788,0.961] 
##      0.7944059      0.7123497     -0.1587098     -0.9724606     -0.5394811
plot(x,y); abline(m1,lty=3); lines(x,m2$fit,col=2,lty=2)
lines(x,m3$fit,col="blue",lty=5,type="b")

cx10=cut(x,10,include.lowest=T); m4=update(m3,~cx10)
plot(x,y); abline(m1,lty=3); lines(x,m2$fit,col=2,lty=2)
lines(x,m3$fit,col="blue",lty=5,type="b");
lines(x,m4$fit,col="darkgreen",lty=1,type="b")

19.0.2 Nearest Neighborhood method

  • Local averaging
  • Local linear regression

19.0.3 Example

library(KernSmooth); data(geyser, package = "MASS")
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
xis=geyser$duration; yis=geyser$waiting
plot(xis, yis,xlab="Duration",ylab="Waiting",las=1)
fit1=ksmooth(xis, yis, bandwidth = 0.5); lines(fit1,col=2)
fit2=ksmooth(xis, yis,band= 1);lines(fit2,col=4)
fit3=ksmooth(xis, yis,band= 5);lines(fit3,col=3)

19.0.4 Local linear estimator

19.0.5 Local polynomial estimator

est=locpoly(xis, bandwidth = 0.25); do=density(xis);
hist(xis,prob=T,ylim=c(0,0.6),xlim=range(est$x,xis,do$x),las=1)
lines(est$x,est$y); lines(do$x,do$y,col=2)

plot(xis, yis,xlab="Duration",ylab="Waiting",las=1)
fit1=locpoly(xis, yis, degree=1,band= 0.25); lines(fit1,col=2)
fit2=locpoly(xis, yis, degree=2,band= 0.25);lines(fit2,col=4)
fit3=locpoly(xis, yis, degree=2,band= 0.5);lines(fit3,col=3)