# Novmeber 17, 2005 # Solutions first exam --- Stat 410 --- Dr. Scott ############################################## #### Problem 1 ############################## ############################################## # read in abalone entree prices (2004 dollars) tmp <- read.table("fish.txt",skip=1) # skip=1 necessary x.fish <- ( as.numeric(tmp[,1]) - 1924 ) / ( 2005-1924 ) y.fish <- as.numeric(tmp[,2]) ### bring in HW3 code for lack-of-fit anova (see homework 3) ### modify the function to only do box-cox on y-variable myreg <- function(x,y) { n <- length(x); xbar <- mean(x); ybar <- mean(y) if(length(y)!=n) stop("lengths do not match") b1 <- sum( (x-xbar)*(y-ybar) ) / sum( (x-xbar)^2 ); b0 <- ybar - b1 * xbar yhat <- b0 + b1*x; attr(yhat,"label") <- "predictions" ehat <- y - yhat; attr(ehat,"label") <- "residuals" ssto <- sum((y-ybar)^2); sse <- sum(ehat^2); mse <- sse/(n-2); ssr <- ssto-sse list(b=c(b0,b1),x=x,y=y,yhat=yhat,ehat=ehat,ssto=ssto,sse=sse,ssr=ssr,mse=mse) } proj3 <- function(x,y,ll=seq(-2,2,length=71),transform=" ") { a <- myreg(x,y); n <- length(x); par(mfrow=c(2,2)) # plot ordinary regression plot(x,y,pch="o",xlab=attr(a$x,"label"),ylab=attr(a$y,"label"),cex=.75, main=paste("Regress",attr(a$y,"label"),"on",attr(a$x,"label"))); abline(a$b) frame(); par(usr=c(0,1,0,1),mar=c(5,0,2,0)+.1); box() # ANOVA table text(.5,.9,"ANOVA TABLE"); abline(h=c(.8,.7,.2)) text(c(.1,.3,.45,.6,.75,.9),rep(.75,6),c("Source","SS","df","MS","F","p-value")) text(rep(.1,3),c(.6,.5,.1),c("Regression","Error ","Total ")) text(rep(.3,3),c(.6,.5,.1),round(c(a$ssr,a$sse,a$ssto),4)) text(rep(.45,3),c(.6,.5,.1),as.integer(c(1,n-2,n-1))) text(rep(.6,2),c(.6,.5),round(c(a$ssr,a$mse),4)); ft <- a$ssr/a$mse text(c(.75,.9),rep(.6,2),round(c(ft,1-pf(ft,1,n-2)),4)) xu <- unique(x); nc <- length(xu); sspe <- 0 # check for repeated values xuu <- NULL # list of duplicate x's if(nc1) { xuu <- c(xuu,xu[i]) } sspe <- sspe + sum( (y[j]-mean(y[j]))^2 ) } sslf <- a$sse-sspe; mslf <- sslf/(nc-2); mspe <- sspe/(n-nc) text(rep(.15,2),c(.4,.3),c("LackFit","PureErr")) text(rep(.3,2),c(.4,.3),round(c(sslf,sspe),4)) text(rep(.45,2),c(.4,.3),as.integer(c(nc-2,n-nc))) text(rep(.6,2),c(.4,.3),round(c(mslf,mspe),4)); ft <- mslf/mspe text(c(.75,.9),rep(.4,2),round(c(ft,1-pf(ft,nc-2,n-nc)),4)) } par(mar=c(5,4,4,2)+.1) xx <- sqrt(a$mse)*qnorm( ((1:n)-.375)/(n+.25) ); yy <- sort(a$ehat); par(pty="s") plot( xx,yy,cex=.75,xlim=range(xx,yy),ylim=range(xx,yy), xlab="expected value",ylab="residuals", main=paste("Normal Probability Plot (", round(cor(xx,yy),3),")")); abline(c(0,1)) text(xx[n/4],yy[3*n/4],paste("corr=",round(cor(xx,yy),4),sep="")); par(pty="m") ysse <- NULL for( lamb in ll ) { yy <- trans(y,lamb); ysse <- c(ysse, myreg(x,yy)$sse) } plot(ll,ysse,type="l",ylim=range(ysse)*c(.99,1),xlab="lambda",ylab="SSE", main=paste("Box-Cox",transform)) text(ll[3],ysse[1],"Y") markmin(ll,ysse,2); return(xuu) } markmin <- function(x,y,lty) {i <- seq(x)[y==min(y)]; print(x[i]); abline(v=x[i],lty=lty)} trans <- function(y,ll) { # Box-Cox k2 <- exp( sum(log(y))/length(y) ) # geometric mean if( abs(ll)<1.e-4 ) { w <- k2 * log(y) } else { w <- (y^ll-1) / ( ll * k2^(ll-1) ) } return(w) } ### (a) plot data lsfit CI's for beta's plot(x.fish,y.fish,pch=16) ls.fish <- lsfit( x.fish, y.fish ); abline(ls.fish) ls.print( ls.fish ) ## Residual Standard Error=6.6795 ## R-Square=0.838 ## F-statistic (df=1, 47)=243.1838 ## p-value=0 ## ## Estimate Std.Err t-value Pr(>|t|) ## Intercept -6.6568 1.9439 -3.4245 0.0013 ## X 56.4458 3.6196 15.5944 0.0000 tt <- qt( .975, 49-2 ) # 2.0117 b0 <- -6.6568 + 2.0117 * c(-1,1) * 1.9439 # ( -10.57, -2.75 ) b1 <- 56.4457 + 2.0117 * c(-1,1) * 3.6196 # ( 49.16, 63.73 ) # or mse <- sum( ls.fish$res^2) / 47 # sqrt( mse * diag( solve( t( cbind(1,x.fish) ) %*% cbind(1,x.fish) ) ) ) # gives the std.errors ### (b) lack-of-fit test proj3(x.fish,y.fish,ll=seq(-2,1.1,l=71),transform="(price only)") -> tmp # WOW p-value = .297 looks like a lousy fit!!! hist( ls.fish$res, 15 ) qqnorm( ls.fish$res ) qqnorm( scale(ls.fish$res ) ); abline(c(0,1)) ### NOTE: r^2 = 0.942 which is less than 0.977 in Table B.6 ==> NOT NORMAL ### (c) polynomial fits > ls.fish2 <- lsfit( cbind(x.fish,x.fish^2 ), y.fish ) > ls.fish3 <- lsfit( cbind(x.fish,x.fish^2,x.fish^3), y.fish ) > par(mfrow=c(1,1)); plot(x.fish,y.fish,pch=16) > xm <- seq(0,1,l=101) > lines( xm, cbind(1,xm,xm^2 ) %*% ls.fish2$coef , col=2, lwd=2) > lines( xm, cbind(1,xm,xm^2,xm^3) %*% ls.fish3$coef , col=3, lwd=2) > ls.print(ls.fish2) Estimate Std.Err t-value Pr(>|t|) Intercept 9.6798 2.0139 4.8064 0.0000 x.fish -23.1177 8.4116 -2.7483 0.0085 x.fish^2 72.4304 7.4180 9.7641 0.0000 > ls.print(ls.fish3) R-Square=0.9482 Estimate Std.Err t-value Pr(>|t|) Intercept 7.8099 2.9471 2.6500 0.0111 x.fish -5.6441 21.7604 -0.2594 0.7965 x.fish^2 31.8113 47.2200 0.6737 0.5040 x.fish^3 25.6022 29.3913 0.8711 0.3883 ## quadratic term significant, but cubic is not ### (d) box-cox transformation # this was plotted in part (b) ll = -.3614 is best choice # ll=0 is the log-transform and the criterion is almost the same (so OK) yt <- trans(y.fish,-.3611); par(mfrow=c(2,2)) plot(x.fish, yt, pch=16); lst.fish <- lsfit(x.fish,yt); abline(lst.fish) ### (e) best box-cox transformation residuals hist( lst.fish$res ) qqnorm( lst.fish$res ) qqnorm( scale(lst.fish$res ) ) -> tmp; abline(c(0,1)) cor( tmp$x, tmp$y) # correlation is not 0.992 so accept normality ### (f) lack-of-fit proj3( x.fish,yt ) ## p-value is 0.36 so don't reject lack-of-fit ############################################## #### Problem 2 ############################## ############################################## # read in rewards data ml <- read.table("mlol.rewards.txt",header=T); par(mfrow=c(2,2)) points <- as.numeric(ml[,1]) # points price <- as.numeric(ml[,2]) # price $$ ship <- as.numeric(ml[,3]) # shipping 1-2-3-4 ### (a) which is the "y" variable ??? plot( price, points, pch=16); a1 <- lsfit(price,points); abline(a1) plot( points, price, pch=16); a2 <- lsfit(points,price); abline(a2) > ls.print(a1) R-Square=0.9762 Estimate Std.Err t-value Pr(>|t|) Intercept 1342.759 3553.1240 0.3779 0.7078 X 121.937 3.2648 37.3487 0.0000 > ls.print(a2) R-Square=0.9762 Estimate Std.Err t-value Pr(>|t|) Intercept 3.9159 28.8428 0.1358 0.8928 X 0.0080 0.0002 37.3487 0.0000 # same significance!!! ### put other line on this graph (very close) abline( -a1$coef[1]/a1$coef[2], 1/a1$coef[2], lty=2 ) ### note: not receiprocal slopes a2$coef[2] * a1$coef[2] = 0.976206 ### (b) which is more appropriate form? ### Q: number of dollars per reward point = .008 ==> price = b . points ### Q: one reward point = ??? dollars ==> points = b . price form a3 <- lsfit( points, price, int=F ) plot( points, price, pch=16); abline(a2); abline(0,a3$coef,col=2) ## a3$coef = ( 0, 0.00802 ) a1$coef = c( 3.91585 0.00801) but look the same a4 <- qqnorm(a3$res); cor(a4$x,a4$y) # = 0.867 much less than 0.969 NOT NORMAL? hist(a3$res); plot( density( a3$res),type="l") ### other way a3a <- lsfit( price, points, int=F ) plot( price, points, pch=16); abline(a1); abline(0,a3a$coef,col=2) ## a3a$coef = ( 0, 122.64 ) a1$coef = c( 1342.76 121.94) but look the same a4a <- qqnorm(a3a$res); cor(a4$x,a4$y) # = 0.867 much less than 0.969 NOT NORMAL? hist(a3a$res); plot( density( a3a$res),type="l") ### (c) include shipping indicators X0 <- cbind( ml[,1], ifelse(ml[,3]==2,1,0), ifelse(ml[,3]==3,1,0), ifelse(ml[,3]==4,1,0) ) ### choice of leaving intercept in or out .... either makes ### sense.... just need to have proper degrees of freedom a5 <- lsfit(X0,price) # can be compared to a2 sseF <- sum(a5$res^2); sseR <- sum(a2$res^2) Ftest <- ( (sseR-sseF)/3 ) / ( sseF / (36-5) ) # 0.702 (obviously not sign) pval <- 1-pf(Ftest,3,32) # 0.558 a6 <- lsfit(X0,price,int=F) # can be compared to a3 sseF <- sum(a6$res^2); sseR <- sum(a3$res^2) Ftest <- ( (sseR-sseF)/3 ) / ( sseF / (36-4) ) # 0.716 (obviously not sign) pval <- 1-pf(Ftest,3,32) # 0.550 ### (d) log10 transform (no intercept form) X0t <- cbind( log10(X0[,1]), X0[,2:4] ) # log10 points prct <- log10(price) # log10 price plot(X0t[,1],prct,pch=16,xlim=range(0,X0t[,1]),ylim=range(0,prct),main="no intercept") a7 <- lsfit(X0t,prct,int=F) print(a7$coef) # 0.496 0.057 0.289 0.629 abline( 0, a7$coef[1] ); for(k in 2:4) abline( a7$coef[k], a7$coef[1] ) ## with intercept plot(X0t[,1],prct,pch=16,main="with intercept") a8 <- lsfit(X0t,prct,int=T) print(a8$coef) # -2.352 1.058 -0.004 -0.048 -0.049 abline( a8$coef[1], a8$coef[2] ) for(k in 3:5) abline( a8$coef[1] + a8$coef[k], a8$coef[2] ) a9 <- lsfit(X0t[,1],prct,int=T) # need this now (reduced model) sseF <- sum(a8$res^2); sseR <- sum(a9$res^2) Ftest <- ( (sseR-sseF)/3 ) / ( sseF / (36-5) ) # 0.335 (obviously not sign) pval <- 1-pf(Ftest,3,31) # 0.800 can drop > ls.print(a8) R-Square=0.9809 Estimate Std.Err t-value Pr(>|t|) Intercept -2.3522 0.1844 -12.7550 0.0000 X1 1.0585 0.0446 23.7183 0.0000 X2 -0.0039 0.0370 -0.1046 0.9174 X3 -0.0482 0.0544 -0.8866 0.3821 X4 -0.0493 0.0732 -0.6736 0.5056 ## shipping too vague (or some prices included shipping and some did not??) ## other way log(price) vs log(points) XX <- cbind( log( ml[,1] ), ifelse(ml[,3]==2,1,0), ifelse(ml[,3]==3,1,0), ifelse(ml[,3]==4,1,0) ); dimnames(XX) <- list(NULL,c("pts","s2","s3","s4")) pts <- log( ml[,2] ) a10 <- lsfit(XX,pts) ############################################## #### Problem 3 ############################## ############################################## tmp <- read.table("APPENC07.txt") yt <- log10(tmp[,1]-76310) # transformed selling price X <- matrix( scan("houses.txt"), ncol=18,byrow=T ) ls.house <- lsfit( scale(X), scale(yt) ) > ls.print( ls.house ) Residual Standard Error=0.4233 R-Square=0.827 F-statistic (df=18, 503)=133.584 p-value=0 Estimate Std.Err t-value Pr(>|t|) Intercept 0.0000 0.0185 0.0000 1.0000 zero since centered X and y X1 0.4610 0.0377 12.2276 0.0000 square feet X2 0.0589 0.0308 1.9096 0.0568 bed 3 X3 0.0846 0.0330 2.5615 0.0107 bed 4 X4 0.0391 0.0299 1.3062 0.1921 bed 5 or more (curiously small) X5 0.0845 0.0318 2.6600 0.0081 bath 2 X6 0.2165 0.0409 5.2987 0.0000 bath 3 X7 0.1988 0.0430 4.6261 0.0000 bath 4 or more (again smaller) X8 0.0608 0.0224 2.7105 0.0069 A/C X9 0.0824 0.0321 2.5704 0.0104 cars 2 X10 0.1299 0.0363 3.5751 0.0004 cars 3 or more X11 0.0303 0.0191 1.5876 0.1130 pool (A/C cools you more?) X12 0.1447 0.0259 5.5919 0.0000 year built (newness counts) X13 0.0686 0.0317 2.1669 0.0307 quality 2 X14 0.1954 0.0357 5.4669 0.0000 quality 1 X15 0.0142 0.0231 0.6123 0.5406 style 1 X16 -0.0824 0.0262 -3.1467 0.0017 style 7 (more expensive but???) X17 0.1351 0.0202 6.6792 0.0000 lot size X18 -0.0227 0.0189 -1.1994 0.2310 adjacent to highway (surprise! # R^2 looks impressive to me plot( scale(yt), cbind(1,scale(X)) %*% ls.house$coef ); abline(0,1) hist( ls.house$res,20 ); xx <- seq(-3,1.4,l=201) lines( xx, 522*0.2*dnorm(xx,0,sqrt(var(ls.house$res))), col=2, lwd=2 ) ### (b) leverages hii <- rowSums( svd(X)$u^2 ) plot(hii) ii <- seq(522)[ hii> 0.10 ] X[ii,] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 [1,] 2283 1 0 0 0 1 0 1 0 1 0 1997 0 1 1 0 18524 1 [2,] 2060 0 0 0 0 1 0 1 1 0 0 1997 1 0 1 0 38623 1 [3,] 2087 0 0 0 1 0 0 1 1 0 0 1966 1 0 1 0 24764 1 [4,] 2081 0 1 0 1 0 0 1 1 0 0 1980 1 0 0 0 24993 1 [5,] 1696 1 0 0 0 1 0 1 1 0 0 1978 1 0 0 0 22294 1 [6,] 2222 0 1 0 1 0 0 0 1 0 0 1955 1 0 1 0 71527 1 [7,] 2110 0 0 1 0 1 0 1 1 0 0 1957 1 0 1 0 15332 1 [8,] 1774 0 1 0 1 0 0 0 1 0 0 1963 0 0 1 0 15528 1 [9,] 1592 1 0 0 1 0 0 1 1 0 0 1957 0 0 1 0 11221 1 [10,] 1748 1 0 0 1 0 0 1 1 0 0 1972 0 0 1 0 23939 1 [11,] 1985 1 0 0 0 0 0 0 1 0 0 1948 0 0 1 0 69975 1 [12,] 2344 0 1 0 0 1 0 1 1 0 0 1925 0 1 1 0 86004 0 [13,] 2017 1 0 0 0 0 0 0 0 0 0 1958 1 0 0 0 86571 0 table(X[,18]) 0 1 ## ALL of the highway houses have high leverage 511 11 sort(X[,17])[515:522] [1] 70240 71527 75232 80886 86004 86248 86571 86830 # other 2 big lots