# October 19, 2005 # Solutions for HW #6 # compute sum of squares of vector ss <- function(y,mean=F) { if(mean) sum((y-mean(y))^2) else sum(y^2) } ##### Problem 8.8 ### (a) cp <- read.table("CH06PR18.txt") # (y,x1,x2,x3,x4) n=81 y <- cp[,1]; xm1 <- mean(cp[,2]) # mean of variable 1 (in 2nd column) X <- cbind( 1, cp[,1+1]-xm1, (cp[,1+1]-xm1)^2, cp[,2+1], cp[,4+1] ) ans <- lsfit(X,y, int=F) # I am doing the intercept yhat <- c( X %*% ans$coef ) plot(y,yhat,pch=16); abline(c(0,1)) # FIT LOOKS OK with quadratic term ### (b) adjusted R^2 see p.226 ls.print( ans ); r2 <- 0.6131 # R^2 = 0.6131 from output r2a <- 1 - (81-1)/(81-5)*(1-r2) # 0.593 adjusted R^2 ### (c) is quadratic term helpful? # H0: beta2 = 0 # from ls.print X2 is the quadratic term # b2 s(b2) t p-val # X2 0.0141 0.0058 2.4306 0.0174 # p-val < 0.05 so reject H0 (i.e. need quadratic term) ### (d) prediction and confidence interval x <- c( 1, 8-xm1, (8-xm1)^2, 16, 250000 ) yh <- sum( ans$coef * x ) mse <- sum( ans$resid^2 ) / (81-5) var.y <- mse * x %*% solve( t(X) %*% X ) %*% x # = .37345^2 tval <- qt( 0.975, 81-5 ) # 1.991673 ci <- c( yh - tval*sqrt(var.y), yh + tval*sqrt(var.y) ) # (16.46,17.94) ### (e) express in original variables # y = 10.189 - .1818 x1 + .01415 x1^2 + .3140 X2 + .000008046 X4 # becomes after substituting x1 = X1 - 7.8642 # y = 12.4938 - .4044 X1 + .01415 X1^2 + .314 X2 + .000008046 X4 ##### Problem 8.38 senic <- read.table("APPENC01.txt") # 113 by 12 ### (a) quadratic model y <- senic[,11] # number nurses x <- senic[,12]; xm <- mean(x) # available facilities xm = 43.159 X <- cbind( x-xm, (x-xm)^2 ) # center variable in quadratic model ans <- lsfit(X,y) # Y = 150.079 + 7.066 x1 + 0.1012 x1^2 # x1 = centered x plot( cbind(1,X) %*% ans$coef, ans$res, pch=16); abline(h=0) # looks good # although variance grows ### (b) R^2 # ls.print(ans) returns # Residual Standard Error=82.3078 # R-Square=0.6569 # R2=.66 # F-statistic (df=2, 110)=105.3216 # p-value=0 # # Estimate Std.Err t-value Pr(>|t|) # Intercept 150.0792 9.9414 15.0964 0e+00 # X1 7.0662 0.5125 13.7869 0e+00 # X2 0.1012 0.0272 3.7157 3e-04 # significant quadratic # # ls.print( lsfit( x-xm, y ) ) returns R-Square=0.6139 # yes significant improvement in model to include quadratic term ### (c) quadratic term significant (see part b) ##### Problem 8.40 senic <- as.matrix(read.table("APPENC01.txt")) # 113 by 12 ### (a) linear model with indicator for medical school y <- senic[,4] # infection risk x <- senic[,c(2,3,6,8)] # length.stay/age/x-ray/medical.school x[,4] <- 2-x[,4] # change (1,2) to (1,0) indicator ans <- lsfit(x,y) # y = 0.8574 + 0.2888 x1 - 0.0181 x2 + 0.0200 x3 + 0.2878 x4 ### (b) confidence interval for beta4-hat mse <- sum( ans$res^2 ) / ( length(y) - 5 ) # error MSE df = 113-5 # standard deviation of beta-hat-4 sd.b4 <- sqrt( mse * solve( t(cbind(1,x)) %*% cbind(1,x) )[5,5] ) # 0.3067 tval <- qt(0.99, length(y)-5) # for 98% CI --- strange choice ci <- ans$coef[5] + c(-1,1) * tval * sd.b4 # -0.4364 < b4 < 1.0120 # medical school does not appear to add to prediction of infection risk ### (c) add interaction terms x2 <- cbind( x, x[,2]*x[,4], x[,3]*x[,4] ) # add 2 interaction terms ans2 <- lsfit(x2,y) ls.print(ans2) # Y = .9941 + .2641 X1 - .0228 X2 + .0243 X3 - 5.695 X4 + # .1558 X2 X4 - .0241 X3 X4 # H0: b5=b6=00 vs. not both zero sse.f <- sum( ans2$res^2 ) # 122.0468 sse.r <- sum( lsfit(x2[,1:4],y)$res^2 ) F <- ( (sse.r - sse.f) / 2 ) / ( sse.f / (113-7) ) # 2.256574 pval <- 1 - pf( F, 2, 113-7 ) # 10.97% ## just not signficant....fail to reject H0 ##### Problem 8.41 other senic variables y <- senic[,2] # length of stay X <- senic[, c(3,5,10,12)] # age/culturing/census/facilities # add NE/NC/S regions (W=baseline) X <- cbind(X, senic[,9]==1, senic[,9]==2, senic[,9]==3 ) ### (a) ans <- lsfit(X,y) # Y = 2.0478 + .1037 X1 + .0403 X2 + .00660 X3 - .0208 X4 + # 2.150 X5 + 1.1903 X6 + .6335 X7 ### (b) Ho: b2=0 (culturing ratio) ls.print(ans) # Estimate Std.Err t-value Pr(>|t|) # Intercept 2.0478 1.8130 1.1296 0.2612 # V3 0.1037 0.0315 3.2960 0.0013 # V5 0.0403 0.0143 2.8177 0.0058 # V10 0.0066 0.0014 4.7003 0.0000 # V12 -0.0208 0.0144 -1.4449 0.1515 # 2.1500 0.4615 4.6585 0.0000 # 1.1903 0.4371 2.7235 0.0076 # 0.6335 0.4276 1.4816 0.1414 # read off V5 line ==> t-test = 2.8177 and pval=.0058 ==> reject H0 ### (c) confidence intervals for b5, b6, b7 tval <- -qt( .025/3, 113-7 ) # bonferronie of 95% alpha 2.1500 + c(-1,1) * tval * .4615 # 1.0274 < b5 < 3.2726 1.1903 + c(-1,1) * tval * .4371 # 0.1267 < b6 < 2.2533 0.6335 + c(-1,1) * tval * .4276 # -0.4067 < b7 < 1.6737 # South and West not statistical different # NE is statistical different from West as is NC ## note the standard errors are given in ls.print