#	Novmeber 17, 2005

#	Solutions first exam --- Stat 410 --- Dr. Scott


##############################################
####  Problem 1 ##############################
##############################################


#	read in abalone entree prices (2004 dollars)

tmp <- read.table("fish.txt",skip=1)		# skip=1 necessary
x.fish <- ( as.numeric(tmp[,1]) - 1924 ) / ( 2005-1924 )
y.fish <- as.numeric(tmp[,2])

###   bring in HW3 code for    lack-of-fit anova  (see homework 3)
###   modify the function to only do box-cox on y-variable


myreg <- function(x,y) { n <- length(x);  xbar <- mean(x); ybar <- mean(y)
  if(length(y)!=n) stop("lengths do not match")
  b1 <- sum( (x-xbar)*(y-ybar) ) / sum( (x-xbar)^2 ); b0 <- ybar - b1 * xbar
  yhat <- b0 + b1*x;  attr(yhat,"label") <- "predictions"
  ehat <- y - yhat;   attr(ehat,"label") <- "residuals"
  ssto <- sum((y-ybar)^2); sse <- sum(ehat^2); mse <- sse/(n-2); ssr <- ssto-sse
  list(b=c(b0,b1),x=x,y=y,yhat=yhat,ehat=ehat,ssto=ssto,sse=sse,ssr=ssr,mse=mse)
 }

proj3 <- function(x,y,ll=seq(-2,2,length=71),transform=" ") {
  a <- myreg(x,y);  n <- length(x);  par(mfrow=c(2,2)) # plot ordinary regression

  plot(x,y,pch="o",xlab=attr(a$x,"label"),ylab=attr(a$y,"label"),cex=.75,
    main=paste("Regress",attr(a$y,"label"),"on",attr(a$x,"label"))); abline(a$b)

  frame(); par(usr=c(0,1,0,1),mar=c(5,0,2,0)+.1); box() # ANOVA table
  text(.5,.9,"ANOVA TABLE"); abline(h=c(.8,.7,.2))
  text(c(.1,.3,.45,.6,.75,.9),rep(.75,6),c("Source","SS","df","MS","F","p-value"))
  text(rep(.1,3),c(.6,.5,.1),c("Regression","Error     ","Total     "))
  text(rep(.3,3),c(.6,.5,.1),round(c(a$ssr,a$sse,a$ssto),4))
  text(rep(.45,3),c(.6,.5,.1),as.integer(c(1,n-2,n-1)))
  text(rep(.6,2),c(.6,.5),round(c(a$ssr,a$mse),4)); ft <- a$ssr/a$mse
  text(c(.75,.9),rep(.6,2),round(c(ft,1-pf(ft,1,n-2)),4))

  xu <- unique(x); nc <- length(xu); sspe <- 0		# check for repeated values
  xuu <- NULL	# list of duplicate x's
  if(nc<n){ for(i in seq(nc)) { j <- seq(n)[x==xu[i]]
      if(length(j)>1) { xuu <- c(xuu,xu[i]) }
      sspe <- sspe + sum( (y[j]-mean(y[j]))^2 ) }
    sslf <- a$sse-sspe; mslf <- sslf/(nc-2); mspe <- sspe/(n-nc)
    text(rep(.15,2),c(.4,.3),c("LackFit","PureErr"))
    text(rep(.3,2),c(.4,.3),round(c(sslf,sspe),4))
    text(rep(.45,2),c(.4,.3),as.integer(c(nc-2,n-nc)))
    text(rep(.6,2),c(.4,.3),round(c(mslf,mspe),4)); ft <- mslf/mspe
    text(c(.75,.9),rep(.4,2),round(c(ft,1-pf(ft,nc-2,n-nc)),4))
  }
  par(mar=c(5,4,4,2)+.1)

  xx <- sqrt(a$mse)*qnorm( ((1:n)-.375)/(n+.25) ); yy <- sort(a$ehat); par(pty="s")
  plot( xx,yy,cex=.75,xlim=range(xx,yy),ylim=range(xx,yy),
    xlab="expected value",ylab="residuals", main=paste("Normal Probability Plot  (",
	round(cor(xx,yy),3),")")); abline(c(0,1))
  text(xx[n/4],yy[3*n/4],paste("corr=",round(cor(xx,yy),4),sep="")); par(pty="m")

  ysse <- NULL
  for( lamb in ll ) { yy <- trans(y,lamb); ysse <- c(ysse, myreg(x,yy)$sse) }

  plot(ll,ysse,type="l",ylim=range(ysse)*c(.99,1),xlab="lambda",ylab="SSE",
    main=paste("Box-Cox",transform))
  text(ll[3],ysse[1],"Y")
  markmin(ll,ysse,2); return(xuu)
}

markmin <- function(x,y,lty) {i <- seq(x)[y==min(y)]; print(x[i]); abline(v=x[i],lty=lty)}

trans <- function(y,ll) {       # Box-Cox
  k2 <- exp( sum(log(y))/length(y) )    # geometric mean
  if( abs(ll)<1.e-4 ) { w <- k2 * log(y) } else { w <- (y^ll-1) / ( ll * k2^(ll-1) ) }
  return(w) }


### (a)  plot data    lsfit    CI's for beta's


plot(x.fish,y.fish,pch=16)

ls.fish <- lsfit( x.fish, y.fish ); abline(ls.fish)
ls.print( ls.fish )


##  Residual Standard Error=6.6795
##  R-Square=0.838
##  F-statistic (df=1, 47)=243.1838
##  p-value=0
##  
##            Estimate Std.Err t-value Pr(>|t|)
##  Intercept  -6.6568  1.9439 -3.4245   0.0013
##  X          56.4458  3.6196 15.5944   0.0000

tt <- qt( .975, 49-2 )		# 2.0117
b0 <- -6.6568 + 2.0117 * c(-1,1) * 1.9439	# ( -10.57, -2.75 )
b1 <- 56.4457 + 2.0117 * c(-1,1) * 3.6196	# (  49.16, 63.73 )

# or  mse <- sum( ls.fish$res^2) / 47
#     sqrt( mse * diag( solve( t( cbind(1,x.fish) ) %*% cbind(1,x.fish) ) ) )
#	gives the std.errors


### (b)   lack-of-fit test


proj3(x.fish,y.fish,ll=seq(-2,1.1,l=71),transform="(price only)") -> tmp

# WOW  p-value = .297        looks like a lousy fit!!!

hist( ls.fish$res, 15 )
qqnorm(       ls.fish$res   )
qqnorm( scale(ls.fish$res ) ); abline(c(0,1))

###  NOTE:  r^2 = 0.942  which is less than 0.977  in Table B.6  ==> NOT NORMAL


### (c)   polynomial fits

> ls.fish2 <- lsfit( cbind(x.fish,x.fish^2         ), y.fish )
> ls.fish3 <- lsfit( cbind(x.fish,x.fish^2,x.fish^3), y.fish )

> par(mfrow=c(1,1)); plot(x.fish,y.fish,pch=16)
> xm <- seq(0,1,l=101)
> lines( xm, cbind(1,xm,xm^2     ) %*% ls.fish2$coef , col=2, lwd=2)
> lines( xm, cbind(1,xm,xm^2,xm^3) %*% ls.fish3$coef , col=3, lwd=2)

> ls.print(ls.fish2)


          Estimate Std.Err t-value Pr(>|t|)
Intercept   9.6798  2.0139  4.8064   0.0000
x.fish    -23.1177  8.4116 -2.7483   0.0085
x.fish^2   72.4304  7.4180  9.7641   0.0000


> ls.print(ls.fish3)


R-Square=0.9482

          Estimate Std.Err t-value Pr(>|t|)
Intercept   7.8099  2.9471  2.6500   0.0111
x.fish     -5.6441 21.7604 -0.2594   0.7965
x.fish^2   31.8113 47.2200  0.6737   0.5040
x.fish^3   25.6022 29.3913  0.8711   0.3883


##  quadratic term significant, but cubic is not


### (d)   box-cox transformation


#  this was plotted in part (b)    ll = -.3614  is best choice

#  ll=0 is the log-transform and the criterion is almost the same (so OK)

yt <- trans(y.fish,-.3611); par(mfrow=c(2,2))
plot(x.fish, yt, pch=16); lst.fish <- lsfit(x.fish,yt); abline(lst.fish)


### (e)   best box-cox transformation   residuals

hist( lst.fish$res )
qqnorm(       lst.fish$res   )
qqnorm( scale(lst.fish$res ) ) -> tmp; abline(c(0,1))

cor( tmp$x, tmp$y)	# correlation is not 0.992   so accept normality


### (f)  lack-of-fit

proj3( x.fish,yt )

##   p-value is 0.36   so don't reject lack-of-fit


##############################################
####  Problem 2 ##############################
##############################################


#	read in rewards data

ml <- read.table("mlol.rewards.txt",header=T);  par(mfrow=c(2,2))
points <- as.numeric(ml[,1])	# points
price <- as.numeric(ml[,2])	# price $$
ship <- as.numeric(ml[,3])	# shipping 1-2-3-4


### (a)   which is the "y" variable  ???

plot( price, points, pch=16); a1 <- lsfit(price,points); abline(a1)
plot( points, price, pch=16); a2 <- lsfit(points,price); abline(a2)

> ls.print(a1)

R-Square=0.9762
          Estimate   Std.Err t-value Pr(>|t|)
Intercept 1342.759 3553.1240  0.3779   0.7078
X          121.937    3.2648 37.3487   0.0000


> ls.print(a2)
R-Square=0.9762
          Estimate Std.Err t-value Pr(>|t|)
Intercept   3.9159 28.8428  0.1358   0.8928
X           0.0080  0.0002 37.3487   0.0000		# same significance!!!


###   put other line on this graph  (very close)

abline( -a1$coef[1]/a1$coef[2], 1/a1$coef[2], lty=2 )

	### note:  not receiprocal slopes   a2$coef[2] * a1$coef[2] =  0.976206


### (b)   which is more appropriate form?


###    Q:  number of dollars per reward point = .008   ==> price = b . points

###    Q:   one reward point =  ??? dollars   ==>  points = b . price  form


a3 <- lsfit( points, price, int=F )

plot( points, price, pch=16); abline(a2); abline(0,a3$coef,col=2)

##  a3$coef = ( 0, 0.00802 )     a1$coef = c( 3.91585  0.00801)   but look the same

a4 <- qqnorm(a3$res); cor(a4$x,a4$y)   # = 0.867    much less than 0.969  NOT NORMAL?
hist(a3$res); plot( density( a3$res),type="l")


### other way

a3a <- lsfit( price, points, int=F )
plot( price, points, pch=16); abline(a1); abline(0,a3a$coef,col=2)
##  a3a$coef = ( 0, 122.64 )     a1$coef = c( 1342.76  121.94)   but look the same

a4a <- qqnorm(a3a$res); cor(a4$x,a4$y)   # = 0.867    much less than 0.969  NOT NORMAL?
hist(a3a$res); plot( density( a3a$res),type="l")


### (c)    include shipping indicators

X0 <- cbind( ml[,1], ifelse(ml[,3]==2,1,0), ifelse(ml[,3]==3,1,0),
	ifelse(ml[,3]==4,1,0) )

###  choice of leaving intercept in or out .... either makes
###  sense....  just need to have proper degrees of freedom

a5 <- lsfit(X0,price)		# can be compared to a2

sseF <- sum(a5$res^2); sseR <- sum(a2$res^2)
Ftest <- ( (sseR-sseF)/3 ) / ( sseF / (36-5) )		# 0.702 (obviously not sign)
pval <- 1-pf(Ftest,3,32)				# 0.558


a6 <- lsfit(X0,price,int=F)	# can be compared to a3

sseF <- sum(a6$res^2); sseR <- sum(a3$res^2)
Ftest <- ( (sseR-sseF)/3 ) / ( sseF / (36-4) )		# 0.716 (obviously not sign)
pval <- 1-pf(Ftest,3,32)				# 0.550


### (d)   log10 transform	(no intercept form)

X0t <- cbind( log10(X0[,1]), X0[,2:4] )		# log10 points
prct  <- log10(price)				# log10 price

plot(X0t[,1],prct,pch=16,xlim=range(0,X0t[,1]),ylim=range(0,prct),main="no intercept")
a7 <- lsfit(X0t,prct,int=F)

print(a7$coef)		#  0.496 0.057 0.289 0.629 

abline( 0, a7$coef[1] ); for(k in 2:4) abline( a7$coef[k], a7$coef[1] )


		## with intercept

plot(X0t[,1],prct,pch=16,main="with intercept")
a8 <- lsfit(X0t,prct,int=T)

print(a8$coef)		#         -2.352     1.058    -0.004    -0.048    -0.049 

abline( a8$coef[1], a8$coef[2] )
  for(k in 3:5) abline( a8$coef[1] + a8$coef[k], a8$coef[2] )


a9 <- lsfit(X0t[,1],prct,int=T)		# need this now (reduced model)

sseF <- sum(a8$res^2); sseR <- sum(a9$res^2)
Ftest <- ( (sseR-sseF)/3 ) / ( sseF / (36-5) )		# 0.335 (obviously not sign)
pval <- 1-pf(Ftest,3,31)				# 0.800    can drop


> ls.print(a8)

R-Square=0.9809

          Estimate Std.Err  t-value Pr(>|t|)
Intercept  -2.3522  0.1844 -12.7550   0.0000
X1          1.0585  0.0446  23.7183   0.0000
X2         -0.0039  0.0370  -0.1046   0.9174
X3         -0.0482  0.0544  -0.8866   0.3821
X4         -0.0493  0.0732  -0.6736   0.5056


	##  shipping too vague   (or some prices included shipping and some did not??)


##  other way    log(price) vs log(points)

XX <- cbind( log( ml[,1] ), ifelse(ml[,3]==2,1,0), ifelse(ml[,3]==3,1,0),
	ifelse(ml[,3]==4,1,0) ); dimnames(XX) <- list(NULL,c("pts","s2","s3","s4"))
pts <- log( ml[,2] )

a10 <- lsfit(XX,pts)


##############################################
####  Problem 3 ##############################
##############################################


tmp <- read.table("APPENC07.txt")
yt <- log10(tmp[,1]-76310)		# transformed selling price
X <- matrix( scan("houses.txt"), ncol=18,byrow=T )

ls.house <- lsfit( scale(X), scale(yt) )


> ls.print( ls.house )
Residual Standard Error=0.4233
R-Square=0.827
F-statistic (df=18, 503)=133.584
p-value=0

          Estimate Std.Err t-value Pr(>|t|)
Intercept   0.0000  0.0185  0.0000   1.0000   zero since centered X and y
X1          0.4610  0.0377 12.2276   0.0000   square feet
X2          0.0589  0.0308  1.9096   0.0568   bed 3
X3          0.0846  0.0330  2.5615   0.0107   bed 4
X4          0.0391  0.0299  1.3062   0.1921   bed 5 or more (curiously small)
X5          0.0845  0.0318  2.6600   0.0081   bath 2
X6          0.2165  0.0409  5.2987   0.0000   bath 3
X7          0.1988  0.0430  4.6261   0.0000   bath 4 or more (again smaller)
X8          0.0608  0.0224  2.7105   0.0069   A/C
X9          0.0824  0.0321  2.5704   0.0104   cars 2
X10         0.1299  0.0363  3.5751   0.0004   cars 3 or more
X11         0.0303  0.0191  1.5876   0.1130   pool  (A/C cools you more?)
X12         0.1447  0.0259  5.5919   0.0000   year built (newness counts)
X13         0.0686  0.0317  2.1669   0.0307   quality 2
X14         0.1954  0.0357  5.4669   0.0000   quality 1
X15         0.0142  0.0231  0.6123   0.5406   style 1
X16        -0.0824  0.0262 -3.1467   0.0017   style 7 (more expensive but???)
X17         0.1351  0.0202  6.6792   0.0000   lot size
X18        -0.0227  0.0189 -1.1994   0.2310   adjacent to highway (surprise!


#	R^2 looks impressive to me

plot( scale(yt), cbind(1,scale(X)) %*% ls.house$coef ); abline(0,1)
hist( ls.house$res,20 ); xx <- seq(-3,1.4,l=201)
lines( xx, 522*0.2*dnorm(xx,0,sqrt(var(ls.house$res))), col=2, lwd=2 )


### (b)   leverages


hii <- rowSums( svd(X)$u^2 )
plot(hii)

ii <- seq(522)[ hii> 0.10 ]
X[ii,]


        1  2 3 4 5 6 7 8 9 10 11   12  13 14 15 16  17   18
 [1,] 2283 1 0 0 0 1 0 1 0  1  0  1997  0  1  1  0 18524  1
 [2,] 2060 0 0 0 0 1 0 1 1  0  0  1997  1  0  1  0 38623  1
 [3,] 2087 0 0 0 1 0 0 1 1  0  0  1966  1  0  1  0 24764  1
 [4,] 2081 0 1 0 1 0 0 1 1  0  0  1980  1  0  0  0 24993  1
 [5,] 1696 1 0 0 0 1 0 1 1  0  0  1978  1  0  0  0 22294  1
 [6,] 2222 0 1 0 1 0 0 0 1  0  0  1955  1  0  1  0 71527  1
 [7,] 2110 0 0 1 0 1 0 1 1  0  0  1957  1  0  1  0 15332  1
 [8,] 1774 0 1 0 1 0 0 0 1  0  0  1963  0  0  1  0 15528  1
 [9,] 1592 1 0 0 1 0 0 1 1  0  0  1957  0  0  1  0 11221  1
[10,] 1748 1 0 0 1 0 0 1 1  0  0  1972  0  0  1  0 23939  1
[11,] 1985 1 0 0 0 0 0 0 1  0  0  1948  0  0  1  0 69975  1
[12,] 2344 0 1 0 0 1 0 1 1  0  0  1925  0  1  1  0 86004  0
[13,] 2017 1 0 0 0 0 0 0 0  0  0  1958  1  0  0  0 86571  0


table(X[,18])

  0   1 		##  ALL of the highway houses have high leverage
511  11 

sort(X[,17])[515:522]
[1] 70240 71527 75232 80886 86004 86248 86571 86830	# other 2 big lots