# ############################################################################# # Linear Regression ***************************** # ############################################################################# require(LearnBayes) data(birdextinct) attach(birdextinct) # ############################################################################# # Exploratory analysis ******************************************************** # ############################################################################# # Distribution of dependent variable (Y) hist(time) # Transform Y to log-scale logtime = log(time) hist(logtime) # Look at relationship between Y and predictor variables # Nesting (X1) - notice positive linear relationship plot(nesting, logtime) out = logtime>3 # outliers text(nesting[out], logtime[out], label = species[out], pos = 3) # Size (X2) plot(jitter(size), logtime, xaxp = c(0,1,1)) # Status (X3) plot(jitter(status), logtime, xaxp = c(0,1,1)) # ############################################################################# # Inference ******************************************************************* # ############################################################################# # Frequentist approach ******************************************************** fit = lm(logtime ~ nesting + as.factor(size) + as.factor(status)) summary(fit) # Bayesian approach using non-informative prior ******************************* y = logtime n = length(y) x = as.matrix(cbind(rep(1,n), birdextinct[,3:5])) # Compute quantities for speed Vb = solve(t(x) %*% x) betahat = Vb %*% t(x) %*% y s2 = t(y - x %*% betahat) %*% (y - x %*% betahat) # MCMC settings T = 10000 k = ncol(x) sigma2 = rep(NA, T) beta = matrix(NA, T, k) # Sample from joint posterior for (t in 1:T){ # Draw sigma2 sigma2[t] = rigamma(1, (n-k)/2, s2/2) # Draw beta beta[t,] = rmnorm(1, betahat, sigma2[t] * Vb) } # Distribution of posterior samples par(mfrow=c(2,2)) hist(beta[,2], main = "Nesting", xlab = expression(beta[1])) hist(beta[,3], main = "Size", xlab = expression(beta[2])) hist(beta[,4], main = "Status", xlab = expression(beta[3])) hist(sigma2, main = "Variance", xlab = expression(sigma^2)) par(mfrow=c(1,1)) # Posterior summary statistics: mean and 95% credible interval cat("Posterior mean of beta:", apply(beta, 2, mean)) cat("Quantiles for beta:") apply(beta, 2, quantile, c(0.025, 0.5, 0.975)) cat("Quantiles for sigma^2:", quantile(sigma2, c(0.025, 0.5, 0.975))) # ############################################################################# # Prediction ****************************************************************** # ############################################################################# # Simulate future data x_f00 = data.frame(nesting = rep(4,4), size = c(rep(0,2),rep(1,2)), status = rep(c(0,1),2)) # Frequentist approach ******************************************************** predict(fit, x_f00, interval = "prediction") # Bayesian approach: approximate posterior predictive density ***************** x_f = as.matrix(cbind(rep(1,4),x_f00)) n_f = nrow(x_f) ytilde = matrix(NA, T, n_f) for (t in 1:T){ ytilde[t,] = rmnorm(1, x_f %*% beta[t,], sigma2[t]*diag(n_f)) } # Distribution of posterior predictive samples c.labels = c("A", "B", "C", "D") par(mfrow=c(2,2)) for (j in 1:4){ hist(ytilde[ , j], main = paste("Covariate Set", c.labels[j]), xlab = "log time", prob = T, breaks = 20) } par(mfrow=c(1,1)) # Posterior mean / 95% credible interval cbind(apply(ytilde, 2, mean), t(apply(ytilde, 2, quantile, c(0.025, 0.975)))) # ############################################################################# # Model checking and robustness: ********************************************** # Evaluate whether data is consistent with fitted model *********************** # ############################################################################# # Simulate future samples y1*,...,yn* from posterior predictive distribution, # conditional on the training covariates ystar = matrix(NA, T, n) for (t in 1:T){ ystar[t,] = rmnorm(1, x %*% beta[t,], sigma2[t]*diag(n)) } # Look at position of y relative to histogram of simulated values of y*: # Plot 95% credible intervals y_store = cbind(y, apply(ystar, 2, mean), t(apply(ystar, 2, quantile, c(0.025, 0.975)))) colnames(y_store)[2] = "y_star_mean" pred.ci = apply(ystar, 2, quantile, c(0.025, 0.975)) par(mfrow=c(1,1)) matplot(rbind(1:n,1:n), pred.ci, type = "l", lty = 1, xlab = "Observation Index", ylab = "Log time", col = "black") # Plot actual extinction log-times (y). Identify outliers as values outside 95% # credible interval points(1:n, logtime, pch = 19) out = (logtime > pred.ci[2,]) text((1:n)[out], logtime[out], label = species[out], pos = 4)