############################################################################### # # # Mode Tree S-plus Functions # # # # These functions, together with the FORTRAN code in the companion file # # "modetree.f", will allow the user to generate univariate mode trees, as # # described in Minnotte and Scott, (1993) "The Mode Tree: A Tool for # # Visualization of Nonparametric Density Features," Journal of Computational # # and Graphical Statistics, 2, 51-68. # # # # The primary functions are the first four. # # # # modetree - generates a basic mode tree including only modes and branchings # # enhmtree - generates an enhanced mode tree including modes, branchings, # # mode sizes, bumps, and antimodes # # modetreeplot - plots a basic mode tree (if doplot=T, called automatically # # from modetree) # # enhmtreeplot - plots an enhanced mode tree (if doplot=T, called # # automatically from enhmtree) # # # # The remaining functions exist primarily to be called from the above # # functions, but may be useful themselves. # # # # bin1 - bins a data set into nbin bins for use in nash1 # # nash1 - approximates a kernel density estimate with normal kernel using # # an appropriately weighted average shifted histogram (ASH) # # nicerange - range of a data set extended on each side (default: 10%) # # matcher - matches elements from two lists as described in the appendix # # to Minnotte and Scott # # quadest - given three ordered pairs, finds the extremum of the interpolated # # parabola # # findmodes - given sequence of values representing a density function, # # identify number of modes, locations of modes, and indices of modes # # and antimodes # # findbumps - given sequence of values representing a density function, # # identify bumps (regions of negative second derivative) # # # # All functions are copyright 1993, Michael C. Minnotte except bin1 and # # nicerange, copyright 1986, David W. Scott, and nash1, copyright 1993 David # # W. Scott and Michael C. Minnotte. # # # # The authors may be contacted at minnotte@math.usu.edu and scottdw@rice.edu .# ############################################################################### modetree<- function(X, ran = c(diff(nicerange(X))/4, diff(nicerange(X))/500), maintit = "", xlb="", nbin = 500, nh = 200, doplot = T, docat=T) # modetree - generates a basic mode tree including only modes and branchings # X - raw (unbinned) data; ran - largest and smallest bandwidths; maintit - # main title on plot; xlb - x label on plot; nbin - number of bins for data # and ASH; nh - number of different bandwidths; doplot - plot tree on # current device if TRUE; docat - indicate status if TRUE # Copyright 1993, Michael C. Minnotte { tree <- NULL modes <- NULL bins <- bin1(X, nbin = nbin) maxh <- ran[1] minh <- ran[2] loghlist <- seq(log(maxh), log(minh), length = nh) n <- length(X) hlist <- exp(loghlist) dh<- hlist[2]/hlist[1] est <- nash1(bins, hlist[1]) oldmodes <- findmodes(est$x, est$y) tree <- cbind(oldmodes$modes[, 1], rep(hlist[1], oldmodes$nm), rep( 0, oldmodes$nm), rep(0, oldmodes$nm)) count <- 0 for(i in 2:length(hlist)) { if(docat) cat("h number ", i, " starting.\n") est <- nash1(bins, hlist[i]) newmodes <- findmodes(est$x, est$y) mout <- matcher(newmodes$modes[, 1], oldmodes$modes[, 1]) for(j in 1:newmodes$nm) { if(mout[j, 1] <= oldmodes$nm) tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + mout[j, 1], 1)) else { oldfrom <- (1:oldmodes$nm)[newmodes$modes[ j, 2] > oldmodes$am[, 1] & newmodes$ modes[j, 2] < oldmodes$am[, 2]] newfrom <- mout[oldfrom, 2] tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + oldmodes$nm + newfrom, 2)) } } count <- count + oldmodes$nm oldmodes <- newmodes } if(doplot) modetreeplot(tree, maintit, xlb) return(tree) } ############################################################################### enhmtree<- function(X, ran = c(diff(nicerange(X))/4, diff(nicerange(X))/200), maintit = "", xlb = "", nbin = 500, nh = 200, doplot = T, docat = T) # enhmtree - generates an enhanced mode tree including modes, branchings, # mode sizes, bumps, and antimodes # X - raw (unbinned) data; ran - largest and smallest bandwidths; maintit - # main title on plot; xlb - x label on plot; nbin - number of bins for data # and ASH; nh - number of different bandwidths; doplot - plot tree on # current device if TRUE; docat - indicate status if TRUE # Copyright 1993, Michael C. Minnotte { tree <- NULL am <- NULL bins <- bin1(X, nbin = nbin) maxh <- ran[1] minh <- ran[2] loghlist <- seq(log(maxh), log(minh), length = nh) hlist <- exp(loghlist) n <- length(X) est <- nash1(bins, hlist[1]) d <- est$x[2] - est$x[1] oldbumps <- findbumps(est$x, est$y) nob <- nrow(oldbumps) shade <- cbind(oldbumps, rep(hlist[1], nob), rep(0, nob), rep(0, nob)) oldmodes <- findmodes(est$x, est$y) if(oldmodes$nm == 1) tree <- c(oldmodes$modes[1, 1], hlist[1], 0, 1) else for(i in 1:oldmodes$nm) { temp <- .Fortran("excise", as.double(est$y), g = double(nbin), coeff = double(1), as.integer(nbin), as.integer(oldmodes$am[i, 1]), as.integer(oldmodes$am[i, 2]), as.double(d)) wd <- sum(est$y - temp$g) * d tree <- rbind(tree, c(oldmodes$modes[i, 1], hlist[ 1], 0, wd)) } onm1<-oldmodes$nm-1 if(oldmodes$nm > 1){ oldam<-rep(0,onm1) for(j in 1:onm1) oldam[j] <- quadest(est$x[(oldmodes$am[j + 1, 1] - 1):(oldmodes$am[j + 1, 1] + 1)], est$y[ (oldmodes$am[j + 1, 1] - 1):(oldmodes$am[j + 1, 1] + 1)]) am<-cbind(oldam,rep(hlist[1],onm1),rep(0,onm1), rep(0,onm1))} count <- 0 countb<-0 countam <- 0 for(i in 2:length(hlist)) { if(docat) cat("h number ", i, " starting.\n") est <- nash1(bins, hlist[i]) newbumps <- findbumps(est$x, est$y) nnb <- nrow(newbumps) mblout <- matcher(newbumps[,1],oldbumps[,1])[1:nnb,1] mblout[mblout>nob] <- -countb mbrout <- matcher(newbumps[,2],oldbumps[,2])[1:nnb,1] mbrout[mbrout>nob] <- -countb newbumps <- cbind(newbumps, rep(hlist[i], nnb), countb + mblout, countb + mbrout) shade <- rbind(shade, newbumps) countb <- countb + nob nob <- nnb oldbumps <- newbumps # shade <- rbind(shade, cbind(bumps, rep(hlist[i], nrow(bumps)))) newmodes <- findmodes(est$x, est$y) mout <- matcher(newmodes$modes[, 1], oldmodes$modes[, 1]) for(j in 1:newmodes$nm) { if(newmodes$nm == 1) wd <- 1 else { temp <- .Fortran("excise", as.double(est$y), g = double(nbin), coeff = double(1), as.integer(nbin), as.integer(newmodes$am[j, 1]), as.integer(newmodes$am[j, 2]), as.double(d)) wd <- sum(est$y - temp$g) * d } if(mout[j, 1] <= oldmodes$nm) tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + mout[j, 1], wd)) else { oldfrom <- (1:oldmodes$nm)[newmodes$modes[ j, 2] > oldmodes$am[, 1] & newmodes$ modes[j, 2] < oldmodes$am[, 2]] newfrom <- mout[oldfrom, 2] tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + oldmodes$nm + newfrom, -1 * wd)) } } count <- count + oldmodes$nm if(newmodes$nm > 1) { nnm1<-newmodes$nm-1 newam<-rep(0,nnm1) for(j in 1:nnm1) newam[j] <- quadest(est$x[(newmodes$am[j + 1, 1] - 1):(newmodes$am[j + 1, 1] + 1)], est$y[ (newmodes$am[j + 1, 1] - 1):(newmodes$am[j + 1, 1] + 1)]) if (is.null(am)) am<-cbind(newam,rep(hlist[i],nnm1),rep(0,nnm1), rep(0,nnm1)) else { mout<-matcher(newam,oldam) for (j in 1:nnm1) if (mout[j,1]<=onm1) am<-rbind(am,c(newam[j], hlist[i],countam+ mout[j,1],1)) else am<-rbind(am,c(newam[j], hlist[i],0,0)) } countam<-countam + onm1 onm1<-nnm1 oldam<-newam } oldmodes <- newmodes } out <- list(tree = tree, shade = shade, am = am) if(doplot) enhmtreeplot(out, maintit, xlb=xlb) return(out) } ############################################################################### modetreeplot<- function(tree, maintit = "", xlb="") # modetreeplot - plots a basic mode tree (if doplot=T, called automatically # from modetree) # tree - mode tree (output from modetree); maintit - main title; xlb - x label # Copyright 1993, Michael C. Minnotte { nr <- nrow(tree) plot(tree[floor(nr/2), 1], tree[floor(nr/2), 2], type = "n", xlim = nicerange(tree[, 1]), ylim = range(tree[, 2]), log = "y", xlab = xlb,ylab = "h", main = maintit) t1 <- (1:nrow(tree))[tree[, 4] == 1] t2 <- (1:nrow(tree))[tree[, 4] == 2] segments(tree[t1, 1], tree[t1, 2], tree[tree[t1, 3], 1], tree[tree[ t1, 3], 2], lty = 1) segments(tree[t2, 1], tree[t2, 2], tree[tree[t2, 3], 1], tree[tree[ t2, 3], 2], lty = 2) } ############################################################################### enhmtreeplot<- function(enhmtree, maintit = "", xlb="", wd = 0, cols=c(16,1,2,1,0,1), amp=3) # enhmtreeplot - plots an enhanced mode tree (if doplot=T, called # automatically from enhmtree) # tree - mode tree (output from modetree); maintit - main title; xlb - x label; # wd - width of line indicating mode consisting of entire density (default of # 0 calculates so that no mode line goes beyond its bump's shaded area); # cols - colors: [1] - modes (polygon), [2] - modes (segments), [3] - bumps # (polygons), [4] - antimodes (points/segments), [5] - background/antibumps # (polygon), [6] - labels/box (lines/text), selected values look good for # X11 graphics mode, for openlook cyan-to-red I like c(5,5,2,1,6,1), and # for (b&w) postscript I like c(1,1,2,1,0,1); amp - antimode points, if # positive one dot for every amp h values will be plotted, if nonpositive, # line type -amp will be used. # Copyright 1993, Michael C. Minnotte { par(las = 1) tree <- enhmtree$tree shade <- enhmtree$shade am <- enhmtree$am nr <- nrow(tree) plot(tree[floor(nr/2), 1], tree[floor(nr/2), 2], type = "n", xlim = range(shade[, 1:2]), ylim = range(shade[, 3]), log = "y", xlab = xlb, ylab = "h", main = maintit, col=cols[6]) h <- unique(shade[, 3]) lh <- length(h) logsep <- log(h[2]) - log(h[1]) logsep2 <- logsep/2 # factor <- sqrt(h[1]/h[2]) # polygon(c(min(shade[, 1]), rep(max(shade[, 2]), 2), min(shade[, 1])), # c(rep(min(shade[, 3])/factor, 2), rep(max(shade[, 3]) * factor, # 2)), border = F, col = cols[5]) polygon(c(min(shade[, 1]), rep(max(shade[, 2]), 2), min(shade[, 1])), c(rep(min(shade[, 3]), 2), rep(max(shade[, 3]), 2)), border = F, col = cols[5]) s1 <- (1:nrow(shade))[shade[,4] > 0 & shade[,5] > 0] x1 <- cbind(shade[s1, 1], shade[s1, 2], shade[shade[s1,5], 2], shade[shade[s1,4], 1], rep(NA,length(s1))) y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,5],3], shade[shade[s1,4],3], rep(NA,length(s1))) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[3]) s1 <- (1:nrow(shade))[shade[,4] > 0 & shade[,5] == 0] x1 <- cbind(shade[s1, 1], shade[s1, 2], (shade[s1,2] + shade[s1+1,1])/2, shade[shade[s1,4], 1], rep(NA,length(s1))) y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,4],3], shade[shade[s1,4],3], rep(NA,length(s1))) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[3]) s1 <- (1:nrow(shade))[shade[,4] == 0 & shade[,5] > 0] x1 <- cbind(shade[s1, 1], shade[s1, 2], shade[shade[s1,5],2], (shade[s1,1] + shade[s1-1,2])/2, rep(NA,length(s1))) y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,5],3], shade[shade[s1,5],3], rep(NA,length(s1))) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[3]) if(wd == 0) { gt <- (1:nrow(tree))[tree[, 4] < 0] mt <- tree[gt, 3] ms <- rep(0, length(mt)) for(i in 1:length(mt)) { ms[i] <- (1:nrow(shade))[shade[, 3] == tree[mt[i], 2] & shade[, 1] < tree[mt[i], 1] & shade[, 2] > tree[mt[i], 1]] } wd <- min((shade[ms, 2] - tree[mt, 1])/abs(tree[mt, 4]), ( tree[mt, 1] - shade[ms, 1])/abs(tree[mt, 4])) * .9 } gt <- (1:nrow(tree))[tree[, 4] > 0 & tree[, 3] > 0] nt <- length(gt) x1 <- cbind(tree[gt, 1] - wd * abs(tree[gt, 4]), tree[gt, 1] + wd * abs( tree[gt, 4]), tree[tree[gt, 3], 1] + wd * abs(tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] - wd * abs(tree[tree[gt, 3], 4]), rep( NA, nt)) y1 <- cbind(tree[gt, 2], tree[gt, 2], tree[tree[gt, 3], 2], tree[tree[ gt, 3], 2], rep(NA, nt)) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[1]) t2 <- (1:nrow(tree))[tree[, 4] < 0] gt <- tree[t2, 3] nt <- length(gt) x1 <- cbind(tree[gt, 1] - wd * abs(tree[tree[gt, 3], 4]), tree[gt, 1] + wd * abs(tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] + wd * abs( tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] - wd * abs(tree[ tree[gt, 3], 4]), rep(NA, nt)) y1 <- cbind(tree[gt, 2], tree[gt, 2], tree[tree[gt, 3], 2], tree[tree[ gt, 3], 2], rep(NA, nt)) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[1]) gt <- (1:nrow(tree))[tree[, 4] > 0 & tree[, 3] > 0] segments(tree[gt, 1], tree[gt, 2], tree[tree[gt, 3], 1], tree[tree[gt, 3], 2], col=cols[2]) segments(tree[t2, 1], tree[t2, 2], tree[tree[t2, 3], 1], tree[tree[ t2, 3], 2], lty = 3, col=cols[2]) if (amp > 0) { h<-unique(am[,2]) h<-c(h,rep(0,ceiling(length(h)/amp)*amp-length(h))) h<-matrix(h,amp) goodh<-h[1,] goodam<-NULL for (i in 1:length(goodh)) goodam<-rbind(goodam,am[am[,2]==goodh[i],]) points(goodam[, 1], goodam[, 2], pch = ".",col=cols[4]) } else { a1 <- (1:nrow(am))[am[, 4] == 1] segments(am[a1, 1], am[a1, 2], am[am[a1, 3], 1], am[am[a1, 3], 2], col = cols[4], lty = -amp) # a2 <- (1:nrow(am))[am[, 2] == am[nrow(am), 2]] # segments(am[a2, 1], am[a2, 2], am[a2, 1], am[a2, 2]/factor, # col = cols[4], lty = -amp) } } ############################################################################### bin1<- function(x, ab = nicerange(x), nbin = 500) # bin1 - bins a data set into nbin bins for use in nash1 # x - (unbinned) data; ab - range for bins; nbin - number of bins # Copyright 1986, David W. Scott { if(ab[1] >= ab[2]) fatal("Interval vector has negative orientation") if(nbin <= 0) fatal("Number of bin intervals nonpositive") a <- .Fortran("bin1", as.double(x), as.integer(length(x)), as.double(ab), as.integer(nbin), bc = double(nbin), nskip = integer(1)) return(list(bc = a$bc, ab = ab, nskip = a$nskip)) } ############################################################################### nash1<- function(bins, h) # nash1 - approximates a kernel density estimate with normal kernel using # an appropriately weighted average shifted histogram (ASH) # bins - binned data; h - bandwidth (s.d. of normal kernel) # Copyright 1993, David W. Scott and Michael C. Minnotte { if(h <= 0) stop("Smoothing parameter m nonpositive") bc <- bins$bc ab <- bins$ab nbin <- length(bc) a <- .Fortran("nash1", as.double(h), as.double(bc), as.integer(nbin), as.double(ab), t = double(nbin), f = double(nbin), work = double(nbin)) return(list(x = a$t, y = a$f, ab = ab, h = h, n = sum(bc), nskip = bins $nskip)) } ############################################################################### nicerange<- function(x, beta = 0.2) # nicerange - range of a data set extended on each side (default: 10%) # x - unbinned data; beta - total fraction of data range to add (split between # the two sides) # Copyright 1986, David W. Scott { ab <- range(x) # interval surrounding data del <- ((ab[2] - ab[1]) * beta)/2 return(c(ab + c(.Uminus(del), del))) } ############################################################################### matcher<- function(alist, blist) # matcher - matches elements from two lists as described in the appendix # to Minnotte and Scott # alist - first vector of sorted values; blist - second vector of sorted # values # Copyright 1993, Michael C. Minnotte { na <- length(alist) nb <- length(blist) alist <- c(alist, rep(0, 100 - na)) blist <- c(blist, rep(0, 100 - nb)) mout <- matrix(0, 100, 2) out <- .Fortran("matcher", as.double(alist), as.double(blist), as.integer(na), as.integer(nb), mout = as.integer(mout)) return(matrix(out$mout, 100, 2)) } ############################################################################### quadest<- function(x, y) # quadest - given three ordered pairs, finds the extremum of the interpolated # parabola # x - 3-vector of x values; y - 3-vector of y values # Copyright 1993, Michael C. Minnotte { a <- ((y[1] - y[2])/(x[1] - x[2]) - (y[2] - y[3])/(x[2] - x[3]))/( x[1] - x[3]) b <- (y[1] - y[2])/(x[1] - x[2]) - a * (x[1] + x[2]) quad <- - b/(2 * a) return(quad) } ############################################################################### findmodes<- function(x, y) # findmodes - given sequence of values representing a density function, # identify number of modes, locations of modes, and indices of modes # and antimodes # x - x-values returned by nash1; y - density values returned by nash1 # Copyright 1993, Michael C. Minnotte { nbin <- length(x) out <- .Fortran("findmodes", as.double(y), as.double(x), as.integer(nbin), rm = double(100), mc = integer(100), ml = integer(100), mr = integer(100), nm = integer(1)) return(nm = out$nm, modes = cbind(out$rm[1:out$nm], out$mc[1:out$nm]), am = cbind(out$ml[1:out$nm], out$mr[1:out$nm])) } ############################################################################### findbumps<- function(x, y) # findbumps - given sequence of values representing a density function, # # identify bumps (regions of negative second derivative) # # x - x-values returned by nash1; y - density values returned by nash1 # Copyright 1993, Michael C. Minnotte { n <- length(y) f2 <- c(1, y[1:(n - 2)] - 2 * y[2:(n - 1)] + y[3:n], 1) s <- ifelse(f2 < 0, -1, 1) s2 <- s[2:n] - s[1:(n - 1)] i1 <- (1:(n - 1))[s2 == -2] + 1 i2 <- (1:(n - 1))[s2 == 2] return(cbind(x[i1], x[i2])) }