## From c:/homeNEW/R/library/gm ## source("R/gm.R") ## package.skeleton("gm",path='gm') ## ## In cygwin: ## $ cd c:/homeNEW/R/library ## $ Rcmd INSTALL gm ## ## Last updated: October 12, 2004 ## 2004-10-16: Changed sub in gm (extract substring up to pattern) to subst ## to avoid masking sub in R base. ## Functions that may need modification for R are in gm.R.originalcode ## ## Changes ## Feb 15 2005: Tri function to build symmetric matrix from unique elements and Triinv. ## Oct 12 2004: moved some functions to gm.R.originalcode ## updated: td ## new: xqplot ## ## Nov 5, 2003: tests of normality by Dave Cummins, V. Devanarayan # # fun.SSC # a collection of useful functions # # Character: # # ch <- as.character # tr: translate # subst: extract first portion of string to first matching character # # Graphics # # td: trellis.device # myplot # brace # ell # data.ell # shadow # slice # # est: estimation with linear models # # diags: # cis: confidence and prediction intervals for linear models # tt: table with confidence intervals for coefficients # disp: display regression output in minimalist form # vif.lm : variance inflation factors # # # Data manipulation # # stack: probably better done with 'merge' # total: adds border of totals to matrix # dfapply: # tochar: # tofactor: # dfapply # # Arrays: # abind: glues two arrays along a face (generalizes c, rbind, cbind) # atotal: borders an array with totals (generalizes total) # see also: aperm (generalizes t) # # File manipulation: # # fopen: open window # ex: export data frame to Excel file # export: what's the diff?? # # SHOULD DO: # w2l: (wide to long) # # # 00-11-06: added logic functions from pascal:~georges/library/home/home.s # ????? NOT UP TO DATE .... REPLACE WITH VERSION ON MACK # # This is currently the master version 01-12-21 # Log: # Added mods from HeenanBlaikie to: Key, stack and superpose3 # attach("C:\\Program Files\\sp2000\\library\\home\\_Data" ,1) # search() # RUN # detach( what = "C:\\Program Files\\sp2000\\library\\home\\_Data" ) # detach( what = "C:\\Program Files\\sp2000\\library\\home\\_Data" ) # library(home) ".First.lib" <- function(lib, pkg) { cat("panel.superpose may be masked\n") invisible() } #{ # cat("panel.superpose may be masked\n") # cat("\n") # cat("3.0.2 nlme.formula modified to report number of starting values needed\n") # cat("\n") # cat("predict.nlme is corrected version for 3.3.1 -- replace with recent version\n") # cat(" when availab1e\n") # cat("\n") # cat("summary.default is modified to include standard deviations\n") # cat("\n") # cat("This version was last updated November 15, 2002 on the Compaq laptop\n") # cat("List of masked functions:\n") # print(masked()) # invisible() #} # Lagging # - interesting if 'index' is continuous and/or if there # are missing waves. # - need some kind of intrapolation to compute a derivative # at the time of observation # The first function here only works well with complete data # and an integer index ranging from 1 to n. (n can vary from # subject to subject) na.rm <- function(x) x[!is.na(x)] Lag <- function(x,id,idx,check=T) { # computes Lagged values but without intrapolation # Written 020907 by G. Monette at the Robarts RDC if (check) { comb <- paste(id,idx) if (any(duplicated(comb))) stop("id not unique in each level of idx") } ret <- x id <- as.character(id) names(x) <- id for ( i in max(idx):2) { to.pos <- idx == i from <- x[idx == (i-1)] ids <- names( x[to.pos]) ret[to.pos] <- from[ids] } ret[idx==1] <- NA ret } LagI <- function(x,id,time,lag=1,delta=.4,check=T) { # computes Lagged values with intrapolation # Written 020907 by G. Monette at the Robarts RDC # lags by intrapolating # with complete data at each value of index, this does the same thing # as Lag # If values of Lag are skipped then we get linear intrapolations. # Note that 'delta' should be small enough so that values of x are # at least delta apart. However, too small a value for delta introduces # numerical error # if (check) { comb <- paste(id,time) if (any(duplicated(comb))) stop("id not unique in each level of idx") } ret <- x id <- as.character(id) names(x) <- id names(time) <- id for (nn in unique(id)){ pos <- id == nn xx <- x[pos] tt <- time[pos] topred <- tt-delta drop <- is.na(xx)|is.na(tt) xxc <- xx[!drop] ttc <- tt[!drop] nl <- length(xxc) if ( nl > 0) { if ( nl > 1 ) { xx.p <- approx(ttc,xxc,tt-delta)$y xx.t <- approx(ttc,xxc,tt-delta/2)$y xx.lag <- xx.t - ((2*lag-delta)/delta)*(xx.t-xx.p) } else xx.lag <- NA } else xx.lag <- NA ret[pos] <- xx.lag } ret } DiffI <- function(xx,...) { # Change since last time period with intrapolation is # last time period is missing # Written 020907 by G. Monette at the Robarts RDC xx - LagI(xx,...) } ### ### RGL functions ### testing <- function(){ library(help=rgl) rgl x <- 1:20 y <- 1:20 + rnorm(20) z <- 1:20 + rnorm(20) rgl.open() } ### ### Modifications of car functions ### influence.plot <- function (model, scale = 10, col = c(1, 2), labels = names(rstud), cex.lab = 1, ...) { ## Modified from car by J. Fox to include a cex.lab argument hatval <- hatvalues(model) rstud <- rstudent(model) cook <- sqrt(cookd(model)) scale <- scale/max(cook, na.rm = TRUE) p <- length(coef(model)) n <- length(rstud) cutoff <- sqrt(4/(n - p)) plot(hatval, rstud, xlab = "Hat-Values", ylab = "Studentized Residuals", type = "n", ...) abline(v = c(2, 3) * p/n, lty = 2) abline(h = c(-2, 0, 2), lty = 2) points(hatval, rstud, cex = scale * cook, col = ifelse(cook > cutoff, col[2], col[1])) if (labels[1] != FALSE) identify(hatval, rstud, labels, cex = cex.lab) } tr <- function(x,from = levels(x),to.){ # improved version that leaves things unchanged if # they aren't in from xx <- as.character(x) from <- c(from, unique(xx)) to. <- c(to., unique(xx)) to. [ match( x, from ) ] } ## tr(c('a','b','b','a','x'), c('b','v'), c('B',"V")) ######### ######### ######### TRELLIS ######### ######### # sampler() td <- function(basecol = NULL, col=c(3,5,4,6,7,8,2), lty=1:7, lwd=1, pch = 1:7, cex = 0.8, font = 1, len = 7, long = F, new = F, record = T, colsets = c('plot.symbol','plot.line','dot.symbol', 'dot.line','cloud.3d','box.dot'),...) { # Modified for R: Oct. 10, 2004 # Note: By using col and lty/pch with lengths that are relatively prime and # by using the len argument, one can generate unique combinations e.g. for len = 42 with # col of length 6 and pch of length 7 # last updated: GAM June 17,2003 # # reset superpose.symbol and superpose.line so they have consistent length # equal to the max of input parameters: # sps: cex, col, font, pch # spl: lty, col, lwd # or to len # This allows distinctive line styles, for example for 12 objects by using # good lty's: 1,4,8,6,3,5,7 # and good col's: 1,8,6,3,4,5,2 # require(lattice) if ( long ) { col <- c(3,5,4,6,8,2) # drop yellow len <- 42 # generates 42 unique combinations of pch/lty and col } if(new) trellis.device(theme = col.whitebg, record = record, new = new, ...) # NOTE: fixed panel.superpose so lty argument # is passed to points for type = 'b' len <- max(len,length(col),length(lty),length(lwd),length(pch),length(cex),length(font)) spl <- trellis.par.get("superpose.line") spl$lty <- rep(lty,length=len) spl$col <- rep(col,length=len) spl$lwd <- rep(lwd,length=len) trellis.par.set("superpose.line",spl) sps <- trellis.par.get("superpose.symbol") sps$pch <- rep(pch, length=len) sps$col <- rep(col, length=len) sps$cex <- rep(cex, length=len) sps$font <- rep(font, length=len) trellis.par.set("superpose.symbol",sps) list(superpose.symbol=sps, superpose.line=spl) if ( !is.null(basecol)) { for ( ii in colsets ) { tt <- trellis.par.get(ii) tt$col <- basecol trellis.par.set(ii,tt) } } invisible(attr(.Device,"trellis.settings")) } ## ?sample ## ds <- data.frame(x=1:10, y = sample(LETTERS[1:3],10, rep =T)) ## xqplot(ds) ## zz <- jobeval[1:101,] ## td(new=T) ## xqplot(jobeval) xqplot <- function(x, labels = dimnames(x)[[2]], ..., ask = F, ptype = "quantile", mfrow = findmfrow ( ncol(x)), mcex = 1.2, maxlab = 12 , debug = F, mar = c(5,2,3,1), text.cex.factor = 1 , left.labs = F) { ## Adapted from myplot.data.frame for R by G. Monette, Oct. 25, 2004 ## maxlab is maximum number of labels left.labs <- rep( left.labs, length = ncol(x)) findmfrow <- function( x ) { if ( x > 9) c(3,4) else cbind( '1'=c(1,1),'2'=c(1,2),'3'=c(2,2), '4'=c(2,2),'5'=c(2,3),'6'=c(2,3), '7'=c(3,3), '8'=c(3,3), '9'=c(3,3)) [, x] } opt <- par( mfrow = mfrow, mar = mar + 0.1 ) on.exit(par(opt)) iscat <- function( x ) is.factor(x) || is.character(x) compute.cex <- function( x ) { ll <- length(x) cex <- 2 * ifelse( ll < 5, 2, ifelse( ll < 10, 1, ifelse( ll < 20, .7, .5)))/mfrow[1] } for ( ii in 1: dim(x)[2]) { vv <- x[[ii]] nam <- labels[[ii]] Nmiss <- sum(is.na(vv)) N <- length(vv) if ( iscat(vv) ){ tt <- table(vv) xlab <- paste("N =", N ) if ( Nmiss > 0 ) { tt <- c( "" = sum(is.na(vv)), tt) xlab <- paste(xlab, " Nmiss =", Nmiss) } ll <- names(tt) nn <- length(ll) if ( left.labs[ii] ) barplot( tt, horiz = T, xlab = xlab, cex.names = text.cex.factor * compute.cex(nn) ) else { zm <- barplot( tt, names = rep("",nn), horiz = T, xlab = xlab) ## If nn > maxlab drop labels for smaller frequencies sel <- rep( T, length(tt)) tt.sorted <- rev(sort(tt)) if ( nn > maxlab ) sel <- tt > tt.sorted[maxlab] if (debug) { print(sel) print(nam) print(tt) print(tt.sorted) print(maxlab) print(tt.sorted[maxlab]) print(sel) print(zm[sel]) print(rep(max(tt),nn)[sel]) print( ll[sel]) } if ( any(sel) ) text( rep( max( tt ), nn)[sel] , zm[sel], ll[sel], adj = 1, cex = text.cex.factor * compute.cex( nn )) } } else { vv <- vv[!is.na(vv)] xxvar <- ppoints(length(vv)) xlab <- paste("Fraction of",N-Nmiss,"(NA:",Nmiss,")") if ( pmatch( ptype, 'normal', nomatch = 0) == 1 ) { xxvar <- qnorm( ppoints(length(xvar)) ) xlab <- paste("Normal quantile for",N-Nmiss,"obs. (NA:",Nmiss,")") } ## Plot continuous if ( N == Nmiss ) { xxvar <- 1 vv <- 1 plot( xxvar, vv, xlab = xlab, ylab="", type = 'n') text( 1, 1, "NA") } else { plot(xxvar, sort(vv), xlab = xlab, ylab = "Data", ...) xm <- mean(vv) xs <- sqrt(var(vv)) abline( h= xm,lty=1) abline( h= c(xm-xs,xm+xs),lty=2) } } ## Titles for all plots mtext(labels[ii], 3, 1.5, cex = mcex) } invisible(0) } subst <- function(obj, pat) { ## 04-10-16: in R masks sub in base # extract first portion of string up to but # excluding first char matching expression # added: 02-12-16 if (isfact <- is.factor(obj)) obj <- as.character(obj) nmax <- max(nchar(obj)) lastex <- regexpr(pat,obj) - 1 lastex <- ifelse(lastex < 1, nmax, lastex) obj <- substring(obj,1,lastex) if(isfact) obj <- factor(obj) obj } # td(new=T) # # Strip functions # strip1 <- function(...) strip.default(..., style = 1) strip2 <- function(...) strip.default(..., style = 2) strip3 <- function(...) strip.default(..., style = 3) strip4 <- function(...) strip.default(..., style = 4) strip5 <- function(...) strip.default(..., style = 5) strip6 <- function(...) strip.default(..., style = 6) # # panel.superpose functions # "panel.superpose.lowess" <- function(x, y, subscripts, groups, lwd = superpose.line$lwd, lty = superpose.line$lty, font = superpose.symbol$font, col = NULL, f = 2/3, ...) { # this function was cobbled from panel.superpose # -- aey 1/26/2000 (Anne E. York) superpose.symbol <- trellis.par.get("superpose.symbol") superpose.line <- trellis.par.get("superpose.line") groups <- as.numeric(as.factor(groups))[subscripts] max.groups <- max(groups) lty <- rep(lty, length = max.groups) lwd <- rep(lwd, length = max.groups) font <- rep(font, length = max.groups) if(is.null(col)) { sl <- rep(superpose.line$col, length = max.groups) ss <- rep(superpose.symbol$col, length = max.groups) col <- numeric(max.groups) for(i in 1:max.groups) col[i] <- sl[i] } else col <- rep(col, length = max.groups) N <- seq(along = groups) for(i in sort(unique(groups))) { which <- N[groups == i] # j <- which[order(x[which])] # sort in x j <- which # no sorting lines(lowmiss(x[j], y[j], f = f), col = col[i], lwd = lwd[i], lty = lty[i]) } } "lowmiss" <- function(x, y, f = 2/3) { yes <- x != "NA" & y != "NA" lowess(x[yes], y[yes], f) } panel.superpose2 <- function(x, y, subscripts, groups, index, type = "p", lwd = superpose.line$lwd, lty = superpose.line$lty, pch = superpose.symbol$pch, cex = superpose.symbol$cex, font = superpose.symbol$font, col = NULL, ...) { # modified by Georges Monette 01-03-11 # from Splus panel.superpose so that symbol and line type are chosen # by a variable index # this is particularly useful when separate lines for each group but # wanting plot style to be determined by a different variable # Note that BOTH groups and index must be specified in xyplot superpose.symbol <- trellis.par.get("superpose.symbol") superpose.line <- trellis.par.get("superpose.line") groups <- as.numeric(as.factor(groups))[subscripts] index <- index[subscripts] max.index <- max(index,na.rm=T) type <- rep(type, length = max.index) lty <- rep(lty, length = max.index) lwd <- rep(lwd, length = max.index) pch <- rep(pch, length = max.index) cex <- rep(cex, length = max.index) font <- rep(font, length = max.index) if(is.null(col)) { sl <- rep(superpose.line$col, length = max.index) ss <- rep(superpose.symbol$col, length = max.index) col <- numeric(max.index) for(i in 1:max.index) col[i] <- if(type[i] == "l") sl[i] else ss[i] } else col <- rep(col, length = max.index) N <- seq(along = groups) for(i in sort(unique(groups))) { which <- N[groups == i] # j <- which[order(x[which])] # sort in x j <- which # no sorting ind <- (index[j])[1] if(type[ind] == "l") lines(x[j], y[j], col = col[ind], lwd = lwd[ind], lty = lty[ind], type = type[ind], ...) else points(x[j], y[j], col = col[ind], pch = pch[ind], cex = cex[ind], font = font[ind], type = type[ind], ...) } } xsel <- function( arr , index , cols = NULL, drop = T) { # exact selection to avoid partial matching into row.names of matrix or data.frame if ( is.null( rn <- row.names(arr))) stop("need non-null row.names") if ( is.null(cols) ) arr [ match( index, rn ), , drop = drop] else arr [ match( index, rn ), cols, drop=drop] } Key2 <- function(text, type = c('p','l'), columns = length(text), ...) { # use type = 'b' to get both a line and a character for any row # generates argument for 'key' in xyplot # -- to change lty etc, in a plot, use td() to change setting to trellis # since Key uses trellis setting and ignores lty,etc. arguments to xyplot # spl <- trellis.par.get("superpose.line") sps <- trellis.par.get("superpose.symbol") if (is.list(text)) ret <- list( columns = columns, text = text, ...) else ret <- list( columns = columns, text = list(text),...) if (any( type == 'p') | any(type == 'b')) { ret <- c(ret, points = list(Rows(sps, 1:length(text)))) ret$points$col[type == "l"] <- 0 } if (any( type == 'l') | any(type == 'b')){ ret <- c(ret, lines = list(Rows(spl, 1:length(text)))) ret$lines$col[type == "p"] <- 0 } ret } # panel.superpose # Key ( levels( dd$Name ) ) # search() # rm(dd) Key <- function(text, type = c('p','l'), columns = length(text),lwd = superpose.line$lwd, lty = superpose.line$lty, pch = superpose.symbol$pch, cex = superpose.symbol$cex, font = superpose.symbol$font, col = NULL, ...) { # # Improvement over previous Key which is now Key2 # # # generates argument for 'key' in xyplot # -- to change lty etc, in a plot, use td() to change setting to trellis # since Key uses trellis setting and ignores lty,etc. arguments to xyplot # superpose.symbol <- trellis.par.get("superpose.symbol") superpose.line <- trellis.par.get("superpose.line") max.groups <- length(text) type <- rep(type, length = max.groups) lty <- rep(lty, length = max.groups) lwd <- rep(lwd, length = max.groups) pch <- rep(pch, length = max.groups) cex <- rep(cex, length = max.groups) font <- rep(font, length = max.groups) if(is.null(col)) { sl <- rep(superpose.line$col, length = max.groups) ss <- rep(superpose.symbol$col, length = max.groups) col <- numeric(max.groups) for(i in 1:max.groups) col[i] <- if(type[i] == "l") sl[i] else ss[i] } spl <- list(col=col,lty=lty,lwd=lwd) sps <- list(cex=cex, col=col, font=font, pch = pch) print(spl) print(sps) # spl <- Rows( list(col=col,lty=lty,lwd=lwd) ,1:length(text)) # sps <- Rows( list(cex=cex, col=col, font=font, pch = pch) ,1:length(text)) type <- rep(type,length=length(text)) if (is.list(text)) ret <- list( columns = columns, text = text, ...) else ret <- list( columns = columns, text = list(text),...) spl$col <- ifelse( type == 'p', sps$col, spl$col) spl$lty <- ifelse( type == 'p', sps$pch, spl$lty) ret <- c(lines=list(spl),ret, type = list(type)) ret } # Key(levels(dd$Name)) zzz <- function(x, y, subscripts, groups, type = "p", lwd = superpose.line$lwd, lty = superpose.line$lty, pch = superpose.symbol$pch, cex = superpose.symbol$cex, font = superpose.symbol$font, col = NULL, ...) { superpose.symbol <- trellis.par.get("superpose.symbol") superpose.line <- trellis.par.get("superpose.line") groups <- as.numeric(as.factor(groups))[subscripts] max.groups <- max(groups) type <- rep(type, length = max.groups) lty <- rep(lty, length = max.groups) lwd <- rep(lwd, length = max.groups) pch <- rep(pch, length = max.groups) cex <- rep(cex, length = max.groups) font <- rep(font, length = max.groups) if(is.null(col)) { sl <- rep(superpose.line$col, length = max.groups) ss <- rep(superpose.symbol$col, length = max.groups) col <- numeric(max.groups) for(i in 1:max.groups) col[i] <- if(type[i] == "l") sl[i] else ss[i] } else col <- rep(col, length = max.groups) N <- seq(along = groups) for(i in sort(unique(groups))) { which <- N[groups == i] # j <- which[order(x[which])] # sort in x j <- which # no sorting if(type[i] == "l") lines(x[j], y[j], col = col[i], lwd = lwd[i], lty = lty[i], type = type[i], ...) else points(x[j], y[j], col = col[i], pch = pch[i], cex = cex[i], font = font[i], type = type[i], ...) } } panel.superpose3 <- function(x, y, subscripts, groups, Type ,type = "p", lwd = superpose.line$lwd, lty = superpose.line$lty, pch = superpose.symbol$pch, cex = superpose.symbol$cex, font = superpose.symbol$font, col = NULL, ...) { # # Uses Type ('l' or 'p') to plot points or lines within each level of groups # superpose.symbol <- trellis.par.get("superpose.symbol") superpose.line <- trellis.par.get("superpose.line") groups <- as.numeric(as.factor(groups))[subscripts] Type <- as.character( as.factor(Type)[subscripts] ) max.groups <- max(groups) type <- rep(type, length = max.groups) lty <- rep(lty, length = max.groups) lwd <- rep(lwd, length = max.groups) pch <- rep(pch, length = max.groups) cex <- rep(cex, length = max.groups) font <- rep(font, length = max.groups) if(is.null(col)) { sl <- rep(superpose.line$col, length = max.groups) ss <- rep(superpose.symbol$col, length = max.groups) col <- numeric(max.groups) for(i in 1:max.groups) col[i] <- if(type[i] == "l") sl[i] else ss[i] } else col <- rep(col, length = max.groups) N <- seq(along = groups) for(i in sort(unique(groups))) { which <- N[groups == i] # j <- which[order(x[which])] # sort in x j <- which # no sorting tt <- Type[j] xx <- x[j] yy <- y[j] if (any(jj <- tt == 'l') ){ lines(xx[jj], yy[jj], col = col[i], lwd = lwd[i], lty = lty[i], type= 'l', ...) } if (any(jj <- tt == 'p') ) { points(xx[jj], yy[jj], col = col[i], pch = pch[i], cex = cex[ i], font = font[i], type = 'p', ...) } } } # # Trellis # resources <- function(expr) { # From V&R: S Programming 2000 loc <- sys.parent(1) if (loc == 1) loc <- F on.exit(cat("Timing stopped at:", proc.time() - time, "\n")) expr <- substitute(expr) stime <- proc.time() mem.tally.reset() w <- eval(expr, local = loc) etime <- proc.time() mem <- mem.tally.report() on.exit() names(mem) <- c("Cache", "Working") # proc.time() gives length 1 on Windows. if( length(stime) == 1) stime <- c(NA,NA,stime,0,0) if( length(etime) == 1) etime <- c(NA,NA,etime,0,0) time <- etime - stime time[3] <- max( time[3] , time[1] + time[2] , na.rm=T) print( c( CPU = time[1] + time[2], Elapsed = time[3], "% CPU" = round(( 100* (time[1] + time[2]))/time[3], 1), Child = time[4] + time[5], mem)) invisible(w) } sampler <- function() { # sample of lines and symbols require(lattice) y <- 16*(0:15) x <- 0:15 print(xyplot( y ~ x, type = 'n', main = 'pch', panel = function(x,y,...) { for ( i in x) { for ( j in y ) { panel.xyplot(i,j,pch=i+j) } } })) print(xyplot( y ~ x, type = 'n', main = 'lty', panel = function(x,y,...) { for ( i in x) { panel.xyplot(c(i,i),range(y),type='l',lty=i,col=1) } })) print(xyplot( y ~ x, type = 'n', main = 'col', panel = function(x,y,...) { for ( i in x) { panel.xyplot(c(i,i),range(y),type='l',lty=1,col=i) } })) z <- expand.grid(y=1:7,x=0:2) # print(z$x, z$y, ylim=c(0,7)) print(xyplot( y ~ x , z, ylim =c(0,7),groups = y, type='b',panel= function(x,y,...) { panel.superpose(x,y,...) spl <- as.data.frame(trellis.par.get('superpose.line')) sps <- as.data.frame(trellis.par.get('superpose.symbol')) for ( i in y) { labl <- paste( names(spl), spl[i,], sep='=', collapse =',') labs <- paste( names(sps), sps[i,], sep='=', collapse =',') panel.text(0,i-1/4,labl,adj=0) panel.text(1,i-1/4,labs,adj=0) } } )) invisible(0) } # sampler() curvePlot <- function( expr, xlim = c(0,1), len = 200, x = seq( xlim[1],xlim[2], len= len), ylab = deparse(substitute(expr)),...) { # Modified from V&R, S Programming 2000 plot(x, eval(substitute(expr)), type="l", ylab = ylab, ...) } mypanel <- function(x, y, ...,family='gaussian') { panel.xyplot(x,y,...) z <- na.omit(data.frame(x=x,y=y)) if(length(unique(z$x))>5) panel.loess(z$x, z$y,...,family=family) if(nrow(z) > 3) panel.lmline(z$x, z$y,...) } mypanel2 <- function(x, y, ...) { panel.xyplot(x,y,...) if(diff(range(x)) > 0) panel.lmline(x, y,...) } # # NA type I # nasummary <- function( x , debug = F) { # x assumed to be a data.frame we change everything to a factor # print variables in the order they would be added to minimize the # number of additional cases dropped at each stage : i.e. forward # stepwise addition of variables to minimizes additional number of # cases dropped x <- as.data.frame(x) nr <- nrow(x) for ( i in seq(along=x)) { if( is.character(x[[i]])) x[[i]] <- factor(x[[i]]) x [[i]] <- as.numeric( x[[i]]) } varn <- names(x) if ( any ( duplicated( varn ) )) { add.to.name <- ifelse( duplicated( varn), paste(".",as.character(1:length(varn)),sep=""), "") names(x) <- paste( varn, add.to.name , sep = "") mess <- paste("Duplicated vars:", paste(varn[duplicated(varn)],collapse=" ")) warning( mess ) } x <- as.matrix(x) x <- 1*is.na(x) if (debug ) print(x) numnas <- apply(x , 2, sum) x <- rbind(x,0) x <- rbind(x,0) # make sure matrix doesn't disappear ret <- rep(0,ncol(x)) names(ret) <- rep("",ncol(x)) n <- ncol(x) for ( i in 1:n) { #if (debug ) print (x) nas <- apply(x, 2, sum) minpos <- order(nas)[1] # first index with fewest remaining nas addlmiss <- nas[minpos] ret[i] <- addlmiss names(ret)[i] <- dimnames(x)[[2]][minpos] if (debug ) print(ret) x <- x[ x[,minpos] != 1, - minpos, drop = F] } retd <- data.frame( drop=ret, dropped=cumsum(ret), left = nr-cumsum(ret)) retd$nas <- numnas[row.names(retd)] retd <- retd[, c(4,1:3) ] class(retd) <- c("nasummary",class(retd)) retd } plot.nasummary <- function( x, ... ) { # definitely needs work ... stay with naplot(naclus(....)) !! N <- x$dropped[1] + x$left[1] usr <- par("usr") eps <- 0.015 * diff(usr[1:2]) epsy <- 0.015 * diff(usr[3:4]) plot(x$dropped,x$nas) text(x$dropped,jitter(x$nas),row.names(x)) } # Data frame and factor manipulation # add labels to variables to produce comments label <- function(x,...) UseMethod("label") label.lme <- function( x, ... ) { z <- model.frame( x$call$formula, x$call$data) label(z) <- label(x$call$data) label.data.frame(z) } label.data.frame <- function(x) { ret <- character(length(x)) ret <- lapply( x , label ) class(ret) <- "label.list" label(ret) <- attr(x,"label") ret } print.label.list <- function(x, ...) { cat(label(x),"\n") z <- data.frame( name=I(names(x)), label=I(c(x,recursive=T)),row.names=NULL) sel <- nchar(z$label) > 0 z <- z[ sel , ] print.data.frame(z, ...) invisible(x) } label.default <- function(x) { x <- attr(x,"label") if ( is.null (x) ) x <- "" x } "label<-" <- function(x,y) { attr(x,"label") <- y x } form <- function(x, ...) UseMethod("form") form.lme <- function(x, ...) formula(x,...) form.nlme <- function(x, ...) formula.lme(x, ...) aggdev <- function(dd, groups, FUN = mean, ..., suffix = c(".b",".w")) { # creates a data.frame with two vars for each var in data.frame dd: # one var is aggregate by groups, other is deviation within groups ret <- list() for ( i in seq(along=dd) ) { nn <- names(dd)[i] nnb <- paste(nn,suffix[1],sep="") nnw <- paste(nn,suffix[2],sep="") x <- dd[[i]] xb <- tapply(x, groups, FUN, ...) [ tapply( x, groups)] xw <- x - xb ret[[nnb]] <- xb ret[[nnw]] <- xw } ret <- as.data.frame(ret) ret } drop.levels <- function(x) { # drops empty levels but otherwise preserves order of levels if ( !is.factor(x)) return (x) } # add totals to matrix total <- function(x,FUN=sum, ..., rowlab="Total",collab=rowlab,rowfirst=F,colfirst=F) { dn <- dimnames(x) if(is.null(dn)) dn <- list(NULL,NULL) if(length(dn[[1]])==0) dn[[1]] <- paste("Row",1:dim(x)[1]) if(length(dn[[2]])==0) dn[[2]] <- paste("Col",1:dim(x)[2]) x <- cbind( x, apply(x, 1, FUN, ...)) x <- rbind( x, apply(x, 2, FUN, ...)) dimnames(x) <- list( c( dn[[1]],rowlab), c(dn[[2]],collab)) if (colfirst ) x <- x[, c( ncol(x), 1:(ncol(x)-1))] if (rowfirst ) x <- x[c(nrow(x), 1:(nrow(x)-1)),] x } # add totals to an array # 1 add total to last dimension of array: atotal <- function(arr,FUN=sum) { d <- dim(arr) if (length(d) == 1) { arr <- c(arr) d <- dim(arr) } if(is.character(FUN)) FUN <- get(FUN, mode = "function") else if(mode(FUN) != "function") { farg <- substitute(FUN) if(mode(farg) == "name") FUN <- get(farg, mode = "function") else stop(paste("\"", farg, "\" is not a function", sep = "")) } if (is.null(d)) return (c( arr, Total=FUN(arr))) n <- length(d) ret <- arr ind <- 1:n for ( i in n:1) { new <- apply(ret,ind[-i],FUN) ret <- abind( ret, new, i, "Total") } ret } ## ?aperm abind <- function(arr1,arr2,d,facename="") { # glue arr1 to arr2 along dimension d, # i.e. along face 'not d' # Note dim(arr1) and dim(arr2) must be equal except for dim(arr?)[d] # Result, ret, has dim same dim(arr1) excep dim(ret)[d] which is sum # If length(dim(arr2)) is 1 less than length(dim(arr1)) then arr2 is # treated as parallel to a face of arr1 # G.Monette 02-13-18 # d1 <- dim(arr1) n1 <- length(d1) dn1 <- dimnames(arr1) d2 <- dim(arr2) n2 <- length(d2) dn2 <- dimnames(arr2) arenull <- is.null(dn1) & is.null(dn2) if (is.null(dn1)){ dn1 <- lapply( as.list(d1), function(x) seq(1,x)) dimnames(arr1) <- dn1 } if ( n1 != n2 ) { d2 <- d1 d2[d] <- 1 dn2 <- dn1 dn2[[d]] <- facename dimnames(arr2) <- NULL dim(arr2) <- d2 dimnames(arr2) <- dn2 n2 <- n1 } if (is.null(dn2)){ dn2 <- lapply( as.list(d2), function(x) seq(1,x)) dimnames(arr2) <- dn2 } # check input for consistency # ... later # perm <- 1:n1 perm[c(d,n1)] <- c(n1,d) # perm is an involution # # permute arr1 # arr.p1 <- aperm(arr1,perm) # # permute arr2 if length of dimension same as arr1 # arr.p2 <- aperm(arr2,perm) dret <- d1[perm] dret[n1] <- dret[n1] + d2[d] dnret <- dn1 dnret[[d]] <- c(dnret[[d]],dn2[[d]]) ret <- c(arr.p1, arr.p2) dim(ret) <- dret # # permute response back # ret <- aperm(ret, perm) dimnames(ret) <- dnret ret } ## zf <- factor( c('a','b',NA,'c','a')) ## tab(zf) ## table( dda[,c('Skill','Skill.c')]) tab <- function(...) { require(Hmisc) isl <- function(x,...) is.list(x) if( !isl(...)) a <- list(...) else a <- as.list(...) for ( ii in 1:length(a)) if ( is.factor( a[[ii]])) a[[ii]] <- na.include(a[[ii]]) a <- c( a, exclude = NULL ) do.call("table", a) } ## zdframe <- data.frame( a = c('a','b','a','c'), x=c(1,2,2,1)) ## tab(zdframe) ## tab(zdframe$a, zdframe$x) ## tab(zdframe['a'], zdframe['x']) # NEW: to transform to character or factor ch <- function(x,...) UseMethod("ch") ch.default <- as.character ch.factor <- as.character ch.formula <- function(x,...) { x <- as.character(x) if ( length(x) == 2) ret <- paste(x[1],x[2],sep="") else if ( length(x) == 3) ret <- paste(x[2],x[1],x[3],sep="") else stop("wrong length in as.character(x)") ret } ch.data.frame <- function(x, ...) tochar(x) tochar <- function(x) { for ( i in seq(along=x) ) if (is.factor(x[[i]])) x[[i]]<-as.character(x[[i]]) x } tofactor <- function(x) { for ( i in seq(along=x) ) if (is.character(x[[i]])) x[[i]]<-factor(x[[i]]) x } # NEW SILLY IDEA TO GET COMMENTS TO FOLLOW OUTPUT OF PREVIOUS COMMAND: . <- "" class(.) <- "." print.. <- function(x,...) { wid <- .Options$width x <- paste(rep("#",wid),collapse="") # cat(x,"\n",sep="") cat("\n") invisible(x) } "%~%" <- function(x,y) y[grep(x,y)] "and"<- function(a, b) { # # note: keeps the order in a, result has no duplicates # if(is.null(a) || is.null(b)) return(NULL) ret <- b[match(a, b, 0)] if(length(ret) == 0) return(NULL) unique(ret) } "%and%"<- function(a, b) and(a, b) "%eq%"<- function(a, b) eq(a, b) "%in%"<- function(a, b) match(a, b, nomatch = 0) > 0 "%less%"<- function(a, b) { if(is.null(a)) return(NULL) a[!match(a, b, 0)] } "%or%" <- function(x, y) { or(x,y) } or <- function(x,y) union(x,y) na2f <- function(x) { x [ is.na(x) ] <- F x } na20 <- function(x) { x [ is.na(x) ] <- 0 x } myplot <- function(x,...) UseMethod("myplot") myplot.default <- function(x,...) myplot.data.frame(as.data.frame(x),...) myplot.data.frame <- function(x, labels = dimnames(x)[[2]], ..., ask = F, ptype = "quantile", mfrow = zz ( ncol(x)), mcex = 1.2) { # 02-12-15: added mean and sd for numeric vars # 00-12-23: set par(mfrow=...) within myplot # 00-10-21: modification to avoid trellis so we can have many plots to a page zz <- function( x ) { if ( x > 9) c(3,4) else cbind( '1'=c(1,1),'2'=c(1,2),'3'=c(2,2), '4'=c(2,2),'5'=c(2,3),'6'=c(2,3), '7'=c(3,3), '8'=c(3,3), '9'=c(3,3)) [, x] } on.exit(par(opt)) for ( i in names(x) ) if (is.character(x[[i]])) x[[i]]<-as.factor(x[[i]]) opt <- par(mfrow=mfrow) num.var <- dim(x)[2] if(!is.vector(labels) || !is.character(labels)) stop("labels must be a character vector") if(nrow(x) <= 1) stop("must have at least 2 rows in data frame") if(!is.null(names(labels))) labels <- labels[dimnames(x)[[2]]] if(num.var > 1) { # print(ask) # debug oldpar <- par(ask = ask) on.exit(par(oldpar)) } for(i in seq(length = num.var)) { xvar <- x[, i] if ( is.factor (xvar) ) { xvar <- na.include(xvar) z <- table(xvar) zmai <- par("mai") # print(zmai) # debug lab.len <- max(nchar(names(z))) zmai[2] <- zmai[2] + par("cin")[1] * abs(lab.len -5) old <- par(mai=zmai) barplot(z, names=names(z), horiz = T) par(old) } else { Nmiss <- sum(is.na(xvar)) N <- length(xvar) xvar <- xvar[!is.na(xvar)] xxvar <- ppoints(length(xvar)) xlab <- paste("Fraction of",N-Nmiss,"(NA:",Nmiss,")") if ( pmatch( ptype, 'normal', nomatch = 0) == 1 ) { xxvar <- qnorm( ppoints(length(xvar)) ) xlab <- paste("Normal quantile for",N-Nmiss,"obs. (NA:",Nmiss,")") } if(N==Nmiss) { xvar <- 0 xxvar <- 0 } if(is.category(xvar)) dotchart(table(xvar), lines = F, xlab = "Count", ...) else { plot(xxvar, sort(xvar), xlab = xlab, ylab = "Data", ...) xm <- mean(xvar) xs <- sqrt(var(xvar)) abline( h= xm,lty=1) abline( h= c(xm-xs,xm+xs),lty=2) } if(N==Nmiss) text(0,0,"NA", cex = 2) } mtext(labels[i], 3, 1.5, cex = mcex) } invisible() } myplot2 <- function(x, ...) UseMethod("myplot2") myplot2.data.frame <- function(x, labels = dimnames(x)[[2]], ..., ask = F,by=NULL, ptype = "quantile", mfrow = zz(ncol( x)), mcex = 1.2) { # adapted from myplot.data.frame .... for numerical variables with more than one group # BUGS: works only on numerical variables # 00-12-23: set par(mfrow=...) within myplot # 00-10-21: modification to avoid trellis so we can have many plots to a page if ( is.null(by) ) UseMethod('myplot') zz <- function(x) { if(x > 9) c(3, 4) else cbind("1" = c(1, 1), "2" = c(1, 2), "3" = c(2, 2), "4" = c(2, 2), "5" = c(2, 3), "6" = c(2, 3), "7" = c(3, 3), "8" = c(3, 3 ), "9" = c(3, 3))[, x] } for(i in names(x)) if(is.character(x[[i]])) x[[i]] <- as.factor(x[[i]]) x <- x [ sapply( x, function(x) !is.factor(x))] # keep only numeric opt <- par(mfrow = mfrow) num.var <- dim(x)[2] if(!is.vector(labels) || !is.character(labels)) stop("labels must be a character vector") if(nrow(x) <= 1) stop("must have at least 2 rows in data frame") if(!is.null(names(labels))) labels <- labels[dimnames(x)[[2]]] if(num.var > 1) { # print(ask) # debug oldpar <- par(ask = ask) on.exit(par(oldpar)) } for(i in seq(length = num.var)) { xx <- x[[i]] if (is.numeric(xx) && !is.factor(xx)) { myplot2.default( xx, by, title = names(x)[i],mcex=mcex) } } invisible() } myplot2.default <- function( x, by, title = deparse(substitute(x)), xlab=paste("NA:", sum(!ok)), ylab="Data", col = 1:8, mcex=1) { ok <- !is.na(x) by.cat <- unique(as.character(by)) n.cat <- length(by.cat) x <- x[ok] by <- by[ok] xvar <- ppoints(length(x)) plot(xvar,x,xlab=xlab,ylab=ylab,type='n') for ( i in 1:n.cat) { ok <- by == by.cat[i] if ( sum(ok) > 0) { xx <- x[ok] xvar <- ppoints(length(xx)) lines(xvar,sort(xx), col = col[i]) } } mtext(title, 3, 1.5, cex = mcex) invisible() } # dd #### zm <- barplot( table( dd$SensDmd), names= rep("",length(names(table(dd$SensDmd)))), horiz = T) ###################### ###################### ###################### ###################### ###################### "[<-.factor"<-function(x, ..., value) { # reports dropped levels cl <- class(x) class(x) <- NULL l <- levels(x) co <- attr(x, "contrasts") levels(x) <- NULL is.na.value <- is.na(value) if(inherits(value, "factor")) { class(value) <- NULL value <- levels(value)[value] } jj <- match(value, l) x[...] <- jj if(any(is.na(jj) & !is.na.value)) { non.match <- paste( unique(setdiff(value, l)), collapse = " ") mess <- paste("Following turned to NA':", non.match, collapse = " ") warning(mess) } levels(x) <- l attr(x, "contrasts") <- co class(x) <- cl x } type2 <- function(fit) { Terms <- as.terms(fit) drop.mat <- attr(Terms,"factors") form <- attr(Terms,"formula") vars <- dimnames(drop.mat)[[1]] for ( nn in vars ) { } } diags <- function(x, ...) UseMethod("diags") diags.lm <- function(x, y, ..., ask, labels = names(residuals(x)), showlabs = text) { # diags.lm # graphical diagnostics for lm, locally first-order for glm # enlarged version of plot.lm with emphasis on diagnostics # G. Monette, Dec. 94 # modified Nov. 97, May 98 # Slight modification to pairs adding labels, Jan 03 if(!missing(ask)) { op <- par(ask = ask) on.exit(par(op)) } form <- formula(x) f <- predict(x) r <- residuals(x) nams <- names(r) if(!missing(labels)) { nams <- names(residuals(x)) # # if labels not same length as residuals assume it's a vector # of len == original data and select elements included in residuals if(length(nams) != length(labels)) labels <- labels[nams] } ret <- NULL if(missing(y)) { y <- f + r yname <- deparse(form[[2]]) } else yname <- deparse(substitute(y)) fname <- paste("Fitted:", deparse(form[[3]]), collapse = " ") plot(f, y, xlab = fname, ylab = yname, main = "Dependent var. vs. Predicted", ...) abline(0, 1, lty = 1) lines(supsmu(f,y)) showlabs(f, y, labels,...) # # get influence diags and model matrix while looking at first plot # lmi <- lm.influence(x) hat <- lmi$hat sigma <- lmi$sigma # drop 1 sigma mm <- scale(model.matrix(x), scale = F) # centres each column mp <- predict(x, type = "terms") comp.res <- mp + r # effect + residual # # Absolute residual vs. predicted # plot(f, abs(r), xlab = fname, ylab = deparse(substitute(abs(resid(x)))), main = "Absolute Residual vs. Predicted", ...) showlabs(f, abs(r), labels, ...) # # # Normal quantile plot # zq <- qqnorm(r, main = "Normal Quantile Plot", ylab = "Residual", sub = fname) qqline(r) showlabs(zq, labels,...) # # # Symmetry plot of residuals (Lawrence C. Hamilton, Regression with # Graphics, Duxbury, 1992) n <- length(r) r.o <- sort(r) half <- (n + 1)/2 if(n %% 2 == 1) { # n is odd med <- r.o[half] below <- med - r.o[half:1] above <- r.o[half:n] - med } else { # n is even med <- sum(r.o[c(half, half + 1)])/2 below <- med - r.o[(n/2):1] above <- r.o[(n/2 + 1):n] - med } opt <- par(pty = "s") ran <- range(c(below, above)) plot(below, above, main = "Symmetry plot of residuals", xlab = "Distance below median", ylab = "Distance above median", xlim = ran, ylim = ran) abline(0, 1, lty = 2) par(opt) # # # Studentized residual vs. leverage # std.r <- r/(sigma * sqrt(1 - hat)) plot(hat, std.r, xlab = "Leverage (hat)", ylab = yname, sub = fname, main = "Studentized residual vs. Leverage", ...) showlabs(hat, std.r, labels,...) # plot(lmi$sig, std.r) # # # Added variable plot (partial regression leverage plot) # and partial residual plot # dr1 <- drop1(x, . ~ ., keep = c("residuals", "x.residuals")) drk <- dr1$keep # print(dimnames(drk)[[1]]) for(ii in dimnames(drk)[[1]]) { # print(paste("Doing AVP for", ii)) xr <- drk[[ii, "x.residuals"]] yr <- drk[[ii, "residuals"]] if(is.null(dim(xr))) { # one degree of freedom effect symbols(xr, yr, circles = hat, inches = max(hat), xlab = paste("Residual of", ii, "on other predictors"), ylab = paste( "Residual of", yname, "on all but", ii), main = paste("Added variable plot for", ii), sub = "radius of circles proportional to leverage") abline(lm(yr ~ xr)) lines(supsmu(xr, yr)) showlabs(xr, yr, labels,...) # partial residual plot symbols(mm[, ii], comp.res[, ii], circles = hat, inches = max(hat), xlab = paste(ii, "- mean(", ii, ")"), ylab = paste("Residual of", yname, "plus component for", ii), main = paste( "Partial residual plot for", ii), sub = "radius of circles proportional to leverage") lines(supsmu(mm[, ii], comp.res[, ii])) showlabs(mm[, ii], comp.res[, ii], labels,...) } else { symbols(mp[, ii], comp.res[, ii], circles = hat, inches = max(hat), xlab = paste("Effect of", ii), ylab = paste("Residual of", yname, "plus component for", ii), main = paste( "Modified partial residual plot for", ii), sub = "radius of circles proportional to leverage" ) lines(supsmu(mp[, ii], comp.res[, ii])) showlabs(mp[, ii], comp.res[, ii], labels,...) for(iii in dimnames(xr)[[2]]) { symbols(xr[, iii], yr, circles = hat, inches = max(hat), xlab = paste("Residual of", iii, "on all predictors but", ii), ylab = paste( "Residual of", yname, "on all but", ii), main = paste("Modified added variable plot for", iii), sub = "radius of circles proportional to leverage") lines(supsmu(xr[, iii], yr)) showlabs(xr[, iii], yr, labels,...) } } } # # effect of dropping one observation DFBETA # nams <- dimnames(lmi$coefficients)[[1]] pairs(lmi$coefficients) pairs(lmi$coefficients, panel = function(x,y,nams){ points(x,y) text(x,y,nams) }, nams = nams) # main = "Effect of dropping one case", sub = fname) invisible(0) } # library(home,T) # detach(what=1) ## ## ## Extensions of tapply ## ## # zdf$n <- rep(1:5,2) # sapply(dfapply(zdf$x, zdf[c('a','b','n')],min),data.class) # dimnames(tapply( zdf$x, zdf[c('a','b','n')], min)) # sapply(zdf,data.class) dfapply <- function( x , categ, fun, y='Y', ... ) { # like tapply but returns a data frame of the 'expand.grid' list. # PROBLEMS: # should dfapply return only non-missing combinations of the categories # or should it do a cartesian product like tapply # here we opted for the latter so functions like length could be used # (of course we then need to transfom NAs to 0) # Setting up the 'categ' data frame is easy the first way but hard the second # because it's hard to keep the mode of the original variables categ <- as.data.frame(categ) y <- tapply( x, categ, fun, ...) ret <- expand.grid(dimnames(y)) is.num <- sapply(categ, function(x) data.class(x)=='numeric') if ( any(is.num) ) { for ( i in (1:length(is.num))[is.num]) { ret[[i]] <- as.numeric(as.character(ret[[i]])) } } if ( match('Y',names(ret),0) > 0 ) warning("Variable name Y exists in data.frame and has been replaced by value returned") ret$Y <- c(y) ret } # library(help=home) # search() # detach(what=1) # library(home,T) # # fun.ssc # brace <- function( x1=0, y1=0, x2=0, y2=1, right = T, rad = .2) { uin <- par("uin") ux <- uin[1] uy <- uin[2] dx <- x2 - x1 dy <- y2 - y1 alpha <- atan(ux*dx , uy*dy ) scale <- sqrt( (ux*dx)^2 + (uy*dy)^2) # length in inches if ( scale > 5* rad ) rad <- rad / scale qcirc <- cbind( cos((0:10)*pi/20), sin( (0:10)*pi/20)) qcircr <- cbind( cos((10:0)*pi/20), sin( (10:0)*pi/20)) rot <- function( theta) t( cbind( c(cos(theta),sin(theta)), c(-sin(theta), cos(theta)))) seg1 <- t( t(rad * qcirc %*% rot( -pi/2)) + c(0, rad)) seg4 <- t( t(rad * qcirc) + c( 0, 1 - rad)) seg3 <- t( t( (rad * qcircr) %*% rot( pi )) + c(2*rad,.5+rad)) seg2 <- t( t( (rad * qcircr) %*% rot(pi/2)) + c(2*rad,.5-rad)) bra <- rbind(seg1,seg2,seg3,seg4) if ( !right ) bra <- bra %*% diag( c(-1,1)) bra <- scale * bra %*% rot(-alpha) bra <- bra %*% diag( c(1/ux, 1/uy)) bra <- t( t(bra) + c(x1,y1)) bra } data.ell <- function(x, y, radius = 1, style = 1, regwidth = 100, cifactor = 1,plot=T,...) { dmat <- na.omit(cbind(x, y)) v <- var(dmat) m <- apply(dmat, 2, mean) ell (m, v, radius = radius, style = style, regwidth = regwidth, cifactor = cifactor,plot=plot,...) } ell <- function(m, v, radius = 1, style = 1, regwidth = 100, cifactor = 1,plot=F,...) { # # For a normal theory confidence ellipse: # m is theta.hat # v is estimated variance of theta.hat # radius is [nu is dfs for error] # qt(.975, nu) for an ellipse whose shadows are ordinary 95% CI # sqrt( 1* qf(.95, 1, nu )) does exactly the same thing # sqrt( 2* qf(.95, 2, nu )) produces a 95% confidence ellipse # ibid produces an ellipse whose shadows are # Scheffe 95% CIs adjusted for fishing in a # space of dimension 2 # sqrt( d * qf( .95, d, nu)) is a 95% confidence ellipse adjusted for # fishing in a space of dimension d # sqrt( qf( 1 - .05/nt, 1, nu) produces an ellipse whose shadows are 95% # CIs with a Bonferroni adjustment for nt tests. # # Draws a data ellipse by transforming and translating a circle # # style: use binary OR of # 1 Ellipse # 2 Regression line # 4 Parallelogram # 8 CI # 16 vertical bands # 32 SE bands # # Latest version: 02-03-08 # NOTE Exactly the same as data.ell except for first three lines ... # so we'll call this for data.ell # if (is.list(m)) { mm <- m m <- c(mm[[1]]) v <- mm[[2]] } rads <- (2 * pi * (0:100))/100 circle <- cbind( x=NA, y=NA) if( style %% 2 ) { # ellipse circle <- rbind( circle,cbind(cos(rads), sin(rads))) } style <- style %/% 2 if( style %% 2 ) { # regression line circle <- rbind( circle, c(NA,NA), c(-regwidth,0),c(regwidth,0)) } style <- style %/% 2 if( style %% 2) { # box circle <- rbind( circle, c(NA,NA), c(1,1),c(-1,1),c(-1,-1),c(1,-1),c(1,1)) } style <- style %/% 2 if( style %% 2) { # interval circle <- rbind( circle, c(NA,NA), c(-regwidth,-cifactor*regwidth), c( regwidth, cifactor*regwidth), c(NA,NA), c(-regwidth, cifactor*regwidth), c( regwidth,-cifactor*regwidth)) } style <- style %/% 2 if( style %% 2) { # vertical band circle <- rbind( circle, c(NA,NA), c(1,-regwidth), c(1,+regwidth), c(NA,NA), c(-1,-regwidth), c(-1,+regwidth)) } style <- style %/% 2 if( style %% 2) { # SE band circle <- rbind( circle, c(NA,NA), c(-regwidth,1), c(+regwidth,1), c(NA,NA), c(-regwidth,-1), c(+regwidth,-1)) } ret <-t(m + radius * t(circle %*% chol(v))) dimnames(ret) <- list(NULL, names(m)) if (plot) lines(ret,...) invisible(ret) } ## ## print(ell( c(a=1,b=2), diag(2))) ## est <- function(fit, L =diag(length(coef(fit))), sel=NULL, radius = 1){ cc <- coef(fit) fits <- summary(fit) if(!is.null(sel)) { dimnames(L) <- list(names(cc),names(cc)) L <- L[sel,] } eta <- L %*% cc V <- (radius * fits$sigma)^2 * fits$cov.unscaled V <- L %*% V %*% t(L) ret <- list( eta = eta, V=V) attr(ret,"df") <- fit$df.resid attr(ret,"sigma") <- fits$sigma ret } # # manipulating ellipses # # represent as list(center, varmat) shadow <- function( ell, line) { # return coordinates of tangents points and shadow onto line # ell has form list(center,varmat) # line has form list(point, direction) if( !is.list(line)) line <- list(c(0,0),line) dir <- line[[2]] cent <- line[[1]] m <- ell[[1]] V <- ell[[2]] fact <- c(1/sqrt( t(dir) %*% V %*% dir)) ell1 <- m + fact * V %*% dir ell2 <- m - fact * V %*% dir line1 <- dir * t(dir)%*%(ell1 - cent)/(t(dir)%*%dir) + cent line2 <- dir * t(dir)%*%(ell2 - cent)/(t(dir)%*%dir) + cent ret <- rbind( c(ell1), c(line1), c(NA,NA), c(ell2), c(line2)) ret } slice <- function( ell, along, tangent = NULL, radius = c(-1,1) ){ m <- c(ell[[1]]) V <- ell[[2]] if ( length(radius) == 1) radius <- radius * c(-1,1) if (!is.null(tangent)) { z <- solve(V) %*% tangent along <- c(z[2],-z[1]) } print(along) print(V) len <- sqrt(c( t(along) %*% solve(V) %*% along )) along <- along/len ret <- rbind(radius[1]*along +m, radius[2]*along+m) ret } # shapiro-wilk test "shapiro.wilk.test" <- function(x){ ##this function is an S version of the procedure described by # J. P. Royston (1982) in "An Extension of Shapiro and Wilk's W Test # for Normality to Large Samples" from Applied Statistics,31 no.2 # pp115-124. # n <- length(x) index <- 1:n m <- qnorm((index - 0.375)/(n + 0.25)) y <- sort(x) mu <- mean(y) SSq <- sum((y - mu)^2) astar <- 2 * m ends <- c(1, n) astar.p <- astar[ - ends] if(n <= 20) m <- n - 1 else m <- n if(m < 20) aa <- gamma(0.5 * (m + 1))/(sqrt(2) * gamma(0.5 * m + 1)) else { f1 <- (6 * m + 7)/(6 * m + 13) f2 <- exp(1)/(m + 2) f3 <- (m + 1)/(m + 2) f3 <- f3^(m - 2) aa <- f1 * sqrt(f2 * f3) } astar.1 <- (aa * sum(astar.p^2))/(1 - 2 * aa) astar.1 <- sqrt(astar.1) astar[1] <- - astar.1 astar[n] <- astar.1 A <- astar/sqrt(sum(astar^2)) W <- (sum(A * y)^2)/SSq if(n <= 20) { u <- log(n) - 3 lambda <- 0.118898 + 0.133414 * u + 0.327907 * u^2 logmu <- -0.37542 - 0.492145 * u - 1.124332 * u^2 - 0.199422 * u^3 logsigma <- -3.15805 + 0.729399 * u + 3.01855 * u^2 + 1.558776 * u^3 } if(n > 20) { u <- log(n) - 5 lambda <- 0.480385 + 0.318828 * u - 0.0241665 * u^3 + 0.00879701 * u^4 + 0.002989646 * u^5 logmu <- -1.91487 - 1.37888 * u - 0.04183209 * u^2 + 0.1066339 * u^3 - 0.03513666 * u^4 - 0.01504614 * u^5 logsigma <- -3.73538 - 1.015807 * u - 0.331885 * u^2 + 0.1773538 * u^3 - 0.01638782 * u^4 - 0.03215018 * u^5 + 0.003852646 * u^6 } mu <- exp(logmu) sigma <- exp(logsigma) y <- (1 - W)^lambda z <- (y - mu)/sigma p <- 1 - pnorm(z) if(n < 7) { warning("n is too small for this program to correctly estimate p") p <- NA } if(n > 2000) { warning("n is too large for this program to correctly estimate p") p <- NA } out <- list(W = W, n = n, p = p) out } ## ## modified summary.default to include sd ## "summary.default" <- function(object, ..., digits = max(options()$digits - 3, 3)) { if(length(levels(object))) return(summary.factor(object, ...)) value <- if(is.numeric(object)) { nas <- is.na(object) object <- object[!nas] qq <- quantile(object) qq <- signif(c(qq[1:3], mean(object), qq[4:5]), digits) names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") if (length(object) > 1) qq <- c(qq,"St.Dev."= sqrt(var(object))) if(any(nas)) c(qq, "NA's" = sum(nas)) else qq } else if(is.recursive(object) && !is.language(object) && (n <- length(object))) { sumry <- array("", c(n, 3), list(names(object), c("Length", "Class", "Mode"))) ll <- numeric(n) for(i in 1:n) { ii <- object[[i]] ll[i] <- length(ii) sumry[i, 2] <- class(ii) sumry[i, 3] <- mode(ii) } sumry[, 1] <- format(ll) class(sumry) <- "table" sumry } else c(Length = length(object), Class = class(object), Mode = mode(object)) class(value) <- "table" value } ## ## vif ## vif <- function(object, ...) UseMethod("vif") vif.default <- function(object, ...) stop("No default method for vif. Sorry.") vif.lm <- function(object, ...) { V <- summary(object)$cov.unscaled Vi <- crossprod(model.matrix(object)) nam <- names(coef(object)) k <- match("(Intercept)", nam, nomatch = FALSE) # Jürgen Groß # University of Dortmund # Vogelpothsweg 87 # D-44221 Dortmund, Germany # gross@statistik.uni-dortmund.de v1 <- diag(V) v2 <- diag(Vi) uc.struct <- structure(v1 * v2, names = nam) if(k) { v1 <- diag(V)[-k] v2 <- diag(Vi)[-k] - Vi[k, -k]^2 / Vi[k, k] nam <- nam[-k] c.struct <- structure(v1 * v2, names = nam) return(Centered.VIF = c.struct, Uncentered.VIF = uc.struct) } else { warning(paste("No intercept term", "detected. Uncentered VIFs computed.")) return(Uncentered.VIF = uc.struct) } } ################################################################### # Program name: BHtests_version2.s # Updated: December 5, 1997. # Authors: Dave Cummins, V. Devanarayan # Eli Lilly & Company -- Cummins_DJ@lilly.com # Purpose: # Splus functions for implementing the Brown-Hettmansperger tests # for skewness, kurtosis and normal tails (see JASA, Dec '96). # For validation purposes, data from their paper have been included # in the variable my.y in the bottom part of this code. In # order to use this program for other data sets, simply call the # function: BHtests(my.data) after sourcing the file: BHtests_version2.s #################################################################### options(digits=15) # estimate of sigma est.sigma <- function(y) { y <- sort(y) n <- length(y) vec1 <- 1:(n-1) vec2 <- 2:n sum(( y[vec2] - y[vec1] )*dnorm( qnorm( vec1/n ) )) } # estimate of d3 est.d3 <- function(y) { y <- sort(y) n <- length(y) vec1 <- 1:(n-2) vec2 <- 2:(n-1) d3hat <- 2^(-.5)*y[1]*( -dnorm(qnorm(1/n))*qnorm(1/n) ) + 2^(-.5)*sum( y[vec2]*( dnorm( qnorm( vec1/n ) )*qnorm( vec1/n ) - dnorm( qnorm( vec2/n ) )*qnorm( (2:(n-1))/n ))) + 2^(-.5)*y[n]*( dnorm(qnorm((n-1)/n))*qnorm((n-1)/n) ) d3hat } # estimate of d4 est.d4 <- function(y) { y <- sort(y) n <- length(y) vec1 <- 1:(n-2) vec2 <- 2:(n-1) d4hat <- 6^(-.5)*y[1]*( -dnorm(qnorm(1/n))*(qnorm(1/n)^2 - 1) ) + 6^(-.5)*sum( y[vec2]*( dnorm(qnorm(vec1/n))*(qnorm(vec1/n)^2 - 1) - dnorm(qnorm(vec2/n))*(qnorm(vec2/n)^2 - 1) ) ) + 6^(-.5)*y[n]*( dnorm(qnorm((n-1)/n))*(qnorm((n-1)/n)^2 - 1) ) d4hat } # Statistic Z1 (not adjusted for small sample bias) z1 <- function(y) { sqrt(3*length(y))*est.d3(y)/est.sigma(y) } # Statistic Z2 (not adjusted for small sample bias) z2 <- function(y) { sqrt(4*length(y))*est.d4(y)/est.sigma(y) } # This function computes bias adjustment corrections for small samples # n: sample size, r: number of replications biasadj <- function(n,r) { z1a <- matrix(0,r,1) z2a <- matrix(0,r,1) # The set.seed below will give results that match the JASA paper # set.seed(158) cat("Iteration:") for (i in 1:r) { if( trunc(i/100)==i/100 ) cat(i,".. ") ry <- rnorm(n) z1a[i] <- z1(ry) z2a[i] <- z2(ry) } cat("\n") cat("\n") c( mean(z2a), sqrt(var(z1a)), sqrt(var(z2a)) ) } # This is the function that actually computes the test-statistics # and p-values for skewness, kurtosis and nonnormal tails. # This function calls the other functions defined above. BHtests <- function(y,r=1000,plotit=T,legend.size=1) { ################################################################### # Program name: BHtests_version2.s # Updated: December 5, 1997. # Authors: Dave Cummins, V. Devanarayan # Purpose: # Splus functions for implementing the Brown-Hettmansperger tests # for skewness, kurtosis and normal tails (see JASA, Dec '96). # For validation purposes, data from their paper have been included # in the variable my.y in the bottom part of this program. In # order to use this program for other data sets, simply call the # function: BHtests(my.data) after sourcing the file: BHtests_version2.s ################################################################### y <- sort(y) buv <- biasadj(n=length(y),r) bn <- buv[1] un <- buv[2] vn <- buv[3] Z1ab <- sqrt(3*length(y))*est.d3(y)/(un*est.sigma(y)) pZ1 <- 2*( 1-pnorm(abs(Z1ab)) ) Z2ab <- max( (sqrt(4*length(y))*est.d4(y)/est.sigma(y) - bn)/vn, 0 ) pZ2 <- 1-pnorm(Z2ab) Tab <- Z1ab^2 + Z2ab^2 pT <- 1-pnorm(sqrt(Tab)) + .5*exp(-Tab/2) Z1ab <- round(Z1ab,5) pZ1 <- round(pZ1,5) Z2ab <- round(Z2ab,5) pZ2 <- round(pZ2,5) Tab <- round(Tab,5) pT <- round(pT,5) cat("Bias Adjusted Test Statistic for Skewness =",Z1ab,'\n') cat("with asymptotic P-value for a two-sided test =",pZ1,'\n','\n') cat("Bias Adjusted Test Statistic for Kurtosis =",Z2ab,'\n') cat("with asymptotic P-value for a one-sided test =",pZ2,'\n','\n') cat("Bias Adjusted Test Statistic for Normal Tails =",Tab,'\n') cat("with asymptotic P-value for a two-sided test =",pT,'\n','\n','\n') if(plotit==T) { ## if(names(dev.cur())=="null device") motif() n <- length(y) vec <- seq(from=0,to=n,by=1) vec[(n+1)] <- 0 phi <- dnorm(qnorm(vec/n)) b <- phi[1:n] - phi[2:(n+1)] lambda <- t(b) %*% b a <- b / lambda # equivalent to : solve( t(b)%*%b ) %*% b plot(a,y, ylab="Observed Data", xlab="Quantiles of Standard Normal (bias-adj)") if(legend.size>0) { y.width <- max(y) - min(y) a.width <- max(a) - min(a) text.top <- max(y) - .05 * y.width*legend.size text.bott <- max(y) - .40*y.width*legend.size text.left <- min(a) + .05 * a.width*legend.size text.right <- min(a) + .5 * a.width*legend.size lines(a,a*sqrt(var(y))+mean(y)) legend(x=c(text.left,text.right),y=c(text.top,text.bott), type="n",bty="n",cex=legend.size, legend=c(" p-values ", " ------------- ", paste("Skewness: ",pZ1,sep="")," ", paste("Kurtosis: ",pZ2,sep="")," ", paste("Normal Tails: ",pT,sep="")) ) } } list(skewness=pZ1,kurtosis=pZ2,tails=pT) } ### ### General graphic functions ### { ## txt: allows formulas for text txt <- function(x1,... ){ UseMethod("txt") } txt.default <- function (x1, ...) { text( x1, ...) } txt.formula <- function(formula,data, labels=rownames(data), subset, ...) { cl <- match.call() # print(cl) mf <- match.call(expand.dots = FALSE) # print(mf) m <- match(c("formula", "data", "subset","labels"), names(mf), 0) mf <- mf[c(1, m)] # print(mf) mf[[1]] <- as.name("model.frame") mf <- eval(mf, parent.frame()) # cat("Post eval:\n") # print (mf) if ( ! is.null( mf[["(labels)"]] ))labs <- mf[["(labels)"]] else labs <- rownames(mf) text( mf[[2]], mf[[1]], labs, ...) } } { pad <- function(x,n=1,...) { ## pad: prepends blanks to put some light between points and labels ## does the same job as 'offset' but the latter doesn't always seem to work ## Methods: default and factor UseMethod("pad") } pad.default <- function(x,n=1){ # warning("Consider using par offset instead of pad") paste(substring(" ",1,n), x, sep ="") } pad.factor <- function(x,...) { factor( pad( as.character(x), ...)) } } csv <- function(x,quote='"',sep=",") { cat( quote, paste(x, collapse = paste(quote,quote,sep=sep)), quote,'\n', sep="") # output for cutting and pasting into source } csvlist <- function(x,quote='"',sep=",") { ## good for entering start values cat(paste( "\t",names(x),"=", x, ',\n')) } { ## rnd: round that can be applied to print data.frames rnd <- function(x, ...) { UseMethod("rnd") } rnd.default <- function(x,...) { round(x, ...) } rnd.factor <- function(x, ...){ x } rnd.data.frame <- function(x, ...) { as.data.frame( lapply( x, rnd, ...)) } } ### ### developed for math 4730 in 2004/05 ### ### ### R Functions for MATH4730 ### Last update: Nov. 3, 2004 ### ### Testing the General Linear Hypothesis with normal models ### dfe <- function(mod) mod$df.residual glh <- function(mod, L = diag(length(coef(mod))) , h0 = 0, q = qt(.975, dfe(mod)) ) { ## Genereal Linear Hypothesis for Normal regression model ## Arguments: ## mod: the output of lm ## L: a hypothesis matrix or vector (turned into a matrix with one row) ## h0: the null value of L %*% beta. ## Details: ## gets eta.hat and var(eta.hat) ## computes F statistic to test H0: L beta = h0 ## To do: ## 1. should improve numerical aspects of algorithm ## 2. should test linear independence of hypothesis matrix ## 3. need help file ## Written by G. Monette, Oct. 25, 2004 L <- rbind(L) # makes L into a matrix if it's a vector dimnames(L)[[2]] <- names(coef(mod)) ## print(L) dfh <- nrow(L) eta.hat <- L %*% coef(mod) # does the right thing if L is just a vector nu2 <- mod$df.residual mod.sum <- summary(mod) xpx.inv <- mod.sum$cov.unscaled var.eta <- mod.sum$sigma^2 * ( L %*% xpx.inv %*% t(L)) F.obs <- (t( eta.hat - h0 ) %*% solve( var.eta) %*% (eta.hat - h0))/dfh p.val <- 1 - pf(F.obs, dfh, nu2) # confidence intervals se.eta <- sqrt(diag(var.eta)) cis <- cbind( EtaHat = eta.hat-h0, SE = sqrt(diag(var.eta)), LowerBound = (eta.hat-h0)-q*se.eta, UpperBound =(eta.hat-h0)+q*se.eta) dimnames(cis) <- list( dimnames(L)[[1]], c("EtaHat-h0", "SE", "LowerBound", "UpperBound")) # print(cis) ret <- list(Call = match.call(), model = mod, L = L, h0 = h0, eta.hat = eta.hat, var.eta.hat = var.eta, df.residual = nu2, F = F.obs, "Overall p.val" = p.val, q=deparse(substitute(q)),"Confidence Intervals" = cis) class(ret) <- "glh" ret } ## glh(fit, L = diag(13) [ 11:13,]) ## glh(fit, L = diag(13) [ 11:13,], q= 3*qf(.95,3,dfe(fit))) ### ### Data frame at a glance: ### xqplot: quantile or bar plots for each variable in a data frame ### xqplot <- function(x, labels = dimnames(x)[[2]], ..., ask = F, ptype = "quantile", mfrow = findmfrow ( ncol(x)), mcex = 1.2, maxlab = 12 , debug = F, mar = c(5,1.5,3,1)) { ## Adapted from myplot.data.frame for R by G. Monette, Oct. 25, 2004 ## maxlab is maximum number of labels findmfrow <- function( x ) { if ( x > 9) c(3,4) else cbind( '1'=c(1,1),'2'=c(1,2),'3'=c(2,2), '4'=c(2,2),'5'=c(2,3),'6'=c(2,3), '7'=c(3,3), '8'=c(3,3), '9'=c(3,3)) [, x] } opt <- par( mfrow = mfrow, mar = mar + 0.1 ) on.exit(par(opt)) iscat <- function( x ) is.factor(x) || is.character(x) compute.cex <- function( x ) { ll <- length(x) cex <- 2 * ifelse( ll < 5, 2, ifelse( ll < 10, 1, ifelse( ll < 20, .7, .5)))/mfrow[1] } for ( ii in 1: dim(x)[2]) { vv <- x[[ii]] nam <- labels[[ii]] Nmiss <- sum(is.na(vv)) N <- length(vv) if ( iscat(vv) ){ ff <- as.character(vv) ff[is.na(ff)] <- "" ff <- as.factor(ff) tt <- table(ff) ll <- names(tt) nn <- length(ll) xlab <- paste("N =", N ," Nmiss =", Nmiss) zm <- barplot( tt, names = rep("",nn), horiz = T, xlab = xlab) ## If nn > maxlab drop labels for smaller frequencies sel <- rep( T, length(tt)) tt.sorted <- rev(sort(tt)) if ( nn > maxlab ) sel <- tt > tt.sorted[maxlab] if (debug) { print(sel) print(nam) print(tt) print(tt.sorted) print(maxlab) print(tt.sorted[maxlab]) print(sel) print(zm[sel]) print(rep(max(tt),nn)[sel]) print( ll[sel]) } if ( any(sel) ) text( rep( max( tt ), nn)[sel] , zm[sel], ll[sel], adj = 1, cex = compute.cex( nn )) } else { vv <- vv[!is.na(vv)] xxvar <- ppoints(length(vv)) xlab <- paste("Fraction of",N-Nmiss,"(NA:",Nmiss,")") if ( pmatch( ptype, 'normal', nomatch = 0) == 1 ) { xxvar <- qnorm( ppoints(length(xvar)) ) xlab <- paste("Normal quantile for",N-Nmiss,"obs. (NA:",Nmiss,")") } ## Plot continuous if ( N == Nmiss ) { xxvar <- 1 vv <- 1 plot( xxvar, vv, xlab = xlab, ylab="", type = 'n') text( 1, 1, "NA") } else { plot(xxvar, sort(vv), xlab = xlab, ylab = "Data", ...) xm <- mean(vv) xs <- sqrt(var(vv)) abline( h= xm,lty=1) abline( h= c(xm-xs,xm+xs),lty=2) } } ## Titles for all plots mtext(labels[ii], 3, 1.5, cex = mcex) } invisible(0) } ncv.test.lm <- function (model, var.formula, data = NULL, subset, na.action, ...) { ## corrects a typo in the function in car if ((!is.null(class(model$na.action))) && class(model$na.action) == "exclude") model <- update(model, na.action = na.omit) sumry <- summary(model) residuals <- residuals(model, type = "pearson") S.sq <- df.residual(model) * (sumry$sigma)^2/sum(!is.na(residuals)) U <- (residuals^2)/S.sq if (missing(var.formula)) { mod <- lm(U ~ fitted.values(model)) varnames <- "fitted.values" var.formula <- ~fitted.values df <- 1 } else { if (missing(na.action)) { na.action <- if (is.null(model$na.action)) options()$na.action else parse(text = paste("na.", class(model$na.action), sep = "")) # typo corrected by GM } m <- match.call(expand.dots = FALSE) if (is.matrix(eval(m$data, sys.frame(sys.parent())))) m$data <- as.data.frame(data) m$formula <- var.formula m$var.formula <- m$model <- m$... <- NULL m[[1]] <- as.name("model.frame") mf <- eval(m, sys.frame(sys.parent())) response <- attr(attr(mf, "terms"), "response") if (response) stop(paste("Variance formula contains a response.")) mf$U <- U .X <- model.matrix(as.formula(paste("U~", as.character(var.formula)[2], "-1")), data = mf) mod <- lm(U ~ .X) df <- sum(!is.na(coefficients(mod))) - 1 } SS <- anova(mod)$"Sum Sq" RegSS <- sum(SS) - SS[length(SS)] Chisq <- RegSS/2 result <- list(formula = var.formula, formula.name = "Variance", ChiSquare = Chisq, Df = df, p = 1 - pchisq(Chisq, df), test = "Non-constant Variance Score Test") class(result) <- "chisq.test" result } ### ### Functions for Math 6627 2004/05 ### ## data ellipse dell <- data.ell ## confidence ellipse for glh or similar output cell <- function(glh, type="t" , confidence = .95) { radius <- switch(type, t= qt( 1 - (1-confidence)/2, glh$df.residual), qf(confidence, 2, glh$df.residual) ) ret <- ell(c( glh$eta.hat), glh$var.eta.hat, radius = radius) } ## glh(fit, diag(12)[2:3,]) ## plot(cell( glh( fit, diag(12)[ 2:3, ]))) ## ell