sub.sample <- function(x, fixed.percentage = FALSE, min.count = TRUE, count = 300, tolerance = 5, min.percentage = FALSE, percentage = 10, max.percentage = 90) # fixed.percentage: if TRUE all subsamples = percentage% # (overrides min.count and min. percentage) # x = data frame with samples in columns and taxa in rows # min.count: if TRUE, each subsample should contain a minimum number of individuals # count = minimum number of individuals in subsample (if min.count = TRUE) # tolerance = the number of individuals less than count that will constitute # an acceptable subsample # min.percentage: if TRUE, then each subsample should be at least a minimum percentage # percentage: the minimum percentage subsample size # max.percentage = percentage subsample beyond which the subsampling effort outweighs the effort of sorting the whole sample: therefore, whole sample taken { ss.fixed.perc <- function(x,percentage) { col.headers <- names(x) x <- as.matrix(x) ns <- dim(x)[2] samp <- sapply(x[,1],rbinom, n=1, prob=percentage/100) if(ns > 1) { subsampled.set <- data.frame(samp) number.in.subsample <- sum(samp) subsample.size <- percentage/100 for(i in 2:ns) { samp <- sapply(x[,i],rbinom, n=1, prob=percentage/100) subsampled.set <- cbind(subsampled.set, samp) number.in.subsample <- c(number.in.subsample,sum(samp)) subsample.size <- c(subsample.size,percentage/100) } } else { subsampled.set <- samp number.in.subsample <- sum(samp) subsample.size <- percentage/100 } names(subsampled.set) <- col.headers list(subsampled.set=subsampled.set, subsample.size=subsample.size, number.in.subsample=number.in.subsample) } ss.fixed.count.sss <- function(x,count,max.percentage) { col.headers <- names(x) x <- as.matrix(x) ns <- dim(x)[2] tots <- apply(x,2,sum) sss <- round(count/tots,2) sss[!sss<((max.percentage+1)/100)] <- 1 sss } ss.fixed.count1 <- function(x,sss) { col.headers <- names(x) x <- as.matrix(x) ns <- dim(x)[2] samp <- sapply(x[,1],rbinom, n=1, prob=sss[1]) if(ns > 1) { subsampled.set <- data.frame(samp) number.in.subsample <- sum(samp) subsample.size <- sss for(i in 2:ns) { samp <- sapply(x[,i],rbinom, n=1, prob=sss[i]) subsampled.set <- cbind(subsampled.set, samp) number.in.subsample <- c(number.in.subsample,sum(samp)) } } else { subsampled.set <- data.samp number.in.subsample <- sum(samp) subsample.size <- sss } names(subsampled.set) <- col.headers list(subsampled.set=subsampled.set, subsample.size=subsample.size, number.in.subsample=number.in.subsample) } ss.fixed.count2 <- function(x,ss.x,ss.x.sss,count,tolerance,max.percentage) { residues <- x - ss.x number.required <- count - apply(ss.x,2,sum) number.required[number.required < tolerance] <- 0 #identifies adequately sorted samples for no further subsanpling sss.outof100 <- ceiling(100*(1-ss.x.sss)* number.required/apply(residues,2,sum))/100 sss.outof100[is.na(sss.outof100)] <- 0 sss.outof100[(sss.outof100 + ss.x.sss) > max.percentage/100] <- 1 #identifies samples requiring > max.percentage subsample, and flags them for complete sorting sss <- sss.outof100/(1-ss.x.sss) sss[sss > 1] <- 1 # assumes 100 cells in subsampler: i.e if ss.x.sss was 10%, sss is rounded to 0.10, and then expressed as an unrounded proportion of 90 cells (/1-ss.x.sss) ss.x1 <- ss.fixed.count1(residues, sss) subsampled.set <- ss.x + ss.x1$subsampled.set subsample.size <- ss.x.sss + sss.outof100 subsample.size[subsample.size>1] <- 1 number.in.subsample <- apply(ss.x,2,sum) + ss.x1$number.in.subsample list(subsampled.set = subsampled.set, subsample.size = subsample.size, number.in.subsample = number.in.subsample) } if(fixed.percentage){ ss.fixed.perc(x, percentage) } else { if(min.count && !min.percentage) { sss <- ss.fixed.count.sss(x,count,max.percentage) working.ss <- ss.fixed.count1(x, sss) working.sss <- working.ss$subsample.size while(sum(working.sss) > 0) { working.ss <- ss.fixed.count2(x, working.ss$subsampled.set, working.ss$subsample.size, count, tolerance, max.percentage) working.sss <- (count - apply(working.ss$subsampled.set,2,sum)) working.sss[working.sss < tolerance] <- 0 working.ss } working.ss } else { if(min.count && min.percentage) { working.ss <- ss.fixed.perc(x, percentage) working.sss <- working.ss$subsample.size while(sum(working.sss) > 0) { working.ss <- ss.fixed.count2(x, working.ss$subsampled.set, working.ss$subsample.size, count, tolerance, max.percentage) working.sss <- (count - apply(working.ss$subsampled.set,2,sum)) working.sss[working.sss < tolerance] <- 0 working.ss } working.ss } else { if(!min.count) { cat("\n\nYou must specify one of the following:\n\n") cat("1. fixed.percentage = TRUE (overrides other options)\n\n") cat("2. min.count = TRUE & min.percentage = FALSE\n") cat(" (= fixed count subsampling)\n\n") cat("3. min.count = TRUE & min.percentage = TRUE\n") cat(" (= fixed count subsampling conditional on a min. percentage)\n\n") } } } } } ssN <- function(x, N, spp.names = NULL) { nsamp <- dim(x)[2] if(is.null(names(x))) samp.names <- c(1:nsamp) else samp.names <- names(x) nspp <- dim(x)[1] spp <- c(1:nspp) if(is.null(spp.names)) spp.names <- spp if(is.data.frame(x)) ss.x <- x else ss.x <- data.frame(array(0, dim = c(nspp, nsamp)), row.names = spp.names) for(a in 1:nsamp) { xa <- x[,a] if(sum(xa) > N) { inds <- rep(spp[xa > 0], times = xa[xa > 0]) o <- order(runif(length(inds))) inds <- inds[o] inds.ss <- inds[1:N] # first break must = 0 to 'trick' hist into counting # first two elements: without this it lumps the first two. ss.x[,a] <- hist(inds.ss, br = c(0:nspp), plot = FALSE)$counts } else ss.x[,a] <- x[,a] } ss.x <- data.frame(t(ss.x), row.names = samp.names) ss.x <- t(ss.x) ss.x } rarefact <- function(x, ESn = c(25, 50, 100, 150, 200, 250), num.reps = 100, subset = NULL) { nsamp <- dim(x)[2] if(is.null(names(x))) samp.names <- c(1:nsamp) else samp.names <- names(x) nspp <- dim(x)[1] spp <- c(1:nspp) nESn <- length(ESn) cat("\nplease wait: conducting", num.reps*nsamp, "randomizations\n") ES.out <- c(sample = "first", N.inds = 0, mean.ES = 0, var.ES = 0) sample <- "temp" N.inds <- 0 mean.ES <- 0 var.ES <- 0 if(!is.null(subset)) { ES.subset.out <- c(sample = "first", N.inds = 0, mean.ES.subset = 0, var.ES.subset = 0) mean.ES.subset <- 0 var.ES.subset <- 0 } for(a in 1:nsamp) { xa <- x[,a] inds <- rep(spp[xa > 0], times = xa[xa > 0]) count.spp <- hist(inds, br = c(0:nspp), plot = FALSE)$counts tot.spp <- length(count.spp[count.spp > 0]) tot.inds <- sum(xa) ESi <- array(0, dim = c(num.reps, nESn)) if(!is.null(subset)) { ESi.subset <- array(0, dim = c(num.reps, nESn)) inds.subset <- inds[inds >= min(subset)] inds.subset <- inds.subset[inds.subset <= max(subset)] count.spp.subset <- hist(inds.subset, br = c(0:nspp), plot = FALSE)$counts tot.spp.subset <- length(count.spp.subset[count.spp.subset > 0]) } for(i in 1:num.reps) { o <- order(runif(length(inds))) inds <- inds[o] for(j in 1:nESn) { if(ESn[j] < tot.inds) { inds.ss <- inds[1:ESn[j]] count.ES <- hist(inds.ss, br = c(0:nspp), plot = FALSE)$counts ESi[i,j] <- length(count.ES[count.ES > 0]) if(!is.null(subset)) { inds.subset <- inds.ss[inds.ss >= min(subset)] inds.subset <- inds.subset[inds.subset <= max(subset)] count.ES.subset <- hist(inds.subset, br = c(0:nspp), plot = FALSE)$counts ESi.subset[i,j] <- length(count.ES.subset[count.ES.subset > 0]) } } else ESi[i,j] <- NA } } ES <- apply(ESi, 2, mean, na.rm = TRUE) vES <- apply(ESi, 2, var, na.rm = TRUE) sample <- c(sample, rep(samp.names[a], times = nESn + 1)) N.inds <- c(N.inds, ESn, tot.inds) mean.ES <- c(mean.ES, ES, tot.spp) var.ES = c(var.ES, vES, 0) outlen <- length(mean.ES) if(!is.null(subset)) { ES.subset <- apply(ESi.subset, 2, mean, na.rm = TRUE) vES.subset <- apply(ESi.subset, 2, var, na.rm = TRUE) mean.ES.subset <- c(mean.ES.subset, ES.subset, tot.spp.subset) var.ES.subset = c(var.ES.subset, vES.subset, 0) ES.out.subset <- data.frame(sample = sample[2:outlen], N.inds = N.inds[2:outlen], mean.ES.subset = mean.ES.subset[2:outlen], var.ES.subset = var.ES.subset[2:outlen]) } } ES.out <- data.frame(sample = sample[2:outlen], N.inds = N.inds[2:outlen], mean.ES = mean.ES[2:outlen], var.ES = var.ES[2:outlen]) if(!is.null(subset)) list(ES = ES.out, ES.subset = ES.out.subset) else { ES <- ES.out ES } } ssN.nr <- function(x, N, num.ss = 10, spp.names = NULL) { if(N*num.ss > sum(x)) cat("\n\nNumber of individuals to be subsampled exceeds number in sample\n\n") else { nspp <- length(x) spp <- c(1:nspp) if(is.null(spp.names)) spp.names <- spp ss.x <- data.frame(array(0, dim = c(nspp, num.ss)), row.names = spp.names) inds <- rep(spp[x > 0], times = x[x > 0]) o <- order(runif(length(inds))) inds <- inds[o] for(a in 1:num.ss) { inds.ss <- inds[((N*a)-N+1):(N*a)] # first break must = 0 to 'trick' hist into counting # first two elements: without this it lumps the first two. ss.x[,a] <- hist(inds.ss, br = c(0:nspp), plot = FALSE)$counts } ss.x <- data.frame(t(ss.x)) ss.x <- t(ss.x) ss.x } }