################################################################ ## Exploratory data analysis for threshold detection ## For LTER Working group on threshold detection ## Initial coding by Aaron Ellison (Harvard) & Sapna Sharma ## (Wisconsin/Chicago) ## Final coding and commenting by Aaron Ellison (AME) ## Initial version: 23 May 2011 ## Revised: 3 October 2011 (AME) ## 11 October 2011 (AME) ## Current version: 11 October 2011 ## Questions? TANSTAAFL ############################################################### ############################################################### ## Required libraries ############################################################### require(plotrix) require(strucchange) ############################################################### ## Data should be: csv, txt, or equivalent file ## Columns: time, driver, response ## LTER data: time in years ## SET WORKING DIRECTORY if needed ## REPLACE "mydata.csv" below with your data file ############################################################### setwd( ) mydata <- read.csv("mydata.csv", header=TRUE) ## OR if .txt ## mydata <- read.table("mydata.txt", header=TRUE, sep="\t") mytime <- mydata$time mydriver <- mydata$driver myresponse <- mydata$response ############################################################### ## Standardize data (sd units), and interpolate over NAs if necessary ## Fit a loess curve to visualize trend if any ## Create time-series object for later use ############################################################### myresponse <- (myresponse-mean(myresponse, na.rm=TRUE))/sd(myresponse, na.rm=TRUE) ############################################################### #User must define value1, value2, value3 for interpolation ############################################################### mystart <- value1 #beginning of useful data (time point, indexes response) myend <- value2 #end of useful data (time point, indexes response) myfrequency <- value3 #frequency of time (1 = annual, etc.) ############################################################### #interpolation for missing data #may not be required for all datasets #uncomment if needed # interp.data <- myresponse # for (i in 1:length(interp.data)) # {if (is.na(interp.data[i])) # interp.data[i] <- rnorm(1,mean(interp.data[mystart:myend],na.rm=T), # sd(interp.data[mystart:myend],na.rm=T))} ############################################################### ############################################################### # replace "interp.data" with "myresponse" here and in plot routines # if no interpolation was needed myresponse.loess <- loess(interp.data ~ mytime) interp.data <- ts(interp.data, start=mystart, frequency = myfrequency) ############################################################### ## Change-point analysis (strucchange) ############################################################### y <- interp.data #place-holder ## difference or detrend, you decide # difference the time series to remove trend y.diff <- diff(y) # OR detrend relative to loess to remove trend y.detrend <- y-predict(myresponse.loess) # first use basic model that fits constant to the ts data # then plot the result ocus.y.detrend <- efp(y.detrend ~1, type="OLS-CUSUM") plot(ocus.y.detrend) # Fstats gives a sequence of F statistics # compute and plot fs.y.detrend <- Fstats(y.detrend~1) plot(fs.y.detrend) # breakpoint estimate for the F-statistic # this will be slow for very long (thousands of points) time series breakpoints(fs.y.detrend) # this computes all possible breakpoints # this will be slow for very long (thousands of points) time series bp.y.detrend <- breakpoints(y.detrend~1) # model selection (n breakpoints)using minimum of BIC or RSS plot(bp.y.detrend) # this identifies one breakpoint bp1.y.detrend <- breakpoints(bp.y.detrend, breaks=1) # fit a null model with no breakpoint fm0.y.detrend <- lm(y.detrend ~ 1) # use the breakpoint for coefficients on either side y.detrend.fac <- breakfactor(bp1.y.detrend, breaks=1) # fits alternative model with breakpoints in y.detrend.fac fm1.y.detrend <- lm(y.detrend ~ y.detrend.fac -1, ) ## this process could be repeated for multiple breakpoints... # return coefficents coef(fm0.y.detrend) coef(fm1.y.detrend) # plot the detrended series then add the predicted values from strucchange models plot(y.detrend) lines(ts(predict(fm0.y.detrend), start=mystart, frequency=myfrequency), col="red", lty=3) lines(ts(predict(fm1.y.detrend), start=mystart, frequency=myfrequency), col="blue", lty=1) ############################################################### ## Finished plots ## NEED TO SET BEGINNING (value 4) AND END (value 5) OF TIME SERIES ############################################################### mytime1 <- value4 mytime2 <- value5 mytimes <- c(mytime1, mytime2) par(mfrow=c(2,2)) ############################################################### # 1. Driver alone (time series) plot(mytime, mydriver,col="black", font.axis=2, font.lab=2,cex=1.25, cex.lab=1.75, cex.axis=1.5, xlim=mytimes), type="b", pch=19, xlab="", ylab="Driver", xlab="Time", cex=2, font=2) ############################################################### ############################################################### # 2. Response alone (time series) my.ylims <- c(min(myresponse)-.25, max(myresponse+.25)) # plot original data plot(myresponse~mytime, pch=19, ylim=my.ylims, col="black",cex=1.25, ylab="Population anomaly", xlab="TIME", main="", font.axis=2, font.lab=2, cex.lab=1.75, cex.axis=1.5) # overlay time series with interpolation lines(interp.data, col="grey50", lwd=2) # overlay loess smoother lines(ts(predict(myresponse.loess), start=mystart, frequency=myfrequency), col="black", lwd=2) # add optional arrows to indicate change points # change.x <- year.of.change # height.y1 <- top of arrow # height.y2 <- bottom of arrow # arrows(change.x, height.y1, change.x, height.y2, col="black", #length=.15, angle=25, lwd=2) ############################################################### ############################################################### # 3. histograms - bimodality may indicate change point hist(myresponse, col="black", # may want to define breaks/bins xlab=" ", ylab="Population anomaly", main=" ", border="white", lwd=2, font.axis=2, cex=1.25, cex.lab=2.5, cex.axis=2.25, prob=TRUE) lines(density(myresponse), col="darkgrey", lwd=2) ############################################################### # 4. rolling variance # window size is value6 (integer, less than or equal to length of time series): # note can use other variance measures in rollapply # substitute function for "var" in rollapply() mywin <- value6 myresponse.win=rollapply(y.diff, value6, var, align="right") plot(myresponse.win) # buffer lines to beginning of time series if desired # add arrows if desired # lines(myresponse.win, lwd=2, col="black") # lines(c(mytime1,myfirstbreakyear), c(myresponse.win[1], # myresponse.win[1]), col="black", lwd=2, lty=2) # arrows(1993, -0.05, x1=1993, y1=-0.3, code= 2, col="black", # length=0.15, angle=25, lwd=2) ############################################################### ############################################################### # 5. response vs. driver # this needs to be customized for each dataset!! # but the idea is to plot pre-break and post-break data on the same plot # then add linear (or nonlinear) models to pre-break and post-break data # this example is for penguins vs. sea-ice, where pre is before break # and post is after break; ice is sea-ice extent and peng is penguin # population size plot(pre.ice, pre.peng, col="black", xlim=c(50,260), ylim=c(-1.75,1.25), xlab="Sea ice duration (days)", ylab="Population anomaly", font.axis=2, font.lab=2, cex=1.25, cex.lab=1.75, cex.axis=1.5) points(post.ice, post.peng, pch=19, col="black", cex=1.25) lines(pre.ice, predict(lm(pre.peng~pre.ice)), col="grey50", lwd=2) lines(post.ice, predict(lm(post.peng~post.ice)), col="black", lwd=2) ###############################################################