###################################### ### HARVARD FOREST DATASET PREVIEW ### ###################################### # This script creates PDF files of summary statistics for data files # contained in datasets in the Harvard Forest Data Archive. # The resulting previews are intended to help users identify datasets # of possible interest and are not intended to be comprehensive. # The script currently supports dataTable and otherEntity files. # The user provides a dataset ID (e.g. "hf123") and optionally a file number # (e.g. 4). If the file number is set to zero, all files in the dataset are # processed. The PDF file(s) are stored on the user's local machine. Summary # files are not created for otherEntity files but the MD5 checksum value is # displayed in the console. # The script retrieves the EML file for the dataset and reads the data # file into an R data frame, using the following mapping from EML data # types to R data types: # EML ratio or interval -> R numeric # EML dateTime (format YYYY-MM-DD) -> R Date # EML dateTime (format YYYY) -> R numeric # EML nominal, ordinal, other dateTime -> R character # EML dateTime values with format YYYY-MM-DDThh:mm(:ss) are included in the # table of summary statistics and are converted to POSIXct for plotting. # The PDF file contains information about the data file, a list of variable # definitions, a table of summary statistics for numeric, date, and datetime # variables, and a series of time-series plots (using the R plot command) # or scatterplot matrices (using the R pairs command), depending on the # value of the timeSeries element in additionalMetadata. # # In order to keep file size and computational time within reasonable # limits, long data tables are shortened by sampling at regular intervals # to about 10000 rows for time-series plots and about 1000 rows for # scatterplot matrices. For the latter, a new variable containing the # row number is added and variables are plotted in groups of six # variables at a time, with the last varible repeated in the next group. # As a result, not all possible combinations of variables are plotted for # wide tables. For both types of plots, variables may be repeated in the # final plot to maintain consistent plot size. # Updated to read data files with comma, semicolon or tab as field delimiter. # Emery R Boose # revised 4-August-2020 #--------------------------INITIALIZE-----------------------------# # libraries library(tools) library(RCurl) library(XML) library(gplots) # dataset id & data file number (0 = all files) dataset.id <- "hf346" datafile.num <- 0 # server urls for data archive hf.archive.url1 <- "https://harvardforest.fas.harvard.edu/data/" hf.archive.url2 <- "http://harvardforest.fas.harvard.edu/data/" # local directory for data archive hf.archive.dir <- "C:/DA/" # local directory for EML files hf.eml.dir <- paste(hf.archive.dir, "eml/", sep="") # local directory where PDF file(s) will be written plot.dir <- "C:/DA/plots/" # maximum number of columns & lines on a page max.cols <- 70 max.lines <- 50 #---------------------------EML FILE------------------------------# # get parsed eml file get.parsed.eml <- function(hf.eml.dir, dataset.id) { # get file path eml.path <- paste(hf.eml.dir, dataset.id, ".xml", sep="") # warning if eml file does not exist if (!file.exists(eml.path)) { msg <- paste(toupper(dataset.id), ": EML file not found", sep="") warning(msg) eml <- NULL } else { eml <- xmlTreeParse(eml.path, useInternalNodes=TRUE) } return(eml) } # parse time-series string parse.time.series.string <- function(time.series.st) { # split string and convert to integer vector split.st <- strsplit(time.series.st, ",") time.series.ids <- NULL for (i in 1:length(split.st[[1]])) { time.series.ids <- append(time.series.ids, as.integer(split.st[[1]][i])) } return(time.series.ids) } # get time-series ids get.time.series.ids <- function(eml) { xpath <- paste("//additionalMetadata/metadata/additionalClassifications/timeSeries", sep="") node.set <- getNodeSet(eml, xpath, fun=xmlValue) if (length(node.set) == 0) { time.series.ids <- NULL } else { time.series.st <- node.set[[1]] time.series.ids <- parse.time.series.string(time.series.st) } return(time.series.ids) } # get datafile id for specified file number get.datafile.id <- function(eml, dataset.id, datafile.num) { # create string for datafile.id if (datafile.num > 9) { st <- toString(datafile.num) } else { st <- paste("0", toString(datafile.num), sep="") } datafile.id <- paste(dataset.id, "-", st, sep="") # data table: check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']", sep="") # other entity: check lower & upper case xpath3 <- paste("//otherEntity[@id='", datafile.id, "']", sep="") xpath4 <- paste("//otherEntity[@id='", toupper(datafile.id), "']", sep="") # check if data file id exists node.set1 <- getNodeSet(eml, xpath1) node.set2 <- getNodeSet(eml, xpath2) node.set3 <- getNodeSet(eml, xpath3) node.set4 <- getNodeSet(eml, xpath4) # warning if no data table or other entity if (length(node.set1) == 0 & length(node.set2) == 0 & length(node.set3) == 0 & length(node.set4) == 0) { msg <- paste(toupper(dataset.id), ": no data table or other entity with the number ", datafile.num, sep="") warning(msg) datafile.id <- NULL } return(datafile.id) } # get all data file ids from node set get.all.datafile.ids <- function(node.set) { datafile.ids <- NULL num <- xmlSize(node.set) if (num > 0) { id.node.set <- xmlSApply(node.set, xmlGetAttr, "id") for (i in 1:num) { datafile.id <- tolower(id.node.set[[i]]) datafile.ids <- append(datafile.ids, datafile.id) } } return(datafile.ids) } # get data file ids get.datafile.ids <- function(eml, dataset.id) { # get datafile ids for data tables xpath <- paste("//dataTable", sep="") node.set <- getNodeSet(eml, xpath) datatable.ids <- get.all.datafile.ids(node.set) # get datafile ids for other entities xpath <- paste("//otherEntity", sep="") node.set <- getNodeSet(eml, xpath) otherentity.ids <- get.all.datafile.ids(node.set) # combine datafile ids datafile.ids <- append(datatable.ids, otherentity.ids) # warning if no data files if (is.null(datafile.ids)) { msg <- paste(toupper(dataset.id), ": no data tables or other entities in this dataset", sep="") warning(msg) } return(datafile.ids) } # get data file type get.datafile.type <- function(eml, datafile.id) { # data table: check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']", sep="") node.set <- getNodeSet(eml, xpath1) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2) if (length(node.set) > 0) return("dataTable") # other entity: check lower & upper case xpath1 <- paste("//otherEntity[@id='", datafile.id, "']", sep="") xpath2 <- paste("//otherEntity[@id='", toupper(datafile.id), "']", sep="") node.set <- getNodeSet(eml, xpath1) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2) if (length(node.set) > 0) return("otherEntity") # otherwise return null return(NULL) } # get path for data file get.datafile.path <- function(eml, datafile.id) { # data table: check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/distribution/online/url", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/distribution/online/url", sep="") # other entity: check lower & upper case xpath3 <- paste("//otherEntity[@id='", datafile.id, "']/physical/distribution/online/url", sep="") xpath4 <- paste("//otherEntity[@id='", toupper(datafile.id), "']/physical/distribution/online/url", sep="") node.set <- getNodeSet(eml, xpath1, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath3, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath4, fun=xmlValue) # get data file url on server datafile.url <- node.set[[1]] # get data file path on local machine if (substr(datafile.url, 1, 5) == "https") { datafile.path <- gsub(hf.archive.url1, hf.archive.dir, datafile.url) } else { datafile.path <- gsub(hf.archive.url2, hf.archive.dir, datafile.url) } # warning if data file does not exist if (!file.exists(datafile.path)) { msg <- paste(toupper(datafile.id), ": data file not found", sep="") warning(msg) datafile.path <- NULL } return(datafile.path) } # get checksum for data file get.datafile.checksum <- function(eml, datafile.id) { # data table: check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/authentication", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/authentication", sep="") # other entity: check lower & upper case xpath3 <- paste("//otherEntity[@id='", datafile.id, "']/physical/authentication", sep="") xpath4 <- paste("//otherEntity[@id='", toupper(datafile.id), "']/physical/authentication", sep="") # try node sets node.set <- getNodeSet(eml, xpath1, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath3, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath4, fun=xmlValue) # warning if checksum is missing if (length(node.set) == 0) { msg <- paste(toupper(datafile.id), ": checksum is missing", sep="") warning(msg) datafile.checksum <- NULL } else { datafile.checksum <- node.set[[1]] } return(datafile.checksum) } # get data table name get.datafile.name <- function(eml, datafile.id) { # check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']/entityName", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/entityName", sep="") node.set <- getNodeSet(eml, xpath1, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue) datafile.name <- node.set[[1]] return(datafile.name) } # get data table description get.datafile.description <- function(eml, datafile.id) { # check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']/entityDescription", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/entityDescription", sep="") node.set <- getNodeSet(eml, xpath1, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue) datafile.description <- node.set[[1]] return(datafile.description) } # get data file size in bytes get.datafile.size <- function(eml, datafile.id, datafile.type) { # check lower & upper case if (datafile.type == "dataTable") { xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/size", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/size", sep="") } else { xpath1 <- paste("//otherEntity[@id='", datafile.id, "']/physical/size", sep="") xpath2 <- paste("//otherEntity[@id='", toupper(datafile.id), "']/physical/size", sep="") } node.set <- getNodeSet(eml, xpath1, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue) # warning if file size is missing if (length(node.set) == 0) { msg <- paste(toupper(datafile.id), ": file size is missing", sep="") warning(msg) datafile.size <- NULL } else { datafile.size <- node.set[[1]] } return(datafile.size) } # get number of records for data table get.datafile.number.of.records <- function(eml, datafile.id) { # check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']/numberOfRecords", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/numberOfRecords", sep="") node.set <- getNodeSet(eml, xpath1, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue) # warning if number of records is missing if (length(node.set) == 0) { msg <- paste(toupper(datafile.id), ": number of records is missing", sep="") warning(msg) datafile.records <- NULL } else { datafile.records <- node.set[[1]] } return(datafile.records) } # get field delimiter for data table get.field.delimiter <- function(eml, datafile.id) { # check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter", sep="") node.set <- getNodeSet(eml, xpath1, fun=xmlValue) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue) field.delimiter <- node.set[[1]] # modify for tab if (field.delimiter == "\\t") { field.delimiter <- "\t" } return(field.delimiter) } # get node set for data table get.data.table.node.set <- function(eml, datafile.id) { # check lower & upper case xpath1 <- paste("//dataTable[@id='", datafile.id, "']/attributeList/attribute", sep="") xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/attributeList/attribute", sep="") node.set <- getNodeSet(eml, xpath1) if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2) return(node.set) } # get R variable type get.variable.type <- function(node.set.element) { # eml ratio or interval -> R numeric x1 <- xpathApply(node.set.element, "measurementScale/ratio", xmlValue) x2 <- xpathApply(node.set.element, "measurementScale/interval", xmlValue) if (length(x1) > 0 | length(x2) > 0) { var.type <- "numeric" } else { x3 <- xpathApply(node.set.element, "measurementScale/dateTime/formatString", xmlValue) if (length(x3) > 0) { if (x3[[1]] == "YYYY-MM-DD") { # eml dateTime in YYYY-MM-DD format -> R Date var.type <- "Date" } else if (x3[[1]] == "YYYY") { # eml dateTime in YYYY format -> R numeric var.type <- "numeric" } else { # eml dateTime in other formats -> R character var.type <- "character" } } else { # eml nominal, ordinal -> R character var.type <- "character" } } return(var.type) } # get all variable types for data table get.all.variable.types <- function(node.set) { num <- xmlSize(node.set) var.types <- NULL for (i in 1:num) { node.set.element <- node.set[[i]] var.type <- get.variable.type(node.set.element) var.types <- append(var.types, var.type) } return(var.types) } # get variable format string get.variable.format <- function(node.set.element) { x <- xpathApply(node.set.element, "measurementScale/dateTime/formatString", xmlValue) if (length(x) > 0) var.format <- x[[1]] else var.format <- "" return(var.format) } # get all variable format strings for data table get.all.variable.formats <- function(node.set) { num <- xmlSize(node.set) var.formats <- NULL for (i in 1:num) { node.set.element <- node.set[[i]] var.format <- get.variable.format(node.set.element) var.formats <- append(var.formats, var.format) } return(var.formats) } # get variable definition get.variable.definition <- function(node.set.element) { x <- xpathApply(node.set.element, "attributeDefinition", xmlValue) if (length(x) > 0) var.def <- x[[1]] else var.def <- "" return(var.def) } # get all variable definitions for data table get.all.variable.definitions <- function(node.set) { num <- xmlSize(node.set) var.defs <- NULL for (i in 1:num) { node.set.element <- node.set[[i]] var.def <- get.variable.definition(node.set.element) var.defs <- append(var.defs, var.def) } return(var.defs) } # get variable units get.variable.unit <- function(node.set.element) { x <- xpathApply(node.set.element, "measurementScale/interval/unit/standardUnit", xmlValue) if (length(x) > 0) { var.unit <- x[[1]] } else { x <- xpathApply(node.set.element, "measurementScale/ratio/unit/standardUnit", xmlValue) if (length(x) > 0) { var.unit <- x[[1]] } else { x <- xpathApply(node.set.element, "measurementScale/interval/unit/customUnit", xmlValue) if (length(x) > 0) { var.unit <- x[[1]] } else { x <- xpathApply(node.set.element, "measurementScale/ratio/unit/customUnit", xmlValue) if (length(x) > 0) { var.unit <- x[[1]] } else { var.unit <- "" } } } } return(var.unit) } # get all variable units for data table get.all.variable.units <- function(node.set) { num <- xmlSize(node.set) var.units <- NULL for (i in 1:num) { node.set.element <- node.set[[i]] var.unit <- get.variable.unit(node.set.element) var.units <- append(var.units, var.unit) } return(var.units) } #-----------------------SUMMARY STATS TABLE-------------------------# # identify numeric, date & datetime variables get.summary.index <- function(df, var.types, var.formats) { sum.index <- array() for (i in 1:length(df)) { if (var.types[i] == "numeric" | var.types[i] == "Date" | var.formats[i] == "YYYY-MM-DDThh:mm" | var.formats[i] == "YYYY-MM-DDThh:mm:ss") { sum.index[i] <- TRUE } else { sum.index[i] <- FALSE } } return(sum.index) } # get summary variable types get.summary.variable.types <- function(sum.index, var.types, var.formats) { if (all(sum.index == FALSE)) { sum.var.types <- NULL } else { sum.var.types <- var.types[sum.index] sum.var.formats <- var.formats[sum.index] for (i in 1:length(sum.var.types)) { if (sum.var.formats[i] == "YYYY-MM-DDThh:mm") { sum.var.types[i] <- "POSIXct-1" } else if (sum.var.formats[i] == "YYYY-MM-DDThh:mm:ss") { sum.var.types[i] <- "POSIXct-2" } } } return(sum.var.types) } # get summary variable definitions get.summary.variable.definitions <- function(sum.index, var.defs) { if (all(sum.index == FALSE)) { sum.var.defs <- NULL } else { sum.var.defs <- var.defs[sum.index] } return(sum.var.defs) } # get summary variable units get.summary.variable.units <- function(sum.index, var.units) { if (all(sum.index == FALSE)) { sum.var.units <- NULL } else { sum.var.units <- var.units[sum.index] } return(sum.var.units) } # create summary data frame get.summary.data.frame <- function(df, sum.index) { if (all(sum.index == FALSE)) { df.sum <- NULL } else if (sum(sum.index, na.rm=TRUE) == 1) { df.sum <- as.data.frame(df[ , sum.index]) i <- which(sum.index == TRUE) colnames(df.sum) <- names(df[i]) } else { df.sum <- df[ , sum.index] } return(df.sum) } # wrap lines with indent as needed word.wrap.lines <- function (st, xstr="") { z <- strsplit(st, " ") col <- 1 for (i in 1:length(z[[1]])) { if (z[[1]][[i]] != "") { col <- col + nchar(z[[1]][[i]]) + 1 if (col > max.cols) { xstr <- paste(xstr, "\n ", sep="") col <- 4 } xstr <- paste(xstr, z[[1]][[i]], " ", sep="") } } xstr <- paste(xstr, "\n", sep="") return(xstr) } # add column headings for variable table add.column.headings <- function(xstr="") { xstr <- paste(xstr, "Variable Min Median Mean Max NAs\n", sep="") xstr <- paste(xstr, "------------------------------------------------------------------------\n", sep="") return(xstr) } # create summary stats table create.summary.table <- function(df, df.sum, datafile.id, sum.var.types, sum.var.defs, sum.var.units, datafile.path) { # get data file name datafile.name <- get.datafile.name(eml, datafile.id) # get data file description datafile.description <- get.datafile.description(eml, datafile.id) # calculate md5 checksum from data file checksum <- md5sum(datafile.path) # create summary page xstr <- paste("Harvard Forest Data Archive ", toupper(datafile.id), "\n\n", sep="") xstr <- paste(xstr, "Data File:\n\n", sep="") xstr <- paste(xstr, "Name = ", datafile.name, "\n", sep="") st <- paste("Description = ", datafile.description, sep="") xstr <- word.wrap.lines(st, xstr) xstr <- paste(xstr, "Rows = ", nrow(df), " Columns = ", length(df), "\n", sep="") xstr <- paste(xstr, "MD5 checksum = ", checksum, "\n\n", sep="") # check for variables to summarize if (is.null(df.sum)) { xstr <- paste(xstr, "No date or numerical variables\n", sep="") textplot(capture.output(writeLines(xstr))) } else { # variable definitions & units xstr <- paste(xstr, "Variables:\n\n", sep="") lines <- 11 for (i in 1:length(df.sum)) { if (sum.var.units[i] != "") { var.unit <- paste(" (", sum.var.units[i], ")", sep="") } else { var.unit <- "" } st <- paste(names(df.sum[i]), " = ", sum.var.defs[i], var.unit, sep="") xstr <- word.wrap.lines(st, xstr) new.lines <- nchar(st) %/% max.cols + 1 lines <- lines + new.lines # add new page if necessary if (lines > max.lines) { textplot(capture.output(writeLines(xstr))) xstr <- "" lines <- 0 } } # write summary page to file if (lines > 0) textplot(capture.output(writeLines(xstr))) # create summary table xstr <- add.column.headings() lines <- 2 for (i in 1:length(df.sum)) { # variable name xstr <- paste(xstr, substr(paste(names(df.sum[i]), " ", sep=""), 1, 12), sep="") # all values missing if (all(is.na(df.sum[ , i]))) { xnas <- sum(is.na(df.sum[ , i])) xstr <- paste(xstr," ", xnas, "\n", sep="") # datetime variable } else if (sum.var.types[i] == "POSIXct-1" | sum.var.types[i] == "POSIXct-2") { # get min, max & NAs only xmin <- min(df.sum[ , i], na.rm=TRUE) xmax <- max(df.sum[ , i], na.rm=TRUE) xnas <- sum(is.na(df.sum[ , i])) xstr <- paste(xstr, xmin, " ", xmax, " ", formatC(xnas,0,5,"f"), "\n", sep="") # get all stats } else { xmin <- min(df.sum[ , i], na.rm=TRUE) xmedian <- median(df.sum[ , i], na.rm=TRUE) xmean <- mean(df.sum[ , i], na.rm=TRUE) xmax <- max(df.sum[ , i], na.rm=TRUE) xnas <- sum(is.na(df.sum[ , i])) # date variable if (sum.var.types[i] == "Date") { xstr <- paste(xstr, " ", xmin, " ", xmedian, " ", xmean, " ", xmax, formatC(xnas,0,8,"f"), "\n", sep="") # numeric variable } else { if (xmin < -1000000 | xmax > 1000000) { d <- 0 } else { d <- 3 } xstr <- paste(xstr, formatC(xmin,d,12,"f"), formatC(xmedian,d,12,"f"), formatC(xmean,d,12,"f"), formatC(xmax,d,12,"f"), formatC(xnas,0,8,"f"), "\n", sep="") } } lines <- lines + 1 # add new page if necessary if (lines > max.lines) { textplot(capture.output(writeLines(xstr))) xstr <- add.column.headings() lines <- 0 } } # write summary table to file if (lines > 0) textplot(capture.output(writeLines(xstr))) } } #---------------------------PLOTS------------------------------# # get plot type get.plot.type <- function(datafile.id, time.series.ids) { num <- as.integer(substr(datafile.id, 7, 8)) if (num %in% time.series.ids) { plot.type <- "time-series" } else { plot.type <- "scatterplot-matrix" } return(plot.type) } # create plot data frame get.plot.data.frame <- function(df.sum, sum.var.types, max.rows) { # start with summary data frame df.plt <- df.sum if (!is.null(df.plt)) { # resample rows at regular intervals if necessary plot.rows <- nrow(df.plt) if (plot.rows > max.rows) { skip <- round(plot.rows/max.rows) index <- seq(from=1, to=plot.rows, by=skip) df.plt <- df.plt[index, , drop=FALSE] } # convert standard datetime to POSIXct for (i in 1:length(df.plt)) { if (sum.var.types[i] == "POSIXct-1") { df.plt[ , i] = as.POSIXct(df.plt[ , i], format="%Y-%m-%dT%H:%M") } else if(sum.var.types[i] == "POSIXct-2") { df.plt[ , i] = as.POSIXct(df.plt[ , i], format="%Y-%m-%dT%H:%M:%S") } } # remove empty variables keep <- NULL if (is.vector(df.plt)) { if (all(is.na(df.plt))) { df.plt <- NULL } } else { for (i in 1:length(df.plt)) { if (!all(is.na(df.plt[ ,i]))) { keep <- append(keep, i) } } if (is.null(keep)) { df.plt <- NULL } else if (length(keep) == 1) { col.name <- names(df.plt[keep]) df.plt <- as.data.frame(df.plt[ , keep]) colnames(df.plt) <- col.name } else { df.plt <- df.plt[ , keep] } } } return(df.plt) } #--------------------------TIME SERIES-----------------------------# # create time-series plot create.time.series.plot <- function(df.plt, datafile.id) { if (!is.null(df.plt)) { # max number of variables per plot max.var <- 4 # get number of plots var.num <- length(df.plt) if (var.num <= max.var) { plot.num <- 1 } else { plot.num <- var.num %/% max.var plot.last <- var.num %% max.var if (plot.last > 0) plot.num <- plot.num + 1 } # create each plot for (i in 1:plot.num) { first.col <- (i-1)*max.var + 1 if (plot.num == 1) { last.col <- first.col + var.num - 1 } else if (i < plot.num) { last.col <- first.col + max.var - 1 } else { first.col <- var.num - max.var + 1 last.col <- var.num } # subset for variables to plot if (first.col == last.col ) { zz <- as.data.frame(df.plt[ , c(first.col:last.col)]) colnames(zz) <- names(df.plt[first.col]) } else { zz <- df.plt[ , c(first.col:last.col)] } # create time series df.ts <- ts(zz) # create plot plot.title <- paste(toupper(datafile.id), " Plot ", i, sep="") plot(df.ts, main=plot.title, col="blue") } } } #----------------------SCATTERPLOT MATRIX--------------------------# # create scatterplot matrix create.scatterplot.matrix <- function(df.plt, datafile.id) { if (!is.null(df.plt)) { # column for row number row.num <- seq(from=1, to=nrow(df.plt), by=1) # number of variables to plot var.num <- length(df.plt) # maximum number of variables per plot max.var <- 5 # get number of plots (repeat last variable in next plot) if (var.num <= max.var) { plot.num <- 1 } else { plot.num <- 1 + (var.num-5) %/% (max.var-1) plot.last <- (var.num-5) %% (max.var-1) if (plot.last > 0) plot.num <- plot.num + 1 } # create each plot for (i in 1:plot.num) { # select columns to plot if (plot.num == 1) { first.col <- 1 last.col <- var.num } else { if (i < plot.num) { first.col <- (i-1)*max.var - i + 2 last.col <- first.col + max.var - 1 } else { first.col <- var.num - max.var + 1 last.col <- var.num } } # subset for variables to plot if (first.col == last.col) { col.name <- names(df.plt[first.col]) df.sp <- as.data.frame(df.plt[ , first.col]) colnames(df.sp) <- col.name } else { df.sp <- df.plt[ , c(first.col:last.col)] } # add column for row number df.sp <- cbind(row.num, df.sp) colnames(df.sp)[1] <- "row" # create plot plot.title <- paste(toupper(datafile.id), " Plot ", i, sep="") pairs(df.sp, main=plot.title, col="blue") } } } #---------------------PROCESS DATA FILE------------------------# # display column headings in console display.column.headings <- function() { st <- "File MD5 Checksum Preview Rows Bytes" writeLines(st) st <- "----------------------------------------------------------------------------" writeLines(st) } # read data file into data frame get.datafile <- function(datafile.path, var.types, field.delimiter) { con <- file(datafile.path) df <- read.csv(con, header=TRUE, sep=field.delimiter, colClasses=var.types) return(df) } # create preview for data file create.datafile.preview <- function(eml, time.series.ids, datafile.id, datafile.path) { # get data file type datafile.type <- get.datafile.type(eml, datafile.id) # calculate checksum from data file checksum <- md5sum(datafile.path) # get data file checksum from eml datafile.checksum <- get.datafile.checksum(eml, datafile.id) # warning if checksum values do not match if (!is.null(datafile.checksum)) { if (datafile.checksum != checksum) { msg <- paste(toupper(datafile.id), ": checksum values do not match", sep="") warning(msg) } } # get data file size in bytes f.size <- file.size(datafile.path) # get file size from eml datafile.size <- get.datafile.size(eml, datafile.id, datafile.type) # warning if file sizes do not match if (!is.null(datafile.size)) { if (datafile.size != f.size) { msg <- paste(toupper(datafile.id), ": file sizes do not match", sep="") warning(msg) } } # data table: create summmary table & plots if (datafile.type == "dataTable") { # create PDF file pdf.file <- paste(plot.dir, datafile.id, ".pdf", sep="") pdf(pdf.file) par(mar=c(0.5,0.5,0.5,0.5)) # get field delimiter field.delimiter <- get.field.delimiter(eml, datafile.id) # get data table node set node.set <- get.data.table.node.set(eml, datafile.id) # get R variable types var.types <- get.all.variable.types(node.set) # get variable format strings var.formats <- get.all.variable.formats(node.set) # get variable definitions var.defs <- get.all.variable.definitions(node.set) # get variable units var.units <- get.all.variable.units(node.set) # read data file into data frame df <- get.datafile(datafile.path, var.types, field.delimiter) # get number of records from eml datafile.records <- get.datafile.number.of.records(eml, datafile.id) # warning if numbers of records do not match if (!is.null(datafile.records)) { if (datafile.records != nrow(df)) { msg <- paste(toupper(datafile.id), ": numbers of records do not match", sep="") warning(msg) } } # get actual number of records as a string n.rows <- substr(paste(toString(nrow(df)), " ", sep=""), 1, 10) # create summary stats table sum.index <- get.summary.index(df, var.types, var.formats) sum.var.types <- get.summary.variable.types(sum.index, var.types, var.formats) sum.var.defs <- get.summary.variable.definitions(sum.index, var.defs) sum.var.units <- get.summary.variable.units(sum.index, var.units) df.sum <- get.summary.data.frame(df, sum.index) create.summary.table(df, df.sum, datafile.id, sum.var.types, sum.var.defs, sum.var.units, datafile.path) # create plots if (is.null(df.sum)){ msg <- paste(checksum, " No plots ", n.rows, f.size, sep="") } else { # get plot type plot.type <- get.plot.type(datafile.id, time.series.ids) # time-series plot if (plot.type == "time-series") { df.plt <- get.plot.data.frame(df.sum, sum.var.types, max.rows=10000) create.time.series.plot(df.plt, datafile.id) msg <- paste(checksum, " Time Series ", n.rows, f.size, sep="") } else { # scatterplot matrix df.plt <- get.plot.data.frame(df.sum, sum.var.types, max.rows=1000) create.scatterplot.matrix(df.plt, datafile.id) msg <- paste(checksum, " Scatterplots ", n.rows, f.size, sep="") } } # close PDF file dev.off() # other entity: display md5 checksum } else if (datafile.type == "otherEntity") { # display checksum in console msg <- paste(checksum, " No preview ", f.size, sep="") # neither data table nor other entity } else { msg <- "Data type not currently supported" } # display result message st <- paste(toupper(datafile.id), ' ', msg, sep="") writeLines(st) } #------------------------MAIN PROGRAM---------------------------# # read eml file eml <- get.parsed.eml(hf.eml.dir, dataset.id) if (!is.null(eml)) { # get data file numbers for time-series plots time.series.ids <- get.time.series.ids(eml) # get data file ids for data tables & other entities if (datafile.num > 0) { datafile.ids <- get.datafile.id(eml, dataset.id, datafile.num) } else { datafile.ids <- get.datafile.ids(eml, dataset.id) } if (!is.null(datafile.ids)) { # display column headings display.column.headings() # process each data file for (i in 1:length(datafile.ids)) { datafile.id <- datafile.ids[i] # get data file path datafile.path <- get.datafile.path(eml, datafile.id) if (!is.null(datafile.path)) { # create preview or display MD5 checksum create.datafile.preview(eml, time.series.ids, datafile.id, datafile.path) } } } } ############################ DONE ##############################