######################################
### HARVARD FOREST DATASET PREVIEW ###
######################################

# This script creates PDF files of summary statistics for data files
# contained in datasets in the Harvard Forest Data Archive.
# The resulting previews are intended to help users identify datasets 
# of possible interest and are not intended to be comprehensive.
# The script currently supports dataTable and otherEntity files.

# The user provides a dataset ID (e.g. "hf123") and optionally a file number
# (e.g. 4).  If the file number is set to zero, all files in the dataset are
# processed. The PDF file(s) are stored on the user's local machine. Summary
# files are not created for otherEntity files but the MD5 checksum value is
# displayed in the console.

# The script retrieves the EML file for the dataset and reads the data 
# file into an R data frame, using the following mapping from EML data 
# types to R data types:

#   EML ratio or interval -> R numeric
#   EML dateTime (format YYYY-MM-DD) -> R Date
#   EML dateTime (format YYYY) -> R numeric
#   EML nominal, ordinal, other dateTime -> R character

# EML dateTime values with format YYYY-MM-DDThh:mm(:ss) are included in the 
# table of summary statistics and are converted to POSIXct for plotting.

# The PDF file contains information about the data file, a list of variable
# definitions, a table of summary statistics for numeric, date, and datetime 
# variables, and a series of time-series plots (using the R plot command) 
# or scatterplot matrices (using the R pairs command), depending on the 
# value of the timeSeries element in additionalMetadata.
# 
# In order to keep file size and computational time within reasonable 
# limits, long data tables are shortened by sampling at regular intervals
# to about 10000 rows for time-series plots and about 1000 rows for
# scatterplot matrices. For the latter, a new variable containing the
# row number is added and variables are plotted in groups of six
# variables at a time, with the last varible repeated in the next group.
# As a result, not all possible combinations of variables are plotted for
# wide tables. For both types of plots, variables may be repeated in the 
# final plot to maintain consistent plot size.

# Updated to read data files with comma, semicolon or tab as field delimiter.

# Emery R Boose

# revised 4-August-2020

#--------------------------INITIALIZE-----------------------------#

# libraries
library(tools)
library(RCurl)
library(XML)
library(gplots)

# dataset id & data file number (0 = all files)

dataset.id <- "hf346"
datafile.num <- 0

# server urls for data archive
hf.archive.url1 <- "https://harvardforest.fas.harvard.edu/data/"
hf.archive.url2 <- "http://harvardforest.fas.harvard.edu/data/"

# local directory for data archive
hf.archive.dir <- "C:/DA/"

# local directory for EML files
hf.eml.dir <- paste(hf.archive.dir, "eml/", sep="")

# local directory where PDF file(s) will be written
plot.dir <- "C:/DA/plots/"

# maximum number of columns & lines on a page
max.cols <- 70
max.lines <- 50

#---------------------------EML FILE------------------------------#

# get parsed eml file
get.parsed.eml <- function(hf.eml.dir, dataset.id) {
  # get file path
  eml.path <- paste(hf.eml.dir, dataset.id, ".xml", sep="")
  # warning if eml file does not exist
  if (!file.exists(eml.path)) {
    msg <- paste(toupper(dataset.id), ": EML file not found", sep="")
    warning(msg)
    eml <- NULL
  } else {
    eml <- xmlTreeParse(eml.path, useInternalNodes=TRUE)
  }
  return(eml)
}

# parse time-series string
parse.time.series.string <- function(time.series.st) {
  # split string and convert to integer vector
  split.st <- strsplit(time.series.st, ",")
  time.series.ids <- NULL
  for (i in 1:length(split.st[[1]])) {
     time.series.ids <- append(time.series.ids, as.integer(split.st[[1]][i]))
  }
  return(time.series.ids)
}

# get time-series ids
get.time.series.ids <- function(eml) {
  xpath <- paste("//additionalMetadata/metadata/additionalClassifications/timeSeries", sep="")
  node.set <- getNodeSet(eml, xpath, fun=xmlValue)   
  if (length(node.set) == 0) {
    time.series.ids <- NULL
  } else {
    time.series.st <- node.set[[1]]
    time.series.ids <- parse.time.series.string(time.series.st)
  }  
  return(time.series.ids)
}

# get datafile id for specified file number
get.datafile.id <- function(eml, dataset.id, datafile.num) {
  # create string for datafile.id
  if (datafile.num > 9) {
    st <- toString(datafile.num)
  } else {
    st <- paste("0", toString(datafile.num), sep="")
  }
  datafile.id <- paste(dataset.id, "-", st, sep="")
  # data table: check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']", sep="")
  # other entity: check lower & upper case
  xpath3 <- paste("//otherEntity[@id='", datafile.id, "']", sep="")
  xpath4 <- paste("//otherEntity[@id='", toupper(datafile.id), "']", sep="")
  # check if data file id exists
  node.set1 <- getNodeSet(eml, xpath1)
  node.set2 <- getNodeSet(eml, xpath2)
  node.set3 <- getNodeSet(eml, xpath3)
  node.set4 <- getNodeSet(eml, xpath4)
  # warning if no data table or other entity
  if (length(node.set1) == 0 & length(node.set2) == 0 & length(node.set3) == 0 & length(node.set4) == 0) {
    msg <- paste(toupper(dataset.id), ": no data table or other entity with the number ", datafile.num, sep="")
    warning(msg)
    datafile.id <- NULL
  }
  return(datafile.id)  
}

# get all data file ids from node set
get.all.datafile.ids <- function(node.set) {
  datafile.ids <- NULL
  num <- xmlSize(node.set)
  if (num > 0) {
    id.node.set <- xmlSApply(node.set, xmlGetAttr, "id")
    for (i in 1:num) {
      datafile.id <- tolower(id.node.set[[i]])
      datafile.ids <- append(datafile.ids, datafile.id)
    }
  }
  return(datafile.ids)
}

# get data file ids
get.datafile.ids <- function(eml, dataset.id) {
  # get datafile ids for data tables
  xpath <- paste("//dataTable", sep="")
  node.set <- getNodeSet(eml, xpath)
  datatable.ids <- get.all.datafile.ids(node.set)
  # get datafile ids for other entities
  xpath <- paste("//otherEntity", sep="")
  node.set <- getNodeSet(eml, xpath)
  otherentity.ids <- get.all.datafile.ids(node.set)
  # combine datafile ids
  datafile.ids <- append(datatable.ids, otherentity.ids)

  # warning if no data files
  if (is.null(datafile.ids)) {
    msg <- paste(toupper(dataset.id), ": no data tables or other entities in this dataset", sep="")
    warning(msg)
  }
  return(datafile.ids)
}

# get data file type
get.datafile.type <- function(eml, datafile.id) {
  # data table: check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']", sep="")
  node.set <- getNodeSet(eml, xpath1)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2)
  if (length(node.set) > 0) return("dataTable")
  
  # other entity: check lower & upper case
  xpath1 <- paste("//otherEntity[@id='", datafile.id, "']", sep="")
  xpath2 <- paste("//otherEntity[@id='", toupper(datafile.id), "']", sep="")
  node.set <- getNodeSet(eml, xpath1)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2)
  if (length(node.set) > 0) return("otherEntity")
  
  # otherwise return null
  return(NULL)
}

# get path for data file
get.datafile.path <- function(eml, datafile.id) {
  # data table: check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/distribution/online/url", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/distribution/online/url", sep="")
  # other entity: check lower & upper case
  xpath3 <- paste("//otherEntity[@id='", datafile.id, "']/physical/distribution/online/url", sep="")
  xpath4 <- paste("//otherEntity[@id='", toupper(datafile.id), "']/physical/distribution/online/url", sep="")

  node.set <- getNodeSet(eml, xpath1, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath3, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath4, fun=xmlValue)
  
  # get data file url on server
  datafile.url <- node.set[[1]]

  # get data file path on local machine
  if (substr(datafile.url, 1, 5) == "https") {
    datafile.path <- gsub(hf.archive.url1, hf.archive.dir, datafile.url)
  } else {
    datafile.path <- gsub(hf.archive.url2, hf.archive.dir, datafile.url)
  }
   
  # warning if data file does not exist
  if (!file.exists(datafile.path)) {
    msg <- paste(toupper(datafile.id), ": data file not found", sep="")
    warning(msg)
    datafile.path <- NULL
  }
  return(datafile.path)
}

# get checksum for data file
get.datafile.checksum <- function(eml, datafile.id) {
  # data table: check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/authentication", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/authentication", sep="")
  # other entity: check lower & upper case
  xpath3 <- paste("//otherEntity[@id='", datafile.id, "']/physical/authentication", sep="")
  xpath4 <- paste("//otherEntity[@id='", toupper(datafile.id), "']/physical/authentication", sep="")

  # try node sets
  node.set <- getNodeSet(eml, xpath1, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath3, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath4, fun=xmlValue)
  
  # warning if checksum is missing
  if (length(node.set) == 0) {
    msg <- paste(toupper(datafile.id), ": checksum is missing", sep="")
    warning(msg)
    datafile.checksum <- NULL
  } else {
    datafile.checksum <- node.set[[1]]
  }
  return(datafile.checksum)
}

# get data table name
get.datafile.name <- function(eml, datafile.id) {
  # check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']/entityName", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/entityName", sep="")
  node.set <- getNodeSet(eml, xpath1, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue)
  datafile.name <- node.set[[1]]
  return(datafile.name)
}

# get data table description
get.datafile.description <- function(eml, datafile.id) {
  # check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']/entityDescription", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/entityDescription", sep="")
  node.set <- getNodeSet(eml, xpath1, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue)
  datafile.description <- node.set[[1]]
  return(datafile.description)
}

# get data file size in bytes
get.datafile.size <- function(eml, datafile.id, datafile.type) {
  # check lower & upper case
  if (datafile.type == "dataTable") {
    xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/size", sep="")
    xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/size", sep="")
  } else {
    xpath1 <- paste("//otherEntity[@id='", datafile.id, "']/physical/size", sep="")
    xpath2 <- paste("//otherEntity[@id='", toupper(datafile.id), "']/physical/size", sep="")
  }

  node.set <- getNodeSet(eml, xpath1, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue)

  # warning if file size is missing
  if (length(node.set) == 0) {
    msg <- paste(toupper(datafile.id), ": file size is missing", sep="")
    warning(msg)
    datafile.size <- NULL
  } else {
    datafile.size <- node.set[[1]]
  }
  return(datafile.size)
}

# get number of records for data table
get.datafile.number.of.records <- function(eml, datafile.id) {
  # check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']/numberOfRecords", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/numberOfRecords", sep="")
  node.set <- getNodeSet(eml, xpath1, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue)

  # warning if number of records is missing
  if (length(node.set) == 0) {
    msg <- paste(toupper(datafile.id), ": number of records is missing", sep="")
    warning(msg)
    datafile.records <- NULL
  } else {
    datafile.records <- node.set[[1]]
  }
  return(datafile.records)
}

# get field delimiter for data table
get.field.delimiter <- function(eml, datafile.id) {
  # check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']/physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter", sep="")
  node.set <- getNodeSet(eml, xpath1, fun=xmlValue)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2, fun=xmlValue)
  field.delimiter <- node.set[[1]]
  # modify for tab
  if (field.delimiter == "\\t") {
    field.delimiter <- "\t"
  }
  return(field.delimiter)
}

# get node set for data table
get.data.table.node.set <- function(eml, datafile.id) {
  # check lower & upper case
  xpath1 <- paste("//dataTable[@id='", datafile.id, "']/attributeList/attribute", sep="")
  xpath2 <- paste("//dataTable[@id='", toupper(datafile.id), "']/attributeList/attribute", sep="")
  node.set <- getNodeSet(eml, xpath1)
  if (length(node.set) == 0) node.set <- getNodeSet(eml, xpath2)
  return(node.set)
}

# get R variable type
get.variable.type <- function(node.set.element) {
  # eml ratio or interval -> R numeric
  x1 <- xpathApply(node.set.element, "measurementScale/ratio", xmlValue)
  x2 <- xpathApply(node.set.element, "measurementScale/interval", xmlValue)
  if (length(x1) > 0 | length(x2) > 0) {
    var.type <- "numeric"
  } else {
    x3 <- xpathApply(node.set.element, "measurementScale/dateTime/formatString", xmlValue)
    if (length(x3) > 0) {
      if (x3[[1]] == "YYYY-MM-DD") {
        # eml dateTime in YYYY-MM-DD format -> R Date
        var.type <- "Date"
      } else if (x3[[1]] == "YYYY") {
        # eml dateTime in YYYY format -> R numeric
        var.type <- "numeric"
      } else {
        # eml dateTime in other formats -> R character
        var.type <- "character"
      }
    } else {
      # eml nominal, ordinal -> R character
      var.type <- "character"
    }
  }
  return(var.type)
}

# get all variable types for data table
get.all.variable.types <- function(node.set) {
  num <- xmlSize(node.set)
  var.types <- NULL
  for (i in 1:num) {
    node.set.element <- node.set[[i]]
    var.type <- get.variable.type(node.set.element)
    var.types <- append(var.types, var.type)
  }
  return(var.types)
}

# get variable format string
get.variable.format <- function(node.set.element) {
  x <- xpathApply(node.set.element, "measurementScale/dateTime/formatString", xmlValue)
  if (length(x) > 0) var.format <- x[[1]]
  else var.format <- ""
  return(var.format)
}

# get all variable format strings for data table
get.all.variable.formats <- function(node.set) {
  num <- xmlSize(node.set)
  var.formats <- NULL
  for (i in 1:num) {
    node.set.element <- node.set[[i]]
    var.format <- get.variable.format(node.set.element)
    var.formats <- append(var.formats, var.format)
  }
  return(var.formats)
}

# get variable definition
get.variable.definition <- function(node.set.element) {
  x <- xpathApply(node.set.element, "attributeDefinition", xmlValue)
  if (length(x) > 0) var.def <- x[[1]]
  else var.def <- ""
  return(var.def)
}

# get all variable definitions for data table
get.all.variable.definitions <- function(node.set) {
  num <- xmlSize(node.set)
  var.defs <- NULL
  for (i in 1:num) {
    node.set.element <- node.set[[i]]
    var.def <- get.variable.definition(node.set.element)
    var.defs <- append(var.defs, var.def)
  }
  return(var.defs)
}

# get variable units
get.variable.unit <- function(node.set.element) {
  x <- xpathApply(node.set.element, "measurementScale/interval/unit/standardUnit", xmlValue)
  if (length(x) > 0) {
    var.unit <- x[[1]]
  } else {
    x <- xpathApply(node.set.element, "measurementScale/ratio/unit/standardUnit", xmlValue)
    if (length(x) > 0) {
      var.unit <- x[[1]]
    } else {
      x <- xpathApply(node.set.element, "measurementScale/interval/unit/customUnit", xmlValue)
      if (length(x) > 0) {
        var.unit <- x[[1]]
      } else {
        x <- xpathApply(node.set.element, "measurementScale/ratio/unit/customUnit", xmlValue)
        if (length(x) > 0) {
          var.unit <- x[[1]]
        } else {
          var.unit <- ""    
        }
      }
    }
  }
  return(var.unit)
}

# get all variable units for data table
get.all.variable.units <- function(node.set) {
  num <- xmlSize(node.set)
  var.units <- NULL
  for (i in 1:num) {
    node.set.element <- node.set[[i]]
    var.unit <- get.variable.unit(node.set.element)
    var.units <- append(var.units, var.unit)
  }
  return(var.units)
}

#-----------------------SUMMARY STATS TABLE-------------------------#

# identify numeric, date & datetime variables
get.summary.index <- function(df, var.types, var.formats) {
  sum.index <- array()
  for (i in 1:length(df)) {
    if (var.types[i] == "numeric" | var.types[i] == "Date" | var.formats[i] == "YYYY-MM-DDThh:mm" | var.formats[i] == "YYYY-MM-DDThh:mm:ss") {
      sum.index[i] <- TRUE
    } else {
      sum.index[i] <- FALSE
    }
  }
  return(sum.index)  
}

# get summary variable types
get.summary.variable.types <- function(sum.index, var.types, var.formats) {
  if (all(sum.index == FALSE)) {
    sum.var.types <- NULL
  } else {
    sum.var.types <- var.types[sum.index]
    sum.var.formats <- var.formats[sum.index]
    for (i in 1:length(sum.var.types)) {
      if (sum.var.formats[i] == "YYYY-MM-DDThh:mm") {
        sum.var.types[i] <- "POSIXct-1"
      } else if (sum.var.formats[i] == "YYYY-MM-DDThh:mm:ss") {
        sum.var.types[i] <- "POSIXct-2"
      }
    }
  }
  return(sum.var.types)
}

# get summary variable definitions
get.summary.variable.definitions <- function(sum.index, var.defs) {
  if (all(sum.index == FALSE)) {
    sum.var.defs <- NULL
  } else {
    sum.var.defs <- var.defs[sum.index]
  }
  return(sum.var.defs)
}

# get summary variable units
get.summary.variable.units <- function(sum.index, var.units) {
  if (all(sum.index == FALSE)) {
    sum.var.units <- NULL
  } else {
    sum.var.units <- var.units[sum.index]
  }
  return(sum.var.units)
}

# create summary data frame
get.summary.data.frame <- function(df, sum.index) {
  if (all(sum.index == FALSE)) {
    df.sum <- NULL
  } else if (sum(sum.index, na.rm=TRUE) == 1) {
    df.sum <- as.data.frame(df[ , sum.index])
    i <- which(sum.index == TRUE)
    colnames(df.sum) <- names(df[i])
  } else {
    df.sum <- df[ , sum.index]
  }
  return(df.sum)
}

# wrap lines with indent as needed
word.wrap.lines <- function (st, xstr="") {
  z <- strsplit(st, " ")
  col <- 1
  for (i in 1:length(z[[1]])) {
    if (z[[1]][[i]] != "") {
      col <- col + nchar(z[[1]][[i]]) + 1
      if (col > max.cols) {
        xstr <- paste(xstr, "\n   ", sep="")
        col <- 4
      }
      xstr <- paste(xstr, z[[1]][[i]], " ", sep="")
    }
  }
  xstr <- paste(xstr, "\n", sep="")
  return(xstr)
}

# add column headings for variable table
add.column.headings <- function(xstr="") {
  xstr <- paste(xstr, "Variable         Min         Median      Mean        Max         NAs\n", sep="")
  xstr <- paste(xstr, "------------------------------------------------------------------------\n", sep="")
  return(xstr)      
}

# create summary stats table
create.summary.table <- function(df, df.sum, datafile.id, sum.var.types, sum.var.defs, sum.var.units, datafile.path) {  
  # get data file name
  datafile.name <- get.datafile.name(eml, datafile.id)
  # get data file description
  datafile.description <- get.datafile.description(eml, datafile.id)
  # calculate md5 checksum from data file
  checksum <- md5sum(datafile.path)

  # create summary page
  xstr <- paste("Harvard Forest Data Archive ", toupper(datafile.id), "\n\n", sep="")
  xstr <- paste(xstr, "Data File:\n\n", sep="")
  xstr <- paste(xstr, "Name = ", datafile.name, "\n", sep="")

  st <- paste("Description = ", datafile.description, sep="")
  xstr <- word.wrap.lines(st, xstr)

  xstr <- paste(xstr, "Rows = ", nrow(df), "  Columns = ", length(df), "\n", sep="")
  xstr <- paste(xstr, "MD5 checksum = ", checksum, "\n\n", sep="")
  
  # check for variables to summarize
  if (is.null(df.sum)) {
    xstr <- paste(xstr, "No date or numerical variables\n", sep="")
    textplot(capture.output(writeLines(xstr)))
  } else {
    # variable definitions & units
    xstr <- paste(xstr, "Variables:\n\n", sep="")
    lines <- 11
    for (i in 1:length(df.sum)) {
      if (sum.var.units[i] != "") {
        var.unit <- paste(" (", sum.var.units[i], ")", sep="")
      } else {
        var.unit <- ""
      }
      st <- paste(names(df.sum[i]), " = ", sum.var.defs[i], var.unit, sep="")

      xstr <- word.wrap.lines(st, xstr)
      new.lines <- nchar(st) %/% max.cols + 1
      lines <- lines + new.lines

      # add new page if necessary
      if (lines > max.lines) {
        textplot(capture.output(writeLines(xstr)))
        xstr <- ""
        lines <- 0
      }
    }
    # write summary page to file
    if (lines > 0) textplot(capture.output(writeLines(xstr)))
    
    # create summary table
    xstr <- add.column.headings()
    lines <- 2

    for (i in 1:length(df.sum)) {
      # variable name
      xstr <- paste(xstr, substr(paste(names(df.sum[i]), "               ", sep=""), 1, 12), sep="")
  
      # all values missing
      if (all(is.na(df.sum[ , i]))) {
        xnas <- sum(is.na(df.sum[ , i]))
        xstr <- paste(xstr,"                                                     ", xnas, "\n", sep="")            
      
      # datetime variable
      } else if (sum.var.types[i] == "POSIXct-1" | sum.var.types[i] == "POSIXct-2") {
        # get min, max & NAs only
        xmin <- min(df.sum[ , i], na.rm=TRUE)
        xmax <- max(df.sum[ , i], na.rm=TRUE)      
        xnas <- sum(is.na(df.sum[ , i]))
        xstr <- paste(xstr, xmin, "                ", xmax, "   ", formatC(xnas,0,5,"f"), "\n", sep="")
    
      # get all stats  
      } else {
        xmin <- min(df.sum[ , i], na.rm=TRUE)
        xmedian <- median(df.sum[ , i], na.rm=TRUE)
        xmean <- mean(df.sum[ , i], na.rm=TRUE)
        xmax <- max(df.sum[ , i], na.rm=TRUE)
        xnas <- sum(is.na(df.sum[ , i]))
      
        # date variable
        if (sum.var.types[i] == "Date") {
          xstr <- paste(xstr, "  ", xmin, "  ", xmedian, "  ", xmean, "  ", xmax, formatC(xnas,0,8,"f"), "\n", sep="")
        
          # numeric variable
        } else {
          if (xmin < -1000000 | xmax > 1000000) {
            d <- 0
          } else {
            d <- 3
          }    
          xstr <- paste(xstr,
          formatC(xmin,d,12,"f"),
          formatC(xmedian,d,12,"f"),
          formatC(xmean,d,12,"f"),
          formatC(xmax,d,12,"f"),
          formatC(xnas,0,8,"f"),
          "\n", sep="")
        }
      }
      lines <- lines + 1
      # add new page if necessary
      if (lines > max.lines) {
        textplot(capture.output(writeLines(xstr)))
        xstr <- add.column.headings()
        lines <- 0
      }
    }
    # write summary table to file
    if (lines > 0) textplot(capture.output(writeLines(xstr)))
  }  
}

#---------------------------PLOTS------------------------------#

# get plot type
get.plot.type <- function(datafile.id, time.series.ids) {
  num <- as.integer(substr(datafile.id, 7, 8))  
  if (num %in% time.series.ids) {
    plot.type <- "time-series"
  } else {
    plot.type <- "scatterplot-matrix"
  }
  return(plot.type)
}

# create plot data frame
get.plot.data.frame <- function(df.sum, sum.var.types, max.rows) {
  # start with summary data frame
  df.plt <- df.sum

  if (!is.null(df.plt)) {
    # resample rows at regular intervals if necessary
    plot.rows <- nrow(df.plt)
    if (plot.rows > max.rows) {
      skip <- round(plot.rows/max.rows)
      index <- seq(from=1, to=plot.rows, by=skip)
      df.plt <- df.plt[index, , drop=FALSE]
    }
    
    # convert standard datetime to POSIXct
    for (i in 1:length(df.plt)) {
      if (sum.var.types[i] == "POSIXct-1") {
        df.plt[ , i] = as.POSIXct(df.plt[ , i], format="%Y-%m-%dT%H:%M")
      } else if(sum.var.types[i] == "POSIXct-2") {
        df.plt[ , i] = as.POSIXct(df.plt[ , i], format="%Y-%m-%dT%H:%M:%S")
      }
    }

    # remove empty variables
    keep <- NULL
    if (is.vector(df.plt)) {
      if (all(is.na(df.plt))) {
        df.plt <- NULL
      }
    } else {
      for (i in 1:length(df.plt)) {
        if (!all(is.na(df.plt[ ,i]))) {
          keep <- append(keep, i)
        }
      }
      if (is.null(keep)) {
        df.plt <- NULL
      } else if (length(keep) == 1) {
        col.name <- names(df.plt[keep])
        df.plt <- as.data.frame(df.plt[ , keep])
        colnames(df.plt) <- col.name
      } else {
        df.plt <- df.plt[ , keep]
      }
    }
  }  
  return(df.plt)
}

#--------------------------TIME SERIES-----------------------------#

# create time-series plot
create.time.series.plot <- function(df.plt, datafile.id) {
  if (!is.null(df.plt)) {
    # max number of variables per plot
    max.var <- 4
    
    # get number of plots
    var.num <- length(df.plt)
    if (var.num <= max.var) {
      plot.num <- 1
    } else {
      plot.num <- var.num %/% max.var
      plot.last <- var.num %% max.var
      if (plot.last > 0) plot.num <- plot.num + 1
    }  

    # create each plot
    for (i in 1:plot.num) {
      first.col <- (i-1)*max.var + 1
      if (plot.num == 1) {
        last.col <- first.col + var.num - 1
      } else if (i < plot.num) {
        last.col <- first.col + max.var - 1
      } else {
        first.col <- var.num - max.var + 1
        last.col <- var.num
      }  
      
      # subset for variables to plot
      if (first.col == last.col ) {
        zz <- as.data.frame(df.plt[ , c(first.col:last.col)])
        colnames(zz) <- names(df.plt[first.col])
      } else {
        zz <- df.plt[ , c(first.col:last.col)]
      }    
      
      # create time series
      df.ts <- ts(zz)

      # create plot
      plot.title <- paste(toupper(datafile.id), " Plot ", i, sep="")     
      plot(df.ts, main=plot.title, col="blue")
    }
  }
}

#----------------------SCATTERPLOT MATRIX--------------------------#

# create scatterplot matrix
create.scatterplot.matrix <- function(df.plt, datafile.id) {
  if (!is.null(df.plt)) {
    # column for row number
    row.num <- seq(from=1, to=nrow(df.plt), by=1)
    
    # number of variables to plot
    var.num <- length(df.plt)
    
    # maximum number of variables per plot
    max.var <- 5
    
    # get number of plots (repeat last variable in next plot)
    if (var.num <= max.var) {
      plot.num <- 1
    } else {
      plot.num <- 1 + (var.num-5) %/% (max.var-1)
      plot.last <- (var.num-5) %% (max.var-1)
      if (plot.last > 0) plot.num <- plot.num + 1      
    }

    # create each plot
    for (i in 1:plot.num) {
      # select columns to plot
      if (plot.num == 1) {
        first.col <- 1
        last.col <- var.num
      } else {
        if (i < plot.num) {
          first.col <- (i-1)*max.var - i + 2
          last.col <- first.col + max.var - 1
        } else {
          first.col <- var.num - max.var + 1
          last.col <- var.num
        }
      }
 
      # subset for variables to plot
      if (first.col == last.col) {
        col.name <- names(df.plt[first.col])
        df.sp <- as.data.frame(df.plt[ , first.col])
        colnames(df.sp) <- col.name
      } else {
        df.sp <- df.plt[ , c(first.col:last.col)]
      }

      # add column for row number
      df.sp <- cbind(row.num, df.sp)
      colnames(df.sp)[1] <- "row"

      # create plot
      plot.title <- paste(toupper(datafile.id), " Plot ", i, sep="")
      pairs(df.sp, main=plot.title, col="blue")
    }
  }
}

#---------------------PROCESS DATA FILE------------------------#

# display column headings in console
display.column.headings <- function() {
  st <- "File      MD5 Checksum                      Preview       Rows      Bytes"
  writeLines(st)
  st <- "----------------------------------------------------------------------------"
  writeLines(st)
}

# read data file into data frame
get.datafile <- function(datafile.path, var.types, field.delimiter) {
  con <- file(datafile.path)
  df <- read.csv(con, header=TRUE, sep=field.delimiter, colClasses=var.types)
  return(df)
}

# create preview for data file
create.datafile.preview <- function(eml, time.series.ids, datafile.id, datafile.path) {
  # get data file type
  datafile.type <- get.datafile.type(eml, datafile.id)

  # calculate checksum from data file
  checksum <- md5sum(datafile.path)

  # get data file checksum from eml
  datafile.checksum <- get.datafile.checksum(eml, datafile.id)
  # warning if checksum values do not match
  if (!is.null(datafile.checksum)) {
    if (datafile.checksum != checksum) {
      msg <- paste(toupper(datafile.id), ": checksum values do not match", sep="")
      warning(msg)
    }
  }

  # get data file size in bytes
  f.size <- file.size(datafile.path)

  # get file size from eml
  datafile.size <- get.datafile.size(eml, datafile.id, datafile.type)
  # warning if file sizes do not match
  if (!is.null(datafile.size)) {
    if (datafile.size != f.size) {
      msg <- paste(toupper(datafile.id), ": file sizes do not match", sep="")
      warning(msg)
    }
  }
  
  # data table: create summmary table & plots
  if (datafile.type == "dataTable") {
    # create PDF file
    pdf.file <- paste(plot.dir, datafile.id, ".pdf", sep="")
    pdf(pdf.file)
    par(mar=c(0.5,0.5,0.5,0.5))

    # get field delimiter
    field.delimiter <- get.field.delimiter(eml, datafile.id)
    # get data table node set
    node.set <- get.data.table.node.set(eml, datafile.id)
    # get R variable types
    var.types <- get.all.variable.types(node.set)
    # get variable format strings
    var.formats <- get.all.variable.formats(node.set)
    # get variable definitions
    var.defs <- get.all.variable.definitions(node.set)
    # get variable units
    var.units <- get.all.variable.units(node.set)

    # read data file into data frame
    df <- get.datafile(datafile.path, var.types, field.delimiter)

    # get number of records from eml
    datafile.records <- get.datafile.number.of.records(eml, datafile.id)
    # warning if numbers of records do not match
    if (!is.null(datafile.records)) {
      if (datafile.records != nrow(df)) {
        msg <- paste(toupper(datafile.id), ": numbers of records do not match", sep="")
        warning(msg)
      }
    }

    # get actual number of records as a string
    n.rows <- substr(paste(toString(nrow(df)), "          ", sep=""), 1, 10)
    
    # create summary stats table
    sum.index <- get.summary.index(df, var.types, var.formats)
    sum.var.types <- get.summary.variable.types(sum.index, var.types, var.formats)
    sum.var.defs <- get.summary.variable.definitions(sum.index, var.defs)
    sum.var.units <- get.summary.variable.units(sum.index, var.units)
    df.sum <- get.summary.data.frame(df, sum.index)
    create.summary.table(df, df.sum, datafile.id, sum.var.types, sum.var.defs, sum.var.units, datafile.path)

    # create plots
    if (is.null(df.sum)){
      msg <- paste(checksum, "  No plots      ", n.rows, f.size, sep="")
    } else {
      # get plot type
      plot.type <- get.plot.type(datafile.id, time.series.ids)

      # time-series plot
      if (plot.type == "time-series") {
        df.plt <- get.plot.data.frame(df.sum, sum.var.types, max.rows=10000)
        create.time.series.plot(df.plt, datafile.id)
        msg <- paste(checksum, "  Time Series   ", n.rows, f.size, sep="")
      } else {

      # scatterplot matrix
        df.plt <- get.plot.data.frame(df.sum, sum.var.types, max.rows=1000)
        create.scatterplot.matrix(df.plt, datafile.id)
        msg <- paste(checksum, "  Scatterplots  ", n.rows, f.size, sep="")
      }
    }
    # close PDF file
    dev.off()  

  # other entity: display md5 checksum
  } else if (datafile.type == "otherEntity") {
    # display checksum in console
    msg <- paste(checksum, "  No preview              ", f.size, sep="")
  
  # neither data table nor other entity
  } else {
    msg <- "Data type not currently supported"
  }

  # display result message
  st <- paste(toupper(datafile.id), '  ', msg, sep="")
  writeLines(st)
}

#------------------------MAIN PROGRAM---------------------------#

# read eml file
eml <- get.parsed.eml(hf.eml.dir, dataset.id)

if (!is.null(eml)) {
  # get data file numbers for time-series plots
  time.series.ids <- get.time.series.ids(eml)
  
  # get data file ids for data tables & other entities
  if (datafile.num > 0) {
    datafile.ids <- get.datafile.id(eml, dataset.id, datafile.num)
  } else {
    datafile.ids <- get.datafile.ids(eml, dataset.id)
  }
  
  if (!is.null(datafile.ids)) {
    # display column headings
    display.column.headings()
    
    # process each data file
    for (i in 1:length(datafile.ids)) {
      datafile.id <- datafile.ids[i]
      
      # get data file path
      datafile.path <- get.datafile.path(eml, datafile.id)
    
      if (!is.null(datafile.path)) {
        # create preview or display MD5 checksum
        create.datafile.preview(eml, time.series.ids, datafile.id, datafile.path)
      }
    }
  }
}

############################ DONE ##############################