From 755b4b5bc516cd1603e4930047da1d8ab7602855 Mon Sep 17 00:00:00 2001 From: Peder <pbac@dtu.dk> Date: Sat, 12 Jun 2021 23:37:42 +0200 Subject: [PATCH] Close to version 1 --- DESCRIPTION | 3 +- R/bspline.R | 2 +- R/cache_name.R | 20 ++- R/complete_cases.R | 100 +++++++++++++ R/data.list.R | 10 +- R/forecastmodel.R | 2 +- R/in_range.R | 24 +-- R/lagdf.R | 4 +- R/lagdl.R | 13 +- R/lm_fit.R | 13 +- R/lm_optim.R | 5 +- R/plot_ts.R | 15 +- R/residuals.R | 24 ++- R/rls_fit.R | 2 +- R/rls_optim.R | 7 +- R/rls_reduce.R | 74 ---------- R/rls_summary.R | 22 +-- R/rmse.R | 2 +- R/score.R | 83 ++++++++--- R/score_fit.R | 58 -------- R/step_optim.R | 217 ++++++++++++++++++++++------ inst/CITATION | 5 +- make.R | 12 +- misc-R/reduce-test.R | 54 ------- misc-R/step_optim-test.R | 72 +++++++++ tests/testthat.R | 4 + tests/testthat/test-newtest.R | 3 + tests/testthat/test-rls-heat-load.R | 2 - vignettes/forecast-evaluation.Rmd | 215 +++++++++++++++------------ vignettes/make.R | 13 +- vignettes/online-updating.Rmd | 121 ++++++++++------ vignettes/setup-and-use-model.Rmd | 44 +++--- vignettes/setup-data.Rmd | 44 +++--- 33 files changed, 779 insertions(+), 510 deletions(-) create mode 100644 R/complete_cases.R delete mode 100644 R/rls_reduce.R delete mode 100644 R/score_fit.R delete mode 100644 misc-R/reduce-test.R create mode 100644 misc-R/step_optim-test.R create mode 100644 tests/testthat.R create mode 100644 tests/testthat/test-newtest.R diff --git a/DESCRIPTION b/DESCRIPTION index 9d12fff..ec365a7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,10 +21,11 @@ Suggests: knitr, rmarkdown, R.rsp, - testthat (>= 2.1.0), + testthat (>= 3.0.0), data.table, plotly VignetteBuilder: knitr RoxygenNote: 7.1.1 URL: https://onlineforecasting.org BugReports: https://lab.compute.dtu.dk/packages/onlineforecast/-/issues +Config/testthat/edition: 3 diff --git a/R/bspline.R b/R/bspline.R index 24aa9e7..56ae11f 100644 --- a/R/bspline.R +++ b/R/bspline.R @@ -86,7 +86,7 @@ bspline <- function(X, Boundary.knots = NA, intercept = FALSE, df = NULL, knots # First find the horizons, they are used in the end nms <- nams(X) # All columns must be like "k12" - #if(length(grep("^[k|h][[:digit:]]+$", nms)) != length(nms)){ stop("All column names must indicate a horizon, so start with k and then an integer") } + #if(length(grep("[k|h][[:digit:]]+$", nms)) != length(nms)){ stop("All column names must indicate a horizon, so start with k and then an integer") } # Run for each horizon and calculate the basis splines L <- lapply(1:ncol(X), function(i) { if (is.na(Boundary.knots[1])) { diff --git a/R/cache_name.R b/R/cache_name.R index d9260d9..bdffef0 100644 --- a/R/cache_name.R +++ b/R/cache_name.R @@ -39,17 +39,18 @@ #' # For this example use a temporary directory #' # In real use this should not be temporary! (changes between R sessions with tempdir()) #' cachedir <- tempdir() -#' +#' +#' # Uncomment to run: #' # First time it takes at least 1 sec. -#' fun(x=2,y=2) +#' #fun(x=2,y=2) #' # Second time it loads the cache and is much faster -#' fun(x=2,y=2) +#' #fun(x=2,y=2) #' # Try changing the arguments (x,y) and run again #' #' # See the cache file(s) -#' dir(cachedir) +#' #dir(cachedir) #' # Delete the cache folder -#' unlink(cachedir, recursive=TRUE) +#' #unlink(cachedir, recursive=TRUE) #' #' # Demonstrate how cache_name() is functioning #' # Cache using the all objects given in the function calling, i.e. both x and y @@ -81,7 +82,14 @@ cache_name <- function(..., cachedir = "cache"){ # Get the name, definition and arguments of the function from which cache_name was called funname <- strsplit(deparse(sys.calls()[[sys.nframe()-1]]), "\\(")[[1]][1] - fundef <- digest::digest(attr(eval(parse(text=funname)), "srcref")) + # Find the function in the nearest environment in the stack (i.e. parent calls) + for(i in rev(sys.parents())){ + if(funname %in% ls(parent.frame(i+1))){ + val <- mget(funname, parent.frame(i+1)) + break + } + } + fundef <- digest::digest(attr(eval(val[[funname]]), "srcref")) # if no arguments were given, then use the arguments function from which cache_name was called if(length(list(...)) == 0){ funargs <- digest::digest(as.list( match.call(definition = sys.function( -1 ), call = sys.call(-1)))[-1]) diff --git a/R/complete_cases.R b/R/complete_cases.R new file mode 100644 index 0000000..3fe0beb --- /dev/null +++ b/R/complete_cases.R @@ -0,0 +1,100 @@ +#' Returns a logical vector indicating the time points which +#' +#' Given a forecast matrix the forecasts are lagged "+k" steps to align them and +#' then 'complete.cases()' is run on that . +#' +#' Gieven a list of forecast matrices the points where all are complete (also all horizons) are complete are TRUE. +#' +#' @title Find complete cases in forecast matrices +#' @param object A data.frame (with columns named 'kxx') or a list of +#' data.frames. +#' @param kseq integer vector: If given then only these horizons are processed. +#' @return A logical vector specifying if there is no missing +#' values across all horizonsd. +#' @author Peder Bacher +#' +#' @examples +#' # Take a small data set +#' D <- subset(Dbuilding, 1:20, kseq=1:5) +#' # Check the forecast matrix of ambient temperature +#' D$Ta +#' # Which are complete over all horizons? The first are not since not all horizons +#' # have a value there (after lagging) +#' complete_cases(D$Ta) +#' # Same goes if given as a list +#' complete_cases(D["Ta"]) +#' # and if more than one is given +#' complete_cases(D[c("Ta","I")]) +#' +#' # Set some NA of some horizon +#' D$I$k3[8:9] <- NA +#' # Now they are recognized as not complete +#' complete_cases(D[c("Ta","I")]) +#' +#' # If we deal with residuals, which are observations and there for have column names "hxx" +#' Resid <- residuals(D$Ta, D$Taobs) +#' names(Resid) +#' # With columns with "h" instead of "k" no lagging occurs in complete_cases +#' complete_cases(Resid) +#' # +#' Resid2 <- Resid +#' Resid$h3[8:9] <- NA +#' complete_cases(list(Resid,Resid2)) +#' +#' @rdname complete_cases +#' @importFrom stats complete.cases +#' @export +complete_cases <- function(object, kseq = NA){ + UseMethod("complete_cases") +} + + +#' @rdname complete_cases +#' @importFrom stats complete.cases +#' @export +complete_cases.list <- function(object, kseq=NA){ + if(length(grep("[k][[:digit:]]+$", nams(object[[1]])))){ + prefix <- "k" + }else{ + prefix <- "h" + } + # Do it only for the given kseq horizons, if kseq is not set, then take it from the first + if(is.na(kseq[1])){ + kseq <- as.integer(gsub(prefix, "", nams(object[[1]]))) + } + # Check that they all have kseq horizons + ok <- rep(TRUE, nrow(object[[1]])) + # + for(i in 1:length(object)){ + if(!all(kseq %in% as.integer(gsub(prefix, "", nams(object[[i]]))))){ + stop("Not all forecast matrices have kseq horizons: ",kseq) + } + ok <- ok & !is.na(object[[i]][ ,pst(prefix,kseq)]) + } + # + ok <- as.data.frame(ok) + if(prefix == "k"){ + # Lag to match resiuduals in time + names(ok) <- pst("k",kseq) + ok <- lagdf(ok, "+k") + } + # Finally, the vector with TRUE for all points with no NAs for any forecast + ok <- apply(ok, 1, all) + # Just set NAs to false + ok[is.na(ok)] <- FALSE + # + return(ok) +} + + +#' @rdname complete_cases +#' @importFrom stats complete.cases +#' @export +complete_cases.data.frame <- function(object, kseq=NA){ + # Make a distinction between "kxx" which are forecasts or "hxx", which are observations + if(length(grep("[k][[:digit:]]+$", nams(object)[1]))){ + # Forecasts, so lag them + object <- lagdf(object, "+k") + } + complete.cases(object) +} diff --git a/R/data.list.R b/R/data.list.R index d57a7ac..df10779 100644 --- a/R/data.list.R +++ b/R/data.list.R @@ -117,7 +117,7 @@ subset.data.list <- function(x, subset = NA, nms = NA, kseq = NA, lagforecasts = X <- D[[nms[i]]] if(class(X)[1] == "data.frame" ){ # Check if holds forecasts by checking if any name is "kxx" - if(length(grep("^k[[:digit:]]+$", names(X))) > 0){ + if(length(grep("k[[:digit:]]+$", names(X))) > 0){ # If it holds forecasts, check that they are all there if( !all(pst("k",kseq) %in% names(X)) ){ warning(pst("The variable ",nms[i]," contain ",pst(names(X),collapse=",")," hence doesn't contain all k in kseq = ",pst(kseq,collapse=","))) @@ -160,7 +160,7 @@ subset.data.list <- function(x, subset = NA, nms = NA, kseq = NA, lagforecasts = val <- lapply(D[nms], function(X) { if (any(class(X) == "data.frame")) { # Check if holds forecasts by checking if any name is "kxx" - if(length(grep("^k[[:digit:]]+$", names(X))) > 0){ + if(length(grep("k[[:digit:]]+$", names(X))) > 0){ return(X[subset,pst("k",kseq), drop=FALSE]) }else{ return(X[subset, , drop=FALSE]) @@ -173,7 +173,7 @@ subset.data.list <- function(x, subset = NA, nms = NA, kseq = NA, lagforecasts = # Lag the forecasts k if specified if(lagforecasts){ val <- lapply(val, function(X){ - if(any(class(X) == "data.frame") & length(grep("^k[[:digit:]]+$",names(X))) > 0) { + if(any(class(X) == "data.frame") & length(grep("k[[:digit:]]+$",names(X))) > 0) { return(lagdf.data.frame(X, lagseq="+k")) }else{ return(X) @@ -213,7 +213,7 @@ as.data.frame.data.list <- function(x, row.names=NULL, optional=FALSE, ...){ val <- as.data.frame(val) } # Fix names of data.frames (i.e. forecasts, their names are now "kxx", but should be X.kxx) - i <- grep("^k[[:digit:]]+$", names(val)) + i <- grep("k[[:digit:]]+$", names(val)) if(length(i) > 0){ names(val)[i] <- pst(names(x)[i],".",names(val)[i]) } @@ -382,7 +382,7 @@ check.data.list <- function(object){ Forecasts$nrow[i] <- nrow(D[[nm]]) } # Check the colnames, are they unique and all k+integer? - if(!length(unique(grep("^k[[:digit:]]+$",colnms,value=TRUE))) == length(colnms)){ + if(!length(unique(grep("k[[:digit:]]+$",colnms,value=TRUE))) == length(colnms)){ Forecasts$colnames[i] <- "X" } if(!length(unique(sapply(colnms, function(colnm){ class(D[[nm]][ ,colnm]) }))) == 1){ diff --git a/R/forecastmodel.R b/R/forecastmodel.R index 24dd10d..b1fc9d0 100644 --- a/R/forecastmodel.R +++ b/R/forecastmodel.R @@ -395,7 +395,7 @@ print.forecastmodel <- function(x, ...){ cat("\nOutput:",model$output) cat("\nInputs: ") if(length(model$inputs) == 0 ){ - cat("\nNo inputs") + cat("\nNo inputs\n\n") }else{ cat(names(model$inputs)[1],"=",model$inputs[[1]]$expr,"\n") if(length(model$inputs) > 1){ diff --git a/R/in_range.R b/R/in_range.R index 2ca3268..cfad0a7 100644 --- a/R/in_range.R +++ b/R/in_range.R @@ -48,15 +48,21 @@ #' @export in_range <- function(tstart, time, tend=NA, timezone=NA) { - if (class(tstart)[1] == "character") - tstart <- ct(tstart) - if (is.na(tend)) + if (is.na(tend)){ tend <- time[length(time)] - if (class(tend)[1] == "character") - tend <- ct(tend) - if (is.na(timezone)){ - timezone <- attr(D$t, "tzone") } - - ct(tstart, tz = timezone) < time & time <= ct(tend, tz = timezone) + if(is.numeric(time)){ + return(tstart < time & time <= tend) + }else{ + if (class(tstart)[1] == "character"){ + tstart <- ct(tstart) + } + if (class(tend)[1] == "character"){ + tend <- ct(tend) + } + if (is.na(timezone)){ + timezone <- attr(time, "tzone") + } + return(ct(tstart, tz = timezone) < time & time <= ct(tend, tz = timezone)) + } } diff --git a/R/lagdf.R b/R/lagdf.R index e56ec9c..e3d3269 100644 --- a/R/lagdf.R +++ b/R/lagdf.R @@ -114,12 +114,12 @@ lagdf.data.frame <- function(x, lagseq) { if (lagseq %in% c("+k","+h")) { lagseq <- rep(0, length(nms)) ## lagseq according to the k value of the columnnames - i <- grep("^[k|h][[:digit:]]+$", nms) + i <- grep("[k|h][[:digit:]]+$", nms) lagseq[i] <- as.integer(sapply(strsplit(nms[i], "[k|h]"), function(x){ x[length(x)] })) } else if (lagseq %in% c("-k","-h")) { lagseq <- rep(0, length(nms)) ## lagseq according to the negative k value of the columnnames - i <- grep("^[k|h][[:digit:]]+$", nms) + i <- grep("[k|h][[:digit:]]+$", nms) lagseq[i] <- -as.integer(sapply(strsplit(nms[i], "[k|h]"), function(x){ x[length(x)] })) } } diff --git a/R/lagdl.R b/R/lagdl.R index 5f6696f..92068f7 100644 --- a/R/lagdl.R +++ b/R/lagdl.R @@ -10,19 +10,18 @@ #' #' #' @title Lagging which returns a data.list -#' @param x The data.list to be lagged. +#' @param DL The data.list to be lagged. #' @param lagseq The integer(s) setting the lag steps. #' @return A data.list. -#' @seealso \code{\link{lagdl.data.frame}} which is run when \code{x} is a data.frame. #' @examples #' # The values are simply shifted in each data.frame with lagdf #' #'@export -lagdl <- function(D, lagseq){ - iseq <- which(sapply(D,class) %in% c("data.frame","matrix")) - D[iseq] <- lapply(iseq, function(i){ - lagdf(D[[i]], lagseq) +lagdl <- function(DL, lagseq){ + iseq <- which(sapply(DL,class) %in% c("data.frame","matrix")) + DL[iseq] <- lapply(iseq, function(i){ + lagdf(DL[[i]], lagseq) }) - return(D) + return(DL) } diff --git a/R/lm_fit.R b/R/lm_fit.R index d23d40f..4c3db31 100644 --- a/R/lm_fit.R +++ b/R/lm_fit.R @@ -37,9 +37,8 @@ #' # Define a simple model #' model <- forecastmodel$new() #' model$output <- "y" -#' model$add_inputs(Ta = "Ta", +#' model$add_inputs(Ta = "lp(Ta, a1=0.9)", #' mu = "one()") -#' model$add_regprm("rls_prm(lambda=0.99)") #' #' # Before fitting the model, define which points to include in the evaluation of the score function #' D$scoreperiod <- in_range("2010-12-20", D$t) @@ -53,13 +52,13 @@ #' class(fit) #' # The one-step forecast #' plot(D$y, type="l") -#' lines(fit$Yhat$k1, col=2) +#' lines(lagvec(fit$Yhat$k1,-1), col=2) #' # Get the residuals #' plot(residuals(fit)$h1) #' # Score for each horizon -#' score_fit(fit) +#' score(residuals(fit)) #' -#' # The lm_fit don't put anything in +#' # The lm_fit don't put anything in this field #' fit$Lfitval #' # Find the lm fits here #' model$Lfits @@ -68,6 +67,10 @@ #' # Some diurnal pattern is present #' acf(residuals(fit)$h1, na.action=na.pass, lag.max=96) #' +#' # Run with other parameters and return the RMSE +#' lm_fit(c(Ta__a1=0.8), model, D, scorefun=rmse, returnanalysis=FALSE) +#' lm_fit(c(Ta__a1=0.9), model, D, scorefun=rmse, returnanalysis=FALSE) +#' #' @importFrom stats lm residuals #' @export lm_fit <- function(prm=NA, model, data, scorefun = NA, returnanalysis = TRUE, printout = TRUE){ diff --git a/R/lm_optim.R b/R/lm_optim.R index 567823e..a49cdb6 100644 --- a/R/lm_optim.R +++ b/R/lm_optim.R @@ -15,6 +15,7 @@ #' @param kseq The horizons to fit for (if not set, then model$kseq is used) #' @param scorefun The function to be score used for calculating the score to be optimized. #' @param cachedir A character specifying the path (and prefix) of the cache file name. If set to \code{""}, then no cache will be loaded or written. See \url{https://onlineforecasting.org/vignettes/nice-tricks.html} for examples. +#' @param cacheload A logical controlling whether to load the cache if it exists. #' @param printout A logical determining if the score function is printed out in each iteration of the optimization. #' @param method The method argument for \code{\link{optim}}. #' @param ... Additional parameters to \code{\link{optim}} @@ -56,7 +57,7 @@ #' #' @importFrom stats optim #' @export -lm_optim <- function(model, data, kseq = NA, scorefun = rmse, cachedir="", cachererun=FALSE, printout=TRUE, method="L-BFGS-B", ...){ +lm_optim <- function(model, data, kseq = NA, scorefun = rmse, cachedir="", cacheload=FALSE, printout=TRUE, method="L-BFGS-B", ...){ ## Take the parameters bounds from the parameter bounds set in the model init <- model$get_prmbounds("init") lower <- model$get_prmbounds("lower") @@ -80,7 +81,7 @@ lm_optim <- function(model, data, kseq = NA, scorefun = rmse, cachedir="", cache cnm <- cache_name(lm_fit, lm_optim, m$outputrange, m$regprm, m$transform_data(data), data[[m$output]], scorefun, init, lower, upper, cachedir = cachedir) # Load the cached result if it exists - if(file.exists(cnm) & !cachererun){ + if(file.exists(cnm) & !cacheload){ res <- readRDS(cnm) # Set the optimized parameters into the model model$insert_prm(res$par) diff --git a/R/plot_ts.R b/R/plot_ts.R index ac7057f..5290400 100644 --- a/R/plot_ts.R +++ b/R/plot_ts.R @@ -24,6 +24,7 @@ #' @param mains A character vector with the main for each plot. #' @param mainouter A character with the main at the top of the plot (can also be added afterwards with \code{title(main, outer=TRUE)}). #' @param legendtexts A list with the legend texts for each plot (replaces the names of the variables). +#' @param colormaps A list of colormaps, which will be used in each plot. #' @param xat POSIXt specifying where the ticks on x-axis should be put. #' @param usely If TRUE then plotly will be used. #' @param plotit If FALSE then the plot will not be generated, only data returned. @@ -91,7 +92,7 @@ plot_ts.data.list <- function(object, patterns=".*", xlim = NA, ylims = NA, xlab # set kseq if not specified if( is.na(kseq[1]) ){ tmp <- unique(unlist(lapply(DL, function(x){names(x)}))) - tmp <- tmp[grep("^[k|h][[:digit:]]+$", tmp)] + tmp <- tmp[grep("[k|h][[:digit:]]+$", tmp)] if( length(tmp) > 0 ){ kseq <- sort(as.integer(gsub("[k|h]","",tmp))) } } # Generate a data.frame with the series to be plotted @@ -173,7 +174,8 @@ plot_ts.data.list <- function(object, patterns=".*", xlim = NA, ylims = NA, xlab #' @importFrom graphics par title #' @export plot_ts.data.frame <- function(object, patterns=".*", xlim = NA, ylims = NA, xlab = "", ylabs = NA, - mains = NA, mainouter="", legendtexts = NA, colormaps = NA, xat = NA, usely=FALSE, plotit = TRUE, p = NA, namesdata=NA, ...) { + mains = NA, mainouter="", legendtexts = NA, colormaps = NA, xat = NA, + usely=FALSE, plotit = TRUE, p = NA, namesdata=NA, ...) { # Take par_ts setup parameters from options if there p <- par_ts(fromoptions=TRUE, p=p, ...) # @@ -196,7 +198,7 @@ plot_ts.data.frame <- function(object, patterns=".*", xlim = NA, ylims = NA, xla if(is.na(ylims[1]) & length(ylims)==1){ ylims <- as.list(rep(NA,length(patterns))) } if(is.na(ylabs[1]) & length(ylabs)==1){ ylabs <- rep(NA,length(patterns)) } if(is.na(legendtexts[1]) & length(legendtexts)==1){ legendtexts <- as.list(rep(NA,length(patterns))) } - if(is.na(colormaps[1]) & length(colormaps)==1){ colormaps <- as.list(rep(NA,length(patterns))) } + if(is.na(colormaps[1]) & length(colormaps)==1){ colormaps <- as.list(rep(NA,length(patterns))) } # if(usely){ # with plotly @@ -301,7 +303,8 @@ plot_ts_iseq <- function(data, pattern, xnm, namesdata){ # Plot all columns found with regex pattern #' @importFrom graphics plot lines axis title axis.POSIXct mtext par legend plot_ts_series <- function(data, pattern, iplot = 1, - ylim = NA, xlab = "", main = "", mainline = -1.2, colormap = NA, legendtext = NA, xat = NA, plotit = TRUE, p = NA, namesdata = NA, xaxis = TRUE, ...) { + ylim = NA, xlab = "", main = "", mainline = -1.2, colormap = NA, legendtext = NA, + xat = NA, plotit = TRUE, p = NA, namesdata = NA, xaxis = TRUE, ...) { # # Take par_ts setup parameters from options or defaults p <- par_ts(fromoptions=TRUE, p=p, ...) @@ -476,7 +479,7 @@ plot_ts_series <- function(data, pattern, iplot = 1, #' @export plot_ts.rls_fit <- function(object, patterns = c("^y$|^Yhat$","^Residuals$","CumAbsResiduals$",pst("^",names(fit$Lfitval[[1]]),"$")), xlim = NA, ylims = NA, xlab = "", ylabs = NA, mains = "", mainouter="", legendtexts = NA, - xat = NA, usely=FALSE, plotit=TRUE, p=NA, kseq = NA, ...){ + colormaps = NA, xat = NA, usely=FALSE, plotit=TRUE, p=NA, kseq = NA, ...){ fit <- object # Calculate the residuals Residuals <- residuals(fit) @@ -531,7 +534,7 @@ plot_ts.rls_fit <- function(object, patterns = c("^y$|^Yhat$","^Residuals$","Cum #patterns[patterns == "!!RLSinputs!!"] <- pst("^",nmsinput,"$")) # Make a plot of the RLS coefficients for each horizon plot_ts(data, patterns, xlim = xlim, ylims = ylims, xlab = xlab, ylabs = ylabs, - mains = mains, mainouter=mainouter, legendtexts = legendtexts, xat = xat, usely=usely, p=p, kseq = kseq, ...) + mains = mains, mainouter=mainouter, legendtexts = legendtexts, colormaps=colormaps, xat = xat, usely=usely, p=p, kseq = kseq, ...) } # Return the data invisible(data) diff --git a/R/residuals.R b/R/residuals.R index a87a87b..16b2dbf 100644 --- a/R/residuals.R +++ b/R/residuals.R @@ -8,9 +8,11 @@ #' @param object The forecast matrix (a data.frame with kxx as column names, Yhat in returned fits). #' @param y The observations vector. #' @param ... Not used. -#' @return A data.frame with the residuals for each horizon. +#' @return If object is a matrix or data.frame: a data.frame with the residuals for each horizon. +#' If object is a list: A list with residuals from each element. #' #' @examples +#' # ?? list example #' # Just a vector to be forecasted #' n <- 100 #' D <- data.list() @@ -41,11 +43,11 @@ #' @rdname residuals #' @export residuals.data.frame <- function(object, y, ...){ - Yhat <- object - # Add some checking at some point - Residuals <- y - lagdf(Yhat, "+k") + # Improvements: Add some checking at some point + # Calculate the residuals + Residuals <- y - lagdf(object, "+k") # Named with hxx (it's not a forecast, but an observation available at t) - names(Residuals) <- gsub("k","h",names(Residuals)) + nams(Residuals) <- gsub("k","h",nams(Residuals)) # return(Residuals) } @@ -55,10 +57,16 @@ residuals.data.frame <- function(object, y, ...){ residuals.matrix <- residuals.data.frame #' @rdname residuals -#' @param object The value from a fit a forecastmodel (currently \code{\link{lm_fit}} or \code{\link{rls_fit}}. -#' @param ... Not used. +#' @export +residuals.list <- function(object, y, ...){ + # Each element is a forecast matrix, so do it on each + return(lapply(object, residuals, y=y)) +} + +#' @rdname residuals #' @export residuals.forecastmodel_fit <- function(object, ...){ fit <- object - residuals(fit$Yhat, fit$data[[fit$model$output]]) + return(residuals(fit$Yhat, fit$data[[fit$model$output]])) } + diff --git a/R/rls_fit.R b/R/rls_fit.R index e1b1299..794f327 100644 --- a/R/rls_fit.R +++ b/R/rls_fit.R @@ -96,7 +96,7 @@ #' # See rmse as a function of horizon #' fit <- rls_fit(val$par, model, D, scorefun = rmse) #' plot(fit$scoreval, xlab="Horizon k", ylab="RMSE") -#' # See ?score_fit for a little more consistent way of calculating this +#' # See ?score for a little more consistent way of calculating this #' #' #' # Try adding a low-pass filter to Ta diff --git a/R/rls_optim.R b/R/rls_optim.R index 4a50980..c01cc77 100644 --- a/R/rls_optim.R +++ b/R/rls_optim.R @@ -17,10 +17,11 @@ #' #' @title Optimize parameters for onlineforecast model fitted with RLS #' @param model The onlineforecast model, including inputs, output, kseq, p -#' @param data The data.list including the variables used in the model. +#' @param data The data.list which holds the data on which the model is fitted. #' @param kseq The horizons to fit for (if not set, then model$kseq is used) #' @param scorefun The function to be score used for calculating the score to be optimized. #' @param cachedir A character specifying the path (and prefix) of the cache file name. If set to \code{""}, then no cache will be loaded or written. See \url{https://onlineforecasting.org/vignettes/nice-tricks.html} for examples. +#' @param cacheload A logical controlling whether to load the cache if it exists. #' @param printout A logical determining if the score function is printed out in each iteration of the optimization. #' @param method The method argument for \code{\link{optim}}. #' @param ... Additional parameters to \code{\link{optim}} @@ -60,7 +61,7 @@ #' #' #' @export -rls_optim <- function(model, data, kseq = NA, scorefun = rmse, cachedir="", cachererun=FALSE, printout=TRUE, method="L-BFGS-B", ...){ +rls_optim <- function(model, data, kseq = NA, scorefun = rmse, cachedir="", cacheload=FALSE, printout=TRUE, method="L-BFGS-B", ...){ # Take the parameters bounds from the parameter bounds set in the model init <- model$get_prmbounds("init") lower <- model$get_prmbounds("lower") @@ -85,7 +86,7 @@ rls_optim <- function(model, data, kseq = NA, scorefun = rmse, cachedir="", cach # This is maybe smarter, don't have to calculate the transformation of the data: cnm <- cache_name(m$regprm, getse(m$inputs, nms="expr"), m$output, m$prmbounds, m$kseq, data, objfun, init, lower, upper, cachedir = cachedir) cnm <- cache_name(rls_fit, rls_optim, m$outputrange, m$regprm, m$transform_data(data), data[[m$output]], scorefun, init, lower, upper, kseq, cachedir = cachedir) # Load the cached result if it exists - if(file.exists(cnm) & !cachererun){ + if(file.exists(cnm) & !cacheload){ res <- readRDS(cnm) # Set the optimized parameters into the model model$insert_prm(res$par) diff --git a/R/rls_reduce.R b/R/rls_reduce.R deleted file mode 100644 index a2caac7..0000000 --- a/R/rls_reduce.R +++ /dev/null @@ -1,74 +0,0 @@ -## #' @importFrom parallel mclapply -## rls_reduce <- function(model, data, prmreduce=list(NA), kseq = NA, scorefun = rmse){ -## ## prm test -## ##prmreduce <- list(I__degree = c(min=1, init=7), mu_tday__nharmonics = c(min=1, init=7)) -## prmin <- unlist(getse(prmreduce, 1)) -## pr <- unlist(getse(prmreduce, 2)) -## # -## m <- model$clone_deep() -## # Insert the starting p reduction values -## if(!is.na(prmreduce[1])){ -## m$insert_prm(pr) -## } -## # -## valref <- rls_optim(m, data, kseq, printout=FALSE)$value -## # -## while(TRUE){ -## # -## message("------------------------------------") -## message("Reference score value",valref) -## # -------- -## # Remove inputs one by one -## message("\nRemoving inputs one by one") -## valsrm <- mclapply(1:length(model$inputs), function(i){ -## mr <- m$clone_deep() -## mr$inputs[[i]] <- NULL -## rls_optim(mr, data, kseq, printout=FALSE)$value -## }) -## valsrm <- unlist(valsrm) -## names(valsrm) <- names(m$inputs) -## message("Scores") -## print(valsrm) -## # -------- -## # Reduce parameter values if specified -## if(!is.na(pr[1])){ -## message("\nReducing prm with -1 one by one") -## valspr <- mclapply(1:length(pr), function(i){ -## mr <- m$clone_deep() -## p <- pr -## # Only count down if above minimum -## if( p[i] >= prmin[i] ){ -## p[i] <- p[i] - 1 -## } -## mr$insert_prm(p) -## val <- rls_optim(mr, data, kseq, printout=FALSE)$value -## # -## return(val) -## }) -## valspr <- unlist(valspr) -## names(valspr) <- names(pr) -## message("Scores") -## print(valspr) -## } -## # Is one the reduced smaller than the current ref? -## if( min(c(valsrm,valspr)) < valref ){ -## if(which.min(c(min(valsrm),min(valspr))) == 1){ -## # One of the models with one of the inputs removed is best -## imin <- which.min(valsrm) -## message("Removing input",names(m$inputs)[imin]) -## m$inputs[[imin]] <- NULL -## }else{ -## # One of the models with reduced parameter values is best -## imin <- which.min(valspr) -## pr[imin] <- pr[imin] - 1 -## m$insert_prm(pr) -## message("Reduced parameter",names(pr)[imin],"to:",pr[imin]) -## } -## valref <- min(c(valsrm,valspr)) -## }else{ -## # No improvement obtained from reduction, so return the current model -## message("------------------------------------\n\nDone") -## return(m) -## } -## } -## } diff --git a/R/rls_summary.R b/R/rls_summary.R index da05735..15e1610 100644 --- a/R/rls_summary.R +++ b/R/rls_summary.R @@ -27,7 +27,6 @@ #' @param object of class \code{rls_fit}, so a fit calculated by \code{\link{rls_fit}}. #' @param scoreperiod logical (or index). If this scoreperiod is given, then it will be used over the one in the fit. #' @param scorefun The score function to be applied on each horizon. -#' @param usecomplete Use on the set of observations which is complete on all horizons. #' @param printit Print the result. #' @param ... Not used. #' @return A list of: @@ -67,15 +66,18 @@ #' #' @importFrom stats sd #' @export -rls_summary <- function(object, scoreperiod = NA, scorefun = rmse, usecomplete = TRUE, printit = TRUE, ...){ +rls_summary <- function(object, scoreperiod = NA, scorefun = rmse, printit = TRUE, ...){ fit <- object # + if(is.na(scoreperiod[1])){ + scoreperiod <- fit$data$scoreperiod + } + # scipen <- options(scipen=10)$scipen - # - tmp <- score_fit(fit, scoreperiod, usecomplete, scorefun) - scoreval <- tmp$scoreval - scoreperiodused <- tmp$scoreperiod - retval <- list(scorefun = scorefun, scoreval = scoreval, scoreperiod = scoreperiodused) + # Calculate the score for each horizon + scoreval <- score(residuals(fit), scoreperiod, scorefun=scorefun) + # The result to return + retval <- list(scorefun = scorefun, scoreval = scoreval, scoreperiod = scoreperiod) # Return the result before print? if(!printit){ return(retval) @@ -91,12 +93,12 @@ rls_summary <- function(object, scoreperiod = NA, scorefun = rmse, usecomplete = cat(" ",names(m$regprm)[i],"=",unlist(m$regprm[i]),"\n") } # - cat("\nScoreperiod:",sum(scoreperiodused),"observations are included.\n") + cat("\nScoreperiod:",sum(scoreperiod),"observations are included.\n") # cat("\nRLS coeffients summary stats (cannot be used for significance tests):\n") coef <- t(sapply(1:length(fit$Lfitval[[1]]), function(i){ val <- sapply(fit$Lfitval, function(Theta){ - Theta[scoreperiodused,i] + Theta[scoreperiod,i] }) # m <- mean(val,na.rm=TRUE) @@ -126,7 +128,7 @@ rls_summary <- function(object, scoreperiod = NA, scorefun = rmse, usecomplete = cat(pst("\n",toupper(scorename),":\n")) print(tmp) cat("\n") - invisible(list(scorefun = scorefun, scoreval = scoreval, scoreperiod = scoreperiodused)) + invisible(list(scorefun = scorefun, scoreval = scoreval, scoreperiod = scoreperiod)) } #' @importFrom stats sd diff --git a/R/rmse.R b/R/rmse.R index 12de721..f90710d 100644 --- a/R/rmse.R +++ b/R/rmse.R @@ -13,7 +13,7 @@ #' @title Computes the RMSE score. #' @param x a numerical vector of residuals. #' @return The RMSE score. -#' @seealso \code{\link{score}()} for calculation of a score for the k'th horizon, and \code{\link{score_fit}()} which takes a forecastmodel fit and returns score taking scoreperiod etc. into account. +#' @seealso \code{\link{score}()} for calculation of a score for the k'th horizon #' @name rmse #' @examples #' diff --git a/R/score.R b/R/score.R index e394c3d..3759178 100644 --- a/R/score.R +++ b/R/score.R @@ -9,10 +9,11 @@ #' Applies the \code{scorefun} on all horizons (each column) of the residuals matrix. See the description of each parameter for more details. #' #' @title Calculate the score for each horizon. -#' @param Residuals A matrix with residuals (columns named \code{hxx}) for which to calculate the score for each horizon. -#' @param scoreperiod as a logical vector controlling which points to be included in the score calculation. If NA then all values are included. -#' @param usecomplete if TRUE then only the values available for all horizons are included (i.e. if at one time point there is a missing value, then values for this time point is removed for all horizons in the calculation). +#' @param object ??list or A matrix with residuals (columns named \code{hxx}) for which to calculate the score for each horizon. +#' @param scoreperiod as a logical vector controlling which points to be included in the score calculation. If NA then all values are included (depeding on 'complete'). +#' @param usecomplete TRUE then only the values available for all horizons are included (i.e. if at one time point there is a missing value, then values for this time point is removed for all horizons in the calculation). #' @param scorefun The score function. +#' @param ... is passed on to the score function. #' @return A list with the a numeric vector with the score value for each horizon and the applied \code{scoreperiod} (note can be different from the given scoreperiod, if only complete observations are used (as per default)). #' @examples #' @@ -24,36 +25,70 @@ #' Resid <- residuals(Yhat, y) #' #' # Calculate the score for the k1 horizon -#' score(Resid)$scoreval +#' score(Resid) #' -#' # The first values were excluded, since there are NAs +#' # In the beginning the horizons have NAs #' head(Resid) -#' score(Resid)$scoreperiod +#' # Default is that only complete cases over all horizons are included +#' score(Resid) +#' # So including all cases for each horizon will give different values +#' score(Resid, usecomplete=FALSE) #' -#' @importFrom stats complete.cases +#' # Given a list +#' # The residuals for each horizon +#' Resid2 <- residuals(persistence(y,kseq=1:4)+rnorm(100), y) +#' +#' score(list(Resid,Resid2)) + +#' @rdname score #' @export -score <- function(Residuals, scoreperiod = NA, usecomplete = TRUE, scorefun = rmse){ - # If no scoreperiod is given, then use all - if(is.na(scoreperiod[1])){ - scoreperiod <- rep(TRUE,nrow(Residuals)) +score <- function(object, scoreperiod = NA, usecomplete = TRUE, scorefun = rmse, ...){ + UseMethod("score") +} + + +#' @rdname score +#' @export +score.list <- function(object, scoreperiod = NA, usecomplete = TRUE, scorefun = rmse, ...){ + # Use only complete cases + if(usecomplete){ + tmp <- complete_cases(object) + }else{ + tmp <- rep(TRUE,nrow(object)) + } + if(!is.na(scoreperiod[1])){ + scoreperiod <- tmp & scoreperiod }else{ - # Do checking of scoreperiod - txt <- "It must be set to an index (int or logical) defining which points to be evaluated in the scorefun()." - if( length(scoreperiod) != nrow(Residuals) ){ - stop("scoreperiod is not same length as nrow(Residuals): ",txt) - }else{ - if( all(is.na(scoreperiod)) ){ stop("scoreperiod is all NA: ",txt) } - } + scoreperiod <- tmp + } + # Run on each element, usecomplete has been dealt with + sapply(object, score, usecomplete=FALSE, scoreperiod=scoreperiod, scorefun=scorefun, ...=...) +} + + +#' @rdname score +#' @export +score.data.frame <- function(object, scoreperiod = NA, usecomplete = TRUE, scorefun = rmse, ...){ + if(is.na(scoreperiod[1])){ + scoreperiod <- rep(TRUE,nrow(object)) } - # Take only the rows which have a value for each horizon? if(usecomplete){ - scoreperiod <- scoreperiod & complete.cases(Residuals) + scoreperiod <- complete_cases(object) & scoreperiod + } + # If no scoreperiod is given, then use all + # Do checking of scoreperiod + txt <- "It must be set to an index (int or logical) defining which points to be evaluated in the scorefun()." + if( length(scoreperiod) != nrow(object) ){ + stop("scoreperiod is not same length as nrow(object): ",txt) + }else{ + if( all(is.na(scoreperiod)) ){ stop("scoreperiod is all NA: ",txt) } } # Calculate the objective function for each horizon - scoreval <- sapply(1:ncol(Residuals), function(i){ - scorefun(Residuals[scoreperiod,i]) + scoreval <- sapply(1:ncol(object), function(i){ + scorefun(object[scoreperiod,i], ...) }) - nams(scoreval) <- gsub("h","k",nams(Residuals)) + nams(scoreval) <- gsub("h","k",nams(object)) # - return(list(scoreval=scoreval,scoreperiod=scoreperiod)) + return(scoreval) } + diff --git a/R/score_fit.R b/R/score_fit.R deleted file mode 100644 index 5ddb7a0..0000000 --- a/R/score_fit.R +++ /dev/null @@ -1,58 +0,0 @@ - -#' Calculate the score for each horizon for a forecast model fit. -#' -#' For evaluation of the score on each horizon, as specified in the fit. Use it for a consistent evaluation. -#' -#' @title Calculates scores for a forecast model fit. -#' @param fit A model fit -#' @param scoreperiod as an index (logical or integer) defining which points to inlude in the score calculation. If NA, the \code{scoreperiod} from the \code{fit$model} object is used. -#' @param usecomplete Only use points where -#' @param scorefun The score function applied, per default \code{\link{rmse}}. -#' @seealso \code{\link{rmse}} and \code{\link{score}} which are used in this function. -#' @return A list with: -#' - \code{scoreval} is the score value -#' - \code{scoreperiod} is the score period used (can be different that the one in arguments \code{fit} or \code{scoreperiod}) -#' - \code{scorename} is the name of the score function applied -#' @export -score_fit <- function(fit, scoreperiod = NA, usecomplete = TRUE, scorefun = rmse){ - # Calculate the score for each horizon - if("scorefun" %in% names(as.list(match.call()))){ - scorename <- as.list(match.call())$scorefun - }else{ - scorename = "rmse" - } - - # Check score period - txt <- "It must be set to an index (int or logical) defining which points to be evaluated in the scorefun()." - if(is.na(scoreperiod[1])){ - if("scoreperiod" %in% nams(fit$data)){ - scoreperiod <- fit$data$scoreperiod - }else{ - stop("scoreperiod is not set. Set it in the data used in the fit function or as argument in the present call: ",txt) - } - } - - # Calculate the Residuals if they were not in fit - if( !"Residuals" %in% names(fit) ){ - # Calculate the residuals - Residuals <- residuals(fit$Yhat, fit$data[fit$model$output]) - }else{ - Residuals <- fit$Residuals - } - - # Calculate the score - tmp <- score(Residuals, scoreperiod, usecomplete) - scoreval <- tmp$scoreval - scoreperiod <- tmp$scoreperiod - - # Give a warning if the score - if(!is.na(fit$scoreval[1])){ - if(length(fit$scoreval) != length(scoreval)){ - warning("fit contains 'scoreval', which is different in length than the scoreval calculated here (probably for different horizons (kseq))") - }else if(!all(fit$scoreval == scoreval)){ - warning("fit contains 'scoreval' which is different from the",scorename,"score calculated now (can also be because of 'usecomplete = TRUE', such that only points which have forecasts for all horizons are included.") - } - } - # - list(scoreval=scoreval, scoreperiod=scoreperiod, scorename=scorename) -} diff --git a/R/step_optim.R b/R/step_optim.R index ea455ea..18ac60c 100644 --- a/R/step_optim.R +++ b/R/step_optim.R @@ -1,53 +1,183 @@ +# Do this in a separate file to see the generated help: +#library(devtools) +#document() +#load_all(as.package("../../onlineforecast")) +#?step_optim + +#' Forward and backward model selection +#' +#' This function takes a model and carry out a model selection by stepping +#' backward, forward or in both directions. +#' +#' The full model containing all inputs must be given. In each step new models +#' are generated, with either one removed input or one added input, and then all +#' the generated models are optimized and their scores compared. If any new +#' model have an improved score compared to the currently selected model, then +#' the new is selected and the process is repeated until no new improvement is +#' obtained. +#' +#' In addition to selecting inputs, then integer parameters can also be stepped +#' through, e.g. the order of basis splined or the number of harmonics in +#' Fourier series. +#' +#' The stepping process is different depending on the direction. In addition to +#' the full model, a starting model can be given, then the selection process +#' will start from that model. +#' +#' If the direction is "both", which is default (same as "backwardboth") then the +#' stepping is: +#' - In first step inputs are removed one-by-one +#' - In following steps, inputs still in the model are removed one-by-one, and +#' inputs not in the model are added one-by-one +#' +#' If the direction is "backwards": +#' - Inputs are only removed in each step +#' +#' If the direction is "forwardboth": +#' - In the first step all inputs are removed +#' - In following steps (same as "both") +#' +#' If the direction is "forward": - In the first step all inputs are removed and +#' from there inputs are only added +#' +#' For stepping through integer variables in the transformation stage, then +#' these have to be set in the "prm" argument. The stepping process will follow +#' the input selection described above. +#' +#' @title Forward and backward model selection +#' @param modelfull The full forecastmodel containing all inputs which will be +#' can be included in the selection. +#' @param data The data.list which holds the data on which the model is fitted. +#' @param kseq The horizons to fit for (if not set, then model$kseq is used) +#' @param prm A list of integer parameters to be stepped. Given using the same +#' syntax as parameters for optimization, e.g. `list(I__degree = c(min=3, +#' max=7))` will step the "degree" for input "I". +#' @param direction The direction to be used in the selection process. +#' @param modelstart A forecastmodel. If it's set then it will be used as the +#' selected model from the first step of the stepping. It should be a sub +#' model of the full model. +#' @param optimfun The function which will carry out the optimization in each step. +#' @param scorefun The score function used. +#' @param ... Additional arguments which will be passed on to optimfun. For example control how many steps +#' +#' @return A list with the result of each step: +#' - '$model' is the model selected in each step +#' - '$result' the result return by the the optimfun +#' +#' @examples +#' +#' +#' # The data, just a rather short period to keep running times short +#' D <- subset(Dbuilding, c("2010-12-15", "2011-02-01")) +#' # Set the score period +#' D$scoreperiod <- in_range("2010-12-22", D$t) +#' # +#' D$tday <- make_tday(D$t, 1:36) +#' # Generate an input which is just random noise, i.e. should be removed in the selection +#' set.seed(83792) +#' D$noise <- make_input(rnorm(length(D$t)), 1:36) +#' +#' # The full model +#' model <- forecastmodel$new() +#' # Set the model output +#' model$output = "heatload" +#' # Inputs (transformation step) +#' model$add_inputs(Ta = "Ta", +#' noise = "noise", +#' mu_tday = "fs(tday/24, nharmonics=5)", +#' mu = "one()") +#' # Regression step parameters +#' model$add_regprm("rls_prm(lambda=0.9)") +#' # Optimization bounds for parameters +#' model$add_prmbounds(lambda = c(0.9, 0.99, 0.9999)) +#' +#' # Select a model, just run it for a single horizon +#' kseq <- 5 +#' # +#' prm <- list(mu_tday__nharmonics = c(min=3, max=7)) +#' +#' # Run all selection schemes +#' Lboth <- step_optim(model, D, kseq, prm, "forward") +#' Lforward <- step_optim(model, D, kseq, prm, "forward") +#' Lbackward <- step_optim(model, D, kseq, prm, "backward") +#' Lbackwardboth <- step_optim(model, D, kseq, prm, "backwardboth") +#' Lforwardboth <- step_optim(model, D, kseq, prm, "forwardboth") +#' +#' # Note that caching can be really smart (the cache files are located in the +#' # cachedir folder (folder in current working directory, can be removed with +#' # unlink(foldername)) See e.g. `?rls_optim` for how the caching works +#' # L <- step_optim(model, D, kseq, prm, "forward", cachedir="cache", cachererun=FALSE) +#' +#' #' @importFrom parallel mclapply #' +#' @export -step_optim <- function(model, data, kseq = NA, prm=list(NA), direction = c("backward","forward","backwardboth","forwardboth"), optimfun = rls_optim, scorefun = rmse, ...){ +step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c("both","backward","forward","backwardboth","forwardboth"), modelstart=NA, optimfun = rls_optim, scorefun = rmse, ...){ # Do: - # - change all lapply + # - checking of input, model, ... + # - change all lapply to mclapply # - Maybe have "cloneit" argument in optimfun, then don't clone inside optim. # - Add argument controlling how much is kept in each iteration (e.g all fitted models) + # - Insert the inputs in the order they are in the full model, then cache might be more consistent (and the order is kept) + # - For score make sure that only the same points are included for all models, or print warning if not! + # - With lm we should use have some regularization, so use AIC or BIC as the score + # - Differentiate the parameters given to optimfun for the selected model and for the new models in each step. E.g. take more steps on the selected model. # # - Help: prm <- list(I__degree = c(min=1, max=7), mu_tday__nharmonics = c(min=1, max=7)) # - help: It's not checked that it's the score is calculated on the same values! WARNING should be printed if some models don't forecast same points # - # First direction is default + # For keeping all the results + L <- list() + # First val of direction is default if not set if(length(direction) > 1){ direction <- direction[1] } - # Don't change the given - mfull <- model$clone_deep() - # Insert the starting prm values - if(!is.na(prm[1])){ - if( direction == "backward" ){ - mfull$insert_prm(unlist(getse(prm, "max"))) - }else if( direction == "forward" ){ - mfull$insert_prm(unlist(getse(prm, "min"))) - }else{ - # Both directions, then start at init, or halfway between min and max - i <- which(sapply(prm, function(x){ "init" %in% names(x) })) - if(length(i)){ - mfull$insert_prm(unlist(getse(prm[i], "init"))) - } - i <- which(sapply(prm, function(x){ !"init" %in% names(x) })) - if(length(i)){ - mfull$insert_prm(round(unlist(getse(prm[i], "max")) - unlist(getse(prm[i], "min")) / 2)) + # Different start up if start model is given + if( is.na(modelstart)[1] ){ + # + mfull <- modelfull$clone_deep() + # Insert the starting prm values into the full model (must be in it, since added inputs are taken from there) + if(!is.na(prm[1])){ + if( direction == "backward" ){ + mfull$insert_prm(unlist(getse(prm, "max"))) + }else if( direction == "forward" ){ + mfull$insert_prm(unlist(getse(prm, "min"))) + }else{ + # Both directions, then start at init, or halfway between min and max + i <- which(sapply(prm, function(x){ "init" %in% names(x) })) + if(length(i)){ + mfull$insert_prm(unlist(getse(prm[i], "init"))) + } + i <- which(sapply(prm, function(x){ !"init" %in% names(x) })) + if(length(i)){ + mfull$insert_prm(round(unlist(getse(prm[i], "max")) - unlist(getse(prm[i], "min")) / 2)) + } } } - } - # For keeping all the results - L <- list() - # - m <- mfull$clone_deep() - if(length(grep("backward",direction))){ - # Optimize from the full model + # Init current model + m <- mfull$clone_deep() + # + if(length(grep("forward",direction))){ + # Remove all inputs + m$inputs <- list() + # Start with no fit + istep <- 0 + res <- list(value=Inf, par=m$get_prmbounds("init")) + }else{ + # Optimize from the full model + res <- optimfun(m, data, kseq, printout=TRUE, ...) + # Keep it + istep <- 1 + L[[istep]] <- list(model = m$clone_deep(), result = res) + } + }else{ + # The full model will not be changed from here, so don't need to clone it + mfull <- modelfull + m <- modelstart$clone() + # Optimize from the model res <- optimfun(m, data, kseq, printout=TRUE, ...) # Keep it istep <- 1 L[[istep]] <- list(model = m$clone_deep(), result = res) - }else{ - # Optimize from the null model - m$inputs <- list() - # Must be set - istep <- 0 - res <- list(value=Inf, par=m$get_prmbounds("init")) } # Helper c_mStep <- function(l, nms){ @@ -63,15 +193,17 @@ step_optim <- function(model, data, kseq = NA, prm=list(NA), direction = c("back # Insert the optimized parameters from last step m$prmbounds[names(res$par),"init"] <- res$par # - message("------------------------------------") - message("Reference score value: ",res$value) + message("------------------------------------\n") + message("Current model:") + print(m) + message("Current score: ",res$value,"\n") # -------- mStep <- list() # Generate the input modified models if(length(grep("backward|both", direction))){ # Remove input from the current model one by one if(length(m$inputs) > 1){ - tmp <- mclapply(1:length(m$inputs), function(i){ + tmp <- lapply(1:length(m$inputs), function(i){ ms <- m$clone_deep() # Remove one input ms$inputs[[i]] <- NULL @@ -84,7 +216,7 @@ step_optim <- function(model, data, kseq = NA, prm=list(NA), direction = c("back # Add input one by one iin <- which(!names(mfull$inputs) %in% names(m$inputs)) if(length(iin)){ - tmp <- mclapply(iin, function(i){ + tmp <- lapply(iin, function(i){ ms <- m$clone_deep() # Add one input ms$inputs[[length(ms$inputs) + 1]] <- mfull$inputs[[i]] @@ -99,7 +231,7 @@ step_optim <- function(model, data, kseq = NA, prm=list(NA), direction = c("back if(!is.na(prm[1])){ if(length(grep("backward|both", direction))){ # Count down the parameters one by one - tmp <- mclapply(1:length(prm), function(i){ + tmp <- lapply(1:length(prm), function(i){ p <- m$get_prmvalues(names(prm[i])) # If the input is not in the current model, then p is NA, so don't include it for fitting if(!is.na(p)){ @@ -118,7 +250,7 @@ step_optim <- function(model, data, kseq = NA, prm=list(NA), direction = c("back } if(length(grep("forward|both", direction))){ # Count up the parameters one by one - tmp <- mclapply(1:length(prm), function(i){ + tmp <- lapply(1:length(prm), function(i){ p <- m$get_prmvalues(names(prm[i])) # If the input is not in the current model, then p is NA, so don't include it for fitting if(!is.na(p)){ @@ -138,9 +270,10 @@ step_optim <- function(model, data, kseq = NA, prm=list(NA), direction = c("back } # Optimize all the step models - resStep <- mclapply(1:length(mStep), function(i, ...){ - res <- try(optimfun(mStep[[i]], data, kseq, printout=FALSE, ...)) - if(class(res) == "try-error"){ browser() } + resStep <- lapply(1:length(mStep), function(i, ...){ + res <- optimfun(mStep[[i]], data, kseq, printout=FALSE, ...) +# res <- try() +# if(class(res) == "try-error"){ browser() } message(names(mStep)[[i]], ": ", res$value) return(res) }, ...) diff --git a/inst/CITATION b/inst/CITATION index 38f0980..e46f8ca 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -22,5 +22,6 @@ citEntry( author = "Peder Bacher and Hjörleifur G. Bergsteinsson", year = "2020", note = "R package version 0.9.3", - url = "https://onlineforecasting.org" -) + url = "https://onlineforecasting.org", + textVersion = paste("The manual.") + ) diff --git a/make.R b/make.R index cf5912c..421fa40 100644 --- a/make.R +++ b/make.R @@ -41,7 +41,7 @@ library(roxygen2) # http://r-pkgs.had.co.nz/tests.html # # Initialize first time the the testing framework -#use_testthat() +# use_testthat() # Init new test #use_test("newtest") @@ -50,17 +50,17 @@ library(roxygen2) #test() # # Run the examples +#load_all(as.package("../onlineforecast")) #run_examples() # # Run tests in a single file -# load_all(as.package("../onlineforecast")) # test_file("tests/testthat/test-rls-heat-load.R") # ---------------------------------------------------------------- # Build the package document() -build(".", vignettes=FALSE) +build(".", vignettes=TRUE) # Install it install.packages("../onlineforecast_0.9.4.tar.gz") @@ -80,11 +80,11 @@ library(onlineforecast) # Test before release devtools::check() -devtools::check_built("../onlineforecast_0.9.3.tar.gz") +devtools::check_built("../onlineforecast_0.9.4.tar.gz") # Does give different results than check() above -#system("R CMD check --as-cran ../onlineforecast_0.9.3.tar.gz") -system("R CMD check ../onlineforecast_0.9.3.tar.gz") +#system("R CMD check --as-cran ../onlineforecast_0.9.4.tar.gz") +system("R CMD check ../onlineforecast_0.9.4.tar.gz") unlink("onlineforecast.Rcheck/", recursive=TRUE) # Use for more checking: diff --git a/misc-R/reduce-test.R b/misc-R/reduce-test.R deleted file mode 100644 index 78c9bec..0000000 --- a/misc-R/reduce-test.R +++ /dev/null @@ -1,54 +0,0 @@ - - - -# Load the current package -library("devtools") -pack <- as.package("../../onlineforecast") -load_all(pack) - -# Set the data in D to simplify notation -D <- Dbuilding -# Set the score period -D$scoreperiod <- in_range("2010-12-22", D$t) -# -D$tday <- make_tday(D$t, 2) -# Generate noise input -set.seed(83792) -D$noise <- make_input(rnorm(length(D$t)), 2) - -# Generate new object (R6 class) -model <- forecastmodel$new() -# Set the model output -model$output = "heatload" -# Inputs (transformation step) -model$add_inputs(Ta = "Ta", - I = "bspline(tday, Boundary.knots = c(6,18), degree = 5, intercept=TRUE) %**% I", - noise = "noise", - mu_tday = "fs(tday/24, nharmonics=10)", - mu = "one()") -# Regression step parameters -model$add_regprm("rls_prm(lambda=0.9)") -# Optimization bounds for parameters -model$add_prmbounds(lambda = c(0.9, 0.99, 0.9999)) - - -# Reduce the model -object <- model -data <- D -kseq <- 2 -prm <- list(I__degree = c(min=1, max=7), mu_tday__nharmonics = c(min=1, max=7)) -optimfun = rls_optim -scorefun = rmse - - -L <- step_optim(object, data, kseq, prm, "forward", optimfun, scorefun, cachedir="cache", cachererun=FALSE) -L <- step_optim(object, data, kseq, prm, "backward", optimfun, scorefun, cachedir="cache", cachererun=FALSE) -L <- step_optim(object, data, kseq, prm, "backwardboth", optimfun, scorefun, cachedir="cache", cachererun=FALSE) -L <- step_optim(object, data, kseq, prm, "forwardboth", optimfun, scorefun, cachedir="cache", cachererun=FALSE) - -unlist(getse(getse(L, "model"), "prm")) -sum(unlist(getse(getse(L, "result"), "counts"))) - - - - diff --git a/misc-R/step_optim-test.R b/misc-R/step_optim-test.R new file mode 100644 index 0000000..419f026 --- /dev/null +++ b/misc-R/step_optim-test.R @@ -0,0 +1,72 @@ + +# Load the current package +library("devtools") +pack <- as.package("../../onlineforecast") +load_all(pack) + + +# Set the data in D to simplify notation +D <- Dbuilding +# Set the score period +D$scoreperiod <- in_range("2010-12-22", D$t) +# +D$tday <- make_tday(D$t, 2) +# Generate noise input +set.seed(83792) +D$noise <- make_input(rnorm(length(D$t)), 2) + +# Generate new object (R6 class) +model <- forecastmodel$new() +# Set the model output +model$output = "heatload" +# Inputs (transformation step) +model$add_inputs(Ta = "Ta", + I = "bspline(tday, Boundary.knots = c(6,18), degree = 5, intercept=TRUE) %**% I", + noise = "noise", + mu_tday = "fs(tday/24, nharmonics=5)", + mu = "one()") +# Regression step parameters +model$add_regprm("rls_prm(lambda=0.9)") +# Optimization bounds for parameters +model$add_prmbounds(lambda = c(0.9, 0.99, 0.9999)) + + +# Select a model +kseq <- 2 +prm <- list(I__degree = c(min=3, max=7), mu_tday__nharmonics = c(min=3, max=7)) + +L <- step_optim(model, D, kseq, prm, "forward") +L <- step_optim(model, D, kseq, prm, "backward") +L <- step_optim(model, D, kseq, prm, "backwardboth") +L <- step_optim(model, D, kseq, prm, "forwardboth") + + +# Caching (the cache files are located in the cachedir folder (folder in current working directory, can be removed with unlink(foldername)) +L <- step_optim(model, D, kseq, prm, "forward", cachedir="cache", cachererun=FALSE) +L <- step_optim(model, D, kseq, prm, "backward", cachedir="cache", cachererun=FALSE) +L <- step_optim(model, D, kseq, prm, "backwardboth", cachedir="cache", cachererun=FALSE) +L <- step_optim(model, D, kseq, prm, "forwardboth", cachedir="cache", cachererun=FALSE) + + + +# Start from a given model +modelstart <- model$clone_deep() +modelstart$inputs <- modelstart$inputs[-2:-4] +L <- step_optim(model, D, kseq, prm, "both", modelstart=modelstart) +L <- step_optim(model, D, kseq, prm, "forward", modelstart=modelstart) + +# The the result in detail (L holds the selected model and it's optim result in each step) +L[[1]] +L[[2]] + +# The models +getse(L, "model") +# The optim results +getse(L, "result") +# +unlist(getse(getse(L, "model"), "prm")) +sum(unlist(getse(getse(L, "result"), "counts"))) + + + + diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..1f95ad4 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(onlineforecast) + +test_check("onlineforecast") diff --git a/tests/testthat/test-newtest.R b/tests/testthat/test-newtest.R new file mode 100644 index 0000000..8849056 --- /dev/null +++ b/tests/testthat/test-newtest.R @@ -0,0 +1,3 @@ +test_that("multiplication works", { + expect_equal(2 * 2, 4) +}) diff --git a/tests/testthat/test-rls-heat-load.R b/tests/testthat/test-rls-heat-load.R index 054105f..426fd9d 100644 --- a/tests/testthat/test-rls-heat-load.R +++ b/tests/testthat/test-rls-heat-load.R @@ -1,5 +1,3 @@ -context("running RLS test") - ## Load current package (must be outcommented) ## library("devtools") ## load_all(as.package("../../../onlineforecast")) diff --git a/vignettes/forecast-evaluation.Rmd b/vignettes/forecast-evaluation.Rmd index f7d9a7b..612ed61 100644 --- a/vignettes/forecast-evaluation.Rmd +++ b/vignettes/forecast-evaluation.Rmd @@ -2,9 +2,12 @@ title: "Forecast evaluation" author: "Peder Bacher" date: "`r Sys.Date()`" -output: rmarkdown::html_vignette +output: + rmarkdown::html_vignette: + toc: true + toc_debth: 3 vignette: > - %\VignetteIndexEntry{Forecast evaluation} + %\VignetteIndexEntry{Online updating of onlineforecast models} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -16,6 +19,7 @@ library(knitr) vignettename <- "forecast-evaluation" # REMEMBER: IF CHANGING IN THE shared-init (next block), then copy to the others! ``` + <!--shared-init-start--> ```{r init, cache=FALSE, include=FALSE, purl=FALSE} # Width will scale all @@ -28,6 +32,7 @@ figheight2 <- 6.5 figheight3 <- 8 figheight4 <- 9.5 figheight5 <- 11 + # Set the size of squared figures (same height as full: figheight/figwidth) owsval <- 0.35 ows <- paste0(owsval*100,"%") @@ -42,7 +47,7 @@ fhs <- figwidth * owsval # Set the knitr options knitr::opts_chunk$set( collapse = TRUE, - comment = "## ", + comment = "##!! ", prompt = FALSE, cache = TRUE, cache.path = paste0("../tmp/vignettes/tmp-",vignettename,"/"), @@ -54,25 +59,32 @@ knitr::opts_chunk$set( ) options(digits=3) -hook_output <- knit_hooks$get("output") -knit_hooks$set(output = function(x, options) { + +# For cropping output and messages +cropfun <- function(x, options, func){ lines <- options$output.lines - if (is.null(lines)) { - return(hook_output(x, options)) # pass to default hook - } - x <- unlist(strsplit(x, "\n")) - more <- "## ...output cropped" - if (length(lines)==1) { # first n lines - if (length(x) > lines) { - # truncate the output, but add .... - x <- c(head(x, lines), more) - } - } else { - x <- c(more, x[lines], more) + ## if (is.null(lines)) { + ## return(func(x, options)) # pass to default hook + ## } + if(!is.null(lines)){ + x <- unlist(strsplit(x, "\n")) + i <- grep("##!!",x) + if(length(i) > lines){ + # truncate the output, but add .... + x <- x[-i[(lines+1):length(i)]] + x[i[lines]] <- pst(x[i[lines]], "\n\n## ...output cropped") + } + # paste these lines together + x <- paste(c(x, ""), collapse = "\n") } - # paste these lines together - x <- paste(c(x, ""), collapse = "\n") - hook_output(x, options) + x <- gsub("!!","",x) + func(x, options) +} + +hook_chunk <- knit_hooks$get("chunk") + +knit_hooks$set(chunk = function(x, options) { + cropfun(x, options, hook_chunk) }) ``` @@ -107,7 +119,7 @@ Just start by: D <- Dbuilding # Keep the model output in y (just easier code later) D$y <- D$heatload -# +# Make time of day in forecast matrix D$tday <- make_tday(D$t, 0:36) ``` @@ -171,18 +183,18 @@ model$add_prmbounds(Ta__a1 = c(0.8, 0.9, 0.99), I__a1 = c(0.6, 0.9, 0.99), lambda = c(0.9, 0.99, 0.9999)) model$add_regprm("rls_prm(lambda=0.9)") -model$kseq <- c(3,18) -# Optimize the parameters -model$prm <- rls_optim(model, D)$par +# Optimize the parameters, only on two horizons +kseqopt <- c(3,18) +rls_optim(model, D, kseqopt) ``` Fit for all horizons and see the fit summary: ```{r} -# Fit for all horizons +# Forecast for all horizons model$kseq <- 1:36 # Fit with RLS fit1 <- rls_fit(model$prm, model, D) -# Check the fit +# See the summary of the fit summary(fit1) ``` @@ -191,31 +203,26 @@ Fourier series. It can simply be added to the current model object: ```{r, output.lines=10} # Add a diurnal curve using fourier series model$add_inputs(mu_tday = "fs(tday/24, nharmonics=4)") -model$kseq <- c(3,18) # Optimize the parameters -model$prm <- rls_optim(model, D)$par +rls_optim(model, D, kseq=kseqopt) ``` Fit for all horizons and see the fit summary: ```{r} -# Fit for all horizons -model$kseq <- 1:36 # Fit with RLS fit2 <- rls_fit(model$prm, model, D) # Check the fit summary(fit2) ``` - - Keep the forecasts for plotting and later analysis: ```{r} -# Keep the forecasts from each model +# Keep the forecasts from each model by just inserting them in the data.list D$Yhat1 <- fit1$Yhat D$Yhat2 <- fit2$Yhat ``` -Plot the full score period: +Plot the forecasts for the full score period: ```{r, fig.height=figheight2} # Plot to see the forecasts for the shortest and the longest horizon plot_ts(subset(D,D$scoreperiod), c("^y|^Yhat1","^y|^Yhat2"), kseq = c(1,36)) @@ -251,10 +258,8 @@ the latest value lagged a given period from the forecast time point. First the simple persistence: ```{r} -# Just keep the horizons -kseq <- 1:36 -# The simple persistence -D$YhatP <- persistence(D$y, kseq) +# The simple persistence (forecast for same horizons as the model) +D$YhatP <- persistence(D$y, model$kseq) # Plot a few horizons plot_ts(D, c("^y$|YhatP$"), c("2011-01-05","2011-01-10"), kseq=c(1,24,36)) ``` @@ -269,7 +274,7 @@ time point, at the same time of day, as the forecast time point (i.e. `tod(t+k)`). It can be obtained by: ```{r} # Use the argument perlen to set the period length -D$YhatDP <- persistence(D$y, kseq, perlen=24) +D$YhatDP <- persistence(D$y, model$kseq, perlen=24) # Plot a few horizons plot_ts(D, c("^y$|YhatDP$"), c("2011-01-05","2011-01-10"), kseq=c(1,24,36)) ``` @@ -282,13 +287,14 @@ lag values (going >48 the forecasts will be 72 lagged and so fourth). Now it's just a matter of calculating the score, as a function of the horizon, for each model and compare them. -We have kept the forecasts in the same format for each model, we can find them by: +We have kept the forecasts in forecast matrices for each model, we can find them by: ```{r} # Find the forecasts in D nms <- grep("^Yhat", names(D), value=TRUE) nms ``` -So it's the small model, large model, and simple and diurnal persistence, respectively. +So it's the small and the large model, and as reference the simple and the +diurnal persistence model. One quite important point: When comparing forecasts from different models exactly the same forecast points must be included. When NAs are present, not all @@ -299,42 +305,60 @@ So to make sure that exactly the same points are included in the score calculation, we must only forecast points where all forecasts are available (i.e. non-NA): ```{r} -# The non-NA for the first forecast -ok <- !is.na(D[[nms[1]]]) -# Go through the remaining: all must be non-NA for a point -for(nm in nms[-1]){ - ok <- ok & !is.na(D[[nm]]) -} -ok <- as.data.frame(ok) -names(ok) <- pst("k",kseq) -# Lag to match resiuduals in time -ok <- lagdf(ok, "+k") -# Only the score period -ok <- ok & D$scoreperiod -# Finally, the vector with TRUE for all points with no NAs for any forecast -ok <- apply(ok, 1, all) +# Find all complete cases for all forecasts and horizons +ok <- complete_cases(D[nms]) ``` -How many points are left? +Check if there are NAs in the forecasts: ```{r} sum(ok) length(ok) ``` -Now the residuals can be calculated and the score: +The forecasts will always have NAs from the start, e.g.: ```{r} -# Use the residuals function +D$Yhat1[1:11, 1:10] +``` +but in this particular case other periods with NAs exists: +```{r} +D$y[59:72] +D$YhatP[59:72, 1] +``` +They have been excluded as non complete cases. + +Actually we only want to include the scoreperiod in the evaluation: +```{r} +ok <- ok & D$scoreperiod +``` +and there all models had complete forecasts: +```{r} +sum(ok) +sum(D$scoreperiod) +``` + +We can now use the `score()` function for calculating the score with only the +complete cases found above: +```{r} +# The score as a function of the horizon R <- residuals(D$Yhat1, D$y) -# And the score as a function of the horizon -score(R, scoreperiod=ok)$scoreval +score(R, ok & D$scoreperiod) +``` +Actually, the default way is to only use complete cases, hence: +```{r} +# Only complete cases are used per default +score(R, D$scoreperiod) == score(R, ok & D$scoreperiod) +``` +Whether to use complete cases only can be controlled by: +```{r} +# The score as a function of the horizon +score(R, usecomplete=FALSE) == score(R) ``` -Calculated the score (default is RMSE) for all models: +These steps can be made for all models by, where also only complete cases for +all models are used per default: ```{r} -RMSE <- sapply(nms, function(nm){ - score(residuals(D[[nm]],D$y), ok)$scoreval -}) +RMSE <- score(residuals(D[nms], D$y), D$scoreperiod) ``` ```{r, include=FALSE} @@ -346,47 +370,41 @@ RMSE <- sapply(nms, function(nm){ Plot the RMSE as a function of the horizon: ```{r, fig.height=figheight2} -RMSE <- as.data.frame(RMSE) -names(RMSE) <- nms - -plot(0, type="n", xlim=range(kseq), ylim=range(RMSE), xlab="Horizon k", ylab="RMSE (kW)") -for(i in 1:length(RMSE)){ - points(kseq, RMSE[ ,i], type="b", col=i) +plot(0, type="n", xlim=range(model$kseq), ylim=range(RMSE), xlab="Horizon k", ylab="RMSE (kW)") +for(i in 1:ncol(RMSE)){ + points(model$kseq, RMSE[ ,i], type="b", col=i) } +legend("topleft", nms, lty=1, col=1:length(nms)) ``` - ### Training set and test set -As explained, it is most times not necessary to divide in a train and a test -set, when fitting recursively, however it can be useful sometimes. +As explained, it is most times not necessary to divide in a train and a test set +when fitting recursively (i.e. using RLS), however it can sometimes be useful. An easy approach is to set a logical vector, which is TRUE until the end of the -training period: +training period (with a week of burn-in): ```{r plottrain} -D$trainperiod <- in_range(D$t[1]-1, D$t, "2011-02-01") +D$trainperiod <- in_range(D$t[7*24]-1, D$t, "2011-02-01") plot(D$t, D$trainperiod) ``` then optimize the parameters only on this period, by taking a subset: ```{r, output.lines=10} -model$kseq <- c(3,18) # Optimize the parameters -model$prm <- rls_optim(model, subset(D,D$trainperiod))$par +rls_optim(model, subset(D,D$trainperiod), kseqopt) ``` and then fit on the entire set: ```{r} -# Fit for all horizons -model$kseq <- 1:36 # Fit with RLS -fittmp <- rls_fit(model$prm, model, D) +fit <- rls_fit(model$prm, model, D) ``` Finally, the score can be calculated on the period following the train period by: ```{r scorefit} -score_fit(fittmp, !D$trainperiod)$scoreval +score(fit$Yhat, !D$trainperiod) ``` In this way it's rather easy to set up different schemes, like optimizing the @@ -415,7 +433,28 @@ Plot for the larger model (plots not included here): plot_ts(fit2, kseq=kseq) ``` -A shorter period (plots not included here): +A pairs plot with residuals and inputs to see if patterns are left: +```{r plotpairs, fig.height=figwidth} +kseq <- c(1,36) +D$Residuals <- residuals(fit2)[ ,pst("h",kseq)] +D$hour <- aslt(D$t)$hour +pairs(D, subset=D$scoreperiod, pattern="Residuals|Ta|I|hour|^t$", kseq=kseq) +``` +So inspecting the two upper rows, there are no clear patterns to be seen for the +mean (red lines are not deviating from zero). Some differences in the marginal +distribution of the residuals are seen, e.g. for the hour the variance is +clearly highest in the mornings. + +Examine how well the dynamics are modelled with the auto-correlation and cross-correlations: +```{r plotacf, fig.height=figwidth, echo=-1} +par(mfrow=c(2,2)) +acf(D$Residuals$h1, na.action=na.pass) +ccf(lagvec(D$Ta$k1,1), D$Residuals$h1, na.action=na.pass) +ccf(lagvec(D$I$k1,1), D$Residuals$h1, na.action=na.pass) +ccf(lagvec(D$I$k1,1), D$Residuals$h1, na.action=na.pass) +``` + +A shorter period of plots of the fits (plots not included here): ```{r, fig.height=figheight5, fig.keep="none"} xlim <- c("2011-01-01","2011-01-14") plot_ts(fit1, xlim=xlim, kseq=kseq) @@ -443,17 +482,7 @@ The RLS coefficients are in the fit for each horizon: str(fit1$Lfitval$k1) ``` -A pairs plot with residuals and inputs to see if patterns are left: -```{r plotpairs, fig.height=figwidth} -kseq <- c(1,36) -D$Residuals <- residuals(fit2)[ ,pst("h",kseq)] -D$hour <- aslt(D$t)$hour -pairs(D, subset=D$scoreperiod, pattern="Residuals|Ta|I|hour|^t$", kseq=kseq) -``` -So inspecting the two upper rows, there are no clear patterns to be seen for the -mean (red lines are not deviating from zero). Some differences in the marginal -distribution of the residuals are seen, e.g. for the hour the variance is -clearly highest in the mornings. + Histograms and box-plots to find patterns. From the histogram and qq-norm plot: ```{r} diff --git a/vignettes/make.R b/vignettes/make.R index 8953203..4f91390 100644 --- a/vignettes/make.R +++ b/vignettes/make.R @@ -1,4 +1,3 @@ -# REMEMBER TO CHANGE IN shared-init in all library(knitr) library(rmarkdown) @@ -14,21 +13,21 @@ makeit <- function(nam, openit=FALSE, clean=TRUE){ render(namrmd, output_file=paste0(dirnam,nam), clean=clean) purl(namrmd) system(paste0("mv ",nam,".R ",dirnam,nam,".R")) - if(openit){ system(paste0("chromium-browser ",dirnam,nam,".html &")) } + if(openit){ system(paste0("chromium-freeworld ",dirnam,nam,".html &")) } } # unlink(paste0(dirnam,"tmp-setup-data/"), recursive=TRUE) -makeit("setup-data", openit=TRUE) +makeit("setup-data", openit=FALSE) # unlink(paste0(dirnam,"tmp-setup-and-use-model/"), recursive=TRUE) makeit("setup-and-use-model", openit=FALSE, clean=TRUE) # -unlink(paste0(dirnam,"tmp-output/tmp-forecast-evaluation/"), recursive=TRUE) +unlink(paste0(dirnam,"tmp-forecast-evaluation/"), recursive=TRUE) makeit("forecast-evaluation", openit=FALSE) -# Finish and include it!! -## unlink(paste0(dirnam,"tmp-output/tmp-online-updating/"), recursive=TRUE) -## makeit("online-updating", openit=FALSE) +# +unlink(paste0(dirnam,"tmp-output/tmp-online-updating/"), recursive=TRUE) +makeit("online-updating", openit=FALSE) diff --git a/vignettes/online-updating.Rmd b/vignettes/online-updating.Rmd index 2d3fe5a..d9ef959 100644 --- a/vignettes/online-updating.Rmd +++ b/vignettes/online-updating.Rmd @@ -32,6 +32,7 @@ figheight2 <- 6.5 figheight3 <- 8 figheight4 <- 9.5 figheight5 <- 11 + # Set the size of squared figures (same height as full: figheight/figwidth) owsval <- 0.35 ows <- paste0(owsval*100,"%") @@ -46,7 +47,7 @@ fhs <- figwidth * owsval # Set the knitr options knitr::opts_chunk$set( collapse = TRUE, - comment = "## ", + comment = "##!! ", prompt = FALSE, cache = TRUE, cache.path = paste0("../tmp/vignettes/tmp-",vignettename,"/"), @@ -58,27 +59,33 @@ knitr::opts_chunk$set( ) options(digits=3) -hook_output <- knit_hooks$get("output") -knit_hooks$set(output = function(x, options) { + +# For cropping output and messages +cropfun <- function(x, options, func){ lines <- options$output.lines - if (is.null(lines)) { - return(hook_output(x, options)) # pass to default hook - } - x <- unlist(strsplit(x, "\n")) - more <- "## ...output cropped" - if (length(lines)==1) { # first n lines - if (length(x) > lines) { - # truncate the output, but add .... - x <- c(head(x, lines), more) - } - } else { - x <- c(more, x[lines], more) + ## if (is.null(lines)) { + ## return(func(x, options)) # pass to default hook + ## } + if(!is.null(lines)){ + x <- unlist(strsplit(x, "\n")) + i <- grep("##!!",x) + if(length(i) > lines){ + # truncate the output, but add .... + x <- x[-i[(lines+1):length(i)]] + x[i[lines]] <- pst(x[i[lines]], "\n\n## ...output cropped") + } + # paste these lines together + x <- paste(c(x, ""), collapse = "\n") } - # paste these lines together - x <- paste(c(x, ""), collapse = "\n") - hook_output(x, options) -}) + x <- gsub("!!","",x) + func(x, options) +} + +hook_chunk <- knit_hooks$get("chunk") +knit_hooks$set(chunk = function(x, options) { + cropfun(x, options, hook_chunk) +}) ``` [onlineforecasting]: https://onlineforecasting.org/articles/onlineforecasting.pdf @@ -88,7 +95,11 @@ knit_hooks$set(output = function(x, options) { ## Intro -This vignette explains how to +This vignette explains how to use the package in a real online operation, where +the following is repeated in real time: new data is received, model is updated +and forecasts are calculated. At a lower frequency the parameters are +optimized, e.g. the updating is executed every hour and the parameters are +optimized once a week. Load the package: ```{r, echo=1:2, purl=1:2} @@ -115,75 +126,99 @@ model$add_regprm("rls_prm(lambda=0.9)") model$add_prmbounds(Ta__a1 = c(0.5, 0.9, 0.9999), lambda = c(0.9, 0.99, 0.9999)) model$kseq <- 1:36 -# Optimize the parameters +# Optimize the parameters on the train period (i.e. until 2011-02-01) rls_optim(model, subset(D,D$trainperiod), kseq = c(3,18)) ``` - ## Recursive update and prediction -How to get new data and update and predict. +Here we go through the steps of getting new data, running a model update and +making predictions. -First fit on a period +First fit on a period: ```{r} +# Keep a sequence with these points iseq <- which(in_range("2010-12-15",D$t,"2011-01-01")) -Dfit <- subset(D, iseq) -rls_fit(model$prm, model, Dfit) +# Fit the model +rls_fit(model$prm, model, subset(D, iseq)) ``` -Now the fits are saved in the model object (its an R6 object, hence passed by reference to the functions and can be changed inside the functions). A list of fits with an entry for each horizon is in Lfits, see the two first +Now the fits are saved in the model object (its an R6 object, hence passed by reference to the functions and can be changed inside the functions). A list of fits with an entry for each horizon is in Lfits, see the two first: ```{r} +# The data of a fit is saved in this list str(model$Lfits[1:2]) ``` -Now new data arrives, take the point right after the fit period +We step to the next time step, where new data arrives. Take the point right +after the fit period and take the data for this time point: ```{r} +# Next time step (i <- iseq[length(iseq)] + 1) +# The new data for this time step Dnew <- subset(D, i) ``` -First we need to transform the new data (This must only be done once for each -new data, since some transform functions, e.g. lp(), actually keep states, see -the detailed description in ) +To update and predict, we need to transform the new data (this must only be done +only once for each new data, since some transform functions, e.g. lp(), actually +keep state data, see some on this in \texttt{?lp} and \texttt{?forecastmodel} +under \texttt{\$reset\_state()}): ```{r} -Dnew_transformed <- model$transform_data(Dnew) +# Transform the new data +DnewTransformed <- model$transform_data(Dnew) ``` -Then we can update the parameters using the transformed data +Then we can update the regression coefficients using the transformed data ```{r} -rls_update(model, Dnew_transformed, Dnew[[model$output]]) +# The value of the coefficients for horizon 1, before the update +model$Lfits$k1$theta +# Update the coefficients +rls_update(model, DnewTransformed, Dnew[[model$output]]) +# The value of the coefficients for horizon 1, after the update +model$Lfits$k1$theta ``` -Calculate predictions using the new data and the updated fits (rls coefficient estimates in model$Lfits[[k]]$theta) +Calculate predictions using the new data and the updated fits: ```{r} -yhat <- rls_predict(model, Dnew_transformed) +# Calculate the predictions +yhat <- rls_predict(model, DnewTransformed) ``` -Plot to see that it fits the observations +Plot to see the predictions with the observations: ```{r} +# The index for the predicted steps ahead iseq <- i+model$kseq +# Plot the observations and predictions plot(D$t[iseq], D$heatload[iseq], type = "b", xlab = "t", ylab = "y") lines(D$t[iseq], yhat, type = "b", col = 2) -legend("topright", c("observations",pst("predictions (",min(model$kseq)," to ",max(model$kseq)," steps ahead)")), lty = 1, col = 1:2) +legend("topright", c("Observations",pst("Predictions (",min(model$kseq)," to ",max(model$kseq)," steps ahead)")), lty = 1, col = 1:2) ``` Run this for a longer period to verify that the same forecasts are obtained (in one go vs. iteratively) -First in one go +First in one go on all data: ```{r} -val <- rls_fit(model$prm, model, D, returnanalysis = TRUE) +# Fit and predict on entire data +val <- rls_fit(model$prm, model, D) +# Keep the forecasts D$Yhat1 <- val$Yhat ``` -and then iteratively +and then run iteratively through: ```{r} +# The indexes of training period itrain <- which(in_range("2010-12-15",D$t,"2011-01-01")) +# The indexes of the test period itest <- which(in_range("2011-01-01",D$t,"2011-01-04")) + +# Fit on the training period rls_fit(model$prm, model, subset(D, itrain)) +# Prepare for the forecasts D$Yhat2 <- data.frame(matrix(NA, nrow(D$Yhat1), ncol(D$Yhat1))) names(D$Yhat2) <- names(D$Yhat1) + +# Step through the test period: for(i in itest){ Dnew <- subset(D, i) Dnewtr <- model$transform_data(Dnew) @@ -192,9 +227,7 @@ for(i in itest){ } ``` -Compare to see the difference between the one step forecasts +Compare to see the difference between the one step forecasts: ```{r} D$Yhat1$k1[itest] - D$Yhat2$k1[itest] ``` - -Note about model$reset_states() diff --git a/vignettes/setup-and-use-model.Rmd b/vignettes/setup-and-use-model.Rmd index e05339d..da8044e 100644 --- a/vignettes/setup-and-use-model.Rmd +++ b/vignettes/setup-and-use-model.Rmd @@ -46,7 +46,7 @@ fhs <- figwidth * owsval # Set the knitr options knitr::opts_chunk$set( collapse = TRUE, - comment = "## ", + comment = "##!! ", prompt = FALSE, cache = TRUE, cache.path = paste0("../tmp/vignettes/tmp-",vignettename,"/"), @@ -58,25 +58,33 @@ knitr::opts_chunk$set( ) options(digits=3) -hook_output <- knit_hooks$get("output") -knit_hooks$set(output = function(x, options) { + +# For cropping output and messages +cropfun <- function(x, options, func){ lines <- options$output.lines - if (is.null(lines)) { - return(hook_output(x, options)) # pass to default hook - } - x <- unlist(strsplit(x, "\n")) - more <- "## ...output cropped" - if (length(lines)==1) { # first n lines - if (length(x) > lines) { - # truncate the output, but add .... - x <- c(head(x, lines), more) - } - } else { - x <- c(more, x[lines], more) + ## if (is.null(lines)) { + ## return(func(x, options)) # pass to default hook + ## } + if(!is.null(lines)){ + x <- unlist(strsplit(x, "\n")) + i <- grep("##!!",x) + if(length(i) > lines){ + # truncate the output, but add .... + #x <- c(x[-i[(lines+1):length(i)]], "```", "## ...output cropped", "```") + x <- x[-i[(lines+1):length(i)]] + x[i[lines]] <- pst(x[i[lines]], "\n\n## ...output cropped") + } + # paste these lines together + x <- paste(c(x, ""), collapse = "\n") } - # paste these lines together - x <- paste(c(x, ""), collapse = "\n") - hook_output(x, options) + x <- gsub("!!","",x) + func(x, options) +} + +hook_chunk <- knit_hooks$get("chunk") + +knit_hooks$set(chunk = function(x, options) { + cropfun(x, options, hook_chunk) }) ``` diff --git a/vignettes/setup-data.Rmd b/vignettes/setup-data.Rmd index 7c64616..10af691 100644 --- a/vignettes/setup-data.Rmd +++ b/vignettes/setup-data.Rmd @@ -47,7 +47,7 @@ fhs <- figwidth * owsval # Set the knitr options knitr::opts_chunk$set( collapse = TRUE, - comment = "## ", + comment = "##!! ", prompt = FALSE, cache = TRUE, cache.path = paste0("../tmp/vignettes/tmp-",vignettename,"/"), @@ -59,25 +59,33 @@ knitr::opts_chunk$set( ) options(digits=3) -hook_output <- knit_hooks$get("output") -knit_hooks$set(output = function(x, options) { + +# For cropping output and messages +cropfun <- function(x, options, func){ lines <- options$output.lines - if (is.null(lines)) { - return(hook_output(x, options)) # pass to default hook - } - x <- unlist(strsplit(x, "\n")) - more <- "## ...output cropped" - if (length(lines)==1) { # first n lines - if (length(x) > lines) { - # truncate the output, but add .... - x <- c(head(x, lines), more) - } - } else { - x <- c(more, x[lines], more) + ## if (is.null(lines)) { + ## return(func(x, options)) # pass to default hook + ## } + if(!is.null(lines)){ + x <- unlist(strsplit(x, "\n")) + i <- grep("##!!",x) + if(length(i) > lines){ + # truncate the output, but add .... + #x <- c(x[-i[(lines+1):length(i)]], "```", "## ...output cropped", "```") + x <- x[-i[(lines+1):length(i)]] + x[i[lines]] <- pst(x[i[lines]], "\n\n## ...output cropped") + } + # paste these lines together + x <- paste(c(x, ""), collapse = "\n") } - # paste these lines together - x <- paste(c(x, ""), collapse = "\n") - hook_output(x, options) + x <- gsub("!!","",x) + func(x, options) +} + +hook_chunk <- knit_hooks$get("chunk") + +knit_hooks$set(chunk = function(x, options) { + cropfun(x, options, hook_chunk) }) ``` -- GitLab