diff --git a/R/complete_cases.R b/R/complete_cases.R index 3fe0bebd539fe07f0f7928aa72665eeb74aed034..8dc673256877468809cf014e38cc2059b985a295 100644 --- a/R/complete_cases.R +++ b/R/complete_cases.R @@ -63,8 +63,9 @@ complete_cases.list <- function(object, kseq=NA){ kseq <- as.integer(gsub(prefix, "", nams(object[[1]]))) } # Check that they all have kseq horizons - ok <- rep(TRUE, nrow(object[[1]])) - # + # Init a logical matrix with each point + ok <- matrix(TRUE, nrow(object[[1]]), length(kseq)) + # for(i in 1:length(object)){ if(!all(kseq %in% as.integer(gsub(prefix, "", nams(object[[i]]))))){ stop("Not all forecast matrices have kseq horizons: ",kseq) diff --git a/R/forecastmodel.R b/R/forecastmodel.R index 37ae9775adeb94908b7f6736db10335ca2f3d696..463ab14229391f33c93df1e283ccae1f889f9b72 100644 --- a/R/forecastmodel.R +++ b/R/forecastmodel.R @@ -130,6 +130,14 @@ forecastmodel <- R6::R6Class("forecastmodel", public = list( } # MUST INCLUDE SOME checks here and print useful messages if something is not right if(any(is.na(prm))){ stop(pst("None of the parameters (in prm) must be NA: prm=",prm)) } + # If given without names, then set in same order as in the prm_bounds + if(is.null(nams(prm))){ + if(length(prm) != nrow(self$prmbounds)){ + stop("prm was given without names and length not same as prmbounds, so don't know what to do") + }else{ + nams(prm) <- row.names(self$prmbounds) + } + } # Keep the prm given self$prm <- prm # Find if any opt parameters, first the one with "__" hence for the inputs @@ -400,7 +408,7 @@ print.forecastmodel <- function(x, ...){ cat(names(model$inputs)[1],"=",model$inputs[[1]]$expr,"\n") if(length(model$inputs) > 1){ for(i in 2:length(model$inputs)){ - cat(" ",names(model$inputs)[i],"=",model$inputs[[i]]$expr,"\n") + cat(" ",names(model$inputs)[i],"=",model$inputs[[i]]$expr,"\n") } } cat("\n") diff --git a/R/lapply.R b/R/lapply.R index ebead0f2acf4e99eadbda9ef2b264c22ad10ea65..9bffb1ca301fbf39cd1c73071be9f5e0d48e9cf5 100644 --- a/R/lapply.R +++ b/R/lapply.R @@ -8,8 +8,8 @@ #' @param X object to apply on #' @param FUN function to apply #' @export -lapply_cbind <- function(X, FUN){ - val <- lapply(X, FUN) +lapply_cbind <- function(X, FUN, ...){ + val <- lapply(X, FUN, ...) return(do.call("cbind", val)) } @@ -17,8 +17,8 @@ lapply_cbind <- function(X, FUN){ #' @param X object to apply on #' @param FUN function to apply #' @export -lapply_rbind <- function(X, FUN){ - val <- lapply(X, FUN) +lapply_rbind <- function(X, FUN, ...){ + val <- lapply(X, FUN, ...) return(do.call("rbind", val)) } @@ -26,8 +26,8 @@ lapply_rbind <- function(X, FUN){ #' @param X object to apply on #' @param FUN function to apply #' @export -lapply_cbind_df <- function(X, FUN){ - val <- lapply(X, FUN) +lapply_cbind_df <- function(X, FUN, ...){ + val <- lapply(X, FUN, ...) return(as.data.frame(do.call("cbind", val))) } @@ -35,8 +35,8 @@ lapply_cbind_df <- function(X, FUN){ #' @param X object to apply on #' @param FUN function to apply #' @export -lapply_rbind_df <- function(X, FUN){ - val <- lapply(X, FUN) +lapply_rbind_df <- function(X, FUN, ...){ + val <- lapply(X, FUN, ...) return(as.data.frame(do.call("rbind", val))) } diff --git a/R/score.R b/R/score.R index 3759178ae394bf87b6505779b12e35c362a8bb61..a1621a799de202f4800439cf85372f35fb743c58 100644 --- a/R/score.R +++ b/R/score.R @@ -50,6 +50,10 @@ score <- function(object, scoreperiod = NA, usecomplete = TRUE, scorefun = rmse, #' @rdname score #' @export score.list <- function(object, scoreperiod = NA, usecomplete = TRUE, scorefun = rmse, ...){ + # If only on element + if(length(object) == 1){ + return(as.matrix(score(object[[1]], scoreperiod, usecomplete, scorefun, ...))) + } # Use only complete cases if(usecomplete){ tmp <- complete_cases(object) @@ -62,7 +66,7 @@ score.list <- function(object, scoreperiod = NA, usecomplete = TRUE, scorefun = scoreperiod <- tmp } # Run on each element, usecomplete has been dealt with - sapply(object, score, usecomplete=FALSE, scoreperiod=scoreperiod, scorefun=scorefun, ...=...) + lapply_cbind(object, score, usecomplete=FALSE, scoreperiod=scoreperiod, scorefun=scorefun, ...=...) } diff --git a/R/step_optim.R b/R/step_optim.R index 5caca35b34377cf96a4b8c5544add8674bc99e7f..dee77a9631d917168c4ac8fa880b2b0d4824bfd3 100644 --- a/R/step_optim.R +++ b/R/step_optim.R @@ -49,6 +49,12 @@ #' these have to be set in the "prm" argument. The stepping process will follow #' the input selection described above. #' +#' In case of missing values, especially in combination with auto-regressive +#' models, it can be very important to make sure that only complete cases are +#' included when calculating the score. By providing the `fitfun` argument then +#' the score will be calculated using only the complete cases across horizons +#' and models in each step, see the last examples. +#' #' @title Forward and backward model selection #' @param modelfull The full forecastmodel containing all inputs which will be #' can be included in the selection. @@ -61,13 +67,27 @@ #' @param modelstart A forecastmodel. If it's set then it will be used as the #' selected model from the first step of the stepping. It should be a sub #' model of the full model. -#' @param optimfun The function which will carry out the optimization in each step. +#' @param keepinputs If TRUE no inputs can be removed in a step, if FALSE then +#' any input can be removed. If given as a character vector with names of +#' inputs, then they cannot be removed in any step. +#' @param optimfun The function which will carry out the optimization in each +#' step. +#' @param fitfun A fit function, should be the same as used in optimfun(). If +#' provided, then the score is caculated with this function (instead of the +#' one called in optimfun(), hence the default is rls_fit(), which is called +#' in rls_optim()). Furthermore, information on complete cases are printed +#' and returned. #' @param scorefun The score function used. -#' @param ... Additional arguments which will be passed on to optimfun. For example control how many steps +#' @param mc.cores The mc.cores argument of mclapply. If debugging it can be +#' nessecary to set it to 1 to stop execution. +#' @param ... Additional arguments which will be passed on to optimfun. For +#' example control how many steps #' #' @return A list with the result of each step: #' - '$model' is the model selected in each step -#' - '$result' the result return by the the optimfun +#' - '$score' is the score for the model selected in each step +#' - '$optimresult' the result return by the the optimfun +#' - '$completecases' a logical vector (NA if fitfun argument is not given) indicating which time points were complete across all horizons and models for the particular step. #' #' @examples #' @@ -102,20 +122,45 @@ #' prm <- list(mu_tday__nharmonics = c(min=3, max=7)) #' #' # Note the control argument, which is passed to optim, it's now set to few -#' # iterations in the prm optimization (must be increased in real applications) +#' # Iterations in the prm optimization (MUST be increased in real applications) #' control <- list(maxit=1) #' -#' # Run all selection schemes -#' Lboth <- step_optim(model, D, kseq, prm, "forward", control=control) +#' # Run the default selection scheme, which is "both" (same as "backwardboth" if no start model is given) +#' L <- step_optim(model, D, kseq, prm, control=control) +#' +#' # The optim value from each step is returned +#' getse(L, "optimresult") +#' getse(L,"score") +#' +#' # The final model +#' L$final$model +#' +#' # Other selection schemes #' Lforward <- step_optim(model, D, kseq, prm, "forward", control=control) #' Lbackward <- step_optim(model, D, kseq, prm, "backward", control=control) #' Lbackwardboth <- step_optim(model, D, kseq, prm, "backwardboth", control=control) -#' Lforwardboth <- step_optim(model, D, kseq, prm, "forwardboth", control=control) +#' Lforwardboth <- step_optim(model, D, kseq, prm, "forwardboth", control=control, mc.cores=1) #' +#' # It's possible avoid removing specified inputs +#' L <- step_optim(model, D, kseq, prm, keepinputs = c("mu","mu_tday"), control=control) +#' #' # Give a starting model #' modelstart <- model$clone_deep() #' modelstart$inputs[2:3] <- NULL -#' Lboth <- step_optim(model, D, kseq, prm, modelstart=modelstart, control=control) +#' L <- step_optim(model, D, kseq, prm, modelstart=modelstart, control=control) +#' +#' # If a fitting function is given, then it will be used for calculating the forecasts +#' # ONLY on the complete cases in each step +#' L1 <- step_optim(model, D, kseq, prm, fitfun=rls_fit, control=control) +#' +#' # The easiest way to conclude if missing values have an influence is to +#' # compare the selection result running with and without +#' L2 <- step_optim(model, D, kseq, prm, control=control) +#' +#' # Compare the selected models +#' tmp1 <- capture.output(getse(L1, "model")) +#' tmp2 <- capture.output(getse(L2, "model")) +#' identical(tmp1, tmp2) #' #' #' # Note that caching can be really smart (the cache files are located in the @@ -123,32 +168,34 @@ #' # unlink(foldername)) See e.g. `?rls_optim` for how the caching works #' # L <- step_optim(model, D, kseq, prm, "forward", cachedir="cache", cachererun=FALSE) #' -#' #' @importFrom parallel mclapply #' #' @export -step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c("both","backward","forward","backwardboth","forwardboth"), modelstart=NA, optimfun = rls_optim, scorefun = rmse, printout = FALSE, ...){ +step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c("both","backward","forward","backwardboth","forwardboth"), modelstart=NA, keepinputs = FALSE, optimfun = rls_optim, fitfun = NA, scorefun = rmse, printout = FALSE, mc.cores = getOption("mc.cores", 2L), ...){ # Do: # - checking of input, model, ... - # - change all lapply to mclapply # - Maybe have "cloneit" argument in optimfun, then don't clone inside optim. # - Add argument controlling how much is kept in each iteration (e.g all fitted models) - # - Insert the inputs in the order they are in the full model, then cache might be more consistent (and the order is kept) - # - For score make sure that only the same points are included for all models, or print warning if not! + # - Is raised as an issue: For score make sure that only the same points are included for all models, or print warning if not! # - With lm we should use have some regularization, so use AIC or BIC as the score # - Differentiate the parameters given to optimfun for the selected model and for the new models in each step. E.g. take more steps on the selected model. # # - Help: prm <- list(I__degree = c(min=1, max=7), mu_tday__nharmonics = c(min=1, max=7)) - # - help: It's not checked that it's the score is calculated on the same values! WARNING should be printed if some models don't forecast same points # # For keeping all the results L <- list() - # First val of direction is default if not set + # The first value in direction is default if(length(direction) > 1){ direction <- direction[1] } - # Different start up if start model is given - if( is.na(modelstart)[1] ){ - # + # Init + istep <- 1 + # Different start up, if a start model is given + if( class(modelstart)[1] == "forecastmodel" ){ + # The full model will not be changed from here, so don't need to clone it + mfull <- modelfull + m <- modelstart$clone() + }else{ + # No start model if given mfull <- modelfull$clone_deep() # Insert the starting prm values into the full model (must be in it, since added inputs are taken from there) if(!is.na(prm[1])){ @@ -164,7 +211,8 @@ step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c(" } i <- which(sapply(prm, function(x){ !"init" %in% names(x) })) if(length(i)){ - mfull$insert_prm(round(unlist(getse(prm[i], "max")) - unlist(getse(prm[i], "min")) / 2)) + tmp <- unlist(getse(prm[i], "min")) + mfull$insert_prm(tmp + round((unlist(getse(prm[i], "max")) - tmp) / 2)) } } } @@ -176,23 +224,16 @@ step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c(" m$inputs <- list() # Start with no fit istep <- 0 - res <- list(value=Inf, par=m$get_prmbounds("init")) + scoreCurrent <- Inf + } + } + # Find the inputs to keep, if any + if(class(keepinputs) == "logical"){ + if(keepinputs){ + keepinputs <- nams(mfull$inputs) }else{ - # Optimize from the full model - res <- optimfun(m, data, kseq, printout=printout, ...) - # Keep it - istep <- 1 - L[[istep]] <- list(model = m$clone_deep(), result = res) + keepinputs <- "" } - }else{ - # The full model will not be changed from here, so don't need to clone it - mfull <- modelfull - m <- modelstart$clone() - # Optimize from the model - res <- optimfun(m, data, kseq, printout=printout, ...) - # Keep it - istep <- 1 - L[[istep]] <- list(model = m$clone_deep(), result = res) } # Helper c_mStep <- function(l, nms){ @@ -203,39 +244,66 @@ step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c(" # Go done <- FALSE while(!done){ + message("\n------------------------------------------------------------------------\n") + message(pst("Step ",istep,". Current model:")) + print(m) + # If the init model is not yet optimized + if(istep == 1 & length(L) == 0){ + # Optimize + res <- optimfun(m, data, kseq, printout=printout, scorefun=scorefun, ...) + # Should we forecast only on the complete cases? + if(class(fitfun) == "function"){ + # Forecast to get the complete cases + mtmp <- m$clone_deep() + mtmp$kseq <- kseq + Yhat <- fitfun(res$par, mtmp, data, printout=printout)$Yhat + scoreCurrent <- sum(score(residuals(Yhat,data[[model$output]]),data$scoreperiod)) + casesCurrent <- complete_cases(Yhat) + }else{ + scoreCurrent <- res$value + casesCurrent <- NA + } + # Keep it + istep <- 1 + L[[istep]] <- list(model = m$clone_deep(), score = scoreCurrent, optimresult = res, completecases = casesCurrent) + } + + message("Current score: ",format(scoreCurrent,digits=7)) + if(class(fitfun) == "function"){ + message("Current complete cases: ",sum(casesCurrent)," (Diff in score from optim:",L[[istep]]$optimresult$value-scoreCurrent,")") + } # Next step istep <- istep + 1 - # Insert the optimized parameters from last step - m$prmbounds[names(res$par),"init"] <- res$par - # - message("------------------------------------\n") - message("Current model:") - print(m) - message("Current score: ",res$value,"\n") # -------- mStep <- list() + # Generate the input modified models + # Remove inputs if(length(grep("backward|both", direction))){ - # Remove input from the current model one by one - if(length(m$inputs) > 1){ - tmp <- mclapply(1:length(m$inputs), function(i){ + irm <- which(!nams(m$inputs) %in% keepinputs) + # Remove any? If only one input, then don't remove it + if(length(irm) & length(m$inputs) > 1){ + tmp <- lapply(irm, function(i){ ms <- m$clone_deep() - # Remove one input ms$inputs[[i]] <- NULL return(ms) }) - mStep <- c_mStep(tmp, pst("-",names(m$inputs))) + mStep <- c_mStep(tmp, pst("-",names(m$inputs)[irm])) } } + + # Add inputs if(length(grep("forward|both", direction))){ # Add input one by one iin <- which(!names(mfull$inputs) %in% names(m$inputs)) if(length(iin)){ - tmp <- mclapply(iin, function(i){ + tmp <- lapply(iin, function(i){ ms <- m$clone_deep() # Add one input ms$inputs[[length(ms$inputs) + 1]] <- mfull$inputs[[i]] names(ms$inputs)[length(ms$inputs)] <- names(mfull$inputs)[i] + # Sort according to the order in the full model + ms$inputs <- ms$inputs[order(match(names(ms$inputs), names(mfull$inputs)))] return(ms) }) mStep <- c_mStep(tmp, pst("+",names(mfull$inputs)[iin])) @@ -246,7 +314,7 @@ step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c(" if(!is.na(prm[1])){ if(length(grep("backward|both", direction))){ # Count down the parameters one by one - tmp <- mclapply(1:length(prm), function(i){ + tmp <- lapply(1:length(prm), function(i){ p <- m$get_prmvalues(names(prm[i])) # If the input is not in the current model, then p is NA, so don't include it for fitting if(!is.na(p)){ @@ -285,30 +353,74 @@ step_optim <- function(modelfull, data, kseq = NA, prm=list(NA), direction = c(" } # Optimize all the new models - resStep <- mclapply(1:length(mStep), function(i, ...){ - res <- optimfun(mStep[[i]], data, kseq, printout=printout, ...) -# res <- try() -# if(class(res) == "try-error"){ browser() } - message(names(mStep)[[i]], ": ", res$value) - return(res) - }, ...) - names(resStep) <- names(mStep) - - # Is one the step models score smaller than the current ref? - valStep <- unlist(getse(resStep, "value")) - imin <- which.min(valStep) - if( valStep[imin] < res$value ){ - # Keep the best model - m <- mStep[[imin]] - res <- resStep[[imin]] - # Keep for the result - L[[istep]] <- list(model = m$clone_deep(), result = resStep[[imin]]) + if(length(mStep) == 0){ + # Nothing to do, so stop + done <- TRUE }else{ - # No improvement obtained from reduction, so return the current model (last in the list) - message("------------------------------------\n\nDone") + + # Run the optimization + Lstep <- mclapply(1:length(mStep), function(i, ...){ + optimfun(mStep[[i]], data, kseq, printout=printout, ...) + }, mc.cores=mc.cores, ...) + names(Lstep) <- names(mStep) + + # Complete cases considered: Should we forecast and recalculate the score on complete cases from all models? + if(class(fitfun) == "function"){ + LYhat <- mclapply(1:length(mStep), function(i){ + mtmp <- mStep[[i]]$clone_deep() + mtmp$kseq <- kseq + fitfun(Lstep[[i]]$par, mtmp, data, printout=printout)$Yhat + }, mc.cores=mc.cores) + # Use complete cases across models and horizons per default + scoreStep <- apply(score(residuals(LYhat,data[[model$output]]), data$scoreperiod), 2, sum) + casesStep <- sapply(LYhat, complete_cases) + }else{ + # Use the scores from optimfun + scoreStep <- unlist(getse(Lstep, "value")) + casesStep <- matrix(rep(NA,length(mStep)), nrow=1) + } + + # Print out + tmp <- as.matrix(unlist(getse(Lstep, "value"))) + if(scoreCurrent == Inf){ + tmp[ ,1] <- pst(format(tmp, digits=2)) + nams(tmp) <- "Score" + }else{ + tmp[ ,1] <- pst(format(100 * (scoreCurrent - tmp) / scoreCurrent, digits=2),"%") + nams(tmp) <- "Improvement" + } + if(class(fitfun) == "function"){ + tmp <- cbind(tmp, apply(casesStep != casesCurrent, 2, sum)) + nams(tmp)[2] <- "CasesDiff" + } + print(tmp) + + # Compare scores: Is one the step models score smaller than the current ref? + imin <- which.min(scoreStep) + if( scoreStep[imin] < scoreCurrent ){ + # Keep the best model (with it's optimized parameters) + m <- mStep[[imin]] + res <- Lstep[[imin]] + scoreCurrent <- scoreStep[[imin]] + casesCurrent <- casesStep[ ,imin] + # Insert the optimized parameters, such that next step starts at the optimized parameter values + if(is.null(names(res$par))){ + names(res$par) <- row.names(m$prmbounds) + } + m$prmbounds[names(res$par),"init"] <- res$par + # Keep for the final result + m$insert_prm(res$par) + L[[istep]] <- list(model = m$clone_deep(), score = scoreCurrent, optimresult = res, completecases = casesCurrent) + }else{ + # No improvement obtained from reduction, so return the current model (last in the list) + done <- TRUE + } + } + if(done){ + message("\n------------------------------------------------------------------------\n\nFinal model:") message(print(m)) - done <- TRUE } } + names(L) <- c(pst("step",1:(length(L)-1)),"final") invisible(L) }