mlr-org · pfistfl · Apr 1, 2020 · Apr 1, 2020 · Apr 1, 2020 · Apr 2, 2020
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -45,6 +45,7 @@ Imports:
     mlr3misc (>= 0.1.4),
     paradox,
     R6,
+    R.cache,
     withr
 Suggests:
     ggplot2,

diff --git a/R/Graph.R b/R/Graph.R
@@ -58,6 +58,15 @@
 #'   (and therefore their `$param_set$values`) and a hash of `$edges`.
 #' * `keep_results` :: `logical(1)` \cr
 #'   Whether to store intermediate results in the [`PipeOp`]'s `$.result` slot, mostly for debugging purposes. Default `FALSE`.
+#' * `cache` :: `logical(1)` \cr
+#'   Whether to cache individual [`PipeOp`]'s during "train" and "predict". Default `FALSE`.
+#'   Caching is performed using the [`R.cache`](R.cache::R.cache) package.
+#'   Caching can be disabled/enabled globally using `getOption("R.cache.enabled", TRUE)`.
+#'   By default, files are cached in `R.cache::getCacheRootPath()`.
+#'   For more information on how to set the cache path or retrieve cached items please consider
+#'   the [`R.cache`](R.cache::R.cache) documentation.
+#'   Caching can be fine-controlled for each [`PipeOp`] by adjusting individual [`PipeOp`]'s
+#'   `cache`, `cache_state` and `stochastic` fields.
 #'
 #' @section Methods:
 #' * `ids(sorted = FALSE)` \cr
@@ -407,6 +416,13 @@ Graph = R6Class("Graph",
       } else {
         map(self$pipeops, "state")
       }
+    },
+    cache = function(val) {
+      if (!missing(val)) {
+        private$.cache = assert_flag(val)
+      } else {
+        private$.cache
+      }
     }
   ),
 
@@ -419,7 +435,8 @@ Graph = R6Class("Graph",
         value
       )
     },
-    .param_set = NULL
+    .param_set = NULL,
+    .cache = FALSE
   )
 )
 
@@ -539,7 +556,7 @@ graph_reduce = function(self, input, fun, single_input) {
     input = input_tbl$payload
     names(input) = input_tbl$name
 
-    output = op[[fun]](input)
+    output = cached_pipeop_eval(self, op, fun, input)
     if (self$keep_results) {
       op$.result = output
     }
@@ -609,3 +626,70 @@ predict.Graph = function(object, newdata, ...) {
   }
   result
 }
+
+# Cached train/predict of a PipeOp. 
+# 1) Caching of a PipeOp only performed if graph and po have `cache = TRUE`,
+#    i.e both the Graph AND the PipeOp want to be cached.
+# 2) Additonally caching is only performed if 'train' or 'predict' is not stochastic
+#    for a given PipeOp. This can be obtained from `.$stochastic` and can be set
+#    for each PipeOp.
+# 3) During training we have two options
+#    Each PipeOp stores whether it wants to do I. or II. in `.$cache_state`.
+#    I. Cache only state:
+#      This is possible if the train transform is the same as the predict transform
+#      and predict is comparatively cheap (i.e. filters).
+#    II. Cache state and output
+#      (All other cases)
+
+cached_pipeop_eval = function(self, op, fun, input) {
+
+  if (self$cache && op$cache) {
+    require_namespaces("R.cache")
+    cache_key = list(map_chr(input, get_hash), op$hash)
+    if (fun == "train") {
+      if (fun %nin% op$stochastic) {
+        # Two options:
+        # I.  cache state (can predict on train set using state during train)
+        # II. do not cache state () (if I. is not possible)
+        if (op$cache_state) {
+          # only cache state (I.)
+          R.cache::evalWithMemoization({
+            op[[fun]](input)
+            state = op$state 
+          }, key = cache_key)
+          # Set state if PipeOp was cached (and "train" was therefore not called)
+          if (is.null(op$state) && fun == "train") op$state = state
+          # We call "predict" on train inputs, this avoids storing the outputs 
+          # during training on disk.
+          # This is only done for pipeops where 'cache_state' is TRUE.
+          return(cached_pipeop_eval(self, op, "predict", input))
+        } else {
+          # Otherwise we cache state and input (II.)
+          R.cache::evalWithMemoization({
+            result = list(output = op[[fun]](input), state = op$state)
+          }, key = cache_key)
+          # Set state if PipeOp was cached before (and thus no state was set)
+          if (is.null(op$state) && fun == "train") op$state = result$state
+          return(result$output)
+        }
+      }
+    } else if (fun == "predict" && !op$cache_state) {
+      # during predict, only cache if cache_state is FALSE and op is not stochastic.
+      if (fun %nin% op$stochastic) {
+        R.cache::evalWithMemoization(
+          {output = op[[fun]](input)},
+          key = cache_key)
+        return(output)
+      }
+    }
+  }
+  # No caching fallback, anything where we do not run into conditions above
+  return(op[[fun]](input))
+}
+
+get_hash = function(x) {
+  hash = try(x$hash, silent = TRUE)
+  if (inherits(hash, "try-error") || is.null(hash))
+    hash = digest(x, algo = "xxhash64")
+  hash
+}
diff --git a/R/PipeOp.R b/R/PipeOp.R
@@ -115,6 +115,21 @@
 #'   If the [`Graph`]'s `$keep_results` flag is set to `TRUE`, then the intermediate Results of `$train()` and `$predict()`
 #'   are saved to this slot, exactly as they are returned by these functions. This is mainly for debugging purposes
 #'   and done, if requested, by the [`Graph`] backend itself; it should *not* be done explicitly by `private$.train()` or `private$.predict()`.
+#' * `cache` :: `logical(1)` \cr
+#'   Whether to cache the [`PipeOp`]'s state and or output during "train" and "predict". Defaults to `TRUE`.
+#'   See the `cache` field in [`Graph`] for more detailed information on caching, as well as `cache_state` and
+#'   `stochastic` below.
+#' * `cache_state` :: `logical(1)` \cr
+#'   Whether the [`PipeOp`]s behaviour during training is equal to behaviour during prediction
+#'   (other then setting a state). In this case, only the [`PipeOp`]s state is cached.
+#'   This avoids caching possibly large intermediate results.
+#'   Defaults to `TRUE`.
+#' * `stochastic` :: `character` \cr
+#'   Whether a [`PipeOp`] is stochastic during `"train"`, `"predict"`, or not at all: `character(0)`.
+#'   Defaults to `character(0)` (deterministic). Stochastic [`PipeOp`]s are not cached during the
+#'   respective phase.
+#'   A [`PipeOp`] is only cached if it is deterministic.
+#' 
 #'
 #' @section Methods:
 #' * `train(input)`\cr
@@ -254,7 +269,6 @@ PipeOp = R6Class("PipeOp",
       if (is_noop(self$state)) {
         stopf("Pipeop %s got NO_OP during train but no NO_OP during predict.", self$id)
       }
-
       input = check_types(self, input, "input", "predict")
       output = private$.predict(input)
       output = check_types(self, output, "output", "predict")
@@ -296,6 +310,26 @@ PipeOp = R6Class("PipeOp",
     hash = function() {
       digest(list(class(self), self$id, self$param_set$values),
         algo = "xxhash64")
+    },
+    cache = function(val) {
+      if (!missing(val)) {
+        private$.cache = assert_flag(val)
+      } else {
+        private$.cache
+      }
+    },
+    cache_state = function(val) {
+      if (!missing(val)) {
+        stop("cache_state is read-only!")
+      } 
+      private$.cache_state
+    },
+    stochastic = function(val) {
+      if (!missing(val)) {
+        private$.stochastic = assert_subset(val, c("train", "predict"))
+      } else {
+        private$.stochastic
+      }
     }
   ),
 
@@ -318,7 +352,10 @@ PipeOp = R6Class("PipeOp",
     .predict = function(input) stop("abstract"),
     .param_set = NULL,
     .param_set_source = NULL,
-    .id = NULL
+    .id = NULL,
+    .cache = TRUE,
+    .cache_state = TRUE,
+    .stochastic = character(0)
   )
 )
 

diff --git a/R/PipeOpBranch.R b/R/PipeOpBranch.R
@@ -117,7 +117,8 @@ PipeOpBranch = R6Class("PipeOpBranch",
       ret = named_list(self$output$name, NO_OP)
       ret[[self$param_set$values$selection]] = inputs[[1]]
       ret
-    }
+    },
+    .cache = FALSE
   )
 )
 

diff --git a/R/PipeOpChunk.R b/R/PipeOpChunk.R
@@ -75,6 +75,17 @@ PipeOpChunk = R6Class("PipeOpChunk",
       )
     }
   ),
+  active = list(
+    stochastic = function(val) {
+      if (!missing(val)) {
+        assert_subset(val, c("train", "predict"))
+        private$.stochastic = val
+      } else {
+        if (self$param_set$values$shuffle) return("train")
+        character(0)
+      }
+    }
+  ),
   private = list(
     .train = function(inputs) {
       self$state = list()
@@ -88,7 +99,8 @@ PipeOpChunk = R6Class("PipeOpChunk",
     },
     .predict = function(inputs) {
       rep(inputs, self$outnum)
-    }
+    },
+    .cache = FALSE
   )
 )
 

diff --git a/R/PipeOpClassBalancing.R b/R/PipeOpClassBalancing.R
@@ -160,7 +160,10 @@ PipeOpClassBalancing = R6Class("PipeOpClassBalancing",
       task_filter_ex(task, new_ids)
     },
 
-    .predict_task = identity
+    .predict_task = identity,
+    .cache = FALSE,
+    .stochastic = "train",
+    .cache_state = FALSE
   )
 )
 

diff --git a/R/PipeOpCopy.R b/R/PipeOpCopy.R
@@ -99,7 +99,8 @@ PipeOpCopy = R6Class("PipeOpCopy",
     },
     .predict = function(inputs) {
       rep_len(inputs, self$outnum)
-    }
+    },
+    .cache = FALSE
   )
 )
 

diff --git a/R/PipeOpImputeHist.R b/R/PipeOpImputeHist.R
@@ -74,7 +74,9 @@ PipeOpImputeHist = R6Class("PipeOpImputeHist",
       }
       feature[is.na(feature)] = sampled
       feature
-    }
+    },
+    .cache = FALSE,
+    .stochastic = c("train", "predict")
   )
 )
 

diff --git a/R/PipeOpImputeSample.R b/R/PipeOpImputeSample.R
@@ -85,7 +85,9 @@ PipeOpImputeSample = R6Class("PipeOpImputeSample",
         feature[is.na(feature)] = sample(model, outlen, replace = TRUE)
       }
       feature
-    }
+    },
+    .cache = FALSE,
+    .stochastic = c("train", "predict")
   )
 )
 

diff --git a/R/PipeOpNOP.R b/R/PipeOpNOP.R
@@ -75,7 +75,8 @@ PipeOpNOP = R6Class("PipeOpNOP",
 
     .predict = function(inputs) {
       inputs
-    }
+    },
+    .cache = FALSE
   )
 )
 

diff --git a/R/PipeOpProxy.R b/R/PipeOpProxy.R
@@ -105,6 +105,35 @@ PipeOpProxy = R6Class("PipeOpProxy",
       )
     }
   ),
+  active = list(
+    cache = function(val) {
+      if (!missing(val)) {
+        self$param_set$values$content$cache = assert_flag(val)
+      } else {
+        self$param_set$values$content$cache
+      }
+    },
+    stochastic = function(val) {
+      if (!missing(val)) {
+        assert_subset(val, c("train", "predict"))
+        if (inherits(self$param_set$values$content, "Graph"))
+          stop("'stochastic' not be set when content is a graph!")
+        else 
+          self$param_set$values$content$stochastic = val
+      } else {
+        if (inherits(self$param_set$values$content, "Graph")) return(character(0))
+        self$param_set$values$content$stochastic
+      }
+    },
+    cache_state = function(val) {
+      if (!missing(val)) {
+        stop("cache_state is read-only!")
+      } else {
+        if (inherits(self$param_set$values$content, "Graph")) return(TRUE)
+        self$param_set$values$content$cache_state
+      }
+    }
+  ),
   private = list(
     .param_set = NULL,
     .param_set_source = NULL,

diff --git a/R/PipeOpSmote.R b/R/PipeOpSmote.R
@@ -106,7 +106,9 @@ PipeOpSmote = R6Class("PipeOpSmote",
       }
       setnames(st, "class", task$target_names)
       task$rbind(st)
-    }
+    },
+    .cache = FALSE,
+    .stochastic = "train"
   )
 )
 

diff --git a/R/PipeOpSubsample.R b/R/PipeOpSubsample.R
@@ -93,8 +93,8 @@ PipeOpSubsample = R6Class("PipeOpSubsample",
       self$state = list()
       task_filter_ex(task, keep)
     },
-
-    .predict_task = identity
+    .predict_task = identity,
+    .cache_state = FALSE
   )
 )
 

diff --git a/R/PipeOpThreshold.R b/R/PipeOpThreshold.R
@@ -82,7 +82,9 @@ PipeOpThreshold = R6Class("PipeOpThreshold",
       }
 
       list(prd$set_threshold(thr))
-    }
+    },
+    .cache = FALSE,
+    .cache_state = FALSE
   )
 )
 

diff --git a/R/PipeOpUnbranch.R b/R/PipeOpUnbranch.R
@@ -88,7 +88,8 @@ PipeOpUnbranch = R6Class("PipeOpUnbranch",
     },
     .predict = function(inputs) {
       filter_noop(inputs)
-    }
+    },
+    .cache = FALSE
   )
 )
-Original file line number
+Diff line change
@@ Expand Up / @@ -117,7 +117,8 @@ PipeOpBranch = R6Class("PipeOpBranch", @@
           ret = named_list(self$output$name, NO_OP)
           ret[[self$param_set$values$selection]] = inputs[[1]]
           ret
-        }
+        },
+        .cache = FALSE
       )
     )
@@ Expand Down @@