apache · amol- · Jun 29, 2022 · May 19, 2022 · May 19, 2022 · May 19, 2022
diff --git a/r/NEWS.md b/r/NEWS.md
@@ -20,8 +20,7 @@
 # arrow 8.0.0.9000
 
 * `lubridate::parse_date_time()` datetime parser:
-  * currently parses only `orders` with year, month, and day components. In a future release `orders` support for other datetime components (such as hours, minutes, seconds, etc) will be added.
-  * strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported.
+  * `orders` with year, month, day, hours, minutes, and seconds components are supported.
   * the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`).
 
 # arrow 8.0.0

diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
@@ -152,11 +152,24 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") {
   x
 }
 
+#' Build formats from multiple orders
+#'
+#' This function is a vectorised version of `build_format_from_order()`. In
+#' addition to `build_format_from_order()`, it also checks if the supplied
+#' orders are currently supported.
+#'
+#' @inheritParams process_data_for_parsing
+#'
+#' @return a vector of unique formats
+#'
+#' @noRd
 build_formats <- function(orders) {
   # only keep the letters and the underscore as separator -> allow the users to
-  # pass strptime-like formats (with "%"). Processing is needed (instead of passing
+  # pass strptime-like formats (with "%"). We process the data -> we need to
+  # process the `orders` (even if supplied in the desired format)
+  # Processing is needed (instead of passing
   # formats as-is) due to the processing of the character vector in parse_date_time()
-  orders <- gsub("[^A-Za-z_]", "", orders)
+  orders <- gsub("[^A-Za-z]", "", orders)
   orders <- gsub("Y", "y", orders)
 
   # we separate "ym', "my", and "yq" from the rest of the `orders` vector and
@@ -170,7 +183,7 @@ build_formats <- function(orders) {
     orders1 <- setdiff(orders, short_orders)
     orders2 <- intersect(orders, short_orders)
     orders2 <- paste0(orders2, "d")
-    orders <- unique(c(orders1, orders2))
+    orders <- unique(c(orders2, orders1))
   }
 
   if (any(orders == "yq")) {
@@ -179,7 +192,30 @@ build_formats <- function(orders) {
     orders <- unique(c(orders1, orders2))
   }
 
-  supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+  if (any(orders == "qy")) {
+    orders1 <- setdiff(orders, "qy")
+    orders2 <- "ymd"
+    orders <- unique(c(orders1, orders2))
+  }
+
+  ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+  ymd_hms_orders <- c(
+    "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS",
+    "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
+  )
+  # support "%I" hour formats
+  ymd_ims_orders <- gsub("H", "I", ymd_hms_orders)
+
+  supported_orders <- c(
+    ymd_orders,
+    ymd_hms_orders,
+    gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as order separators
+    gsub("_", "", ymd_hms_orders),
+    ymd_ims_orders,
+    gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as order separators
+    gsub("_", "", ymd_ims_orders)
+  )
+
   unsupported_passed_orders <- setdiff(orders, supported_orders)
   supported_passed_orders <- intersect(orders, supported_orders)
 
@@ -200,20 +236,191 @@ build_formats <- function(orders) {
   unique(formats)
 }
 
+#' Build formats from a single order
+#'
+#' @param order a single string date-time format, such as `"ymd"` or `"ymd_hms"`
+#'
+#' @return a vector of all possible formats derived from the input
+#' order
+#'
+#' @noRd
 build_format_from_order <- function(order) {
-  year_chars <- c("%y", "%Y")
-  month_chars <- c("%m", "%B", "%b")
-  day_chars <- "%d"
-
-  outcome <- switch(
-    order,
-    "ymd" = expand.grid(year_chars, month_chars, day_chars),
-    "ydm" = expand.grid(year_chars, day_chars, month_chars),
-    "mdy" = expand.grid(month_chars, day_chars, year_chars),
-    "myd" = expand.grid(month_chars, year_chars, day_chars),
-    "dmy" = expand.grid(day_chars, month_chars, year_chars),
-    "dym" = expand.grid(day_chars, year_chars, month_chars)
+  char_list <- list(
+    "y" = c("%y", "%Y"),
+    "m" = c("%m", "%B", "%b"),
+    "d" = "%d",
+    "H" = "%H",
+    "M" = "%M",
+    "S" = "%S",
+    "I" = "%I"
+  )
+
+  split_order <- strsplit(order, split = "")[[1]]
+
+  outcome <- expand.grid(char_list[split_order])
+  # we combine formats with and without the "-" separator, we will later
+  # coalesce through all of them (benchmarking indicated this is a more
+  # computationally efficient approach rather than figuring out if a string has
+  # separators or not and applying only )
+  # during parsing if the string to be parsed does not contain a separator
+  formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+  formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+  c(formats_with_sep, formats_without_sep)
+}
+
+#' Process data in preparation for parsing
+#'
+#' `process_data_for_parsing()` takes a data column and a vector of `orders` and
+#' prepares several versions of the input data:
+#'   * `processed_x` is a version of `x` where all separators were replaced with
+#'  `"-"` and multiple separators were collapsed into a single one. This element
+#'  is only set to an empty list when the `orders` argument indicate we're only
+#'  interested in parsing the augmented version of `x`.
+#'  * each of the other 3 elements augment `x` in some way
+#'    * `augmented_x_ym` - builds the `ym` and `my` formats by adding `"01"`
+#'    (to indicate the first day of the month)
+#'    * `augmented_x_yq` - transforms the `yq` format to `ymd`, by deriving the
+#'    first month of the quarter and adding `"01"` to indicate the first day
+#'    * `augmented_x_qy` - transforms the `qy` format to `ymd` in a similar
+#'    manner to `"yq"`
+#'
+#' @param x an Expression corresponding to a character or numeric vector of
+#' dates to be parsed.
+#' @param orders a character vector of date-time formats.
+#'
+#' @return a list made up of 4 lists, each a different version of x:
+#'  * `processed_x`
+#'  * `augmented_x_ym`
+#'  * `augmented_x_yq`
+#'  * `augmented_x_qy`
+#' @noRd
+process_data_for_parsing <- function(x, orders) {
+
+  processed_x <- x$cast(string())
+
+  # make all separators (non-letters and non-numbers) into "-"
+  processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+  # collapse multiple separators into a single one
+  processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+  # we need to transform `x` when orders are `ym`, `my`, and `yq`
+  # for `ym` and `my` orders we add a day ("01")
+  # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627
+  augmented_x_ym <- NULL
+  if (any(orders %in% c("ym", "my", "Ym", "mY"))) {
+    # add day as "-01" if there is a "-" separator and as "01" if not
+    augmented_x_ym <- call_binding(
+      "if_else",
+      call_binding("grepl", "-", processed_x),
+      call_binding("paste0", processed_x, "-01"),
+      call_binding("paste0", processed_x, "01")
+    )
+  }
+
+  # for `yq` we need to transform the quarter into the start month (lubridate
+  # behaviour) and then add 01 to parse to the first day of the quarter
+  augmented_x_yq <- NULL
+  if (any(orders %in% c("yq", "Yq"))) {
+    # extract everything that comes after the `-` separator, i.e. the quarter
+    # (e.g. 4 from 2022-4)
+    quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+    # we should probably error if quarter is not in 1:4
+    # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+    # in 2002-4)
+    year_x <- call_binding("gsub", "-.*$", "", processed_x)
+    quarter_x <- quarter_x$cast(int32())
+    month_x <- (quarter_x - 1) * 3 + 1
+    augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+  }
+
+  # same as for `yq`, we need to derive the month from the quarter and add a
+  # "01" to give us the first day of the month
+  augmented_x_qy <- NULL
+  if (any(orders %in% c("qy", "qY"))) {
+    quarter_x <- call_binding("gsub", "-.*$", "", processed_x)
+    quarter_x <- quarter_x$cast(int32())
+    year_x <- call_binding("gsub", "^.*?-", "", processed_x)
+    # year might be missing the final 0s when extracted from a float, hence the
+    # need to pad
+    year_x <- call_binding("str_pad", year_x, width = 4, side = "right", pad = "0")
+    month_x <- (quarter_x - 1) * 3 + 1
+    augmented_x_qy <- call_binding("paste0", year_x, "-", month_x, "-01")
+  }
+
+  list(
+    "augmented_x_ym" = augmented_x_ym,
+    "augmented_x_yq" = augmented_x_yq,
+    "augmented_x_qy" = augmented_x_qy,
+    "processed_x" = processed_x
+  )
+}
+
+
+#' Attempt parsing
+#'
+#' This function does several things:
+#'   * builds all possible `formats` from the supplied `orders`
+#'   * processes the data with `process_data_for_parsing()`
+#'   * build a list of the possible `strptime` Expressions for the data & formats
+#'   combinations
+#'
+#' @inheritParams process_data_for_parsing
+#'
+#' @return a list of `strptime` Expressions we can use with `coalesce`
+#' @noRd
+attempt_parsing <- function(x, orders) {
+  # translate orders into possible formats
+  formats <- build_formats(orders)
+
+  # depending on the orders argument we need to do some processing to the input
+  # data. `process_data_for_parsing()` uses the passed `orders` and not the
+  # derived `formats`
+  processed_data <- process_data_for_parsing(x, orders)
+
+  # build a list of expressions for parsing each processed_data element and
+  # format combination
+  parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs, formats)
+
+  # if all orders are in c("ym", "my", "yq", "qy") only attempt to parse the
+  # augmented version(s) of x
+  if (all(orders %in% c("ym", "Ym", "my", "mY", "yq", "Yq", "qy", "qY"))) {
+    parse_attempt_exprs_list$processed_x <- list()
+  }
+
+  # we need the output to be a list of expressions (currently it is a list of
+  # lists of expressions due to the shape of the processed data. we have one list
+  # of expressions for each element of/ list in processed_data) -> we need to
+  # remove a level of hierarchy from the list
+  purrr::flatten(parse_attempt_exprs_list)
+}
+
+#' Build `strptime` expressions
+#'
+#' This function takes several `formats`, iterates over them and builds a
+#' `strptime` Expression for each of them. Given these Expressions are evaluated
+#' row-wise we can leverage this behaviour and introduce a condition. If `x` has
+#' a separator, use the `format` as is, if it doesn't have a separator, remove
+#' the `"-"` separator from the `format`.
+#'
+#' @param x an Expression corresponding to a character or numeric vector of
+#' dates to be parsed.
+#' @param formats a character vector of formats as returned by
+#' `build_format_from_order`
+#'
+#' @return a list of Expressions
+#' @noRd
+build_strptime_exprs <- function(x, formats) {
+  # returning an empty list helps when iterating
+  if (is.null(x)) {
+    return(list())
+  }
+
+  map(
+    formats,
+    ~ build_expr(
+      "strptime",
+      x,
+      options = list(format = .x, unit = 0L, error_is_null = TRUE)
+    )
   )
-  outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-")
-  outcome$format
 }