Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
91b6473
unit tests for `parse_date_time()`
dragosmg Apr 1, 2022
f9c7a37
first pass at implementing `parse_date_time()`
dragosmg Apr 1, 2022
b63b4e7
added changes back in
dragosmg Apr 14, 2022
cc01f24
NEWS update
dragosmg Apr 14, 2022
35fb293
lints
dragosmg Apr 14, 2022
e7821c6
removed `insert_at_position()` and added a TODO
dragosmg Apr 14, 2022
3438344
trying out bypassing the windows custom logic
dragosmg Apr 14, 2022
0e9f63e
Merge branch 'master' into parse_date_time
dragosmg Apr 28, 2022
3cbbc9d
clean-up after merging master
dragosmg Apr 28, 2022
f97fc0c
add newline
dragosmg Apr 28, 2022
70204d1
separate locale-dependent unit tests on Windows
dragosmg Apr 28, 2022
9da29c0
add reference to locale Jira
dragosmg Apr 28, 2022
395594c
refer to the locale on Windows Jira
dragosmg Apr 29, 2022
f77246d
skip tests on windows when the R version is older than 4.0
dragosmg Apr 29, 2022
ffa667f
replace skipping on older R on windows with skip if RE2 not available
dragosmg Apr 29, 2022
1c97ca6
error early when the user supplies `orders` we don't yet support
dragosmg Apr 29, 2022
23adca4
simplify `build_formats()`
dragosmg Apr 29, 2022
94bef2d
simplify the `fast_strptime()` implementation
dragosmg Apr 29, 2022
a5e2b1a
trigger a CI failure on windows
dragosmg May 3, 2022
93f6944
remove binding and unit tests for `fast_strptime()`
dragosmg May 3, 2022
e223f51
move `HMS` failing (expected) tests to a separate block
dragosmg May 3, 2022
16c634d
test to figure out the CI `LC_TIME` locale
dragosmg May 3, 2022
e0a4db6
test
dragosmg May 3, 2022
c89cca8
set locale to `"en_US.UTF-8"`
dragosmg May 3, 2022
683f6ad
`LC_TIME="C"`
dragosmg May 3, 2022
203f0fa
final version with skip on Windows
dragosmg May 3, 2022
5774f10
move test data frames definition inside `compare_dplyr_binding()`
dragosmg May 3, 2022
ad2596b
added a Jira to update `parse_date_time` to accept strings without se…
dragosmg May 3, 2022
55e0f9b
Merge branch 'master' into parse_date_time
dragosmg May 3, 2022
8042844
Merge branch 'master' into parse_date_time
dragosmg May 4, 2022
9b6309a
moved `build_formats()` to helpers
dragosmg May 4, 2022
88beefb
register datetime parsers separately
dragosmg May 4, 2022
3717d49
style
dragosmg May 4, 2022
c6f9e00
a more elegant implementation (no hardcoding)
dragosmg May 5, 2022
98890c5
accept `orders` longer than 1
dragosmg May 5, 2022
5dd5247
updated NEWS with `parse_date_time()` limitations
dragosmg May 6, 2022
db274ce
switch back to 7.0.0.9000
dragosmg May 6, 2022
dc36fd0
one more try
dragosmg May 6, 2022
de52524
allow orders to have `"%"` (which get stripped out) and move checks o…
dragosmg May 6, 2022
0d4cc6a
stop processing formats passed via orders. assume they are correct as…
dragosmg May 6, 2022
a6b242b
return to processing `orders` with `"%"` into `formats`
dragosmg May 6, 2022
96220c4
add a test with combi formats and orders + combi strings to be parsed
dragosmg May 6, 2022
db777bf
typo
dragosmg May 6, 2022
9471e5f
skip when RE2 not available and add clarifying comments
dragosmg May 6, 2022
d1da8b2
hopefully made NEWS entry a bit clearer
dragosmg May 9, 2022
362681d
another minor NEWS update
dragosmg May 9, 2022
904a90e
Update r/NEWS.md
jonkeane May 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@
specific language governing permissions and limitations
under the License.
-->
# development version

* `lubridate::parse_date_time()` datetime parser:
* currently parses only `orders` with year, month, and day components. In a future release `orders` support for other datetime components (such as hours, minutes, seconds, etc) will be added.
* strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported.
* the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`).

# arrow 7.0.0.9000

Expand Down
45 changes: 45 additions & 0 deletions r/R/dplyr-datetime-helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,48 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") {

x
}

build_formats <- function(orders) {
# only keep the letters and the underscore as separator -> allow the users to
# pass strptime-like formats (with "%"). Processing is needed (instead of passing
# formats as-is) due to the processing of the character vector in parse_date_time()
orders <- gsub("[^A-Za-z_]", "", orders)
orders <- gsub("Y", "y", orders)

supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
unsupported_passed_orders <- setdiff(orders, supported_orders)
supported_passed_orders <- intersect(orders, supported_orders)

# error only if there isn't at least one valid order we can try
if (length(supported_passed_orders) == 0) {
arrow_not_supported(
paste0(
oxford_paste(
unsupported_passed_orders
),
" `orders`"
)
)
}

formats_list <- map(orders, build_format_from_order)
purrr::flatten_chr(formats_list)
}

build_format_from_order <- function(order) {
year_chars <- c("%y", "%Y")
month_chars <- c("%m", "%B", "%b")
day_chars <- "%d"

outcome <- switch(
order,
"ymd" = expand.grid(year_chars, month_chars, day_chars),
"ydm" = expand.grid(year_chars, day_chars, month_chars),
"mdy" = expand.grid(month_chars, day_chars, year_chars),
"myd" = expand.grid(month_chars, year_chars, day_chars),
"dmy" = expand.grid(day_chars, month_chars, year_chars),
"dym" = expand.grid(day_chars, year_chars, month_chars)
)
outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-")
outcome$format
}
36 changes: 36 additions & 0 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ register_bindings_datetime <- function() {
register_bindings_duration()
register_bindings_duration_constructor()
register_bindings_duration_helpers()
register_bindings_datetime_parsers()
}

register_bindings_datetime_utility <- function() {
Expand Down Expand Up @@ -485,3 +486,38 @@ register_bindings_duration_helpers <- function() {
abort("Duration in picoseconds not supported in Arrow.")
})
}

register_bindings_datetime_parsers <- function() {
register_binding("parse_date_time", function(x,
orders,
tz = "UTC") {

# each order is translated into possible formats
formats <- build_formats(orders)

# make all separators (non-letters and non-numbers) into "-"
x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
# collapse multiple separators into a single one
x <- call_binding("gsub", "-{2,}", "-", x)

# TODO figure out how to parse strings that have no separators
# https://issues.apache.org/jira/browse/ARROW-16446
# we could insert separators at the "likely" positions, but it might be
# tricky given the possible combinations between dmy formats + locale

# build a list of expressions for each format
parse_attempt_expressions <- list()

for (i in seq_along(formats)) {
parse_attempt_expressions[[i]] <- build_expr(
"strptime",
x,
options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE)
)
}

coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)

build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
})
}
96 changes: 95 additions & 1 deletion r/tests/testthat/test-dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -1429,7 +1429,7 @@ test_that("make_difftime()", {
) %>%
collect(),
paste0("named `difftime` units other than: `second`, `minute`, `hour`,",
" `day`, and `week` not supported in Arrow.")
" `day`, and `week` not supported in Arrow.")
)
)

Expand Down Expand Up @@ -1621,3 +1621,97 @@ test_that("`as_datetime()`", {
regexp = "Float value 10.1 was truncated converting to int64"
)
})

test_that("parse_date_time() works with year, month, and date components", {
# string processing requires RE2 library (not available on Windows with R 3.6)
skip_if_not_available("re2")
compare_dplyr_binding(
.input %>%
mutate(
parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"),
parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"),
parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy")
) %>%
collect(),
tibble::tibble(
string_ymd = c(
"2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5",
"2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11",
# not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19
# no separators and %b or %B are even more complicated (and they work in
# lubridate). not to mention locale
NA
),
string_dmy = c(
"1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021",
"6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21",
# not yet working for strings with no separators, like "10092021", "100921",
NA
),
string_mdy = c(
"09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021",
"09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21",
# not yet working for strings with no separators, like "09102021", "091021",
NA
)
)
)

# locale (affecting "%b% and "%B" formats) does not work properly on Windows
# TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done
skip_on_os("windows")
compare_dplyr_binding(
.input %>%
mutate(
parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"),
parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"),
parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy")
) %>%
collect(),
tibble::tibble(
string_ymd = c(
"2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA
),
string_dmy = c(
"12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA
),
string_mdy = c(
"Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA
)
)
)
})

test_that("parse_date_time() works with a mix of formats and orders", {
# string processing requires RE2 library (not available on Windows with R 3.6)
skip_if_not_available("re2")
test_df <- tibble(
string_combi = c("2021-09-1", "2/09//2021", "09.3.2021")
)

compare_dplyr_binding(
.input %>%
mutate(
date_from_string = parse_date_time(
string_combi,
orders = c("ymd", "%d/%m//%Y", "%m.%d.%Y")
)
) %>%
collect(),
test_df
)
})

test_that("parse_date_time() doesn't work with hour, minutes, and second components", {
test_dates_times <- tibble(
date_times = c("09-01-17 12:34:56", NA)
)

expect_warning(
test_dates_times %>%
arrow_table() %>%
mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>%
collect(),
'"ymd_HMS" `orders` not supported in Arrow'
)
})