-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-15517: [R] Use WriteNode in write_dataset() #12316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dee8959
b8938c3
4bdf78b
ede4232
84de6fe
5f58449
8481b2d
4732418
42f8b4a
87efe9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -136,41 +136,88 @@ write_dataset <- function(dataset, | |
| if (inherits(dataset, "arrow_dplyr_query")) { | ||
| # partitioning vars need to be in the `select` schema | ||
| dataset <- ensure_group_vars(dataset) | ||
| } else if (inherits(dataset, "grouped_df")) { | ||
| force(partitioning) | ||
| # Drop the grouping metadata before writing; we've already consumed it | ||
| # now to construct `partitioning` and don't want it in the metadata$r | ||
| dataset <- dplyr::ungroup(dataset) | ||
| } else { | ||
| if (inherits(dataset, "grouped_df")) { | ||
| force(partitioning) | ||
| # Drop the grouping metadata before writing; we've already consumed it | ||
| # now to construct `partitioning` and don't want it in the metadata$r | ||
| dataset <- dplyr::ungroup(dataset) | ||
| } | ||
| dataset <- tryCatch( | ||
| as_adq(dataset), | ||
| error = function(e) { | ||
| supported <- c( | ||
| "Dataset", "RecordBatch", "Table", "arrow_dplyr_query", "data.frame" | ||
| ) | ||
| stop( | ||
| "'dataset' must be a ", | ||
| oxford_paste(supported, "or", quote = FALSE), | ||
| ", not ", | ||
| deparse(class(dataset)), | ||
| call. = FALSE | ||
| ) | ||
| } | ||
| ) | ||
| } | ||
|
|
||
| plan <- ExecPlan$create() | ||
| final_node <- plan$Build(dataset) | ||
| if (!is.null(final_node$sort %||% final_node$head %||% final_node$tail)) { | ||
| # Because sorting and topK are only handled in the SinkNode (or in R!), | ||
| # they wouldn't get picked up in the WriteNode. So let's Run this ExecPlan | ||
| # to capture those, and then create a new plan for writing | ||
| # TODO(ARROW-15681): do sorting in WriteNode in C++ | ||
| dataset <- as_adq(plan$Run(final_node)) | ||
| plan <- ExecPlan$create() | ||
| final_node <- plan$Build(dataset) | ||
| } | ||
|
|
||
| scanner <- Scanner$create(dataset) | ||
| if (!inherits(partitioning, "Partitioning")) { | ||
| partition_schema <- scanner$schema[partitioning] | ||
| partition_schema <- final_node$schema[partitioning] | ||
| if (isTRUE(hive_style)) { | ||
| partitioning <- HivePartitioning$create(partition_schema, null_fallback = list(...)$null_fallback) | ||
| partitioning <- HivePartitioning$create( | ||
| partition_schema, | ||
| null_fallback = list(...)$null_fallback | ||
| ) | ||
| } else { | ||
| partitioning <- DirectoryPartitioning$create(partition_schema) | ||
| } | ||
| } | ||
|
|
||
| if (!missing(max_rows_per_file) && missing(max_rows_per_group) && max_rows_per_group > max_rows_per_file) { | ||
| max_rows_per_group <- max_rows_per_file | ||
| } | ||
|
|
||
| path_and_fs <- get_path_and_filesystem(path) | ||
| options <- FileWriteOptions$create(format, table = scanner, ...) | ||
| output_schema <- final_node$schema | ||
| options <- FileWriteOptions$create( | ||
| format, | ||
| column_names = names(output_schema), | ||
| ... | ||
| ) | ||
|
|
||
| # TODO(ARROW-16200): expose FileSystemDatasetWriteOptions in R | ||
| # and encapsulate this logic better | ||
| existing_data_behavior_opts <- c("delete_matching", "overwrite", "error") | ||
| existing_data_behavior <- match(match.arg(existing_data_behavior), existing_data_behavior_opts) - 1L | ||
|
|
||
| validate_positive_int_value(max_partitions, "max_partitions must be a positive, non-missing integer") | ||
| validate_positive_int_value(max_open_files, "max_open_files must be a positive, non-missing integer") | ||
| validate_positive_int_value(min_rows_per_group, "min_rows_per_group must be a positive, non-missing integer") | ||
| validate_positive_int_value(max_rows_per_group, "max_rows_per_group must be a positive, non-missing integer") | ||
| if (!missing(max_rows_per_file) && missing(max_rows_per_group) && max_rows_per_group > max_rows_per_file) { | ||
| max_rows_per_group <- max_rows_per_file | ||
| } | ||
|
|
||
| dataset___Dataset__Write( | ||
| validate_positive_int_value(max_partitions) | ||
| validate_positive_int_value(max_open_files) | ||
| validate_positive_int_value(min_rows_per_group) | ||
| validate_positive_int_value(max_rows_per_group) | ||
|
Comment on lines
+204
to
+207
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error message says
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe it means "not |
||
|
|
||
| new_r_meta <- get_r_metadata_from_old_schema( | ||
| output_schema, | ||
| source_data(dataset)$schema, | ||
| drop_attributes = has_aggregation(dataset) | ||
| ) | ||
| if (!is.null(new_r_meta)) { | ||
| output_schema$r_metadata <- new_r_meta | ||
| } | ||
| plan$Write( | ||
| final_node, prepare_key_value_metadata(output_schema$metadata), | ||
| options, path_and_fs$fs, path_and_fs$path, | ||
| partitioning, basename_template, scanner, | ||
| partitioning, basename_template, | ||
| existing_data_behavior, max_partitions, | ||
| max_open_files, max_rows_per_file, | ||
| min_rows_per_group, max_rows_per_group | ||
|
|
@@ -179,6 +226,6 @@ write_dataset <- function(dataset, | |
|
|
||
| validate_positive_int_value <- function(value, msg) { | ||
| if (!is_integerish(value, n = 1) || is.na(value) || value < 0) { | ||
| abort(msg) | ||
| abort(paste(substitute(value), "must be a positive, non-missing integer")) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -133,7 +133,6 @@ remove_attributes <- function(x) { | |
| } | ||
|
|
||
| arrow_attributes <- function(x, only_top_level = FALSE) { | ||
|
|
||
| att <- attributes(x) | ||
| removed_attributes <- remove_attributes(x) | ||
|
|
||
|
|
@@ -208,3 +207,24 @@ arrow_attributes <- function(x, only_top_level = FALSE) { | |
| NULL | ||
| } | ||
| } | ||
|
|
||
| get_r_metadata_from_old_schema <- function(new_schema, | ||
| old_schema, | ||
| drop_attributes = FALSE) { | ||
| # TODO: do we care about other (non-R) metadata preservation? | ||
| # How would we know if it were meaningful? | ||
|
Comment on lines
+214
to
+215
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it depends on the source of In the general case, the input is a collection of files and the output is a different set of files (sometimes we explode files and sometimes we merge files). The idea of writing metadata to the output files in somewhat meaningless. So, in general, I would say no, you don't care about preservation. In python, users can create a dataset from a single file, and we do a little bit of work to preserve the metadata on write because we want to feel like it "round trips". When creating or appending to a dataset users might want to specify general information about how the files were created, like "Origin": "Nightly update" but that is unrelated to the original metadata. In the future the dataset write may append its own metadata (e.g. dataset statistics, or information about the dataset schema such as which columns are already sorted, etc.) |
||
| r_meta <- old_schema$r_metadata | ||
| if (!is.null(r_meta)) { | ||
| # Filter r_metadata$columns on columns with name _and_ type match | ||
| common_names <- intersect(names(r_meta$columns), names(new_schema)) | ||
| keep <- common_names[ | ||
| map_lgl(common_names, ~ old_schema[[.]] == new_schema[[.]]) | ||
| ] | ||
| r_meta$columns <- r_meta$columns[keep] | ||
| if (drop_attributes) { | ||
| # dplyr drops top-level attributes if you do summarize | ||
| r_meta$attributes <- NULL | ||
| } | ||
| } | ||
| r_meta | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This condition probably still needs a test or two