apache · thisisnic · Mar 31, 2023 · Mar 31, 2023 · Apr 3, 2023 · Apr 3, 2023
@@ -472,9 +472,13 @@ names.StructArray <- function(x, ...) StructType__field_names(x$type)
 #' @export
 dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields)
 
+as_df.StructArray <- function(x, ...) {
+  as.vector(x)
+}
+
 #' @export
 as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) {
-  as.vector(x)
+  as.data.frame(as_df(x), row.names = row.names, optional = optional, ...)
 }
 
 #' @rdname array

@@ -93,6 +93,10 @@ ArrowTabular <- R6Class("ArrowTabular",
 
 #' @export
 as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
+  as.data.frame(as_df(x), row.names = row.names, optional = optional, ...)
+}
+
+as_df.ArrowTabular <- function(x, ...) {
   df <- x$to_data_frame()
   apply_arrow_r_metadata(df, x$metadata$r)
 }
@@ -259,3 +263,7 @@ na.omit.ArrowTabular <- function(object, ...) {
 
 #' @export
 na.exclude.ArrowTabular <- na.omit.ArrowTabular
+
+as_df <- function(x) {
+  UseMethod("as_df")
+}
@@ -248,7 +248,7 @@ read_delim_arrow <- function(file,
   }
 
   if (isTRUE(as_data_frame)) {
-    tab <- as.data.frame(tab)
+    tab <- as_df(tab)
   }
 
   tab

@@ -522,9 +522,13 @@ dim.Dataset <- function(x) c(x$num_rows, x$num_cols)
 #' @export
 c.Dataset <- function(...) Dataset$create(list(...))
 
+as_df.Dataset <- function(x, ...) {
+  collect.Dataset(x)
+}
+
 #' @export
 as.data.frame.Dataset <- function(x, row.names = NULL, optional = FALSE, ...) {
-  collect.Dataset(x)
+  as.data.frame(as_df(x), row.names = row.names, optional = optional, ...)
 }
 
 #' @export

@@ -22,9 +22,10 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
   out <- compute.arrow_dplyr_query(x)
   collect.ArrowTabular(out, as_data_frame)
 }
+
 collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
   if (as_data_frame) {
-    as.data.frame(x, ...)
+    as_df(x, ...)
   } else {
     x
   }

@@ -71,7 +71,7 @@ glimpse.ArrowTabular <- function(x,
   var_headings <- paste("$", center_pad(tickify(names(x)), var_types))
 
   # Assemble the data glimpse
-  df <- as.data.frame(head_tab)
+  df <- as_df(head_tab)
   formatted_data <- map_chr(df, function(.) {
     tryCatch(
       paste(pillar::format_glimpse(.), collapse = ", "),

@@ -216,6 +216,10 @@ unique.RecordBatchReader <- unique.arrow_dplyr_query
 
 #' @export
 as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) {
+  as.data.frame(as_df(x), row.names = row.names, optional = optional, ...)
+}
+
+as_df.arrow_dplyr_query <- function(x, ...) {
   collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
 }
 

@@ -196,7 +196,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T
   )
 
   if (isTRUE(as_data_frame)) {
-    out <- as.data.frame(out)
+    out <- as_df(out)
   }
   out
 }

@@ -106,7 +106,7 @@ read_ipc_stream <- function(file, as_data_frame = TRUE, ...) {
   # https://issues.apache.org/jira/browse/ARROW-6830
   out <- RecordBatchStreamReader$create(file)$read_table()
   if (as_data_frame) {
-    out <- as.data.frame(out)
+    out <- as_df(out)
   }
   out
 }
@@ -84,7 +84,7 @@ read_json_arrow <- function(file,
   }
 
   if (isTRUE(as_data_frame)) {
-    tab <- as.data.frame(tab)
+    tab <- as_df(tab)
   }
   tab
 }

@@ -70,7 +70,7 @@ read_parquet <- function(file,
   }
 
   if (as_data_frame) {
-    tab <- as.data.frame(tab)
+    tab <- as_df(tab)
   }
   tab
 }

@@ -129,7 +129,11 @@ dim.RecordBatchReader <- function(x) c(NA_integer_, length(x$schema))
 
 #' @export
 as.data.frame.RecordBatchReader <- function(x, row.names = NULL, optional = FALSE, ...) {
-  as.data.frame(x$read_table(), row.names = row.names, optional = optional, ...)
+  as.data.frame(as_df(x), row.names = row.names, optional = optional, ...)
+}
+
+as_df.RecordBatchReader <- function(x, ...){
+  x$read_table()
 }
 
 #' @export

@@ -386,5 +386,5 @@ as_schema.StructType <- function(x, ...) {
 
 #' @export
 as.data.frame.Schema <- function(x, row.names = NULL, optional = FALSE, ...) {
-  as.data.frame(Table__from_schema(x))
+  as.data.frame(Table__from_schema(x), row.names = row.names, optional = optional, ...)
 }
@@ -228,7 +228,8 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields,
   // "top level" attributes, only relevant if the first object is not named and a data
   // frame
   cpp11::strings names = Rf_getAttrib(lst, R_NamesSymbol);
-  if (names[0] == "" && Rf_inherits(VECTOR_ELT(lst, 0), "data.frame")) {
+  if (names[0] == "" && Rf_inherits(VECTOR_ELT(lst, 0), "data.frame") &&
+      Rf_xlength(lst) == 1) {
     SEXP top_level = metadata[0] = arrow_attributes(VECTOR_ELT(lst, 0), true);
     if (!Rf_isNull(top_level) && XLENGTH(top_level) > 0) {
       has_top_level_metadata = true;

@@ -346,7 +346,7 @@ test_that("record_batch() handles data frame columns", {
       b = struct(x = int32(), y = int32())
     )
   )
-  out <- as.data.frame(batch)
+  out <- as_tibble(batch)
   expect_equal(out, tibble::tibble(a = 1:10, b = tib))
 
   # if not named, columns from tib are auto spliced
@@ -355,7 +355,7 @@ test_that("record_batch() handles data frame columns", {
     batch2$schema,
     schema(a = int32(), x = int32(), y = int32())
   )
-  out <- as.data.frame(batch2)
+  out <- as_tibble(batch2)
   expect_equal(out, tibble::tibble(a = 1:10, !!!tib))
 })
 
@@ -366,7 +366,7 @@ test_that("record_batch() handles data frame columns with schema spec", {
   schema <- schema(a = int32(), b = struct(x = int16(), y = float64()))
   batch <- record_batch(a = 1:10, b = tib, schema = schema)
   expect_equal(batch$schema, schema)
-  out <- as.data.frame(batch)
+  out <- as_tibble(batch)
   expect_equal(out, tibble::tibble(a = 1:10, b = tib_float))
 
   schema <- schema(a = int32(), b = struct(x = int16(), y = utf8()))
@@ -386,7 +386,7 @@ test_that("record_batch() auto splices (ARROW-5718)", {
   expect_equal(batch3, batch4)
   expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32()))
   expect_equal(
-    as.data.frame(batch3),
+    as_tibble(batch3),
     tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
   )
 
@@ -395,15 +395,15 @@ test_that("record_batch() auto splices (ARROW-5718)", {
   batch6 <- record_batch(!!!df, schema = s)
   expect_equal(batch5, batch6)
   expect_equal(batch5$schema, s)
-  expect_equal(as.data.frame(batch5), df)
+  expect_equal(as_tibble(batch5), df)
 
   s2 <- schema(x = float64(), y = utf8(), z = int16())
   batch7 <- record_batch(df, z = 1:10, schema = s2)
   batch8 <- record_batch(!!!df, z = 1:10, schema = s2)
   expect_equal(batch7, batch8)
   expect_equal(batch7$schema, s2)
   expect_equal(
-    as.data.frame(batch7),
+    as_tibble(batch7),
     tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
   )
 })
@@ -627,7 +627,7 @@ test_that("Handling string data with embedded nuls", {
   # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
   # on `as.vector()` where as with it, the error only happens on materialization)
   skip_on_r_older_than("3.6")
-  df <- as.data.frame(batch_with_nul)
+  df <- as_tibble(batch_with_nul)
 
   expect_error(
     df$b[],
@@ -648,7 +648,7 @@ test_that("Handling string data with embedded nuls", {
     suppressWarnings(
       expect_warning(
         expect_equal(
-          as.data.frame(batch_with_nul)$b,
+          as_tibble(batch_with_nul)$b,
           c("person", "woman", "man", "camera", "tv"),
           ignore_attr = TRUE
         ),

@@ -265,7 +265,7 @@ test_that("table() handles ... of arrays, chunked arrays, vectors", {
     tab$schema,
     schema(a = int32(), b = int32(), c = float64(), x = int32(), y = utf8())
   )
-  res <- as.data.frame(tab)
+  res <- as_tibble(tab)
   expect_equal(names(res), c("a", "b", "c", "x", "y"))
   expect_equal(
     res,
@@ -280,14 +280,14 @@ test_that("table() auto splices (ARROW-5718)", {
   tab2 <- Table$create(!!!df)
   expect_equal(tab1, tab2)
   expect_equal(tab1$schema, schema(x = int32(), y = utf8()))
-  expect_equal(as.data.frame(tab1), df)
+  expect_equal(as_tibble(tab1), df)
 
   s <- schema(x = float64(), y = utf8())
   tab3 <- Table$create(df, schema = s)
   tab4 <- Table$create(!!!df, schema = s)
   expect_equal(tab3, tab4)
   expect_equal(tab3$schema, s)
-  expect_equal(as.data.frame(tab3), df)
+  expect_equal(as_tibble(tab3), df)
 })
 
 test_that("Validation when creating table with schema (ARROW-10953)", {
@@ -366,7 +366,7 @@ test_that("Can create table with specific dictionary types", {
     expect_equal(sch, tab$schema)
     if (i != int64()) {
       # TODO: same downcast to int32 as we do for int64() type elsewhere
-      expect_identical(as.data.frame(tab), fact)
+      expect_identical(as_tibble(tab), fact)
     }
   }
 })
@@ -380,7 +380,7 @@ test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {
   res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d")))
   tab <- Table$create(b1, b2, b3, b4)
 
-  expect_identical(as.data.frame(tab), res)
+  expect_identical(as_tibble(tab), res)
 })
 
 test_that("Table$SelectColumns()", {
@@ -711,3 +711,24 @@ test_that("as_arrow_table() errors on data.frame with NULL names", {
   names(df) <- NULL
   expect_error(as_arrow_table(df), "Input data frame columns must be named")
 })
+
+test_that("as.data.frame() on an ArrowTabular object returns a vanilla data.frame and not a tibble", {
+  df <- data.frame(x = 1)
+  out1 <- as.data.frame(arrow::arrow_table(df, name = "1"))
+  out2 <- as.data.frame(arrow::arrow_table(name = "1", df))
+  out3 <- as.data.frame(arrow::arrow_table(df))
+
+  expect_s3_class(out1, "data.frame", exact = TRUE)
+  expect_s3_class(out2, "data.frame", exact = TRUE)
+  expect_s3_class(out3, "data.frame", exact = TRUE)
+})
+
+test_that("as_tibble.ArrowTabular retains groups", {
+  # calling as_tibble.default on ArrowTabular objects results in any grouping being dropped, which is why
+  # we need as_tibble.ArrowTabular
+  df <- data.frame(x = 1:4, y = c("a", "b"))
+  df_grouped <- dplyr::group_by(df, y)
+  arrow_grouped <- arrow_table(df_grouped)
+  expect_data_frame(arrow_grouped, df_grouped)
+
+})
@@ -368,7 +368,7 @@ test_that("is_in", {
 
 test_that("value_counts", {
   a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
-  result_df <- tibble::tibble(
+  result_df <- data.frame(
     values = c(1, 4, 3),
     counts = c(3L, 2L, 2L)
   )

@@ -141,15 +141,15 @@ test_that("Table$SortIndices()", {
     sort(tbl$chr, na.last = TRUE)
   )
   expect_identical(
-    as.data.frame(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))),
+    as_tibble(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))),
     tbl %>% arrange(int, dbl)
   )
 })
 
 test_that("RecordBatch$SortIndices()", {
   x <- record_batch(tbl)
   expect_identical(
-    as.data.frame(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))),
+    as_tibble(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))),
     tbl %>% arrange(desc(chr), desc(int), desc(dbl))
   )
 })
@@ -91,7 +91,7 @@ test_that("CSV scan options", {
   sb$FragmentScanOptions(options)
 
   tab <- sb$Finish()$ToTable()
-  expect_equal(as.data.frame(tab), tibble(chr = c("foo", NA)))
+  expect_equal(as_tibble(tab), tibble(chr = c("foo", NA)))
 
   # Set default convert options in CsvFileFormat
   csv_format <- CsvFileFormat$create(
-Original file line number
+Diff line change
@@ Expand Up / @@ -248,7 +248,7 @@ read_delim_arrow <- function(file, @@
       }
       if (isTRUE(as_data_frame)) {
-        tab <- as.data.frame(tab)
+        tab <- as_df(tab)
       }
       tab
@@ Expand Down @@