apache · kszucs · Jul 25, 2022 · Jul 1, 2022 · Jul 13, 2022 · Jul 14, 2022
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -322,7 +322,7 @@ TEST_F(S3OptionsTest, FromAssumeRole) {
 class S3RegionResolutionTest : public AwsTestMixin {};
 
 TEST_F(S3RegionResolutionTest, PublicBucket) {
-  ASSERT_OK_AND_EQ("us-east-2", ResolveS3BucketRegion("ursa-labs-taxi-data"));
+  ASSERT_OK_AND_EQ("us-east-2", ResolveS3BucketRegion("voltrondata-labs-datasets"));
 
   // Taken from a registry of open S3-hosted datasets
   // at https://github.com/awslabs/open-data-registry

diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst
@@ -355,7 +355,7 @@ specifying a S3 path:
 
 .. code-block:: python
 
-    dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"])
+    dataset = ds.dataset("s3://voltrondata-labs-datasets/nyc-taxi/")
 
 Typically, you will want to customize the connection parameters, and then
 a file system object can be created and passed to the ``filesystem`` keyword:
@@ -365,8 +365,7 @@ a file system object can be created and passed to the ``filesystem`` keyword:
     from pyarrow import fs
 
     s3  = fs.S3FileSystem(region="us-east-2")
-    dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=s3,
-                         partitioning=["year", "month"])
+    dataset = ds.dataset("voltrondata-labs-datasets/nyc-taxi/", filesystem=s3)
 
 The currently available classes are :class:`~pyarrow.fs.S3FileSystem` and
 :class:`~pyarrow.fs.HadoopFileSystem`. See the :ref:`filesystem` docs for more
@@ -387,8 +386,7 @@ useful for testing or benchmarking.
 
     # By default, MinIO will listen for unencrypted HTTP traffic.
     minio = fs.S3FileSystem(scheme="http", endpoint_override="localhost:9000")
-    dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=minio,
-                         partitioning=["year", "month"])
+    dataset = ds.dataset("voltrondata-labs-datasets/nyc-taxi/", filesystem=minio)
 
 
 Working with Parquet Datasets

diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx
@@ -74,8 +74,8 @@ def resolve_s3_region(bucket):
 
     Examples
     --------
-    >>> fs.resolve_s3_region('registry.opendata.aws')
-    'us-east-1'
+    >>> fs.resolve_s3_region('voltrondata-labs-datasets')
+    'us-east-2'
     """
     cdef:
         c_string c_bucket

diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
@@ -1616,15 +1616,17 @@ def test_s3_real_aws():
     assert fs.region == default_region
 
     fs = S3FileSystem(anonymous=True, region='us-east-2')
-    entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data'))
+    entries = fs.get_file_info(FileSelector(
+        'voltrondata-labs-datasets/nyc-taxi'))
     assert len(entries) > 0
-    with fs.open_input_stream('ursa-labs-taxi-data/2019/06/data.parquet') as f:
+    key = 'voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/part-0.parquet'
+    with fs.open_input_stream(key) as f:
         md = f.metadata()
         assert 'Content-Type' in md
-        assert md['Last-Modified'] == b'2020-01-17T16:26:28Z'
+        assert md['Last-Modified'] == b'2022-07-12T23:32:00Z'
         # For some reason, the header value is quoted
         # (both with AWS and Minio)
-        assert md['ETag'] == b'"f1efd5d76cb82861e1542117bfa52b90-8"'
+        assert md['ETag'] == b'"4c6a76826a695c6ac61592bc30cda3df-16"'
 
 
 @pytest.mark.s3
@@ -1653,7 +1655,7 @@ def test_s3_real_aws_region_selection():
 @pytest.mark.s3
 def test_resolve_s3_region():
     from pyarrow.fs import resolve_s3_region
-    assert resolve_s3_region('ursa-labs-taxi-data') == 'us-east-2'
+    assert resolve_s3_region('voltrondata-labs-datasets') == 'us-east-2'
     assert resolve_s3_region('mf-nwp-models') == 'eu-west-1'
 
     with pytest.raises(ValueError, match="Not a valid bucket name"):

diff --git a/r/.gitignore b/r/.gitignore
@@ -18,6 +18,7 @@ vignettes/nyc-taxi/
 arrow_*.tar.gz
 arrow_*.tgz
 extra-tests/files
+.deps
 
 # C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here.
 /tools/cpp/

diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -301,6 +301,7 @@ export(float)
 export(float16)
 export(float32)
 export(float64)
+export(gs_bucket)
 export(halffloat)
 export(hive_partition)
 export(infer_type)

diff --git a/r/R/filesystem.R b/r/R/filesystem.R
@@ -155,6 +155,26 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F
 #' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete
 #'    buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`).
 #'
+#' `GcsFileSystem$create()` optionally takes arguments:
+#'
+#' - `anonymous`: logical, default `FALSE`. If true, will not attempt to look up
+#'    credentials using standard GCS configuration methods.
+#' - `access_token`: optional string for authentication. Should be provided along
+#'   with `expiration`
+#' - `expiration`: optional date representing point at which `access_token` will
+#'   expire.
+#' - `json_credentials`: optional string for authentication. Point to a JSON
+#'   credentials file downloaded from GCS.
+#' - `endpoint_override`: if non-empty, will connect to provided host name / port,
+#'   such as "localhost:9001", instead of default GCS ones. This is primarily useful
+#'   for testing purposes.
+#' - `scheme`: connection transport (default "https")
+#' - `default_bucket_location`: the default location (or "region") to create new
+#'   buckets in.
+#' - `retry_limit_seconds`: the maximum amount of time to spend retrying if
+#'   the filesystem encounters errors. Default is 15 seconds.
+#' - `default_metadata`: default metadata to write in new objects.
+#'
 #' @section Methods:
 #'
 #' - `$GetFileInfo(x)`: `x` may be a [FileSelector][FileSelector] or a character
@@ -426,7 +446,7 @@ default_s3_options <- list(
 #' relative path. Note that this function's success does not guarantee that you
 #' are authorized to access the bucket's contents.
 #' @examplesIf FALSE
-#' bucket <- s3_bucket("ursa-labs-taxi-data")
+#' bucket <- s3_bucket("voltrondata-labs-datasets")
 #' @export
 s3_bucket <- function(bucket, ...) {
   assert_that(is.string(bucket))
@@ -448,6 +468,28 @@ s3_bucket <- function(bucket, ...) {
   SubTreeFileSystem$create(fs_and_path$path, fs)
 }
 
+#' Connect to a Google Cloud Storage (GCS) bucket
+#'
+#' `gs_bucket()` is a convenience function to create an `GcsFileSystem` object
+#' that holds onto its relative path
+#'
+#' @param bucket string GCS bucket name or path
+#' @param ... Additional connection options, passed to `GcsFileSystem$create()`
+#' @return A `SubTreeFileSystem` containing an `GcsFileSystem` and the bucket's
+#' relative path. Note that this function's success does not guarantee that you
+#' are authorized to access the bucket's contents.
+#' @examplesIf FALSE
+#' bucket <- gs_bucket("voltrondata-labs-datasets")
+#' @export
+gs_bucket <- function(bucket, ...) {
+  assert_that(is.string(bucket))
+  args <- list2(...)
+
+  fs <- exec(GcsFileSystem$create, !!!args)
+
+  SubTreeFileSystem$create(bucket, fs)
+}
+
 #' @usage NULL
 #' @format NULL
 #' @rdname FileSystem

diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
@@ -90,7 +90,7 @@ navbar:
           href: articles/install.html
         - text: Working with Arrow Datasets and dplyr
           href: articles/dataset.html
-        - text: Working with Cloud Storage (S3)
+        - text: Working with Cloud Storage (S3, GCS)
           href: articles/fs.html
         - text: Apache Arrow in Python and R with reticulate
           href: articles/python.html
@@ -198,6 +198,7 @@ reference:
   - title: File systems
     contents:
       - s3_bucket
+      - gs_bucket
       - FileSystem
       - FileInfo
       - FileSelector

diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd
diff --git a/r/man/gs_bucket.Rd b/r/man/gs_bucket.Rd
diff --git a/r/man/s3_bucket.Rd b/r/man/s3_bucket.Rd
diff --git a/r/tests/testthat/test-filesystem.R b/r/tests/testthat/test-filesystem.R
@@ -147,7 +147,7 @@ test_that("FileSystem$from_uri", {
   skip_on_cran()
   skip_if_not_available("s3")
   skip_if_offline()
-  fs_and_path <- FileSystem$from_uri("s3://ursa-labs-taxi-data")
+  fs_and_path <- FileSystem$from_uri("s3://voltrondata-labs-datasets")
   expect_r6_class(fs_and_path$fs, "S3FileSystem")
   expect_identical(fs_and_path$fs$region, "us-east-2")
 })
@@ -156,11 +156,11 @@ test_that("SubTreeFileSystem$create() with URI", {
   skip_on_cran()
   skip_if_not_available("s3")
   skip_if_offline()
-  fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data")
+  fs <- SubTreeFileSystem$create("s3://voltrondata-labs-datasets")
   expect_r6_class(fs, "SubTreeFileSystem")
   expect_identical(
     capture.output(print(fs)),
-    "SubTreeFileSystem: s3://ursa-labs-taxi-data/"
+    "SubTreeFileSystem: s3://voltrondata-labs-datasets/"
   )
 })
 
@@ -187,6 +187,19 @@ test_that("s3_bucket", {
     capture.output(print(bucket)),
     "SubTreeFileSystem: s3://ursa-labs-r-test/"
   )
-  skip_on_os("windows") # FIXME
   expect_identical(bucket$base_path, "ursa-labs-r-test/")
 })
+
+test_that("gs_bucket", {
+  skip_on_cran()
+  skip_if_not_available("gcs")
+  skip_if_offline()
+  bucket <- gs_bucket("voltrondata-labs-datasets")
+  expect_r6_class(bucket, "SubTreeFileSystem")
+  expect_r6_class(bucket$base_fs, "GcsFileSystem")
+  expect_identical(
+    capture.output(print(bucket)),
+    "SubTreeFileSystem: gs://voltrondata-labs-datasets/"
+  )
+  expect_identical(bucket$base_path, "voltrondata-labs-datasets/")
+})