Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/src/arrow/filesystem/s3fs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ TEST_F(S3OptionsTest, FromAssumeRole) {
class S3RegionResolutionTest : public AwsTestMixin {};

TEST_F(S3RegionResolutionTest, PublicBucket) {
ASSERT_OK_AND_EQ("us-east-2", ResolveS3BucketRegion("ursa-labs-taxi-data"));
ASSERT_OK_AND_EQ("us-east-2", ResolveS3BucketRegion("voltrondata-labs-datasets"));

// Taken from a registry of open S3-hosted datasets
// at https://github.com/awslabs/open-data-registry
Expand Down
8 changes: 3 additions & 5 deletions docs/source/python/dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ specifying a S3 path:

.. code-block:: python

dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"])
dataset = ds.dataset("s3://voltrondata-labs-datasets/nyc-taxi/")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there also a reason for removing the partitioning option?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a newer version of the dataset that is Hive partitioned, so doesn't require explicitly passing partitioning.


Typically, you will want to customize the connection parameters, and then
a file system object can be created and passed to the ``filesystem`` keyword:
Expand All @@ -365,8 +365,7 @@ a file system object can be created and passed to the ``filesystem`` keyword:
from pyarrow import fs

s3 = fs.S3FileSystem(region="us-east-2")
dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=s3,
partitioning=["year", "month"])
dataset = ds.dataset("voltrondata-labs-datasets/nyc-taxi/", filesystem=s3)

The currently available classes are :class:`~pyarrow.fs.S3FileSystem` and
:class:`~pyarrow.fs.HadoopFileSystem`. See the :ref:`filesystem` docs for more
Expand All @@ -387,8 +386,7 @@ useful for testing or benchmarking.

# By default, MinIO will listen for unencrypted HTTP traffic.
minio = fs.S3FileSystem(scheme="http", endpoint_override="localhost:9000")
dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=minio,
partitioning=["year", "month"])
dataset = ds.dataset("voltrondata-labs-datasets/nyc-taxi/", filesystem=minio)


Working with Parquet Datasets
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/_s3fs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def resolve_s3_region(bucket):

Examples
--------
>>> fs.resolve_s3_region('registry.opendata.aws')
'us-east-1'
>>> fs.resolve_s3_region('voltrondata-labs-datasets')
'us-east-2'
"""
cdef:
c_string c_bucket
Expand Down
12 changes: 7 additions & 5 deletions python/pyarrow/tests/test_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1616,15 +1616,17 @@ def test_s3_real_aws():
assert fs.region == default_region

fs = S3FileSystem(anonymous=True, region='us-east-2')
entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data'))
entries = fs.get_file_info(FileSelector(
'voltrondata-labs-datasets/nyc-taxi'))
assert len(entries) > 0
with fs.open_input_stream('ursa-labs-taxi-data/2019/06/data.parquet') as f:
key = 'voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/part-0.parquet'
with fs.open_input_stream(key) as f:
md = f.metadata()
assert 'Content-Type' in md
assert md['Last-Modified'] == b'2020-01-17T16:26:28Z'
assert md['Last-Modified'] == b'2022-07-12T23:32:00Z'
# For some reason, the header value is quoted
# (both with AWS and Minio)
assert md['ETag'] == b'"f1efd5d76cb82861e1542117bfa52b90-8"'
assert md['ETag'] == b'"4c6a76826a695c6ac61592bc30cda3df-16"'


@pytest.mark.s3
Expand Down Expand Up @@ -1653,7 +1655,7 @@ def test_s3_real_aws_region_selection():
@pytest.mark.s3
def test_resolve_s3_region():
from pyarrow.fs import resolve_s3_region
assert resolve_s3_region('ursa-labs-taxi-data') == 'us-east-2'
assert resolve_s3_region('voltrondata-labs-datasets') == 'us-east-2'
assert resolve_s3_region('mf-nwp-models') == 'eu-west-1'

with pytest.raises(ValueError, match="Not a valid bucket name"):
Expand Down
1 change: 1 addition & 0 deletions r/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ vignettes/nyc-taxi/
arrow_*.tar.gz
arrow_*.tgz
extra-tests/files
.deps
Copy link
Member Author

@wjones127 wjones127 Jul 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this because it got a lot of files in it while building the R doc site; lmk if there's a good reason to not add it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No objection; what creates this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I run pkgdown::build_site() within RStudio, during the "Installing package into temporary library" it creates this directory.


# C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here.
/tools/cpp/
Expand Down
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ export(float)
export(float16)
export(float32)
export(float64)
export(gs_bucket)
export(halffloat)
export(hive_partition)
export(infer_type)
Expand Down
44 changes: 43 additions & 1 deletion r/R/filesystem.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,26 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F
#' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete
#' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`).
#'
#' `GcsFileSystem$create()` optionally takes arguments:
#'
#' - `anonymous`: logical, default `FALSE`. If true, will not attempt to look up
#' credentials using standard GCS configuration methods.
#' - `access_token`: optional string for authentication. Should be provided along
#' with `expiration`
#' - `expiration`: optional date representing point at which `access_token` will
#' expire.
#' - `json_credentials`: optional string for authentication. Point to a JSON
#' credentials file downloaded from GCS.
#' - `endpoint_override`: if non-empty, will connect to provided host name / port,
#' such as "localhost:9001", instead of default GCS ones. This is primarily useful
#' for testing purposes.
#' - `scheme`: connection transport (default "https")
#' - `default_bucket_location`: the default location (or "region") to create new
#' buckets in.
#' - `retry_limit_seconds`: the maximum amount of time to spend retrying if
#' the filesystem encounters errors. Default is 15 seconds.
#' - `default_metadata`: default metadata to write in new objects.
#'
#' @section Methods:
#'
#' - `$GetFileInfo(x)`: `x` may be a [FileSelector][FileSelector] or a character
Expand Down Expand Up @@ -426,7 +446,7 @@ default_s3_options <- list(
#' relative path. Note that this function's success does not guarantee that you
#' are authorized to access the bucket's contents.
#' @examplesIf FALSE
#' bucket <- s3_bucket("ursa-labs-taxi-data")
#' bucket <- s3_bucket("voltrondata-labs-datasets")
#' @export
s3_bucket <- function(bucket, ...) {
assert_that(is.string(bucket))
Expand All @@ -448,6 +468,28 @@ s3_bucket <- function(bucket, ...) {
SubTreeFileSystem$create(fs_and_path$path, fs)
}

#' Connect to a Google Cloud Storage (GCS) bucket
#'
#' `gs_bucket()` is a convenience function to create an `GcsFileSystem` object
#' that holds onto its relative path
#'
#' @param bucket string GCS bucket name or path
#' @param ... Additional connection options, passed to `GcsFileSystem$create()`
#' @return A `SubTreeFileSystem` containing an `GcsFileSystem` and the bucket's
#' relative path. Note that this function's success does not guarantee that you
#' are authorized to access the bucket's contents.
#' @examplesIf FALSE
#' bucket <- gs_bucket("voltrondata-labs-datasets")
#' @export
gs_bucket <- function(bucket, ...) {
assert_that(is.string(bucket))
args <- list2(...)

fs <- exec(GcsFileSystem$create, !!!args)

SubTreeFileSystem$create(bucket, fs)
}

#' @usage NULL
#' @format NULL
#' @rdname FileSystem
Expand Down
3 changes: 2 additions & 1 deletion r/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ navbar:
href: articles/install.html
- text: Working with Arrow Datasets and dplyr
href: articles/dataset.html
- text: Working with Cloud Storage (S3)
- text: Working with Cloud Storage (S3, GCS)
href: articles/fs.html
- text: Apache Arrow in Python and R with reticulate
href: articles/python.html
Expand Down Expand Up @@ -198,6 +198,7 @@ reference:
- title: File systems
contents:
- s3_bucket
- gs_bucket
- FileSystem
- FileInfo
- FileSelector
Expand Down
21 changes: 21 additions & 0 deletions r/man/FileSystem.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions r/man/gs_bucket.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/s3_bucket.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 17 additions & 4 deletions r/tests/testthat/test-filesystem.R
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ test_that("FileSystem$from_uri", {
skip_on_cran()
skip_if_not_available("s3")
skip_if_offline()
fs_and_path <- FileSystem$from_uri("s3://ursa-labs-taxi-data")
fs_and_path <- FileSystem$from_uri("s3://voltrondata-labs-datasets")
expect_r6_class(fs_and_path$fs, "S3FileSystem")
expect_identical(fs_and_path$fs$region, "us-east-2")
})
Expand All @@ -156,11 +156,11 @@ test_that("SubTreeFileSystem$create() with URI", {
skip_on_cran()
skip_if_not_available("s3")
skip_if_offline()
fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data")
fs <- SubTreeFileSystem$create("s3://voltrondata-labs-datasets")
expect_r6_class(fs, "SubTreeFileSystem")
expect_identical(
capture.output(print(fs)),
"SubTreeFileSystem: s3://ursa-labs-taxi-data/"
"SubTreeFileSystem: s3://voltrondata-labs-datasets/"
)
})

Expand All @@ -187,6 +187,19 @@ test_that("s3_bucket", {
capture.output(print(bucket)),
"SubTreeFileSystem: s3://ursa-labs-r-test/"
)
skip_on_os("windows") # FIXME
expect_identical(bucket$base_path, "ursa-labs-r-test/")
})

test_that("gs_bucket", {
skip_on_cran()
skip_if_not_available("gcs")
skip_if_offline()
bucket <- gs_bucket("voltrondata-labs-datasets")
expect_r6_class(bucket, "SubTreeFileSystem")
expect_r6_class(bucket$base_fs, "GcsFileSystem")
expect_identical(
capture.output(print(bucket)),
"SubTreeFileSystem: gs://voltrondata-labs-datasets/"
)
expect_identical(bucket$base_path, "voltrondata-labs-datasets/")
})
Loading