-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-5512: [C++] Rough API skeleton for C++ Datasets API / framework #4483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
74bd283
01c4279
895a03e
20b8f4b
ceec07b
68712f8
2f6440a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| # Licensed to the Apache Software Foundation (ASF) under one | ||
| # or more contributor license agreements. See the NOTICE file | ||
| # distributed with this work for additional information | ||
| # regarding copyright ownership. The ASF licenses this file | ||
| # to you under the Apache License, Version 2.0 (the | ||
| # "License"); you may not use this file except in compliance | ||
| # with the License. You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, | ||
| # software distributed under the License is distributed on an | ||
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| # KIND, either express or implied. See the License for the | ||
| # specific language governing permissions and limitations | ||
| # under the License. | ||
|
|
||
| add_custom_target(arrow_dataset) | ||
|
|
||
| # Headers: top level | ||
| arrow_install_all_headers("arrow/dataset") | ||
|
|
||
| set(ARROW_DATASET_SRCS scanner.cc) | ||
|
|
||
| add_arrow_lib(arrow_dataset | ||
| OUTPUTS | ||
| ARROW_DATASET_LIBRARIES | ||
| SOURCES | ||
| ${ARROW_DATASET_SRCS} | ||
| SHARED_LINK_LIBS | ||
| arrow_shared | ||
| STATIC_LINK_LIBS | ||
| arrow_static) | ||
|
|
||
| if(ARROW_DATASET_TEST_LINKAGE STREQUAL "static") | ||
| set(ARROW_DATASET_TEST_LINK_LIBS arrow_dataset_static ${ARROW_TEST_STATIC_LINK_LIBS}) | ||
| else() | ||
| set(ARROW_DATASET_TEST_LINK_LIBS arrow_dataset_shared ${ARROW_TEST_SHARED_LINK_LIBS}) | ||
| endif() | ||
|
|
||
| foreach(LIB_TARGET ${ARROW_DATASET_LIBRARIES}) | ||
| target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_DS_EXPORTING) | ||
| endforeach() | ||
|
|
||
| if(NOT WIN32) | ||
| add_arrow_test(file_test | ||
| EXTRA_LINK_LIBS | ||
| ${ARROW_DATASET_TEST_LINK_LIBS} | ||
| PREFIX | ||
| "arrow-dataset" | ||
| LABELS | ||
| "arrow_dataset") | ||
| endif() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| <!--- | ||
| Licensed to the Apache Software Foundation (ASF) under one | ||
| or more contributor license agreements. See the NOTICE file | ||
| distributed with this work for additional information | ||
| regarding copyright ownership. The ASF licenses this file | ||
| to you under the Apache License, Version 2.0 (the | ||
| "License"); you may not use this file except in compliance | ||
| with the License. You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, | ||
| software distributed under the License is distributed on an | ||
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| KIND, either express or implied. See the License for the | ||
| specific language governing permissions and limitations | ||
| under the License. | ||
| --> | ||
|
|
||
| # Arrow C++ Datasets | ||
|
|
||
| The `arrow::dataset` subcomponent provides an API to read and write | ||
| semantic datasets stored in different locations and formats. It | ||
| facilitates parallel processing of datasets spread across different | ||
| physical files and serialization formats. Other concerns such as | ||
| partitioning, filtering (partition- and column-level), and schema | ||
| normalization are also addressed. | ||
|
|
||
| ## Development Status | ||
|
|
||
| Pre-alpha as of June 2019. API subject to change without notice. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include "arrow/dataset/dataset.h" | ||
| #include "arrow/dataset/discovery.h" | ||
| #include "arrow/dataset/file_base.h" | ||
| #include "arrow/dataset/file_csv.h" | ||
| #include "arrow/dataset/file_feather.h" | ||
| #include "arrow/dataset/file_parquet.h" | ||
| #include "arrow/dataset/scanner.h" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include "arrow/dataset/type_fwd.h" | ||
| #include "arrow/dataset/visibility.h" | ||
|
|
||
| namespace arrow { | ||
| namespace dataset { | ||
|
|
||
| /// \brief A granular piece of a Dataset, such as an individual file, | ||
| /// which can be read/scanned separately from other fragments | ||
| class ARROW_DS_EXPORT DataFragment { | ||
|
||
| public: | ||
| virtual ~DataFragment() = default; | ||
|
|
||
| /// \brief Return true if the fragment can benefit from parallel | ||
| /// scanning | ||
|
||
| virtual bool splittable() const = 0; | ||
|
|
||
| /// \brief Partition options to use when scanning this fragment. May be | ||
| /// nullptr | ||
| virtual std::shared_ptr<ScanOptions> scan_options() const = 0; | ||
| }; | ||
|
|
||
| /// \brief Conditions to apply to a dataset when reading to include or | ||
| /// exclude fragments, filter out rows, etc. | ||
| struct DataSelector { | ||
| std::vector<std::shared_ptr<Filter>> filters; | ||
|
|
||
| // TODO(wesm): Select specific partition keys, file path globs, or | ||
| // other common desirable selections | ||
| }; | ||
|
|
||
| /// \brief A basic component of a Dataset which yields zero or more | ||
| /// DataFragments | ||
| class ARROW_DS_EXPORT DataSource { | ||
| public: | ||
| virtual ~DataSource() = default; | ||
|
|
||
| virtual std::string type() const = 0; | ||
|
|
||
| virtual std::unique_ptr<DataFragmentIterator> GetFragments( | ||
| const DataSelector& selector) = 0; | ||
| }; | ||
|
|
||
| /// \brief A DataSource consisting of a flat sequence of DataFragments | ||
|
||
| class ARROW_DS_EXPORT SimpleDataSource : public DataSource { | ||
| public: | ||
| std::unique_ptr<DataFragmentIterator> GetFragments( | ||
| const DataSelector& selector) override; | ||
|
|
||
| private: | ||
| DataFragmentVector fragments_; | ||
| }; | ||
|
|
||
| /// \brief Top-level interface for a Dataset with fragments coming | ||
| /// from possibly multiple sources | ||
| class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> { | ||
| public: | ||
| /// \param[in] source a single input data source | ||
| /// \param[in] schema a known schema to conform to, may be nullptr | ||
| explicit Dataset(std::shared_ptr<DataSource> source, | ||
| std::shared_ptr<Schema> schema = NULLPTR); | ||
|
|
||
| /// \param[in] sources one or more input data sources | ||
| /// \param[in] schema a known schema to conform to, may be nullptr | ||
| explicit Dataset(const std::vector<std::shared_ptr<DataSource>>& sources, | ||
|
||
| std::shared_ptr<Schema> schema = NULLPTR); | ||
|
|
||
| virtual ~Dataset() = default; | ||
|
|
||
| /// \brief Begin to build a new Scan operation against this Dataset | ||
| ScannerBuilder NewScan() const; | ||
|
|
||
| const std::vector<std::shared_ptr<DataSource>>& sources() const { return sources_; } | ||
|
|
||
| std::shared_ptr<Schema> schema() const { return schema_; } | ||
|
|
||
| /// \brief Compute consensus schema from input data sources | ||
|
||
| Status InferSchema(std::shared_ptr<Schema>* out); | ||
|
|
||
| /// \brief Return a copy of Dataset with a new target schema | ||
| Status ReplaceSchema(std::shared_ptr<Schema> schema, std::unique_ptr<Dataset>* out); | ||
|
|
||
| protected: | ||
| // The data sources must conform their output to this schema (with | ||
| // projections and filters taken into account) | ||
| std::shared_ptr<Schema> schema_; | ||
|
|
||
| std::vector<std::shared_ptr<DataSource>> sources_; | ||
| }; | ||
|
|
||
| } // namespace dataset | ||
| } // namespace arrow | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| /// Logic for automatically determining the structure of multi-file | ||
| /// dataset with possible partitioning according to available | ||
| /// partition schemes | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
|
|
||
| #include "arrow/dataset/type_fwd.h" | ||
| #include "arrow/dataset/visibility.h" | ||
| #include "arrow/util/macros.h" | ||
|
|
||
| namespace arrow { | ||
| namespace dataset { | ||
|
|
||
| struct ARROW_DS_EXPORT DiscoveryOptions { | ||
| std::shared_ptr<FileFormat> format = NULLPTR; | ||
| std::shared_ptr<PartitionScheme> partition_scheme = NULLPTR; | ||
| }; | ||
|
|
||
| /// \brief Using a root directory | ||
|
||
| ARROW_DS_EXPORT | ||
| Status DiscoverSource(const std::string& path, fs::FileSystem* filesystem, | ||
| const DiscoveryOptions& options, std::shared_ptr<DataSource>* out); | ||
|
|
||
| } // namespace dataset | ||
| } // namespace arrow | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
|
|
||
| #include "arrow/dataset/type_fwd.h" | ||
| #include "arrow/type_fwd.h" | ||
|
|
||
| namespace arrow { | ||
| namespace dataset { | ||
|
|
||
| /// \brief Loads a previously-written collection of Arrow protocol | ||
| /// files and exposes them in a way that can be consumed as a Dataset | ||
| /// source | ||
| class ARROW_DS_EXPORT DiskStoreReader : public DatasetSource { | ||
| public: | ||
| DiskStoreReader(const std::string& path, fs::FileSystem* filesystem); | ||
|
|
||
| private: | ||
| class DiskStoreReaderImpl; | ||
| std::unique_ptr<DiskStoreReaderImpl> impl_; | ||
|
|
||
| std::string path_; | ||
| fs::FileSystem* filesystem_; | ||
|
|
||
| DiskStoreReader() {} | ||
| }; | ||
|
|
||
| /// \brief | ||
| class ARROW_DS_EXPORT DiskStoreWriter { | ||
| public: | ||
| Status Write(const RecordBatch& batch); | ||
|
||
|
|
||
| private: | ||
| DiskStoreWriter() {} | ||
| }; | ||
|
|
||
| } // namespace dataset | ||
| } // namespace arrow | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't really understand what this class is used for. Can the user do something with it, apart from inspecting its properties?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a data structure for representing the internal structure of the dataset. Users in general will not have to interact with this data structure unless they wish to explore the physical topology of the dataset (e.g. iterating through partitions and listing files)