ArrowDataset(
*,
path,
version=None,
data_files=None,
load_args=None,
save_args=None,
credentials=None,
fs_args=None,
metadata=None
)
Bases: FilesystemDataset
ArrowDataset loads/saves Hugging Face Dataset and
DatasetDict objects to/from disk in
Arrow <https://huggingface.co/docs/datasets/about_arrow>_ format
using save_to_disk / load_from_disk.
Saving IterableDataset or IterableDatasetDict objects is not
supported and will raise a DatasetError. Materialize the iterable
dataset into a Dataset or DatasetDict before saving.
Examples:
Using the
YAML API:
reviews:
type: huggingface.ArrowDataset
path: data/01_raw/reviews
Using the
Python API:
>>> from datasets import Dataset
>>> from kedro_datasets.huggingface.arrow_dataset import (
... ArrowDataset,
... )
>>>
>>> data = Dataset.from_dict(
... {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
... )
>>>
>>> dataset = ArrowDataset(
... path=tmp_path / "test_hf_dataset"
... )
>>> dataset.save(data)
>>> reloaded = dataset.load()
>>> assert reloaded.to_dict() == data.to_dict()
Source code in kedro_datasets/huggingface/_base.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110 | def __init__( # noqa: PLR0913
self,
*,
path: str | os.PathLike,
version: Version | None = None,
data_files: dict[str, str] | None = None,
load_args: dict[str, Any] | None = None,
save_args: dict[str, Any] | None = None,
credentials: dict[str, Any] | None = None,
fs_args: dict[str, Any] | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
"""Creates a new instance of ``FilesystemDataset``.
Args:
path: Path to a file or directory for persisting Hugging Face
datasets. Supports local paths, ``os.PathLike`` objects,
and remote URIs (e.g. ``s3://bucket/data``).
version: Optional versioning configuration
(see :class:`~kedro.io.core.Version`).
data_files: Mapping of split name to filename for loading and
saving a ``DatasetDict`` from a directory
(e.g. ``{"train": "train.csv"}``). The keys must match
the split names of the ``DatasetDict`` being saved, and
the filenames must use the correct extension for the
format (e.g. ``.csv`` for ``CSVDataset``).
load_args: Additional keyword arguments passed to the
underlying load function. This cannot include ``data_files``;
use the top-level ``data_files`` argument instead.
save_args: Additional keyword arguments passed to the
underlying save function. This cannot include ``data_files``;
use the top-level ``data_files`` argument instead.
credentials: Credentials for the underlying filesystem
(e.g. ``key``/``secret`` for S3). Passed to the
``storage_options`` parameter in the underlying
``datasets`` implementation.
fs_args: Extra arguments passed to the ``fsspec`` filesystem
initialiser. Passed to the ``storage_options`` parameter
in the underlying ``datasets`` implementation.
metadata: Any arbitrary metadata. This is ignored by Kedro
but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_credentials = deepcopy(credentials) or {}
protocol, resolved_path = get_protocol_and_path(path, version)
self._protocol = protocol
if protocol == "file":
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
self._load_args = deepcopy(load_args or {})
self._save_args = deepcopy(save_args or {})
if "data_files" in self._load_args or "data_files" in self._save_args:
msg = (
f"{type(self).__name__} got ``data_files`` in ``load_args`` "
"or ``save_args``. Pass it as a top-level argument instead."
)
raise DatasetError(msg)
self._data_files = deepcopy(data_files)
self.metadata = metadata
self._storage_options = {**_credentials, **_fs_args} or None
super().__init__(
filepath=PurePosixPath(resolved_path),
version=version,
exists_function=self._fs.exists,
glob_function=self._fs.glob,
)
|
EXTENSION
class-attribute
_exists
Source code in kedro_datasets/huggingface/arrow_dataset.py
75
76
77
78
79
80
81
82
83
84 | def _exists(self) -> bool:
try:
load_path = get_filepath_str(self._get_load_path(), self._protocol)
except DatasetError:
return False
return self._fs.isdir(load_path) and (
self._fs.exists(f"{load_path}/dataset_dict.json")
or self._fs.exists(f"{load_path}/dataset_info.json")
)
|
_load_dataset
Source code in kedro_datasets/huggingface/arrow_dataset.py
| def _load_dataset(self, load_path: str) -> DatasetLike:
return load_from_disk(
load_path,
storage_options=self._storage_options,
**self._load_args,
)
|
_save_dataset
_save_dataset(data, save_path)
Source code in kedro_datasets/huggingface/arrow_dataset.py
| def _save_dataset(self, data: Dataset, save_path: str) -> None:
data.save_to_disk(
save_path,
storage_options=self._storage_options,
**self._save_args,
)
|
_save_dataset_dict
_save_dataset_dict(data, save_path)
Source code in kedro_datasets/huggingface/arrow_dataset.py
| def _save_dataset_dict(self, data: DatasetDict, save_path: str) -> None:
data.save_to_disk(
save_path,
storage_options=self._storage_options,
**self._save_args,
)
|