ArrowDataset¶

ArrowDataset loads and saves Hugging Face datasets in Arrow format using the datasets library.

kedro_datasets.huggingface.ArrowDataset ¶

ArrowDataset(
    *,
    path,
    version=None,
    data_files=None,
    load_args=None,
    save_args=None,
    credentials=None,
    fs_args=None,
    metadata=None
)

Bases: FilesystemDataset

ArrowDataset loads/saves Hugging Face Dataset and DatasetDict objects to/from disk in Arrow <https://huggingface.co/docs/datasets/about_arrow>_ format using save_to_disk / load_from_disk.

Saving IterableDataset or IterableDatasetDict objects is not supported and will raise a DatasetError. Materialize the iterable dataset into a Dataset or DatasetDict before saving.

Examples:

Using the YAML API:

reviews:
  type: huggingface.ArrowDataset
  path: data/01_raw/reviews

Using the Python API:

>>> from datasets import Dataset
>>> from kedro_datasets.huggingface.arrow_dataset import (
...     ArrowDataset,
... )
>>>
>>> data = Dataset.from_dict(
...     {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
... )
>>>
>>> dataset = ArrowDataset(
...     path=tmp_path / "test_hf_dataset"
... )
>>> dataset.save(data)
>>> reloaded = dataset.load()
>>> assert reloaded.to_dict() == data.to_dict()

Source code in kedro_datasets/huggingface/_base.py

def __init__(  # noqa: PLR0913
    self,
    *,
    path: str | os.PathLike,
    version: Version | None = None,
    data_files: dict[str, str] | None = None,
    load_args: dict[str, Any] | None = None,
    save_args: dict[str, Any] | None = None,
    credentials: dict[str, Any] | None = None,
    fs_args: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
) -> None:
    """Creates a new instance of ``FilesystemDataset``.

    Args:
        path: Path to a file or directory for persisting Hugging Face
            datasets. Supports local paths, ``os.PathLike`` objects,
            and remote URIs (e.g. ``s3://bucket/data``).
        version: Optional versioning configuration
            (see :class:`~kedro.io.core.Version`).
        data_files: Mapping of split name to filename for loading and
            saving a ``DatasetDict`` from a directory
            (e.g. ``{"train": "train.csv"}``). The keys must match
            the split names of the ``DatasetDict`` being saved, and
            the filenames must use the correct extension for the
            format (e.g. ``.csv`` for ``CSVDataset``).
        load_args: Additional keyword arguments passed to the
            underlying load function. This cannot include ``data_files``;
            use the top-level ``data_files`` argument instead.
        save_args: Additional keyword arguments passed to the
            underlying save function. This cannot include ``data_files``;
            use the top-level ``data_files`` argument instead.
        credentials: Credentials for the underlying filesystem
            (e.g. ``key``/``secret`` for S3). Passed to the
            ``storage_options`` parameter in the underlying
            ``datasets`` implementation.
        fs_args: Extra arguments passed to the ``fsspec`` filesystem
            initialiser. Passed to the ``storage_options`` parameter
            in the underlying ``datasets`` implementation.
        metadata: Any arbitrary metadata. This is ignored by Kedro
            but may be consumed by users or external plugins.
    """
    _fs_args = deepcopy(fs_args) or {}
    _credentials = deepcopy(credentials) or {}

    protocol, resolved_path = get_protocol_and_path(path, version)
    self._protocol = protocol

    if protocol == "file":
        _fs_args.setdefault("auto_mkdir", True)

    self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

    self._load_args = deepcopy(load_args or {})
    self._save_args = deepcopy(save_args or {})

    if "data_files" in self._load_args or "data_files" in self._save_args:
        msg = (
            f"{type(self).__name__} got ``data_files`` in ``load_args`` "
            "or ``save_args``. Pass it as a top-level argument instead."
        )
        raise DatasetError(msg)

    self._data_files = deepcopy(data_files)
    self.metadata = metadata

    self._storage_options = {**_credentials, **_fs_args} or None

    super().__init__(
        filepath=PurePosixPath(resolved_path),
        version=version,
        exists_function=self._fs.exists,
        glob_function=self._fs.glob,
    )

BUILDER `class-attribute` ¶

BUILDER = 'arrow'

EXTENSION `class-attribute` ¶

EXTENSION = '.arrow'

_exists ¶

_exists()

Source code in kedro_datasets/huggingface/arrow_dataset.py

def _exists(self) -> bool:
    try:
        load_path = get_filepath_str(self._get_load_path(), self._protocol)
    except DatasetError:
        return False

    return self._fs.isdir(load_path) and (
        self._fs.exists(f"{load_path}/dataset_dict.json")
        or self._fs.exists(f"{load_path}/dataset_info.json")
    )

_load_dataset ¶

_load_dataset(load_path)

Source code in kedro_datasets/huggingface/arrow_dataset.py

def _load_dataset(self, load_path: str) -> DatasetLike:
    return load_from_disk(
        load_path,
        storage_options=self._storage_options,
        **self._load_args,
    )

_save_dataset ¶

_save_dataset(data, save_path)

Source code in kedro_datasets/huggingface/arrow_dataset.py

def _save_dataset(self, data: Dataset, save_path: str) -> None:
    data.save_to_disk(
        save_path,
        storage_options=self._storage_options,
        **self._save_args,
    )

_save_dataset_dict ¶

_save_dataset_dict(data, save_path)

Source code in kedro_datasets/huggingface/arrow_dataset.py

def _save_dataset_dict(self, data: DatasetDict, save_path: str) -> None:
    data.save_to_disk(
        save_path,
        storage_options=self._storage_options,
        **self._save_args,
    )

ArrowDataset¶

kedro_datasets.huggingface.ArrowDataset ¶

BUILDER class-attribute ¶

EXTENSION class-attribute ¶

_exists ¶

_load_dataset ¶

_save_dataset ¶

_save_dataset_dict ¶

BUILDER `class-attribute` ¶

EXTENSION `class-attribute` ¶