Source code for kedro_datasets.huggingface.hugging_face_dataset

from __future__ import annotations

from typing import Any

from datasets import load_dataset
from huggingface_hub import HfApi
from kedro.io import AbstractVersionedDataset


[docs] class HFDataset(AbstractVersionedDataset): """``HFDataset`` loads Hugging Face datasets using the `datasets <https://pypi.org/project/datasets>`_ library. Example usage for the :doc:`YAML API <kedro:data/data_catalog_yaml_examples>`: .. code-block:: yaml yelp_reviews: type: kedro_hf_datasets.HFDataset dataset_name: yelp_review_full Example usage for the :doc:`Python API <kedro:data/advanced_data_catalog_usage>`: .. code-block:: pycon >>> from datasets.utils.logging import disable_progress_bar, set_verbosity, ERROR >>> disable_progress_bar() # for doctest to pass >>> set_verbosity(ERROR) # for doctest to pass >>> from kedro_datasets.huggingface import HFDataset >>> dataset = HFDataset(dataset_name="openai_humaneval") >>> ds = dataset.load() # doctest: +ELLIPSIS Downloading and preparing dataset ... Dataset ... >>> assert "test" in ds >>> assert len(ds["test"]) == 164 """ def __init__( self, *, dataset_name: str, dataset_kwargs: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None, ): self.dataset_name = dataset_name self._dataset_kwargs = dataset_kwargs or {} self.metadata = metadata
[docs] def load(self): return load_dataset(self.dataset_name, **self._dataset_kwargs)
[docs] def save(self): raise NotImplementedError("Not yet implemented")
def _describe(self) -> dict[str, Any]: api = HfApi() dataset_info = list(api.list_datasets(search=self.dataset_name))[0] return { "dataset_name": self.dataset_name, "dataset_tags": dataset_info.tags, "dataset_author": dataset_info.author, }
[docs] @staticmethod def list_datasets(): api = HfApi() return list(api.list_datasets())