Skip to content

HFDataset

HFDataset loads Hugging Face datasets using the datasets library.

kedro_datasets.huggingface.HFDataset

HFDataset(
    *, dataset_name, dataset_kwargs=None, metadata=None
)

Bases: AbstractDataset

HFDataset loads Hugging Face datasets using the datasets <https://pypi.org/project/datasets>_ library.

Examples:

Using the YAML API:

yelp_reviews:
  type: kedro_hf_datasets.HFDataset
  dataset_name: yelp_review_full

Using the Python API:

>>> from datasets.utils.logging import ERROR, disable_progress_bar, set_verbosity
>>> from kedro_datasets.huggingface import HFDataset
>>>
>>> disable_progress_bar()  # for doctest to pass
>>> set_verbosity(ERROR)  # for doctest to pass
>>>
>>> dataset = HFDataset(dataset_name="openai_humaneval")
>>> ds = dataset.load()
>>> assert "test" in ds
>>> assert len(ds["test"]) == 164
Source code in kedro_datasets/huggingface/hugging_face_dataset.py
38
39
40
41
42
43
44
45
46
47
def __init__(
    self,
    *,
    dataset_name: str,
    dataset_kwargs: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
):
    self.dataset_name = dataset_name
    self._dataset_kwargs = dataset_kwargs or {}
    self.metadata = metadata

_dataset_kwargs instance-attribute

_dataset_kwargs = dataset_kwargs or {}

dataset_name instance-attribute

dataset_name = dataset_name

metadata instance-attribute

metadata = metadata

_describe

_describe()
Source code in kedro_datasets/huggingface/hugging_face_dataset.py
56
57
58
59
60
61
62
63
def _describe(self) -> dict[str, Any]:
    api = HfApi()
    dataset_info = list(api.list_datasets(search=self.dataset_name))[0]
    return {
        "dataset_name": self.dataset_name,
        "dataset_tags": dataset_info.tags,
        "dataset_author": dataset_info.author,
    }

list_datasets staticmethod

list_datasets()
Source code in kedro_datasets/huggingface/hugging_face_dataset.py
65
66
67
68
@staticmethod
def list_datasets():
    api = HfApi()
    return list(api.list_datasets())

load

load()
Source code in kedro_datasets/huggingface/hugging_face_dataset.py
49
50
51
def load(self):
    # TODO: Replace suppression with the solution from here: https://github.com/kedro-org/kedro-plugins/issues/1131
    return load_dataset(self.dataset_name, **self._dataset_kwargs)  # nosec

save

save()
Source code in kedro_datasets/huggingface/hugging_face_dataset.py
53
54
def save(self):
    raise NotImplementedError("Not yet implemented")