Skip to content

HFDataset

HFDataset loads Hugging Face datasets using the datasets library.

kedro_datasets.huggingface.HFDataset

HFDataset(*, dataset_name, dataset_kwargs=None, metadata=None)

Bases: AbstractDataset

HFDataset loads Hugging Face datasets using the datasets <https://pypi.org/project/datasets>_ library.

Examples:

Using the YAML API:

yelp_reviews:
  type: kedro_hf_datasets.HFDataset
  dataset_name: yelp_review_full

Using the Python API:

>>> from datasets.utils.logging import ERROR, disable_progress_bar, set_verbosity
>>> from kedro_datasets.huggingface import HFDataset
>>>
>>> disable_progress_bar()  # for doctest to pass
>>> set_verbosity(ERROR)  # for doctest to pass
>>>
>>> dataset = HFDataset(dataset_name="openai_humaneval")
>>> ds = dataset.load()
Downloading and preparing dataset ...
Dataset ...
>>> assert "test" in ds
>>> assert len(ds["test"]) == 164
Source code in kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py
40
41
42
43
44
45
46
47
48
49
def __init__(
    self,
    *,
    dataset_name: str,
    dataset_kwargs: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
):
    self.dataset_name = dataset_name
    self._dataset_kwargs = dataset_kwargs or {}
    self.metadata = metadata

_dataset_kwargs instance-attribute

_dataset_kwargs = dataset_kwargs or {}

dataset_name instance-attribute

dataset_name = dataset_name

metadata instance-attribute

metadata = metadata

_describe

_describe()
Source code in kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py
57
58
59
60
61
62
63
64
def _describe(self) -> dict[str, Any]:
    api = HfApi()
    dataset_info = list(api.list_datasets(search=self.dataset_name))[0]
    return {
        "dataset_name": self.dataset_name,
        "dataset_tags": dataset_info.tags,
        "dataset_author": dataset_info.author,
    }

list_datasets staticmethod

list_datasets()
Source code in kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py
66
67
68
69
@staticmethod
def list_datasets():
    api = HfApi()
    return list(api.list_datasets())

load

load()
Source code in kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py
51
52
def load(self):
    return load_dataset(self.dataset_name, **self._dataset_kwargs)

save

save()
Source code in kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py
54
55
def save(self):
    raise NotImplementedError("Not yet implemented")