Skip to content

HFTransformerPipelineDataset

HFTransformerPipelineDataset loads pretrained Hugging Face transformers using the transformers library.

kedro_datasets.huggingface.HFTransformerPipelineDataset

HFTransformerPipelineDataset(
    *,
    task=None,
    model_name=None,
    pipeline_kwargs=None,
    metadata=None
)

Bases: AbstractDataset

HFTransformerPipelineDataset loads pretrained Hugging Face transformers using the transformers <https://pypi.org/project/transformers>_ library.

Examples:

Using the YAML API:

summarizer_model:
  type: huggingface.HFTransformerPipelineDataset
  task: summarization

fill_mask_model:
  type: huggingface.HFTransformerPipelineDataset
  task: fill-mask
  model_name: Twitter/twhin-bert-base

Using the Python API:

>>> from kedro_datasets.huggingface import HFTransformerPipelineDataset
>>>
>>> dataset = HFTransformerPipelineDataset(
...     task="text-classification", model_name="prajjwal1/bert-tiny"
... )
>>> model = dataset.load()
>>> assert model("Hello world")[0]["label"].startswith("LABEL_")
Source code in kedro_datasets/huggingface/transformer_pipeline_dataset.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def __init__(
    self,
    *,
    task: str | None = None,
    model_name: str | None = None,
    pipeline_kwargs: dict[str, t.Any] | None = None,
    metadata: dict[str, t.Any] | None = None,
):
    if task is None and model_name is None:
        raise ValueError("At least 'task' or 'model_name' are needed")
    self._task = task if task else None
    self._model_name = model_name
    self._pipeline_kwargs = pipeline_kwargs or {}
    self.metadata = metadata

    if self._pipeline_kwargs and (
        "task" in self._pipeline_kwargs or "model" in self._pipeline_kwargs
    ):
        warn(
            "Specifying 'task' or 'model' in 'pipeline_kwargs' is not allowed",
            UserWarning,
        )
        self._pipeline_kwargs.pop("task", None)
        self._pipeline_kwargs.pop("model", None)

_model_name instance-attribute

_model_name = model_name

_pipeline_kwargs instance-attribute

_pipeline_kwargs = pipeline_kwargs or {}

_task instance-attribute

_task = task if task else None

metadata instance-attribute

metadata = metadata

_describe

_describe()
Source code in kedro_datasets/huggingface/transformer_pipeline_dataset.py
71
72
73
74
75
76
def _describe(self) -> dict[str, t.Any]:
    return {
        "task": self._task,
        "model_name": self._model_name,
        "pipeline_kwargs": self._pipeline_kwargs,
    }

load

load()
Source code in kedro_datasets/huggingface/transformer_pipeline_dataset.py
65
66
def load(self) -> Pipeline:
    return pipeline(self._task, model=self._model_name, **self._pipeline_kwargs)

save

save(pipeline)
Source code in kedro_datasets/huggingface/transformer_pipeline_dataset.py
68
69
def save(self, pipeline: Pipeline) -> None:
    raise NotImplementedError("Not yet implemented")