Skip to content

PptxDataset

PptxDataset loads and saves data in Pptx format.

kedro_datasets.openxml.PptxDataset

PptxDataset(
    *,
    filepath,
    version=None,
    credentials=None,
    fs_args=None,
    metadata=None
)

Bases: AbstractVersionedDataset[Presentation, Presentation]

PptxDataset loads/saves data from/to a .pptx file using an underlying filesystem (e.g.: local, S3, GCS). It uses python-pptx from Presentation to handle the .pptx file.

Examples:

Using the YAML API:

presentation:
  type: openxml.PptxDataset
  filepath: slides.pptx

Using the Python API:

>>> from pptx import Presentation
>>> from kedro_datasets.openxml import PptxDataset
>>>
>>> data = Presentation()
>>> slide = data.slides.add_slide(data.slide_layouts[0])
>>> title = slide.shapes.title
>>> title.text = "Hello, World!"
>>>
>>> dataset = PptxDataset(filepath=tmp_path / "test.pptx")
>>> dataset.save(data)
>>> reloaded = dataset.load()
>>> assert reloaded.slides[0].shapes.title.text == "Hello, World!"

Parameters:

  • filepath (str) –

    Filepath in POSIX format to a .pptx file prefixed with a protocol like s3://. If prefix is not provided, file protocol (local filesystem) will be used. The prefix should be any protocol supported by fsspec. Note: http(s) doesn't support versioning.

  • version (Version | None, default: None ) –

    If specified, should be an instance of kedro.io.core.Version. If its load attribute is None, the latest version will be loaded. If its save attribute is None, save version will be autogenerated.

  • credentials (dict[str, Any] | None, default: None ) –

    Credentials required to get access to the underlying filesystem. E.g. for GCSFileSystem it should look like {"token": None}.

  • fs_args (dict[str, Any] | None, default: None ) –

    Extra arguments to pass into underlying filesystem class constructor (e.g. {"project": "my-project"} for GCSFileSystem), as well as to pass to the filesystem's open method through nested keys open_args_load and open_args_save. Here you can find all available arguments for open: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except mode, which is set to wb when saving.

  • metadata (dict[str, Any] | None, default: None ) –

    Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins.

Source code in kedro_datasets/openxml/pptx_dataset.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def __init__(  # noqa: PLR0913
    self,
    *,
    filepath: str,
    version: Version | None = None,
    credentials: dict[str, Any] | None = None,
    fs_args: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
) -> None:
    """Creates a new instance of ``PptxDataset`` pointing to a concrete .pptx file
    on a specific filesystem.

    Args:
        filepath: Filepath in POSIX format to a .pptx file prefixed with a protocol like `s3://`.
            If prefix is not provided, `file` protocol (local filesystem) will be used.
            The prefix should be any protocol supported by ``fsspec``.
            Note: `http(s)` doesn't support versioning.
        version: If specified, should be an instance of
            ``kedro.io.core.Version``. If its ``load`` attribute is
            None, the latest version will be loaded. If its ``save``
            attribute is None, save version will be autogenerated.
        credentials: Credentials required to get access to the underlying filesystem.
            E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
        fs_args: Extra arguments to pass into underlying filesystem class constructor
            (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as
            to pass to the filesystem's `open` method through nested keys
            `open_args_load` and `open_args_save`.
            Here you can find all available arguments for `open`:
            https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
            All defaults are preserved, except `mode`, which is set to `wb` when saving.
        metadata: Any arbitrary metadata.
            This is ignored by Kedro, but may be consumed by users or external plugins.
    """
    _fs_args = deepcopy(fs_args) or {}
    _fs_open_args_load = _fs_args.pop("open_args_load", {})
    _fs_open_args_save = _fs_args.pop("open_args_save", {})
    _credentials = deepcopy(credentials) or {}

    protocol, path = get_protocol_and_path(filepath, version)
    if protocol == "file":
        _fs_args.setdefault("auto_mkdir", True)

    self._protocol = protocol
    self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

    self.metadata = metadata

    super().__init__(
        filepath=PurePosixPath(path),
        version=version,
        exists_function=self._fs.exists,
        glob_function=self._fs.glob,
    )

    self._fs_open_args_load = {
        **self.DEFAULT_FS_ARGS.get("open_args_load", {}),
        **(_fs_open_args_load or {}),
    }
    self._fs_open_args_save = {
        **self.DEFAULT_FS_ARGS.get("open_args_save", {}),
        **(_fs_open_args_save or {}),
    }

DEFAULT_FS_ARGS class-attribute instance-attribute

DEFAULT_FS_ARGS = {'open_args_save': {'mode': 'wb'}}

_fs instance-attribute

_fs = filesystem(_protocol, **_credentials, **_fs_args)

_fs_open_args_load instance-attribute

_fs_open_args_load = {
    None: get("open_args_load", {}),
    None: _fs_open_args_load or {},
}

_fs_open_args_save instance-attribute

_fs_open_args_save = {
    None: get("open_args_save", {}),
    None: _fs_open_args_save or {},
}

_protocol instance-attribute

_protocol = protocol

metadata instance-attribute

metadata = metadata

_describe

_describe()

Returns a dictionary with basic dataset information.

Returns:

  • dict[str, Any]

    A dictionary with the following keys: - "filepath" (PurePosixPath): Path to the .pptx file. - "protocol" (str): Filesystem protocol (e.g., 'file', 's3'). - "version" (Version | None): Version information if specified.

Source code in kedro_datasets/openxml/pptx_dataset.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def _describe(self) -> dict[str, Any]:
    """Returns a dictionary with basic dataset information.

    Returns:
        dict[str, Any]: A dictionary with the following keys:
            - "filepath" (PurePosixPath): Path to the `.pptx` file.
            - "protocol" (str): Filesystem protocol (e.g., 'file', 's3').
            - "version" (Version | None): Version information if specified.
    """
    return {
        "filepath": self._filepath,
        "protocol": self._protocol,
        "version": self._version,
    }

_exists

_exists()

Checks whether the file exists on the filesystem.

Returns:

  • bool

    True if the file exists, otherwise False.

Source code in kedro_datasets/openxml/pptx_dataset.py
158
159
160
161
162
163
164
165
166
167
168
169
def _exists(self) -> bool:
    """Checks whether the file exists on the filesystem.

    Returns:
        bool: True if the file exists, otherwise False.
    """
    try:
        load_path = get_filepath_str(self._get_load_path(), self._protocol)
    except DatasetError:
        return False

    return self._fs.exists(load_path)

_invalidate_cache

_invalidate_cache()

Invalidate underlying filesystem caches.

Source code in kedro_datasets/openxml/pptx_dataset.py
176
177
178
179
def _invalidate_cache(self) -> None:
    """Invalidate underlying filesystem caches."""
    filepath = get_filepath_str(self._filepath, self._protocol)
    self._fs.invalidate_cache(filepath)

_release

_release()

Releases resources and invalidates the filesystem cache.

Source code in kedro_datasets/openxml/pptx_dataset.py
171
172
173
174
def _release(self) -> None:
    """Releases resources and invalidates the filesystem cache."""
    super()._release()
    self._invalidate_cache()

load

load()

Loads a .pptx file from the filesystem.

Returns:

  • Presentation

    A python-pptx Presentation instance containing the loaded content.

Source code in kedro_datasets/openxml/pptx_dataset.py
133
134
135
136
137
138
139
140
141
def load(self) -> Presentation:
    """Loads a `.pptx` file from the filesystem.

    Returns:
        Presentation: A `python-pptx` Presentation instance containing the loaded content.
    """
    load_path = get_filepath_str(self._get_load_path(), self._protocol)
    with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
        return Presentation(fs_file)

save

save(data)

Saves a Presentation object to the filesystem.

Parameters:

  • data (Presentation) –

    A python-pptx Presentation instance to be saved.

Source code in kedro_datasets/openxml/pptx_dataset.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def save(self, data: Presentation) -> None:
    """Saves a `Presentation` object to the filesystem.

    Args:
        data (Presentation): A `python-pptx` Presentation instance to be saved.
    """
    buffer = BytesIO()
    data.save(buffer)
    buffer.seek(0)
    save_path = get_filepath_str(self._get_save_path(), self._protocol)
    with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
        fs_file.write(buffer.read())

    self._invalidate_cache()