Skip to content

SVMLightDataset

SVMLightDataset loads and saves data in the SVMlight format.

kedro_datasets.svmlight.SVMLightDataset

SVMLightDataset(
    *,
    filepath,
    load_args=None,
    save_args=None,
    version=None,
    credentials=None,
    fs_args=None,
    metadata=None
)

Bases: AbstractVersionedDataset[_DI, _DO]

SVMLightDataset loads/saves data from/to a svmlight/libsvm file using an underlying filesystem (e.g.: local, S3, GCS). It uses sklearn functions dump_svmlight_file to save and load_svmlight_file to load a file.

Data is loaded as a tuple of features and labels. Labels is NumPy array, and features is Compressed Sparse Row matrix.

This format is a text-based format, with one sample per line. It does not store zero valued features hence it is suitable for sparse datasets.

This format is used as the default format for both svmlight and the libsvm command line programs.

Examples:

Using the YAML API:

svm_dataset:
  type: svmlight.SVMLightDataset
  filepath: data/01_raw/location.svm
  load_args:
    zero_based: False
  save_args:
    zero_based: False

cars:
  type: svmlight.SVMLightDataset
  filepath: gcs://your_bucket/cars.svm
  fs_args:
    project: my-project
  credentials: my_gcp_credentials
  load_args:
    zero_based: False
  save_args:
    zero_based: False

Using the Python API:

>>> import numpy as np
>>> from kedro_datasets.svmlight import SVMLightDataset
>>>
>>> # Features and labels.
>>> data = (np.array([[0, 1], [2, 3.14159]]), np.array([7, 3]))
>>>
>>> dataset = SVMLightDataset(filepath=tmp_path / "test.svm")
>>> dataset.save(data)
>>> reloaded_features, reloaded_labels = dataset.load()
>>> assert (data[0] == reloaded_features).all()
>>> assert (data[1] == reloaded_labels).all()

Parameters:

  • filepath (str) –

    Filepath in POSIX format to a text file prefixed with a protocol like s3://. If prefix is not provided, file protocol (local filesystem) will be used. The prefix should be any protocol supported by fsspec.

  • load_args (dict[str, Any] | None, default: None ) –

    Arguments passed on to load_svmlight_file. See the details in https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html

  • save_args (dict[str, Any] | None, default: None ) –

    Arguments passed on to dump_svmlight_file. See the details in https://scikit-learn.org/stable/modules/generated/sklearn.datasets.dump_svmlight_file.html

  • version (Version | None, default: None ) –

    If specified, should be an instance of kedro.io.core.Version. If its load attribute is None, the latest version will be loaded. If its save attribute is None, save version will be autogenerated.

  • credentials (dict[str, Any] | None, default: None ) –

    Credentials required to get access to the underlying filesystem. E.g. for GCSFileSystem it should look like {"token": None}.

  • fs_args (dict[str, Any] | None, default: None ) –

    Extra arguments to pass into underlying filesystem class constructor (e.g. {"project": "my-project"} for GCSFileSystem). All defaults are preserved, except mode, which is set to rb when loading and to wb when saving.

  • metadata (dict[str, Any] | None, default: None ) –

    Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins.

Source code in kedro_datasets/svmlight/svmlight_dataset.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def __init__(  # noqa: PLR0913
    self,
    *,
    filepath: str,
    load_args: dict[str, Any] | None = None,
    save_args: dict[str, Any] | None = None,
    version: Version | None = None,
    credentials: dict[str, Any] | None = None,
    fs_args: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
) -> None:
    """Creates a new instance of SVMLightDataset to load/save data from a svmlight/libsvm file.

    Args:
        filepath: Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.
            If prefix is not provided, `file` protocol (local filesystem) will be used.
            The prefix should be any protocol supported by ``fsspec``.
        load_args: Arguments passed on to ``load_svmlight_file``.
            See the details in
            https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html
        save_args: Arguments passed on to ``dump_svmlight_file``.
            See the details in
            https://scikit-learn.org/stable/modules/generated/sklearn.datasets.dump_svmlight_file.html
        version: If specified, should be an instance of
            ``kedro.io.core.Version``. If its ``load`` attribute is
            None, the latest version will be loaded. If its ``save``
            attribute is None, save version will be autogenerated.
        credentials: Credentials required to get access to the underlying filesystem.
            E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
        fs_args: Extra arguments to pass into underlying filesystem class constructor
            (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
            All defaults are preserved, except `mode`, which is set to `rb` when loading
            and to `wb` when saving.
        metadata: Any arbitrary metadata.
            This is ignored by Kedro, but may be consumed by users or external plugins.
    """
    _fs_args = deepcopy(fs_args) or {}
    _fs_open_args_load = _fs_args.pop("open_args_load", {})
    _fs_open_args_save = _fs_args.pop("open_args_save", {})
    _credentials = deepcopy(credentials) or {}

    protocol, path = get_protocol_and_path(filepath, version)

    self._protocol = protocol
    if protocol == "file":
        _fs_args.setdefault("auto_mkdir", True)
    self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

    self.metadata = metadata

    super().__init__(
        filepath=PurePosixPath(path),
        version=version,
        exists_function=self._fs.exists,
        glob_function=self._fs.glob,
    )

    # Handle default load and save and fs arguments
    self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})}
    self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})}
    self._fs_open_args_load = {
        **self.DEFAULT_FS_ARGS.get("open_args_load", {}),
        **(_fs_open_args_load or {}),
    }
    self._fs_open_args_save = {
        **self.DEFAULT_FS_ARGS.get("open_args_save", {}),
        **(_fs_open_args_save or {}),
    }

DEFAULT_FS_ARGS class-attribute instance-attribute

DEFAULT_FS_ARGS = {
    "open_args_save": {"mode": "wb"},
    "open_args_load": {"mode": "rb"},
}

DEFAULT_LOAD_ARGS class-attribute instance-attribute

DEFAULT_LOAD_ARGS = {}

DEFAULT_SAVE_ARGS class-attribute instance-attribute

DEFAULT_SAVE_ARGS = {}

_fs instance-attribute

_fs = filesystem(_protocol, **_credentials, **_fs_args)

_fs_open_args_load instance-attribute

_fs_open_args_load = {
    None: get("open_args_load", {}),
    None: _fs_open_args_load or {},
}

_fs_open_args_save instance-attribute

_fs_open_args_save = {
    None: get("open_args_save", {}),
    None: _fs_open_args_save or {},
}

_load_args instance-attribute

_load_args = {
    None: DEFAULT_LOAD_ARGS,
    None: load_args or {},
}

_protocol instance-attribute

_protocol = protocol

_save_args instance-attribute

_save_args = {
    None: DEFAULT_SAVE_ARGS,
    None: save_args or {},
}

metadata instance-attribute

metadata = metadata

_describe

_describe()
Source code in kedro_datasets/svmlight/svmlight_dataset.py
163
164
165
166
167
168
169
170
def _describe(self):
    return {
        "filepath": self._filepath,
        "protocol": self._protocol,
        "load_args": self._load_args,
        "save_args": self._save_args,
        "version": self._version,
    }

_exists

_exists()
Source code in kedro_datasets/svmlight/svmlight_dataset.py
184
185
186
187
188
189
190
def _exists(self) -> bool:
    try:
        load_path = get_filepath_str(self._get_load_path(), self._protocol)
    except DatasetError:
        return False

    return self._fs.exists(load_path)

_invalidate_cache

_invalidate_cache()

Invalidate underlying filesystem caches.

Source code in kedro_datasets/svmlight/svmlight_dataset.py
196
197
198
199
def _invalidate_cache(self) -> None:
    """Invalidate underlying filesystem caches."""
    filepath = get_filepath_str(self._filepath, self._protocol)
    self._fs.invalidate_cache(filepath)

_release

_release()
Source code in kedro_datasets/svmlight/svmlight_dataset.py
192
193
194
def _release(self) -> None:
    super()._release()
    self._invalidate_cache()

load

load()
Source code in kedro_datasets/svmlight/svmlight_dataset.py
172
173
174
175
def load(self) -> _DO:
    load_path = get_filepath_str(self._get_load_path(), self._protocol)
    with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
        return load_svmlight_file(fs_file, **self._load_args)

save

save(data)
Source code in kedro_datasets/svmlight/svmlight_dataset.py
177
178
179
180
181
182
def save(self, data: _DI) -> None:
    save_path = get_filepath_str(self._get_save_path(), self._protocol)
    with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
        dump_svmlight_file(data[0], data[1], fs_file, **self._save_args)

    self._invalidate_cache()