Skip to content

ChromaDBDataset

ChromaDBDataset loads and saves data to ChromaDB collections.

kedro_datasets_experimental.chromadb.ChromaDBDataset

ChromaDBDataset(
    *,
    collection_name,
    client_type="ephemeral",
    client_settings=None,
    load_args=None,
    save_args=None,
    metadata=None
)

Bases: AbstractDataset[dict[str, Any], dict[str, Any]]

ChromaDBDataset loads and saves data from/to ChromaDB collections.

ChromaDB is a vector database for building AI applications. This dataset allows you to interact with ChromaDB collections for storing and retrieving documents with embeddings.

Examples:

Using the YAML API:

my_collection:
  type: chromadb.ChromaDBDataset
  collection_name: "documents"
  client_type: "persistent"
  client_settings:
    path: "./chroma_db"

Using the Python API:

>>> from kedro_datasets_experimental.chromadb import ChromaDBDataset
>>>
>>> # Save data to ChromaDB
>>> data = {
...     "documents": ["This is a document", "This is another document"],
...     "metadatas": [{"type": "text"}, {"type": "text"}],
...     "ids": ["doc1", "doc2"]
... }
>>> dataset = ChromaDBDataset(collection_name="test_collection")
>>> dataset.save(data)
>>>
>>> # Load data from ChromaDB
>>> loaded_data = dataset.load()
>>> print(loaded_data["documents"])  # ['This is a document', 'This is another document']
>>>
>>> # Query for similar vectors (efficient for large datasets)
>>> query_dataset = ChromaDBDataset(
...     collection_name="documents",
...     load_args={
...         "query_texts": ["machine learning"],
...         "n_results": 5,
...         "include": ["documents", "metadatas", "distances"]
...     }
... )
>>> results = query_dataset.load()  # Returns top-5 similar documents

Parameters:

  • collection_name (str) –

    The name of the ChromaDB collection.

  • client_type (str, default: 'ephemeral' ) –

    Type of ChromaDB client. Options: "ephemeral", "persistent", "http". Defaults to "ephemeral".

  • client_settings (dict[str, Any] | None, default: None ) –

    Settings for the ChromaDB client. For "persistent", use {"path": "/path/to/db"}. For "http", use {"host": "localhost", "port": 8000}.

  • load_args (dict[str, Any] | None, default: None ) –

    Additional arguments for loading data from ChromaDB collection. Can include "where", "where_document", "include", "n_results", etc. For vector similarity queries, use: - "query_embeddings": List of embeddings to query for similarity - "query_texts": List of texts to query for similarity - "n_results": Number of results to return (default: 10) - "where": Metadata filter conditions - "where_document": Document content filter conditions

  • save_args (dict[str, Any] | None, default: None ) –

    Additional arguments for saving data to ChromaDB collection. Can include "embeddings" if you want to provide custom embeddings.

  • metadata (dict[str, Any] | None, default: None ) –

    Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins.

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def __init__(  # noqa: PLR0913
    self,
    *,
    collection_name: str,
    client_type: str = "ephemeral",
    client_settings: dict[str, Any] | None = None,
    load_args: dict[str, Any] | None = None,
    save_args: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
) -> None:
    """Creates a new instance of ``ChromaDBDataset``.

    Args:
        collection_name: The name of the ChromaDB collection.
        client_type: Type of ChromaDB client. Options: "ephemeral", "persistent", "http".
            Defaults to "ephemeral".
        client_settings: Settings for the ChromaDB client. For "persistent", use {"path": "/path/to/db"}.
            For "http", use {"host": "localhost", "port": 8000}.
        load_args: Additional arguments for loading data from ChromaDB collection.
            Can include "where", "where_document", "include", "n_results", etc.
            For vector similarity queries, use:
            - "query_embeddings": List of embeddings to query for similarity
            - "query_texts": List of texts to query for similarity
            - "n_results": Number of results to return (default: 10)
            - "where": Metadata filter conditions
            - "where_document": Document content filter conditions
        save_args: Additional arguments for saving data to ChromaDB collection.
            Can include "embeddings" if you want to provide custom embeddings.
        metadata: Any arbitrary metadata.
            This is ignored by Kedro, but may be consumed by users or external plugins.
    """
    self._collection_name = collection_name
    self._client_type = client_type
    self._client_settings = client_settings or {}
    self._load_args = load_args or {}
    self._save_args = save_args or {}
    self.metadata = metadata
    # Initialize instance attributes (actual annotations are at class-level)
    self._client = None
    self._collection = None

_client instance-attribute

_client = None

_client_settings instance-attribute

_client_settings = client_settings or {}

_client_type instance-attribute

_client_type = client_type

_collection instance-attribute

_collection = None

_collection_name instance-attribute

_collection_name = collection_name

_load_args instance-attribute

_load_args = load_args or {}

_save_args instance-attribute

_save_args = save_args or {}

metadata instance-attribute

metadata = metadata

_create_client

_create_client()

Create ChromaDB client based on configuration.

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def _create_client(self) -> chromadb.Client:
    """Create ChromaDB client based on configuration."""
    if self._client_type == "ephemeral":
        return chromadb.EphemeralClient()
    elif self._client_type == "persistent":
        path = self._client_settings.get("path", "./chroma_db")
        return chromadb.PersistentClient(path=path, **{
            k: v for k, v in self._client_settings.items() if k != "path"
        })
    elif self._client_type == "http":
        host = self._client_settings.get("host", "localhost")
        port = self._client_settings.get("port", 8000)
        return chromadb.HttpClient(host=host, port=port, **{
            k: v for k, v in self._client_settings.items() if k not in ["host", "port"]
        })
    else:
        raise DatasetError(
            f"Unsupported client_type: {self._client_type}. "
            f"Must be one of: 'ephemeral', 'persistent', 'http'"
        )

_describe

_describe()

Returns a dictionary describing the dataset configuration.

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
155
156
157
158
159
160
161
162
163
def _describe(self) -> dict[str, Any]:
    """Returns a dictionary describing the dataset configuration."""
    return {
        "collection_name": self._collection_name,
        "client_type": self._client_type,
        "client_settings": self._client_settings,
        "load_args": self._load_args,
        "save_args": self._save_args,
    }

_get_client

_get_client()

Get or create the ChromaDB client.

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
126
127
128
129
130
def _get_client(self) -> chromadb.Client:
    """Get or create the ChromaDB client."""
    if self._client is None:
        self._client = self._create_client()
    return self._client

_get_collection

_get_collection(create_if_missing=True)

Get or create the ChromaDB collection.

Parameters:

  • create_if_missing (bool, default: True ) –

    If True, creates the collection if it doesn't exist. If False, returns None when collection is not found.

Returns:

  • Collection | None

    Collection object if found/created, None if not found and create_if_missing=False.

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def _get_collection(self, create_if_missing: bool = True) -> Collection | None:
    """Get or create the ChromaDB collection.

    Args:
        create_if_missing: If True, creates the collection if it doesn't exist.
                          If False, returns None when collection is not found.

    Returns:
        Collection object if found/created, None if not found and create_if_missing=False.
    """
    if self._collection is None:
        client = self._get_client()
        try:
            self._collection = client.get_collection(name=self._collection_name)
        except NotFoundError:
            if create_if_missing:
                # Collection doesn't exist, create it
                self._collection = client.create_collection(name=self._collection_name)
            else:
                # Don't create collection, return None instead of raising
                return None
    return self._collection

exists

exists()

Checks if the collection exists and contains data.

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def exists(self) -> bool:
    """Checks if the collection exists and contains data."""
    try:
        collection = self._collection or self._get_collection(create_if_missing=False)
        # In case both return None
        if collection is None:
            return False
        return collection.count() > 0
    except Exception:
        return False
        # Use the same collection instance if we already have it
        if self._collection is not None:
            count = self._collection.count()
            return count > 0

        # Otherwise try to get the collection from the client
        collection = self._get_collection(create_if_missing=False)
        count = collection.count()
        return count > 0
    except Exception:
        return False

load

load()

Loads data from the ChromaDB collection.

Returns:

  • dict[str, Any]

    A dictionary containing the collection data with keys: - "documents": List of document texts - "metadatas": List of metadata dictionaries - "ids": List of document IDs - "embeddings": List of embeddings (if included)

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def load(self) -> dict[str, Any]:
    """Loads data from the ChromaDB collection.

    Returns:
        dict[str, Any]: A dictionary containing the collection data with keys:
            - "documents": List of document texts
            - "metadatas": List of metadata dictionaries
            - "ids": List of document IDs
            - "embeddings": List of embeddings (if included)
    """
    collection = self._get_collection(create_if_missing=False)

    # If collection doesn't exist, return empty result rather than creating it
    if collection is None:
        return {"documents": [], "metadatas": [], "ids": [], "embeddings": []}

    # Prepare load arguments
    load_args = {
        "include": ["documents", "metadatas", "embeddings"],
        **self._load_args
    }

    try:
        # Use query() for vector similarity search or filtering
        if any(key in load_args for key in ["query_embeddings", "query_texts", "where", "where_document"]):
            # Vector similarity query - more efficient for large datasets
            if "n_results" not in load_args:
                load_args["n_results"] = 10  # Default limit for queries
            result = collection.query(**load_args)
        else:
            # Use get() for retrieving all documents (not recommended for large collections)
            if "n_results" in load_args:
                # Convert n_results to limit for get() method
                load_args["limit"] = load_args.pop("n_results")
            result = collection.get(**load_args)

        return {
            "documents": result.get("documents", []),
            "metadatas": result.get("metadatas", []),
            "ids": result.get("ids", []),
            "embeddings": result.get("embeddings", [])
        }
    except Exception as e:
        raise DatasetError(
            f"Failed to load data from ChromaDB collection '{self._collection_name}': {e}"
        ) from e

save

save(data)

Saves data to the ChromaDB collection.

Parameters:

  • data (dict[str, Any]) –

    A dictionary containing the data to save. Expected keys: - "documents": List of document texts (required) - "ids": List of document IDs (required) - "metadatas": List of metadata dictionaries (optional) - "embeddings": List of embeddings (optional, will be auto-generated if not provided)

Source code in kedro_datasets_experimental/chromadb/chromadb_dataset.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def save(self, data: dict[str, Any]) -> None:
    """Saves data to the ChromaDB collection.

    Args:
        data: A dictionary containing the data to save. Expected keys:
            - "documents": List of document texts (required)
            - "ids": List of document IDs (required)
            - "metadatas": List of metadata dictionaries (optional)
            - "embeddings": List of embeddings (optional, will be auto-generated if not provided)
    """
    if not isinstance(data, dict):
        raise DatasetError(f"Data must be a dictionary, got {type(data)}")

    if "documents" not in data or "ids" not in data:
        raise DatasetError("Data must contain 'documents' and 'ids' keys")

    collection = self._get_collection(create_if_missing=True)

    if collection is None:
        raise DatasetError(f"Failed to access or create ChromaDB collection '{self._collection_name}'")

    try:
        # Prepare the data for ChromaDB
        add_kwargs = {
            "documents": data["documents"],
            "ids": data["ids"],
            **self._save_args
        }

        # Add optional fields if present
        if "metadatas" in data:
            add_kwargs["metadatas"] = data["metadatas"]
        if "embeddings" in data:
            add_kwargs["embeddings"] = data["embeddings"]

        # Add documents to collection
        collection.add(**add_kwargs)

    except Exception as e:
        raise DatasetError(
            f"Failed to save data to ChromaDB collection '{self._collection_name}': {e}"
        ) from e