Skip to content

GBQTableDataset

GBQTableDataset loads and saves data to/from Google BigQuery tables using pandas-gbq.

kedro_datasets.pandas.GBQTableDataset

GBQTableDataset(
    *,
    dataset,
    table_name,
    project=None,
    credentials=None,
    load_args=None,
    save_args=None,
    metadata=None
)

Bases: ConnectionMixin, AbstractDataset[None, DataFrame]

GBQTableDataset loads and saves data from/to Google BigQuery. It uses pandas-gbq to read and write from/to BigQuery table.

Examples:

Using the YAML API:

vehicles:
  type: pandas.GBQTableDataset
  dataset: big_query_dataset
  table_name: big_query_table
  project: my-project
  credentials: gbq-creds
  load_args:
    reauth: True
  save_args:
    chunk_size: 100

Using the Python API:

>>> import pandas as pd
>>> from kedro_datasets.pandas import GBQTableDataset
>>>
>>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})
>>>
>>> dataset = GBQTableDataset(
...     dataset="dataset", table_name="table_name", project="my-project"
>>> )
>>> dataset.save(data)
>>> reloaded = dataset.load()
>>> assert data.equals(reloaded)

Parameters:

  • dataset (str) –

    Google BigQuery dataset.

  • table_name (str) –

    Google BigQuery table name.

  • project (str | None, default: None ) –

    Google BigQuery Account project ID. Optional when available from the environment. https://cloud.google.com/resource-manager/docs/creating-managing-projects

  • credentials (dict[str, Any] | str | Credentials | None, default: None ) –

    Credentials for accessing Google APIs. Either a credential that bases on google.auth.credentials.Credentials OR a service account json as a dictionary OR a path to a service account key json file. https://googleapis.dev/python/google-auth/latest/

  • load_args (dict[str, Any] | None, default: None ) –

    Pandas options for loading BigQuery table into DataFrame. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html All defaults are preserved.

  • save_args (dict[str, Any] | None, default: None ) –

    Pandas options for saving DataFrame to BigQuery table. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html All defaults are preserved, but "progress_bar", which is set to False.

  • metadata (dict[str, Any] | None, default: None ) –

    Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins.

Raises:

  • DatasetError

    When load_args['location'] and save_args['location'] are different.

Source code in kedro_datasets/pandas/gbq_dataset.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def __init__(  # noqa: PLR0913
    self,
    *,
    dataset: str,
    table_name: str,
    project: str | None = None,
    credentials: dict[str, Any] | str | Credentials | None = None,
    load_args: dict[str, Any] | None = None,
    save_args: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
) -> None:
    """Creates a new instance of ``GBQTableDataset``.

    Args:
        dataset: Google BigQuery dataset.
        table_name: Google BigQuery table name.
        project: Google BigQuery Account project ID.
            Optional when available from the environment.
            https://cloud.google.com/resource-manager/docs/creating-managing-projects
        credentials: Credentials for accessing Google APIs.
            Either a credential that bases on ``google.auth.credentials.Credentials`` OR
            a service account json as a dictionary OR
            a path to a service account key json file.
            https://googleapis.dev/python/google-auth/latest/
        load_args: Pandas options for loading BigQuery table into DataFrame.
            Here you can find all available arguments:
            https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html
            All defaults are preserved.
        save_args: Pandas options for saving DataFrame to BigQuery table.
            Here you can find all available arguments:
            https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html
            All defaults are preserved, but "progress_bar", which is set to False.
        metadata: Any arbitrary metadata.
            This is ignored by Kedro, but may be consumed by users or external plugins.

    Raises:
        DatasetError: When ``load_args['location']`` and ``save_args['location']``
            are different.
    """
    # Handle default load and save arguments
    self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})}
    self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})}

    self._validate_location()
    validate_on_forbidden_chars(dataset=dataset, table_name=table_name)

    self._dataset = dataset
    self._table_name = table_name
    self._project_id = project

    if (not isinstance(credentials, Credentials)) and (credentials is not None):
        credentials = _get_credentials(credentials)

    self._connection_config = {
        "project": self._project_id,
        "credentials": credentials,
        "location": self._save_args.get("location"),
    }

    self.metadata = metadata

DEFAULT_LOAD_ARGS class-attribute instance-attribute

DEFAULT_LOAD_ARGS = {}

DEFAULT_SAVE_ARGS class-attribute instance-attribute

DEFAULT_SAVE_ARGS = {'progress_bar': False}

_CONNECTION_GROUP class-attribute

_CONNECTION_GROUP = 'bigquery'

_connection_config instance-attribute

_connection_config = {
    "project": _project_id,
    "credentials": credentials,
    "location": get("location"),
}

_dataset instance-attribute

_dataset = dataset

_load_args instance-attribute

_load_args = {
    None: DEFAULT_LOAD_ARGS,
    None: load_args or {},
}

_project_id instance-attribute

_project_id = project

_save_args instance-attribute

_save_args = {
    None: DEFAULT_SAVE_ARGS,
    None: save_args or {},
}

_table_name instance-attribute

_table_name = table_name

metadata instance-attribute

metadata = metadata

_connect

_connect()
Source code in kedro_datasets/pandas/gbq_dataset.py
148
149
150
151
152
153
def _connect(self) -> bigquery.Client:
    return bigquery.Client(
        project=self._connection_config["project"],
        credentials=self._connection_config["credentials"],
        location=self._connection_config["location"],
    )

_describe

_describe()
Source code in kedro_datasets/pandas/gbq_dataset.py
140
141
142
143
144
145
146
def _describe(self) -> dict[str, Any]:
    return {
        "dataset": self._dataset,
        "table_name": self._table_name,
        "load_args": self._load_args,
        "save_args": self._save_args,
    }

_exists

_exists()
Source code in kedro_datasets/pandas/gbq_dataset.py
173
174
175
176
177
178
179
def _exists(self) -> bool:
    table_ref = self._connection.dataset(self._dataset).table(self._table_name)
    try:
        self._connection.get_table(table_ref)
        return True
    except NotFound:
        return False

_validate_location

_validate_location()
Source code in kedro_datasets/pandas/gbq_dataset.py
181
182
183
184
185
186
187
188
189
190
191
def _validate_location(self):
    save_location = self._save_args.get("location")
    load_location = self._load_args.get("location")

    if save_location != load_location:
        raise DatasetError(
            """"load_args['location']" is different from "save_args['location']". """
            "The 'location' defines where BigQuery data is stored, therefore has "
            "to be the same for save and load args. "
            "Details: https://cloud.google.com/bigquery/docs/locations"
        )

load

load()
Source code in kedro_datasets/pandas/gbq_dataset.py
155
156
157
158
159
160
161
162
def load(self) -> pd.DataFrame:
    sql = f"select * from {self._dataset}.{self._table_name}"  # nosec
    self._load_args.setdefault("query_or_table", sql)
    return pd_gbq.read_gbq(
        project_id=self._project_id,
        credentials=self._connection._credentials,
        **self._load_args,
    )

save

save(data)
Source code in kedro_datasets/pandas/gbq_dataset.py
164
165
166
167
168
169
170
171
def save(self, data: pd.DataFrame) -> None:
    pd_gbq.to_gbq(
        dataframe=data,
        destination_table=f"{self._dataset}.{self._table_name}",
        project_id=self._project_id,
        credentials=self._connection._credentials,
        **self._save_args,
    )