"""This module provides ``kedro.config`` with the functionality to load one
or more configuration files from specified paths, and format template strings
with the values from the passed dictionary.
"""
from __future__ import annotations
import re
from copy import deepcopy
from pathlib import Path
from typing import Any, Iterable
import jmespath
from kedro.config.abstract_config import AbstractConfigLoader
from kedro.config.common import _get_config_from_patterns, _remove_duplicates
IDENTIFIER_PATTERN = re.compile(
r"""\$\{
(?P<path>[A-Za-z0-9_\.]+) # identifier
(?:\|(?P<default>[^}]*))? # optional default value
\}""",
re.VERBOSE,
)
FULL_STRING_IDENTIFIER_PATTERN = re.compile(
r"^" + IDENTIFIER_PATTERN.pattern + r"$", re.VERBOSE
)
[docs]class TemplatedConfigLoader(AbstractConfigLoader):
"""
Extension of the ``ConfigLoader`` class that allows for template values,
wrapped in brackets like: ${...}, to be automatically formatted
based on the configs.
The easiest way to use this class is by setting the `CONFIG_LOADER_CLASS` constant
in `settings.py`.
Example:
::
>>> # in settings.py
>>> from kedro.config import TemplatedConfigLoader
>>>
>>> CONFIG_LOADER_CLASS = TemplatedConfigLoader
>>> CONFIG_LOADER_ARGS = {
>>> "globals_pattern": "*globals.yml",
>>> }
The contents of the dictionary resulting from the `globals_pattern` get
merged with the ``globals_dict``. In case of conflicts, the keys in
``globals_dict`` take precedence.
If the formatting key is missing from the dictionary, the default template
value is used (the format is "${key|default value}"). If no default is set,
a ``ValueError`` will be raised.
Global parameters can be namespaced as well. An example could work as follows:
`globals.yml`
::
bucket: "my_s3_bucket"
environment: "dev"
datasets:
csv: "pandas.CSVDataSet"
spark: "spark.SparkDataSet"
folders:
raw: "01_raw"
int: "02_intermediate"
pri: "03_primary"
fea: "04_feature"
`catalog.yml`
::
raw_boat_data:
type: "${datasets.spark}"
filepath: "s3a://${bucket}/${environment}/${folders.raw}/boats.csv"
file_format: parquet
raw_car_data:
type: "${datasets.csv}"
filepath: "s3://${bucket}/data/${environment}/${folders.raw}/cars.csv"
This uses ``jmespath`` in the background. For more information see:
https://github.com/jmespath/jmespath.py and https://jmespath.org/.
"""
[docs] def __init__( # noqa: too-many-arguments
self,
conf_source: str,
env: str = None,
runtime_params: dict[str, Any] = None,
config_patterns: dict[str, list[str]] = None,
*,
base_env: str = "base",
default_run_env: str = "local",
globals_pattern: str | None = None,
globals_dict: dict[str, Any] | None = None,
):
"""Instantiates a ``TemplatedConfigLoader``.
Args:
conf_source: Path to use as root directory for loading configuration.
env: Environment that will take precedence over base.
runtime_params: Extra parameters passed to a Kedro run.
config_patterns: Regex patterns that specify the naming convention for configuration
files so they can be loaded. Can be customised by supplying config_patterns as
in `CONFIG_LOADER_ARGS` in `settings.py`.
base_env:
default_run_env:
globals_pattern: Optional keyword-only argument specifying a glob
pattern. Files that match the pattern will be loaded as a
formatting dictionary.
globals_dict: Optional keyword-only argument specifying a formatting
dictionary. This dictionary will get merged with the globals dictionary
obtained from the globals_pattern. In case of duplicate keys, the
``globals_dict`` keys take precedence.
"""
self.config_patterns = {
"catalog": ["catalog*", "catalog*/**", "**/catalog*"],
"parameters": ["parameters*", "parameters*/**", "**/parameters*"],
"credentials": ["credentials*", "credentials*/**", "**/credentials*"],
"logging": ["logging*", "logging*/**", "**/logging*"],
}
self.config_patterns.update(config_patterns or {})
super().__init__(
conf_source=conf_source, env=env, runtime_params=runtime_params
)
self.base_env = base_env
self.default_run_env = default_run_env
self._config_mapping = (
_get_config_from_patterns(
conf_paths=self.conf_paths,
patterns=[globals_pattern],
ac_template=False,
)
if globals_pattern
else {}
)
globals_dict = deepcopy(globals_dict) or {}
self._config_mapping = {**self._config_mapping, **globals_dict}
def __getitem__(self, key):
# Allow bypassing of loading config from patterns if a key and value have been set
# explicitly on the ``TemplatedConfigLoader`` instance.
if key in self:
return super().__getitem__(key)
return self.get(*self.config_patterns[key])
def __repr__(self): # pragma: no cover
return (
f"TemplatedConfigLoader(conf_source={self.conf_source}, env={self.env}, "
f"config_patterns={self.config_patterns})"
)
@property
def conf_paths(self):
"""Property method to return deduplicated configuration paths."""
return _remove_duplicates(self._build_conf_paths())
[docs] def get(self, *patterns: str) -> dict[str, Any]: # type: ignore
"""Tries to resolve the template variables in the config dictionary
provided by the ``ConfigLoader`` (super class) ``get`` method using the
dictionary of replacement values obtained in the ``__init__`` method.
Args:
*patterns: Glob patterns to match. Files, which names match
any of the specified patterns, will be processed.
Returns:
A Python dictionary with the combined configuration from all
configuration files. **Note:** any keys that start with `_`
will be ignored. String values wrapped in `${...}` will be
replaced with the result of the corresponding JMESpath
expression evaluated against globals.
Raises:
ValueError: malformed config found.
"""
config_raw = _get_config_from_patterns(
conf_paths=self.conf_paths, patterns=patterns, ac_template=True
)
return _format_object(config_raw, self._config_mapping)
def _build_conf_paths(self) -> Iterable[str]:
run_env = self.env or self.default_run_env
return [
str(Path(self.conf_source) / self.base_env),
str(Path(self.conf_source) / run_env),
]
def _format_object(val: Any, format_dict: dict[str, Any]) -> Any:
"""Recursive function that loops through the values of a map. In case another
map or a list is encountered, it calls itself. When a string is encountered,
it will use the `format_dict` to replace strings that look like `${expr}`,
where `expr` is a JMESPath expression evaluated against `format_dict`.
Some notes on behavior:
* If val is not a dict, list or string, the same value gets passed back.
* If val is a string and does not match the ${...} pattern, the same
value gets passed back.
* If the value inside ${...} does not match any keys in the dictionary,
the error is raised, unless a default is provided.
* If the default is provided with ${...|default}, and the key is not
found in the dictionary, the default value gets passed back.
* If the ${...} is part of a larger string, the corresponding entry in
the `format_dict` gets parsed into a string and put into the
larger string.
Examples:
val = "${test_key}" with format_dict = {'test_key': 'test_val'} returns
'test_val'
val = 5 (i.e. not a dict, list or string) returns 5
val = "test_key" (i.e. does not match ${...} pattern returns 'test_key'
(irrespective of `format_dict`)
val = "${wrong_test_key}" with format_dict = {'test_key': 'test_val'}
raises ``ValueError``
val = "string-with-${test_key}" with format_dict = {'test_key': 1000}
returns "string-with-1000"
val = "${wrong_test_key|default_value}" with format_dict = {}
returns 'default_value'
Args:
val: If this is a string of the format `${expr}`, it gets replaced
by the result of JMESPath expression
format_dict: A lookup from string to string with replacement values
Returns:
A string formatted according to the ``format_dict`` input.
Raises:
ValueError: The input data is malformed.
"""
def _format_string(match):
value = jmespath.search(match.group("path"), format_dict)
if value is None:
if match.group("default") is None:
raise ValueError(
f"Failed to format pattern '{match.group(0)}': "
f"no config value found, no default provided"
)
return match.group("default")
return value
if isinstance(val, dict):
new_dict = {}
for key, value in val.items():
if isinstance(key, str):
formatted_key = _format_object(key, format_dict)
if not isinstance(formatted_key, str):
raise ValueError(
f"When formatting '{key}' key, only string values can be used. "
f"'{formatted_key}' found"
)
key = formatted_key # noqa: PLW2901
new_dict[key] = _format_object(value, format_dict)
return new_dict
if isinstance(val, list):
return [_format_object(e, format_dict) for e in val]
if isinstance(val, str):
# Distinguish case where entire string matches the pattern,
# as the replacement can be of a different type
match_full = FULL_STRING_IDENTIFIER_PATTERN.match(val)
if match_full:
return _format_string(match_full)
return IDENTIFIER_PATTERN.sub(lambda m: str(_format_string(m)), val)
return val