Source code for tsdat.config.retriever

import re
from typing import Dict, List, Pattern, Union, cast
from pydantic import BaseModel, Extra, Field, validator
from .utils import ParameterizedConfigClass, YamlModel

__all__ = ["RetrieverConfig"]


class DataReaderConfig(ParameterizedConfigClass):
    ...


class DataConverterConfig(ParameterizedConfigClass, extra=Extra.allow):
    ...


class RetrievedVariableConfig(BaseModel, extra=Extra.allow):
    """Specifies how the variable should be retrieved from the raw dataset and the
    preprocessing steps (i.e. DataConverters) that should be applied."""

    name: str = Field(
        description="The exact name of the variable in the raw dataset returned by the"
        " DataReader."
    )
    data_converters: List[DataConverterConfig] = Field(
        [],
        description="A list of DataConverters to run for this variable. Common choices"
        " include the tsdat UnitsConverter (classname: "
        "'tsdat.io.converters.UnitsConverter') to convert the variable's data from its"
        " input units to specified output units, and the tsdat StringToDatetime"
        " converter (classname: 'tsdat.io.converters.StringToDatetime'), which takes"
        " dates/times formatted as strings and converts them into a datetime64 object"
        " that can be used throughout the rest of the pipeline. This property is"
        " optional and defaults to [].",
    )


[docs]class RetrieverConfig(ParameterizedConfigClass, YamlModel, extra=Extra.allow): """--------------------------------------------------------------------------------- Contains configuration parameters for the tsdat retriever class. This class will ultimately be converted into a tsdat.io.base.Retriever subclass for use in tsdat pipelines. Provides methods to support yaml parsing and validation, including the generation of json schema for immediate validation. This class also provides a method to instantiate a tsdat.io.base.Retriever subclass from a parsed configuration file. Args: classname (str): The dotted module path to the pipeline that the specified configurations should apply to. To use the built-in IngestPipeline, for example, you would set 'tsdat.pipeline.pipelines.IngestPipeline' as the classname. readers (Dict[str, DataReaderConfig]): The DataReaders to use for reading input data. ---------------------------------------------------------------------------------""" # HACK: Can't do Pattern[str]: https://github.com/samuelcolvin/pydantic/issues/2636
[docs] readers: Dict[Pattern, DataReaderConfig] = Field( # type: ignore description="A dictionary mapping regex patterns to DataReaders that should be" " used to read the input data. For each input given to the Retriever, the" " mapping will be used to determine which DataReader to use. The patterns will" " be searched in the order they are defined and the DataReader corresponding" " with the first pattern that matches the input key will be used." )
[docs] coords: Dict[str, Union[Dict[Pattern, RetrievedVariableConfig], RetrievedVariableConfig]] = Field( # type: ignore {}, description="A dictionary mapping output coordinate variable names to the" " retrieval rules and preprocessing actions (i.e. DataConverters) that should" " be applied to each retrieved coordinate variable.", )
[docs] data_vars: Dict[str, Union[Dict[Pattern, RetrievedVariableConfig], RetrievedVariableConfig]] = Field( # type: ignore {}, description="A dictionary mapping output data_variable variable names to the" " retrieval rules and preprocessing actions (i.e. DataConverters) that should" " be applied to each retrieved coordinate variable.", )
@validator("coords", "data_vars") @classmethod
[docs] def coerce_to_patterned_retriever(cls, var_dict: Dict[str, Union[Dict[Pattern, RetrievedVariableConfig], RetrievedVariableConfig]]) -> Dict[str, Dict[Pattern[str], RetrievedVariableConfig]]: # type: ignore to_return: Dict[str, Dict[Pattern[str], RetrievedVariableConfig]] = {} # type: ignore for name, var_retriever in var_dict.items(): # type: ignore if isinstance(var_retriever, RetrievedVariableConfig): var_retriever = {re.compile(r".*"): var_retriever} to_return[name] = cast( Dict[Pattern[str], RetrievedVariableConfig], var_retriever ) return to_return