Source code for tsdat.config.variables

from typing import Any, Dict, List, Optional
import numpy as np
from pydantic import (
    BaseModel,
    Extra,
    root_validator,
    Field,
    StrictStr,
    validator,
)
from .attributes import AttributeModel


__all__ = [
    "VariableAttributes",
    "Variable",
    "Coordinate",
]


[docs]class VariableAttributes(AttributeModel):
    """Attributes that will be recorded in the output dataset. These metadata are used to
    record information about the data properties and related fields (e.g., units,
    ancillary_variables, etc), user-facing metadata (e.g., long_name, comment), as well as
    attributes related to quality checks and controls (e.g., valid_*, fail_*, and warn_*
    properties)."""

[docs]    units: Optional[str] = Field(
        description="A string indicating the units the data are measured in. Tsdat uses"
        " pint to handle unit conversions, so this string must be compatible with the"
        " pint list of units, if provided. A complete list of compatible units can be"
        " found here: https://github.com/hgrecco/pint/blob/master/pint/default_en.txt."
        " If the property is unitless, then the string '1' should be used. If the units"
        " of the property are not known, then the units attribute should be omitted and"
        " the comment attribute should include a note indicating that units are not"
        " known. Doing so provides helpful context for data users."
    )
[docs]    long_name: Optional[StrictStr] = Field(
        description="A brief label for the name of the measured property. The xarray"
        " python library automatically searches for this attribute to use as an axes"
        " label in plots, so the value should be suitable for display."
    )
[docs]    standard_name: Optional[StrictStr] = Field(
        description="A string exactly matching a value in the CF Standard Name table"
        " which is used to provide a standardized way of identifying variables and"
        " measurements across heterogeneous datasets and domains. If a suitable match"
        " does not exist, then this attribute should be omitted. The full list of CF"
        " Standard Names is at: https://cfconventions.org/Data/cf-standard-names."
    )
[docs]    comment: Optional[StrictStr] = Field(
        description="A user-friendly description of what the variable represents, how"
        " it was measured or derived, or any other relevant information that increases"
        " the ability of users to understand and use this data. This field plays a"
        " considerable role in creating self-documenting data, so we highly recommend"
        " including this field, especially for any variables which are particularly"
        " important for your dataset. Additionally, if the units for an attribute are"
        " unknown, then this field must include the phrase: 'Unknown units.' so that"
        " users know there is some uncertainty around this property. Variables that are"
        " unitless (e.g., categorical data or ratios), should set the 'units' to '1'."
    )
[docs]    valid_range: Optional[List[float]] = Field(
        min_items=2,
        max_items=2,
        description="A two-element list of [min, max] values outside of which the data"
        " should be treated as missing. If applying QC tests, then users should"
        " configure the quality managers to flag values outside of this range as having"
        " a 'Bad' assessment and replace those values with the variable's _FillValue.",
    )
[docs]    fail_range: Optional[List[float]] = Field(
        min_items=2,
        max_items=2,
        description="A two-element list of [min, max] values outside of which the data"
        " should be teated with heavy skepticism as missing. If applying QC tests, then"
        " users should configure the quality managers to flag values outside of this"
        " range as having a 'Bad' assessment.",
    )
[docs]    warn_range: Optional[List[float]] = Field(
        min_items=2,
        max_items=2,
        description="A two-element list of [min, max] values outside of which the data"
        " should be teated with some skepticism as missing. If applying QC tests, then"
        " users should configure the quality managers to flag values outside of this"
        " range as having an 'Indeterminate' assessment.",
    )
[docs]    valid_delta: Optional[float] = Field(
        description="The largest difference between consecutive values in the data"
        " outside of which the data should be treated as missing. If applying QC tests,"
        " then users should configure the quality managers to flag values outside of"
        " this range as having a 'Bad' assessment and replace those values with the"
        " variable's _FillValue."
    )
[docs]    fail_delta: Optional[float] = Field(
        description="The largest difference between consecutive values in the data"
        " outside of which the data should be teated with heavy skepticism as missing."
        " If applying QC tests, then users should configure the quality managers to"
        " flag values outside of this range as having a 'Bad' assessment."
    )
[docs]    warn_delta: Optional[float] = Field(
        description="The largest difference between consecutive values in the data"
        " outside of which the data should be teated with some skepticism as missing."
        " If applying QC tests, then users should configure the quality managers to"
        " flag values outside of this range as having an 'Indeterminate' assessment."
    )
[docs]    fill_value: Optional[Any] = Field(
        alias="_FillValue",
        description="A value used to initialize the variable's data and indicate that"
        " the data is missing. Defaults to -9999 for numerical data. If choosing a"
        " different value, it is important to use a value that could not reasonably be"
        " mistaken for a physical value or data point.",
    )

    # TODO: Validate units using pint registry
    # ureg = pint.UnitRegistry(autoconvert_offset_to_baseunit=True)
    # ureg.define('percent = 0.01*count = %')
    # ureg.define('unitless = count = 1')
    # try: ureg(units) except: ValueError(units not valid)

    @root_validator
    @classmethod
[docs]    def validate_units_are_commented(cls, values: Dict[str, Any]) -> Dict[str, Any]:
        if not values["units"]:
            if not values["comment"] or "Unknown units." not in values["comment"]:
                raise ValueError(
                    "The 'units' attr is required if known. If the units are not known,"
                    " then the 'comment' attr should include the phrase 'Unknown"
                    " units.' so that users are aware that the measurement's units are"
                    " not known. Note that 'unitless' quantities (e.g., categorical"
                    " data, ratios, etc) should set the 'units' attr to '1'."
                )
        return values


# class RetrieverParameters(BaseModel, extra=Extra.allow):
#     required: bool = Field(
#         True,
#         description="If True (the default) then the pipeline will fail loudly if it is"
#         " unable to retrieve the variable from an input source.",
#     )
#     name: Optional[StrictStr] = Field(
#         title="Input Name",
#         description="The name of the variable as it appears in the input dataset, or"
#         " more accurately, this is the key tsdat will use to retrieve the variable"
#         " from the dataset returned by the input DataReader.",
#     )
#     retrieval_rules: Optional[Any] = Field(
#         description="Optional field used to specify how the variable should be"
#         " retrieved from the input source(s). The format of this field is dependent on"
#         " the type of retriever specified by the input classname. If not specified then"
#         " the 'tsdat.io.retrievers.SimpleRetriever' class is used, and this field is"
#         " not needed."
#     )
#     units: Optional[str] = Field(
#         description="This gives tsdat context about the units the input dataset is"
#         " measured in. If the 'units' property here differs from the 'units' property"
#         " under the 'attrs' section, then tsdat will automatically perform a unit"
#         " conversion on the input data."
#     )
# converters: Optional[List[InputConverter]] = Field(
#     description="A list of converters that tsdat should use to transform the data"
#     " from the input source to the output source. Currently only two converters are"
#     " provided: the 'UnitsConverter', which converts input units to output units"
#     " using the Python libraries act-atmos and pint, and the 'StringTimeConverter',"
#     " which is used exclusively for converting string values into Python datetime"
#     " objects that are timezone-aware. If using the 'StringTimeConverter' class,"
#     " two parameters are required: 'timezone' - the timezone the data are recorded"
#     " in (default UTC), and 'time_format' - a string that is passed to the"
#     " strptime() function as the string format used to create a datetime object.",
# )


# class InputVariable(ParameterizedConfigClass, extra=Extra.allow):
#     classname: StrictStr = "tsdat.io.retrievers.SimpleRetriever"
#     parameters: RetrieverParameters = RetrieverParameters()  # type: ignore


[docs]class Variable(BaseModel, extra=Extra.forbid):
    # name: str = Field(
    #     title="Output Variable Name",
    #     regex=r"^[a-zA-Z0-9_\(\)\/\[\]\{\}\.]+$",
    #     description="The name of the variable in the output file. Generally, we"
    #     " recommend only using lowercase alphanumeric and '_' characters to name"
    #     " variables, as uniformly-named variables are easier to sort through and read"
    #     " for users. Spaces and non-ascii characters are explicitly disallowed. The"
    #     " variable name should be concise, yet clear enough for users to know what the"
    #     " property measures. A more descriptive name for a variable (i.e. suitable for"
    #     " a plot title / axis label) should be provided via the 'long_name' attribute"
    #     " in the attrs section, if desired. The 'comment' attribute is also recommended"
    #     " to provide additional context about the variable, if needed.",
    # )
[docs]    name: str = Field("", regex=r"^[a-zA-Z0-9_\(\)\/\[\]\{\}\.]+$")
    """Should be left empty. This property will be set automatically by the data_vars or
    coords pydantic model upon instantiation."""

[docs]    data: Optional[Any] = Field(
        description="If the variable is not meant to be retrieved from an input dataset"
        " and the value is known in advance, then the 'data' property should specify"
        " its value exactly as it should appear in the output dataset. This is commonly"
        " used for latitude/longitude/altitude data for datasets measured from a"
        " specific geographical location."
    )
[docs]    dtype: StrictStr = Field(
        description="The numpy dtype of the underlying data. This is passed to numpy as"
        " the 'dtype' keyword argument used to initialize an array (e.g.,"
        " `numpy.array([1.0, 2.0], dtype='float')`). Commonly-used values include"
        " 'float', 'int', 'long'."
    )
[docs]    dims: List[StrictStr] = Field(
        unique_items=True,
        description="A list of coordinate variable names that dimension this data"
        " variable. Most commonly this will be set to ['time'], but for datasets where"
        " there are multiple dimensions (e.g., ADCP data measuring current velocities"
        " across time and several depths, it may look like ['time', 'depth']).",
    )
[docs]    attrs: VariableAttributes = Field(
        description="The attrs section is where variable-specific metadata are stored."
        " This metadata is incredibly important for data users, and we recommend"
        " including several properties for each variable in order to have the greatest"
        " impact. In particular, we recommend adding the 'units', 'long_name', and"
        " 'standard_name' attributes, if possible."
    )
    # @validator("name")
    # @classmethod
    # def validate_name_is_ascii(cls, v: str) -> str:
    #     if not v.isascii():
    #         raise ValueError(f"'{v}' contains a non-ascii character.")
    #     return v

    @validator("attrs")
    @classmethod
[docs]    def set_default_fill_value(
        cls, attrs: VariableAttributes, values: Dict[str, Any]
    ) -> VariableAttributes:
        dtype: str = values["dtype"]
        if (
            "fill_value" in attrs.__fields_set__  # Preserve _FillValues set explicitly
            or (dtype == "str")
            or ("datetime" in dtype)
        ):
            return attrs
        attrs.fill_value = np.array([-9999.0], dtype=dtype)[0]  # type: ignore
        return attrs


[docs]class Coordinate(Variable):
    @root_validator(skip_on_failure=True)
    @classmethod
[docs]    def coord_dimensioned_by_self(cls, values: Any) -> Any:
        name, dims = values["name"], values["dims"]
        if [name] != dims:
            raise ValueError(f"coord '{name}' must have dims ['{name}']. Found: {dims}")
        return values


# IDEA: Variables/Coordinates via __root__=Dict[str, Variable/Coordinate]
# TODO: Variables/Coordinates validators; name uniqueness, coords has time, etc