Source code for tsdat.config.variables

import warnings
from typing import Any, Dict, List, Optional

import numpy as np
from pint import PintError, UnitRegistry
from pydantic import (
    BaseModel,
    Extra,
    Field,
    StrictStr,
    root_validator,
    validator,
)

from .attributes import AttributeModel

ureg = UnitRegistry()
ureg.define("unitless = count = 1")  # type: ignore

__all__ = [
    "VariableAttributes",
    "Variable",
    "Coordinate",
]


[docs]class VariableAttributes(AttributeModel): """Attributes that will be recorded in the output dataset. These metadata are to record information about the data properties and related fields (e.g., units, ancillary_variables, etc), user-facing metadata (e.g., long_name, comment), as well as attributes related to quality checks and controls (e.g., valid_*, fail_*, and warn_* properties)."""
[docs] units: Optional[str] = Field( description="A string indicating the units the data are measured in. Tsdat uses" " pint to handle unit conversions, so this string must be compatible with the" " pint list of units, if provided. A complete list of compatible units can be" " found here: https://github.com/hgrecco/pint/blob/master/pint/default_en.txt." " If the property is unitless, then the string '1' should be used. If the units" " of the property are not known, then the units attribute should be omitted and" " the comment attribute should include a note indicating that units are not" " known. Doing so provides helpful context for data users." )
[docs] long_name: Optional[StrictStr] = Field( default=None, description="A brief label for the name of the measured property. The xarray" " python library automatically searches for this attribute to use as an axes" " label in plots, so the value should be suitable for display.", )
[docs] standard_name: Optional[StrictStr] = Field( default=None, description="A string exactly matching a value in the CF Standard Name table" " which is used to provide a standardized way of identifying variables and" " measurements across heterogeneous datasets and domains. If a suitable match" " does not exist, then this attribute should be omitted. The full list of CF" " Standard Names is at: https://cfconventions.org/Data/cf-standard-names.", )
[docs] comment: Optional[StrictStr] = Field( default=None, description="A user-friendly description of what the variable represents, how" " it was measured or derived, or any other relevant information that increases" " the ability of users to understand and use this data. This field plays a" " considerable role in creating self-documenting data, so we highly recommend" " including this field, especially for any variables which are particularly" " important for your dataset. Additionally, if the units for an attribute are" " unknown, then this field must include the phrase: 'Unknown units.' so that" " users know there is some uncertainty around this property. Variables that are" " unitless (e.g., categorical data or ratios), should set the 'units' to '1'.", )
[docs] valid_range: Optional[List[float]] = Field( default=None, min_items=2, max_items=2, description="A two-element list of [min, max] values outside of which the data" " should be treated as missing. If applying QC tests, then users should" " configure the quality managers to flag values outside of this range as having" " a 'Bad' assessment and replace those values with the variable's _FillValue.", )
[docs] fail_range: Optional[List[float]] = Field( default=None, min_items=2, max_items=2, description="A two-element list of [min, max] values outside of which the data" " should be teated with heavy skepticism as missing. If applying QC tests, then" " users should configure the quality managers to flag values outside of this" " range as having a 'Bad' assessment.", )
[docs] warn_range: Optional[List[float]] = Field( default=None, min_items=2, max_items=2, description="A two-element list of [min, max] values outside of which the data" " should be teated with some skepticism as missing. If applying QC tests, then" " users should configure the quality managers to flag values outside of this" " range as having an 'Indeterminate' assessment.", )
[docs] valid_delta: Optional[float] = Field( default=None, description="The largest difference between consecutive values in the data" " outside of which the data should be treated as missing. If applying QC tests," " then users should configure the quality managers to flag values outside of" " this range as having a 'Bad' assessment and replace those values with the" " variable's _FillValue.", )
[docs] fail_delta: Optional[float] = Field( default=None, description="The largest difference between consecutive values in the data" " outside of which the data should be teated with heavy skepticism as missing." " If applying QC tests, then users should configure the quality managers to" " flag values outside of this range as having a 'Bad' assessment.", )
[docs] warn_delta: Optional[float] = Field( default=None, description="The largest difference between consecutive values in the data" " outside of which the data should be teated with some skepticism as missing." " If applying QC tests, then users should configure the quality managers to" " flag values outside of this range as having an 'Indeterminate' assessment.", )
[docs] fill_value: Optional[Any] = Field( default=None, alias="_FillValue", description="A value used to initialize the variable's data and indicate that" " the data is missing. Defaults to -9999 for numerical data. If choosing a" " different value, it is important to use a value that could not reasonably be" " mistaken for a physical value or data point.", )
@validator("units")
[docs] def validate_unit(cls, unit_str: str) -> str: # Not recognized by pint, but we want it to be valid if unit_str == "%" or unit_str.startswith("Seconds since"): return unit_str # Validate with pint unit registry try: ureg(unit_str) except PintError: warnings.warn( f"'{unit_str}' is not a valid unit or combination of units. The string" " will be kept as-is." ) return unit_str
@root_validator @classmethod
[docs] def validate_units_are_commented(cls, values: Dict[str, Any]) -> Dict[str, Any]: if not values["units"]: if not values["comment"] or "Unknown units." not in values["comment"]: raise ValueError( "The 'units' attr is required if known. If the units are not known," " then the 'comment' attr should include the phrase 'Unknown" " units.' so that users are aware that the measurement's units are" " not known. Note that 'unitless' quantities (e.g., categorical" " data, ratios, etc) should set the 'units' attr to '1'." ) return values
[docs]class Variable(BaseModel, extra=Extra.forbid):
[docs] name: str = Field("", regex=r"^[a-zA-Z0-9_\(\)\/\[\]\{\}\.]+$")
"""Should be left empty. This property will be set automatically by the data_vars or coords pydantic model upon instantiation."""
[docs] data: Optional[Any] = Field( description="If the variable is not meant to be retrieved from an input dataset" " and the value is known in advance, then the 'data' property should specify" " its value exactly as it should appear in the output dataset. This is commonly" " used for latitude/longitude/altitude data for datasets measured from a" " specific geographical location." )
[docs] dtype: StrictStr = Field( description="The numpy dtype of the underlying data. This is passed to numpy as" " the 'dtype' keyword argument used to initialize an array (e.g.," " `numpy.array([1.0, 2.0], dtype='float')`). Commonly-used values include" " 'float', 'int', 'long'." )
[docs] dims: List[StrictStr] = Field( unique_items=True, description="A list of coordinate variable names that dimension this data" " variable. Most commonly this will be set to ['time'], but for datasets where" " there are multiple dimensions (e.g., ADCP data measuring current velocities" " across time and several depths, it may look like ['time', 'depth']).", )
[docs] attrs: VariableAttributes = Field( description="The attrs section is where variable-specific metadata are stored." " This metadata is incredibly important for data users, and we recommend" " including several properties for each variable in order to have the greatest" " impact. In particular, we recommend adding the 'units', 'long_name', and" " 'standard_name' attributes, if possible." )
# @validator("name") # @classmethod # def validate_name_is_ascii(cls, v: str) -> str: # if not v.isascii(): # raise ValueError(f"'{v}' contains a non-ascii character.") # return v @validator("attrs") @classmethod
[docs] def set_default_fill_value( cls, attrs: VariableAttributes, values: Dict[str, Any] ) -> VariableAttributes: dtype: str = values["dtype"] if ( "fill_value" in attrs.__fields_set__ # Preserve _FillValues set explicitly or (dtype == "str") or ("datetime" in dtype) ): return attrs attrs.fill_value = np.array([-9999.0], dtype=dtype)[0] # type: ignore return attrs
[docs]class Coordinate(Variable): @root_validator(skip_on_failure=True) @classmethod
[docs] def coord_dimensioned_by_self(cls, values: Any) -> Any: name, dims = values["name"], values["dims"] if [name] != dims: raise ValueError(f"coord '{name}' must have dims ['{name}']. Found: {dims}") return values
# IDEA: Variables/Coordinates via __root__=Dict[str, Variable/Coordinate] # TODO: Variables/Coordinates validators; name uniqueness, coords has time, etc