import warnings
from typing import Any, Dict, List, Optional
import numpy as np
from pint import PintError, UnitRegistry
from pydantic import (
BaseModel,
Extra,
Field,
StrictStr,
root_validator,
validator,
)
from .attributes import AttributeModel
ureg = UnitRegistry()
ureg.define("unitless = count = 1") # type: ignore
__all__ = [
"VariableAttributes",
"Variable",
"Coordinate",
]
[docs]class VariableAttributes(AttributeModel):
"""Attributes that will be recorded in the output dataset.
These metadata are to record information about the data properties and related
fields (e.g., units, ancillary_variables, etc), user-facing metadata (e.g.,
long_name, comment), as well as attributes related to quality checks and controls
(e.g., valid_*, fail_*, and warn_* properties)."""
[docs] units: Optional[str] = Field(
description="A string indicating the units the data are measured in. Tsdat uses"
" pint to handle unit conversions, so this string must be compatible with the"
" pint list of units, if provided. A complete list of compatible units can be"
" found here: https://github.com/hgrecco/pint/blob/master/pint/default_en.txt."
" If the property is unitless, then the string '1' should be used. If the units"
" of the property are not known, then the units attribute should be omitted and"
" the comment attribute should include a note indicating that units are not"
" known. Doing so provides helpful context for data users."
)
[docs] long_name: Optional[StrictStr] = Field(
default=None,
description="A brief label for the name of the measured property. The xarray"
" python library automatically searches for this attribute to use as an axes"
" label in plots, so the value should be suitable for display.",
)
[docs] standard_name: Optional[StrictStr] = Field(
default=None,
description="A string exactly matching a value in the CF Standard Name table"
" which is used to provide a standardized way of identifying variables and"
" measurements across heterogeneous datasets and domains. If a suitable match"
" does not exist, then this attribute should be omitted. The full list of CF"
" Standard Names is at: https://cfconventions.org/Data/cf-standard-names.",
)
[docs] valid_range: Optional[List[float]] = Field(
default=None,
min_items=2,
max_items=2,
description="A two-element list of [min, max] values outside of which the data"
" should be treated as missing. If applying QC tests, then users should"
" configure the quality managers to flag values outside of this range as having"
" a 'Bad' assessment and replace those values with the variable's _FillValue.",
)
[docs] fail_range: Optional[List[float]] = Field(
default=None,
min_items=2,
max_items=2,
description="A two-element list of [min, max] values outside of which the data"
" should be teated with heavy skepticism as missing. If applying QC tests, then"
" users should configure the quality managers to flag values outside of this"
" range as having a 'Bad' assessment.",
)
[docs] warn_range: Optional[List[float]] = Field(
default=None,
min_items=2,
max_items=2,
description="A two-element list of [min, max] values outside of which the data"
" should be teated with some skepticism as missing. If applying QC tests, then"
" users should configure the quality managers to flag values outside of this"
" range as having an 'Indeterminate' assessment.",
)
[docs] valid_delta: Optional[float] = Field(
default=None,
description="The largest difference between consecutive values in the data"
" outside of which the data should be treated as missing. If applying QC tests,"
" then users should configure the quality managers to flag values outside of"
" this range as having a 'Bad' assessment and replace those values with the"
" variable's _FillValue.",
)
[docs] fail_delta: Optional[float] = Field(
default=None,
description="The largest difference between consecutive values in the data"
" outside of which the data should be teated with heavy skepticism as missing."
" If applying QC tests, then users should configure the quality managers to"
" flag values outside of this range as having a 'Bad' assessment.",
)
[docs] warn_delta: Optional[float] = Field(
default=None,
description="The largest difference between consecutive values in the data"
" outside of which the data should be teated with some skepticism as missing."
" If applying QC tests, then users should configure the quality managers to"
" flag values outside of this range as having an 'Indeterminate' assessment.",
)
[docs] fill_value: Optional[Any] = Field(
default=None,
alias="_FillValue",
description="A value used to initialize the variable's data and indicate that"
" the data is missing. Defaults to -9999 for numerical data. If choosing a"
" different value, it is important to use a value that could not reasonably be"
" mistaken for a physical value or data point.",
)
@validator("units")
[docs] def validate_unit(cls, unit_str: str) -> str:
# Not recognized by pint, but we want it to be valid
if unit_str == "%" or unit_str.startswith("Seconds since"):
return unit_str
# Validate with pint unit registry
try:
ureg(unit_str)
except PintError:
warnings.warn(
f"'{unit_str}' is not a valid unit or combination of units. The string"
" will be kept as-is."
)
return unit_str
@root_validator
@classmethod
[docs]class Variable(BaseModel, extra=Extra.forbid):
[docs] name: str = Field("", regex=r"^[a-zA-Z0-9_\(\)\/\[\]\{\}\.]+$")
"""Should be left empty. This property will be set automatically by the data_vars or
coords pydantic model upon instantiation."""
[docs] data: Optional[Any] = Field(
description="If the variable is not meant to be retrieved from an input dataset"
" and the value is known in advance, then the 'data' property should specify"
" its value exactly as it should appear in the output dataset. This is commonly"
" used for latitude/longitude/altitude data for datasets measured from a"
" specific geographical location."
)
[docs] dtype: StrictStr = Field(
description="The numpy dtype of the underlying data. This is passed to numpy as"
" the 'dtype' keyword argument used to initialize an array (e.g.,"
" `numpy.array([1.0, 2.0], dtype='float')`). Commonly-used values include"
" 'float', 'int', 'long'."
)
[docs] dims: List[StrictStr] = Field(
unique_items=True,
description="A list of coordinate variable names that dimension this data"
" variable. Most commonly this will be set to ['time'], but for datasets where"
" there are multiple dimensions (e.g., ADCP data measuring current velocities"
" across time and several depths, it may look like ['time', 'depth']).",
)
[docs] attrs: VariableAttributes = Field(
description="The attrs section is where variable-specific metadata are stored."
" This metadata is incredibly important for data users, and we recommend"
" including several properties for each variable in order to have the greatest"
" impact. In particular, we recommend adding the 'units', 'long_name', and"
" 'standard_name' attributes, if possible."
)
# @validator("name")
# @classmethod
# def validate_name_is_ascii(cls, v: str) -> str:
# if not v.isascii():
# raise ValueError(f"'{v}' contains a non-ascii character.")
# return v
@validator("attrs")
@classmethod
[docs] def set_default_fill_value(
cls, attrs: VariableAttributes, values: Dict[str, Any]
) -> VariableAttributes:
dtype: str = values["dtype"]
if (
"fill_value" in attrs.__fields_set__ # Preserve _FillValues set explicitly
or (dtype == "str")
or ("datetime" in dtype)
):
return attrs
attrs.fill_value = np.array([-9999.0], dtype=dtype)[0] # type: ignore
return attrs
[docs]class Coordinate(Variable):
@root_validator(skip_on_failure=True)
@classmethod
[docs] def coord_dimensioned_by_self(cls, values: Any) -> Any:
name, dims = values["name"], values["dims"]
if [name] != dims:
raise ValueError(f"coord '{name}' must have dims ['{name}']. Found: {dims}")
return values
# IDEA: Variables/Coordinates via __root__=Dict[str, Variable/Coordinate]
# TODO: Variables/Coordinates validators; name uniqueness, coords has time, etc