from typing import Any, Dict, List, Optional
import numpy as np
from pydantic import (
BaseModel,
Extra,
root_validator,
Field,
StrictStr,
validator,
)
from .attributes import AttributeModel
__all__ = [
"VariableAttributes",
"Variable",
"Coordinate",
]
[docs]class VariableAttributes(AttributeModel):
"""Attributes that will be recorded in the output dataset. These metadata are used to
record information about the data properties and related fields (e.g., units,
ancillary_variables, etc), user-facing metadata (e.g., long_name, comment), as well as
attributes related to quality checks and controls (e.g., valid_*, fail_*, and warn_*
properties)."""
[docs] units: Optional[str] = Field(
description="A string indicating the units the data are measured in. Tsdat uses"
" pint to handle unit conversions, so this string must be compatible with the"
" pint list of units, if provided. A complete list of compatible units can be"
" found here: https://github.com/hgrecco/pint/blob/master/pint/default_en.txt."
" If the property is unitless, then the string '1' should be used. If the units"
" of the property are not known, then the units attribute should be omitted and"
" the comment attribute should include a note indicating that units are not"
" known. Doing so provides helpful context for data users."
)
[docs] long_name: Optional[StrictStr] = Field(
description="A brief label for the name of the measured property. The xarray"
" python library automatically searches for this attribute to use as an axes"
" label in plots, so the value should be suitable for display."
)
[docs] standard_name: Optional[StrictStr] = Field(
description="A string exactly matching a value in the CF Standard Name table"
" which is used to provide a standardized way of identifying variables and"
" measurements across heterogeneous datasets and domains. If a suitable match"
" does not exist, then this attribute should be omitted. The full list of CF"
" Standard Names is at: https://cfconventions.org/Data/cf-standard-names."
)
[docs] valid_range: Optional[List[float]] = Field(
min_items=2,
max_items=2,
description="A two-element list of [min, max] values outside of which the data"
" should be treated as missing. If applying QC tests, then users should"
" configure the quality managers to flag values outside of this range as having"
" a 'Bad' assessment and replace those values with the variable's _FillValue.",
)
[docs] fail_range: Optional[List[float]] = Field(
min_items=2,
max_items=2,
description="A two-element list of [min, max] values outside of which the data"
" should be teated with heavy skepticism as missing. If applying QC tests, then"
" users should configure the quality managers to flag values outside of this"
" range as having a 'Bad' assessment.",
)
[docs] warn_range: Optional[List[float]] = Field(
min_items=2,
max_items=2,
description="A two-element list of [min, max] values outside of which the data"
" should be teated with some skepticism as missing. If applying QC tests, then"
" users should configure the quality managers to flag values outside of this"
" range as having an 'Indeterminate' assessment.",
)
[docs] valid_delta: Optional[float] = Field(
description="The largest difference between consecutive values in the data"
" outside of which the data should be treated as missing. If applying QC tests,"
" then users should configure the quality managers to flag values outside of"
" this range as having a 'Bad' assessment and replace those values with the"
" variable's _FillValue."
)
[docs] fail_delta: Optional[float] = Field(
description="The largest difference between consecutive values in the data"
" outside of which the data should be teated with heavy skepticism as missing."
" If applying QC tests, then users should configure the quality managers to"
" flag values outside of this range as having a 'Bad' assessment."
)
[docs] warn_delta: Optional[float] = Field(
description="The largest difference between consecutive values in the data"
" outside of which the data should be teated with some skepticism as missing."
" If applying QC tests, then users should configure the quality managers to"
" flag values outside of this range as having an 'Indeterminate' assessment."
)
[docs] fill_value: Optional[Any] = Field(
alias="_FillValue",
description="A value used to initialize the variable's data and indicate that"
" the data is missing. Defaults to -9999 for numerical data. If choosing a"
" different value, it is important to use a value that could not reasonably be"
" mistaken for a physical value or data point.",
)
# TODO: Validate units using pint registry
# ureg = pint.UnitRegistry(autoconvert_offset_to_baseunit=True)
# ureg.define('percent = 0.01*count = %')
# ureg.define('unitless = count = 1')
# try: ureg(units) except: ValueError(units not valid)
@root_validator
@classmethod
# class RetrieverParameters(BaseModel, extra=Extra.allow):
# required: bool = Field(
# True,
# description="If True (the default) then the pipeline will fail loudly if it is"
# " unable to retrieve the variable from an input source.",
# )
# name: Optional[StrictStr] = Field(
# title="Input Name",
# description="The name of the variable as it appears in the input dataset, or"
# " more accurately, this is the key tsdat will use to retrieve the variable"
# " from the dataset returned by the input DataReader.",
# )
# retrieval_rules: Optional[Any] = Field(
# description="Optional field used to specify how the variable should be"
# " retrieved from the input source(s). The format of this field is dependent on"
# " the type of retriever specified by the input classname. If not specified then"
# " the 'tsdat.io.retrievers.SimpleRetriever' class is used, and this field is"
# " not needed."
# )
# units: Optional[str] = Field(
# description="This gives tsdat context about the units the input dataset is"
# " measured in. If the 'units' property here differs from the 'units' property"
# " under the 'attrs' section, then tsdat will automatically perform a unit"
# " conversion on the input data."
# )
# converters: Optional[List[InputConverter]] = Field(
# description="A list of converters that tsdat should use to transform the data"
# " from the input source to the output source. Currently only two converters are"
# " provided: the 'UnitsConverter', which converts input units to output units"
# " using the Python libraries act-atmos and pint, and the 'StringTimeConverter',"
# " which is used exclusively for converting string values into Python datetime"
# " objects that are timezone-aware. If using the 'StringTimeConverter' class,"
# " two parameters are required: 'timezone' - the timezone the data are recorded"
# " in (default UTC), and 'time_format' - a string that is passed to the"
# " strptime() function as the string format used to create a datetime object.",
# )
# class InputVariable(ParameterizedConfigClass, extra=Extra.allow):
# classname: StrictStr = "tsdat.io.retrievers.SimpleRetriever"
# parameters: RetrieverParameters = RetrieverParameters() # type: ignore
[docs]class Variable(BaseModel, extra=Extra.forbid):
# name: str = Field(
# title="Output Variable Name",
# regex=r"^[a-zA-Z0-9_\(\)\/\[\]\{\}\.]+$",
# description="The name of the variable in the output file. Generally, we"
# " recommend only using lowercase alphanumeric and '_' characters to name"
# " variables, as uniformly-named variables are easier to sort through and read"
# " for users. Spaces and non-ascii characters are explicitly disallowed. The"
# " variable name should be concise, yet clear enough for users to know what the"
# " property measures. A more descriptive name for a variable (i.e. suitable for"
# " a plot title / axis label) should be provided via the 'long_name' attribute"
# " in the attrs section, if desired. The 'comment' attribute is also recommended"
# " to provide additional context about the variable, if needed.",
# )
[docs] name: str = Field("", regex=r"^[a-zA-Z0-9_\(\)\/\[\]\{\}\.]+$")
"""Should be left empty. This property will be set automatically by the data_vars or
coords pydantic model upon instantiation."""
[docs] data: Optional[Any] = Field(
description="If the variable is not meant to be retrieved from an input dataset"
" and the value is known in advance, then the 'data' property should specify"
" its value exactly as it should appear in the output dataset. This is commonly"
" used for latitude/longitude/altitude data for datasets measured from a"
" specific geographical location."
)
[docs] dtype: StrictStr = Field(
description="The numpy dtype of the underlying data. This is passed to numpy as"
" the 'dtype' keyword argument used to initialize an array (e.g.,"
" `numpy.array([1.0, 2.0], dtype='float')`). Commonly-used values include"
" 'float', 'int', 'long'."
)
[docs] dims: List[StrictStr] = Field(
unique_items=True,
description="A list of coordinate variable names that dimension this data"
" variable. Most commonly this will be set to ['time'], but for datasets where"
" there are multiple dimensions (e.g., ADCP data measuring current velocities"
" across time and several depths, it may look like ['time', 'depth']).",
)
[docs] attrs: VariableAttributes = Field(
description="The attrs section is where variable-specific metadata are stored."
" This metadata is incredibly important for data users, and we recommend"
" including several properties for each variable in order to have the greatest"
" impact. In particular, we recommend adding the 'units', 'long_name', and"
" 'standard_name' attributes, if possible."
)
# @validator("name")
# @classmethod
# def validate_name_is_ascii(cls, v: str) -> str:
# if not v.isascii():
# raise ValueError(f"'{v}' contains a non-ascii character.")
# return v
@validator("attrs")
@classmethod
[docs] def set_default_fill_value(
cls, attrs: VariableAttributes, values: Dict[str, Any]
) -> VariableAttributes:
dtype: str = values["dtype"]
if (
"fill_value" in attrs.__fields_set__ # Preserve _FillValues set explicitly
or (dtype == "str")
or ("datetime" in dtype)
):
return attrs
attrs.fill_value = np.array([-9999.0], dtype=dtype)[0] # type: ignore
return attrs
[docs]class Coordinate(Variable):
@root_validator(skip_on_failure=True)
@classmethod
[docs] def coord_dimensioned_by_self(cls, values: Any) -> Any:
name, dims = values["name"], values["dims"]
if [name] != dims:
raise ValueError(f"coord '{name}' must have dims ['{name}']. Found: {dims}")
return values
# TODO: Variables/Coordinates via __root__=Dict[str, Variable/Coordinate]
# TODO: Variables/Coordinates validators; name uniqueness, coords has time, etc