Source code for tsdat.config.attributes

import warnings

from pydantic import (
    BaseModel,
    Extra,
    validator,
    root_validator,
    Field,
    StrictStr,
    HttpUrl,
)
from pydantic.fields import ModelField
from typing import Any, Dict, Optional
from .utils import get_code_version
from ..utils import get_datastream


[docs]class AttributeModel(BaseModel, extra=Extra.allow):

    # HACK: root is needed for now: https://github.com/samuelcolvin/pydantic/issues/515
    @root_validator(skip_on_failure=True)
    @classmethod
[docs]    def validate_all_ascii(cls, values: Dict[Any, Any]) -> Dict[str, str]:
        for key, value in values.items():
            if not isinstance(key, str) or not key.isascii():
                raise ValueError(f"'{key}' contains a non-ascii character.")
            if isinstance(value, str) and not value.isascii():
                raise ValueError(
                    f"attr '{key}' -> '{value}' contains a non-ascii character."
                )
        return values


[docs]class GlobalAttributes(AttributeModel):
    """Global attributes that will be recorded in the output dataset. These metadata are
    used to record data provenance information (e.g., location, institution, etc),
    construct datastream and file names (i.e., location_id, dataset_name, qualifier,
    temporal, and data_level attributes), as well as provide metadata that is useful for
    data users (e.g., title, description, ... ).
    """

[docs]    title: str = Field(
        min_length=1,
        description="A succinct description of the dataset. This value may be similar"
        " to a publication title and should be suitable for use as a title in plots or"
        " other references to this dataset.",
    )
[docs]    description: str = Field(
        min_length=1,
        description="A user-friendly description of the dataset. It should provide"
        " enough context about the data for new users to quickly understand how the"
        " data can be used.",
    )
[docs]    code_url: Optional[HttpUrl] = Field(description="Where the code is hosted.")
[docs]    conventions: Optional[StrictStr] = Field(
        description="The data conventions the dataset follows."
    )
[docs]    doi: Optional[StrictStr] = Field(
        description="The DOI that has been registered for this dataset, if applicable."
    )
[docs]    institution: Optional[StrictStr] = Field(
        description="The institution or organization that produces or manages this"
        " data."
    )
[docs]    references: Optional[StrictStr] = Field(
        description="Optional attribute used to cite other data, algorithms, etc. as"
        " needed."
    )
[docs]    location_id: str = Field(
        min_length=1,
        regex=r"^[a-zA-Z0-9_]+$",  # alphanumeric and '_' characters
        description="A label or acronym for the location where the data were obtained"
        " from. Only alphanumeric characters and '_' are allowed.",
    )
[docs]    dataset_name: str = Field(
        min_length=3,
        regex=r"^[a-z0-9_]+$",  # lowercase alphanumeric and '_' characters
        description="A string used to identify the data being produced. Ideally"
        " resembles a shortened lowercase version of the title. Only lowercase"
        " alphanumeric characters and '_' are allowed.",
    )
[docs]    qualifier: Optional[str] = Field(
        min_length=1,
        regex=r"^[a-zA-Z0-9_]+$",  # lowercase alphanumeric and '_' characters
        description="An optional string which distinguishes these data from other"
        " datasets produced by the same instrument. Only alphanumeric characters"
        " and '_' are allowed.",
    )
[docs]    temporal: Optional[str] = Field(
        min_length=2,
        regex=r"^[0-9]+[a-zA-Z]+$",
        description="An optional string which describes the temporal resolution of the"
        " data (if it spaced in regular intervals). This string should be formated as a"
        " number followed by a unit of measurement, e.g., '10m' would indicate the data"
        " is sampled every ten minutes. Only lowercase alphanumeric characters are"
        " allowed.",
    )
[docs]    data_level: str = Field(
        min_length=2,
        max_length=3,
        regex=r"^[a-z0-9]+$",  # lowercase alphanumeric characters
        description="A string used to indicate the level of processing of the output"
        " data. It should be formated as a letter followed by a number. Typical values"
        " for this include: a1 - data is ingested (no qc), b1 - data is ingested and"
        " quality checks applied, c1 (or higher) - one or more a* or b* datastreams"
        " used to create a higher-level data product. Only lowercase alphanumeric"
        " characters are allowed.",
    )

    # Autogenerated attributes:
[docs]    datastream: StrictStr = Field(
        "",
        description="Typically used as a label that uniquely identifies this data"
        " product from any other data product. For file-based storage systems, the"
        " datastream attribute is typically used to generate directory structures as"
        " f'{location_id}/{datastream}/', with files in that directory typically named"
        " as f'{datastream}.{date}.{time}.{ext}'. This attribute is AUTO-GENERATED at"
        " run-time, unless it is explicitly set in the config file, in which case the"
        " value in the config file will override the default. The default value for"
        " 'datastream' is as follows:\n"
        ' f"{location_id}.{dataset_name}{_qualifier}{_temporal}.{data_level}",'
        " \nwhere '_qualifier' and '_temporal' are both prepended with a literal '-'"
        " character if they are provided. This gives some separation between the"
        " 'dataset_name', 'qualifier', and 'temporal' attributes and makes it possible"
        " to parse out these specific attributes given a complete datastream label.",
    )
[docs]    history: StrictStr = Field(
        "",
        description="Attribute that will be recorded automatically by the pipeline. A"
        " warning will be raised if this is set in the config file.",
    )
[docs]    code_version: StrictStr = Field(
        default_factory=get_code_version,
        description="Attribute that will be recorded automatically by the pipeline. A"
        " warning will be raised if this is set in the config file. The code_version"
        " attribute reads the 'CODE_VERSION' environment variable or parses the git"
        " history to determine the version of the code. Semantic versioning is used by"
        " default (v'major.minor.micro'; e.g., 1.2.3).",
    )

    @validator("history", "code_version", pre=True)
    @classmethod
[docs]    def warn_if_dynamic_properties_are_set(cls, v: str, field: ModelField) -> str:
        if v:
            warnings.warn(
                f"The '{field.name}' attribute should not be set explicitly. The current"
                f" value of '{v}' will be ignored."
            )
        return ""

    @root_validator(skip_on_failure=True)
    @classmethod
[docs]    def add_datastream_field(cls, values: Dict[str, StrictStr]) -> Dict[str, StrictStr]:
        if not values["datastream"]:
            values["datastream"] = get_datastream(**values)
        return values