import warnings
from pydantic import (
BaseModel,
Extra,
validator,
root_validator,
Field,
StrictStr,
HttpUrl,
)
from pydantic.fields import ModelField
from typing import Any, Dict, Optional
from .utils import get_code_version
[docs]class AttributeModel(BaseModel, extra=Extra.allow):
# HACK: root is needed for now: https://github.com/samuelcolvin/pydantic/issues/515
@root_validator(skip_on_failure=True)
@classmethod
[docs] def validate_all_ascii(cls, values: Dict[Any, Any]) -> Dict[str, str]:
for key, value in values.items():
if not isinstance(key, str) or not key.isascii():
raise ValueError(f"'{key}' contains a non-ascii character.")
if isinstance(value, str) and not value.isascii():
raise ValueError(
f"attr '{key}' -> '{value}' contains a non-ascii character."
)
return values
[docs]class GlobalAttributes(AttributeModel):
"""Global attributes that will be recorded in the output dataset. These metadata are
used to record data provenance information (e.g., location, institution, etc),
construct datastream and file names (i.e., location_id, dataset_name, qualifier,
temporal, and data_level attributes), as well as provide metadata that is useful for
data users (e.g., title, description, ... ).
"""
[docs] title: str = Field(
min_length=1,
description="A succinct description of the dataset. This value may be similar"
" to a publication title and should be suitable for use as a title in plots or"
" other references to this dataset.",
)
[docs] description: str = Field(
min_length=1,
description="A user-friendly description of the dataset. It should provide"
" enough context about the data for new users to quickly understand how the"
" data can be used.",
)
[docs] code_url: Optional[HttpUrl] = Field(description="Where the code is hosted.")
[docs] conventions: Optional[StrictStr] = Field(
description="The data conventions the dataset follows."
)
[docs] doi: Optional[StrictStr] = Field(
description="The DOI that has been registered for this dataset, if applicable."
)
[docs] institution: Optional[StrictStr] = Field(
description="The institution or organization that produces or manages this"
" data."
)
[docs] references: Optional[StrictStr] = Field(
description="Optional attribute used to cite other data, algorithms, etc. as"
" needed."
)
[docs] location_id: str = Field(
min_length=3,
regex=r"^[a-z0-9_]+$", # lowercase alphanumeric and '_' characters
description="A label or acronym for the location where the data were obtained"
" from. Only lowercase alphanumeric characters and '_' are allowed.",
)
[docs] dataset_name: str = Field(
min_length=3,
regex=r"^[a-z0-9_]+$", # lowercase alphanumeric and '_' characters
description="A string used to identify the data being produced. Ideally"
" resembles a shortened lowercase version of the title. Only lowercase"
" alphanumeric characters and '_' are allowed.",
)
[docs] qualifier: Optional[str] = Field(
min_length=3,
regex=r"^[a-z0-9_]+$", # lowercase alphanumeric and '_' characters
description="An optional string which distinguishes these data from other"
" datasets produced by the same instrument. Only lowercase alphanumeric"
" characters and '_' are allowed.",
)
[docs] temporal: Optional[str] = Field(
min_length=2,
regex=r"^[0-9]+[a-zA-Z]+$",
description="An optional string which describes the temporal resolution of the"
" data (if it spaced in regular intervals). This string should be formated as a"
" number followed by a unit of measurement, e.g., '10m' would indicate the data"
" is sampled every ten minutes. Only lowercase alphanumeric characters are"
" allowed.",
)
[docs] data_level: str = Field(
min_length=2,
max_length=3,
regex=r"^[a-z0-9]+$", # lowercase alphanumeric characters
description="A string used to indicate the level of processing of the output"
" data. It should be formated as a letter followed by a number. Typical values"
" for this include: a1 - data is ingested (no qc), b1 - data is ingested and"
" quality checks applied, c1 (or higher) - one or more a* or b* datastreams"
" used to create a higher-level data product. Only lowercase alphanumeric"
" characters are allowed.",
)
# Autogenerated attributes:
[docs] datastream: StrictStr = Field(
"",
description="Typically used as a label that uniquely identifies this data"
" product from any other data product. For file-based storage systems, the"
" datastream attribute is typically used to generate directory structures as"
" f'{location_id}/{datastream}/', with files in that directory typically named"
" as f'{datastream}.{date}.{time}.{ext}'. This attribute is AUTO-GENERATED at"
" run-time, unless it is explicitly set in the config file, in which case the"
" value in the config file will override the default. The default value for"
" 'datastream' is as follows:\n"
' f"{location_id}.{dataset_name}{_qualifier}{_temporal}.{data_level}",'
" \nwhere '_qualifier' and '_temporal' are both prepended with a literal '-'"
" character if they are provided. This gives some separation between the"
" 'dataset_name', 'qualifier', and 'temporal' attributes and makes it possible"
" to parse out these specific attributes given a complete datastream label.",
)
[docs] history: StrictStr = Field(
"",
description="Attribute that will be recorded automatically by the pipeline. A"
" warning will be raised if this is set in the config file.",
)
[docs] code_version: StrictStr = Field(
default_factory=get_code_version,
description="Attribute that will be recorded automatically by the pipeline. A"
" warning will be raised if this is set in the config file. The code_version"
" attribute reads the 'CODE_VERSION' environment variable or parses the git"
" history to determine the version of the code. Semantic versioning is used by"
" default (v'major.minor.micro'; e.g., 1.2.3).",
)
@validator("history", "code_version", pre=True)
@classmethod
[docs] def warn_if_dynamic_properties_are_set(cls, v: str, field: ModelField) -> str:
if v:
warnings.warn(
f"The '{field.name}' attribute should not be set explicitly. The current"
f" value of '{v}' will be ignored."
)
return ""
@root_validator(skip_on_failure=True)
@classmethod
[docs] def add_datastream_field(cls, values: Dict[str, StrictStr]) -> Dict[str, StrictStr]:
if not values["datastream"]:
loc = values["location_id"]
name = values["dataset_name"]
qual = "-" + values["qualifier"] if values["qualifier"] else ""
temp = "-" + values["temporal"] if values["temporal"] else ""
lvl = values["data_level"]
values["datastream"] = f"{loc}.{name}{qual}{temp}.{lvl}"
return values