Source code for metobs_toolkit.template

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 09:48:24 2024

@author: thoverga
"""

import os
import sys
import logging
import json


import pandas as pd
from pytz import all_timezones


logger = logging.getLogger(__name__)


# blacklists are created for column names, which are also used as a specific
# column that is often generated by the toolkit.

column_data_blacklist = (
    []
)  # When this column is found, a 'underscar' is added to the name,
column_meta_blacklist = [
    "geometry",
    "assumed_import_frequency",
    "dataset_resolution",
    "lcz",
    "altitude",
]


def _get_empty_templ_dict():
    templ_dict = {
        # data file
        "data_related": {
            "structure": None,  # long, wide or singl-station
            "timestamp": {
                "datetime_column": None,
                "datetime_fmt": None,
                "date_column": None,
                "date_fmt": None,
                "time_column": None,
                "time_fmt": None,
                "timezone": None,
            },
            "name_column": None,
            "obstype_mapping": [
                {
                    "tlk_obstype": None,
                    "columnname": None,
                    "unit": None,
                    "description": None,
                },
            ],
        },
        # Meta data file
        "metadata_related": {
            "name_column": None,
            "lat_column": None,
            "lon_column": None,
            "columns_to_include": [],
        },
        # extra settings
        "single_station_name": "dummy_station_name",
    }
    return templ_dict


def _pwrite_templdict_to_json(templdict, trgfile):
    j = json.dumps(templdict, indent=4)
    with open(trgfile, "w") as f:
        print(j, file=f)
    return


class Template:
    """Contains all info and methods to work with a template."""

    def __init__(self):
        # to renmae the columns

        self.data_namemap = {"name": None}  # name --> name column in data
        self.metadata_namemap = {"name": None}  # name --> name column in metadata

        # obstypes details
        self.obscolumnmap = {}  # toolkitname --> datacolumnname
        self.obsdetails = {}  # obsname(tlk) --> {unit: , description: ..}

        self.metacolmapname = {}  # toolkitname --> metadatacolumnname

        # Special always required
        self.dataformat = (
            "long"  # long or wide (single station is converted to long on import)
        )

        # For single stations data
        self.data_is_single_station = False  # datafmt is assumed to be long, but name column is not required in the data
        self.single_station_name = None

        self.timestampinfo = {
            "datetimecolumn": None,
            "time_column": None,
            "date_column": None,
            "fmt": None,
        }

        # Extra options (not required)
        self.tz = None

        # Not activaly used attributes
        self.filepath = None

[docs] def show(self): """Prints out an overview of Template. Alias of Template.get_info(). Returns ------- None. """ self.get_info()
[docs] def get_info(self): """ Prints out an overview of Template. Returns ------- None. Examples -------- .. code-block:: python >>> import metobs_toolkit >>> >>> #Create your Dataset >>> dataset = metobs_toolkit.Dataset() #empty Dataset >>> >>> #Add observations to the Dataset >>> dataset.update_settings( ... input_data_file=metobs_toolkit.demo_datafile, ... input_metadata_file=metobs_toolkit.demo_metadatafile, ... template_file=metobs_toolkit.demo_template, ... ) >>> dataset.import_data_from_file() >>> >>> # Print out details >>> dataset.template.get_info() ------ Data obstypes map --------- * temp <---> Temperatuur (raw data in Celsius) (description: 2mT passive) <BLANKLINE> * humidity <---> Vochtigheid (raw data in %) (description: 2m relative humidity passive) <BLANKLINE> * wind_speed <---> Windsnelheid (raw data in km/h) (description: Average 2m 10-min windspeed) <BLANKLINE> * wind_direction <---> Windrichting (raw data in ° from north (CW)) (description: Average 2m 10-min windspeed) <BLANKLINE> <BLANKLINE> ... """ key_len = 15 print("------ Data obstypes map ---------") for key, val in self.obscolumnmap.items(): print(f" * {key.ljust(key_len)} <---> {str(val).ljust(key_len)}") print(f' (raw data in {self.obsdetails[key]["unit"]})') descr = self.obsdetails[key]["description"] if len(descr) > 30: print(f" (description: {descr[:30]} ...)") else: print(f" (description: {descr})") print("") print("\n------ Data extra mapping info ---------") print( f" * {'name column (data)'.ljust(key_len)} <---> {str(self.data_namemap['name'])}" ) if self.data_is_single_station: print( f" * {'single station name'.ljust(key_len)} <---> {self.single_station_name}" ) print("\n------ Data timestamp map ---------") for key, val in self.timestampinfo.items(): print(f" * {key.ljust(key_len)} <---> {str(val).ljust(key_len)}") print(f" * {'Timezone'.ljust(key_len)} <---> {self.tz}") print("\n------ Metadata map ---------") for key, val in self.metacolmapname.items(): print(f" * {key.ljust(key_len)} <---> {str(val).ljust(key_len)}")
# ============================================================================= # Setters # ============================================================================= def _set_dataname(self, datanamecolumn): """Overwrite the data name column""" self.data_namemap["name"] = str(datanamecolumn) def _set_dataformat(self, datafmt): """Overwrite the dataformat""" if str(datafmt) == "long": self.dataformat = "long" elif str(datafmt) == "wide": self.dataformat = "wide" elif str(datafmt) == "single_station": # Data format is long, but indicate that data represents a single station self.dataformat = "long" self.data_is_single_station = True else: sys.exit(f"{datafmt} is not a known dataformat.") # ============================================================================= # Getters (used by other classes to extract specific data from a template) # ============================================================================= def _is_data_long(self): return self.dataformat == "long" def _is_data_single_station(self): return self.data_is_single_station def _get_single_station_default_name(self): return str(self.single_station_name) def _get_wide_obstype(self): """Get the name of the wide obstype, in tlk space""" return list(self.obsdetails.keys())[0] def _get_tz(self): """Get the timezone string""" return self.tz def _get_data_name_map(self): """Get the name-map for the data file""" return {self.data_namemap["name"]: "name"} def _get_metadata_name_map(self): """Get the name-map for the metadata file""" return {self.metadata_namemap["name"]: "name"} def _get_metadata_column_map(self): """Get the mapper for all metadata columns (name, lat, lon included)""" return {val: key for key, val in self.metacolmapname.items()} def _get_obs_column_map(self): """Get mapper for al the observations columns of the data (for long format)""" # Check if datetime mapping is valid self._check_if_datetime_is_mapped() columnmmap = {} if self.dataformat == "long": # add all obstype columns for key, val in self.obscolumnmap.items(): columnmmap[val] = key else: pass # no mapping done on wide return columnmmap def _get_all_mapped_data_cols_in_tlk_space(self): # all mapped columns are: name, datetime and all mapped obstypes mapped_cols = ["name", "datetime"] if self.dataformat == "long": mapped_cols.extend(list(self.obscolumnmap.keys())) else: # wide only represents one obstype mapped_cols.append(list(self.obsdetails.keys())[0]) return mapped_cols def _get_original_obstype_columnname(self, obstypename): return str(self.obscolumnmap[obstypename]) def _get_input_unit_of_tlk_obstype(self, obstypename): return str(self.obsdetails[obstypename]["unit"]) def _get_description_of_tlk_obstype(self, obstypename): return str(self.obsdetails[obstypename]["description"]) # ============================================================================= # Validity checkers # ============================================================================= def _check_if_datetime_is_mapped(self): """ Test if the required template details are present to construct a timestamp column. """ ts_info = self.timestampinfo # situation 1: datetime column is present if ts_info["datetimecolumn"] is not None: assert ( ts_info["fmt"] is not None ), f"Datetimes are assumed to be present in ONE column, but no datetime format is specified." if ts_info["time_column"] is not None: self.timestampinfo["time_column"] = None logger.warning( f"The mapping of the time column ({ts_info['time_column']}) is ignored because of the presence of a datetime column." ) if ts_info["date_column"] is not None: self.timestampinfo["date_column"] = None logger.warning( f"The mapping of the date column ({ts_info['date_column']}) is ignored because of the presence of a datetime column." ) return # Situation 2: a seperate date and time columns is present. if (ts_info["time_column"] is not None) & (ts_info["date_column"] is not None): assert ( ts_info["fmt"] is not None ), f"Datetimes are assumed to be present as a date and time column, but no formats are specified." return sys.exit( "The timestamps are not correctly mapped (either by using a datetime column, or by a time and date column)" ) def _data_template_compatibility_test(self, datacolumns): """Check the compatibility of the template and the columns of the data""" # check datetime self._check_if_datetime_is_mapped() if self.timestampinfo["datetimecolumn"] is not None: if not (self.timestampinfo["datetimecolumn"] in datacolumns): raise MetobsTemplateError( f'The column {self.timestampinfo["datetimecolumn"]} is incorrectly mapped in the template as the "datetime" column. The template is therefore not valid with the data.' ) if self.timestampinfo["time_column"] is not None: if not (self.timestampinfo["time_column"] in datacolumns): raise MetobsTemplateError( f'The column {self.timestampinfo["time_column"]} is incorrectly mapped in the template as the "_time" column. The template is therefore not valid with the data.' ) if self.timestampinfo["date_column"] is not None: if not (self.timestampinfo["date_column"] in datacolumns): raise MetobsTemplateError( f'The column {self.timestampinfo["date_column"]} is incorrectly mapped in the template as the "_date" column. The template is therefore not valid with the data.' ) if self._is_data_long(): # check name column if not self._is_data_single_station(): if not (self.data_namemap["name"] in datacolumns): raise MetobsTemplateError( f'The column {self.data_namemap["name"]} is not (or incorrectly) mapped in the template as the "name" column. The template is therefore not valid with the data.' ) # check of templates has obstypes not present in the data for mapped_obscol in self.obscolumnmap.values(): if mapped_obscol not in datacolumns: msg = f"{mapped_obscol} is a mapped observation, present in the template but not found in the data! This obstype will be ignored." logger.warning(msg) # check if data has obstypes not present in the template for data_col in datacolumns: if data_col in [ self.data_namemap["name"], self.timestampinfo["datetimecolumn"], self.timestampinfo["time_column"], self.timestampinfo["date_column"], ]: continue if data_col not in self.obscolumnmap.values(): msg = f"{data_col} is present in the datafile, but not found in the template! This column will be ignored." logger.warning(msg) else: # wide tests" # all columns are assumed to be station names pass def _metadata_template_compatibility_test(self, metadatacolumns): """Check the compatibility of the template and the columns of the metadata""" # check name column (must be present if multiple station are in the data) if not self._is_data_single_station(): if not (self.metadata_namemap["name"] in metadatacolumns): raise MetobsTemplateError( f'The column {self.metadata_namemap["name"]} is not (or incorrectly) mapped in the template as the "name" column. The template is therefore not valid with the metadata.' ) # check if templates contains mapped columns not present in the metadata for mapped_col in self.metacolmapname.values(): if mapped_col not in metadatacolumns: msg = f"{mapped_col} is a mapped metadata column, present in the template but not found in the metadata! This column will be ignored." logger.warning(msg) # check if metadata has columns which are not mapped by the template unmapped = ( set(metadatacolumns) - set(self.metacolmapname.values()) - set(["name"]) ) if not bool(unmapped): msg = f"The following columns are found in the metadata, but not in the template and are therefore ignored: \n{list(unmapped)}" logger.warning(msg) def _apply_blacklist(self, columns, on_data): """Create a mapper for columns in the data or metadata file, which have a specific meaning by the toolkit and are thus present in the blacklists. If a column, is found in the blacklist, but will also be mapped, than it it will not be included in the mapper. The mapper adds a "_original" postifix to the columnnames. """ if on_data: blacklist = column_data_blacklist else: blacklist = column_meta_blacklist to_rename = [col for col in columns if col in column_data_blacklist] if on_data: # if the columns is mapped by the template, remove it from the to_rename mapped_set = set( [ self.data_namemap["name"], # name col self.timestampinfo["datetimecolumn"], self.timestampinfo["date_column"], self.timestampinfo["time_column"], ] ) mapped_set.union(set(self._get_obs_column_map().keys())) else: # on metadata mapped_set = set([self.metadata_namemap["name"]]) mapped_set.union(set(self._get_metadata_column_map().keys())) mapped_set = mapped_set - set([None]) to_rename = set(to_rename) - mapped_set blacklist_mapper = {col: f"{col}_original" for col in to_rename} if not bool(blacklist_mapper): if on_data: msg = f"The following data columns are renamed because of special meaning by the toolkit: {blacklist_mapper}" else: msg = f"The following metadata columns are renamed because of special meaning by the toolkit: {blacklist_mapper}" logger.warning(msg) return blacklist_mapper # ============================================================================= # Other methods # ============================================================================= def read_template_from_file(self, jsonpath): """Read the templatefile (json), and update the attributes of this Template.""" if not str(jsonpath).endswith(".json"): raise MetobsTemplateError(f"{jsonpath}, is not a json file.") with open(jsonpath, "r") as f: tml_dict = json.load(f) # set attributes self.data_namemap = {"name": tml_dict["data_related"]["name_column"]} self.metadata_namemap = {"name": tml_dict["metadata_related"]["name_column"]} self._set_dataformat(tml_dict["data_related"]["structure"]) self.single_station_name = str(tml_dict["single_station_name"]) if tml_dict["data_related"]["timestamp"]["datetime_column"] is None: dt_fmt = f'{tml_dict["data_related"]["timestamp"]["date_fmt"]} {tml_dict["data_related"]["timestamp"]["time_fmt"]}' else: dt_fmt = f'{tml_dict["data_related"]["timestamp"]["datetime_fmt"]}' self.timestampinfo = { "datetimecolumn": tml_dict["data_related"]["timestamp"]["datetime_column"], "time_column": tml_dict["data_related"]["timestamp"]["time_column"], "date_column": tml_dict["data_related"]["timestamp"]["date_column"], "fmt": dt_fmt, } for obsdict in tml_dict["data_related"]["obstype_mapping"]: self.obscolumnmap[obsdict["tlk_obstype"]] = obsdict["columnname"] self.obsdetails[obsdict["tlk_obstype"]] = { "unit": obsdict["unit"], "description": obsdict["description"], } self.metacolmapname["name"] = tml_dict["metadata_related"]["name_column"] if tml_dict["metadata_related"]["lat_column"] is not None: self.metacolmapname["lat"] = tml_dict["metadata_related"]["lat_column"] if tml_dict["metadata_related"]["lon_column"] is not None: self.metacolmapname["lon"] = tml_dict["metadata_related"]["lon_column"] for extra_col in tml_dict["metadata_related"]["columns_to_include"]: self.metacolmapname[extra_col] = extra_col def _create_datetime_column(df, template): """Use the template to construct a tz-naive "datetime" column.""" template._check_if_datetime_is_mapped() if template.timestampinfo["datetimecolumn"] is not None: if not (template.timestampinfo["datetimecolumn"] in df.columns): raise MetobsTemplateError( f'The {template.timestampinfo["datetimecolumn"]} is not found in the columns of the data file: {df.columns}' ) df = df.rename(columns={template.timestampinfo["datetimecolumn"]: "datetime"}) try: df["datetime"] = pd.to_datetime( df["datetime"], format=template.timestampinfo["fmt"] ) except Exception as e: raise MetobsTemplateError( "The timestamps could not be converted to datetimes, check the timestamp format(s) in your template." ) else: # by date and time column if not (template.timestampinfo["time_column"] in df.columns): raise MetobsTemplateError( f'The {template.timestampinfo["time_column"]} is not found in the columns of the data file: {df.columns}' ) if not (template.timestampinfo["date_column"] in df.columns): raise MetobsTemplateError( f'The {template.timestampinfo["date_column"]} is not found in the columns of the data file: {df.columns}' ) df = df.rename( columns={ template.timestampinfo["time_column"]: "_time", template.timestampinfo["date_column"]: "_date", } ) try: df["datetime"] = pd.to_datetime( df["_date"] + " " + df["_time"], format=template.timestampinfo["fmt"] ) except Exception as e: raise MetobsTemplateError( "The timestamps could not be converted to datetimes, check the timestamp format(s) in your template." ) # raise Exception('The timestamps could not be converted to datetimes, check the timestamp format(s) in your template. \n').with_traceback(e.__traceback__) df = df.drop(columns=["_date", "_time"]) return df # ============================================================================= # Exceptions # ============================================================================= class MetobsTemplateError(Exception): """Exception raised for errors in the template.""" pass