#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 09:48:24 2024
@author: thoverga
"""
import os
import sys
import logging
import json
import pandas as pd
from pytz import all_timezones
logger = logging.getLogger(__name__)
# blacklists are created for column names, which are also used as a specific
# column that is often generated by the toolkit.
column_data_blacklist = (
[]
) # When this column is found, a 'underscar' is added to the name,
column_meta_blacklist = [
"geometry",
"assumed_import_frequency",
"dataset_resolution",
"lcz",
"altitude",
]
def _get_empty_templ_dict():
templ_dict = {
# data file
"data_related": {
"structure": None, # long, wide or singl-station
"timestamp": {
"datetime_column": None,
"datetime_fmt": None,
"date_column": None,
"date_fmt": None,
"time_column": None,
"time_fmt": None,
"timezone": None,
},
"name_column": None,
"obstype_mapping": [
{
"tlk_obstype": None,
"columnname": None,
"unit": None,
"description": None,
},
],
},
# Meta data file
"metadata_related": {
"name_column": None,
"lat_column": None,
"lon_column": None,
"columns_to_include": [],
},
# extra settings
"single_station_name": "dummy_station_name",
}
return templ_dict
def _pwrite_templdict_to_json(templdict, trgfile):
j = json.dumps(templdict, indent=4)
with open(trgfile, "w") as f:
print(j, file=f)
return
class Template:
"""Contains all info and methods to work with a template."""
def __init__(self):
# to renmae the columns
self.data_namemap = {"name": None} # name --> name column in data
self.metadata_namemap = {"name": None} # name --> name column in metadata
# obstypes details
self.obscolumnmap = {} # toolkitname --> datacolumnname
self.obsdetails = {} # obsname(tlk) --> {unit: , description: ..}
self.metacolmapname = {} # toolkitname --> metadatacolumnname
# Special always required
self.dataformat = (
"long" # long or wide (single station is converted to long on import)
)
# For single stations data
self.data_is_single_station = False # datafmt is assumed to be long, but name column is not required in the data
self.single_station_name = None
self.timestampinfo = {
"datetimecolumn": None,
"time_column": None,
"date_column": None,
"fmt": None,
}
# Extra options (not required)
self.tz = None
# Not activaly used attributes
self.filepath = None
[docs]
def show(self):
"""Prints out an overview of Template.
Alias of Template.get_info().
Returns
-------
None.
"""
self.get_info()
[docs]
def get_info(self):
"""
Prints out an overview of Template.
Returns
-------
None.
Examples
--------
.. code-block:: python
>>> import metobs_toolkit
>>>
>>> #Create your Dataset
>>> dataset = metobs_toolkit.Dataset() #empty Dataset
>>>
>>> #Add observations to the Dataset
>>> dataset.update_settings(
... input_data_file=metobs_toolkit.demo_datafile,
... input_metadata_file=metobs_toolkit.demo_metadatafile,
... template_file=metobs_toolkit.demo_template,
... )
>>> dataset.import_data_from_file()
>>>
>>> # Print out details
>>> dataset.template.get_info()
------ Data obstypes map ---------
* temp <---> Temperatuur
(raw data in Celsius)
(description: 2mT passive)
<BLANKLINE>
* humidity <---> Vochtigheid
(raw data in %)
(description: 2m relative humidity passive)
<BLANKLINE>
* wind_speed <---> Windsnelheid
(raw data in km/h)
(description: Average 2m 10-min windspeed)
<BLANKLINE>
* wind_direction <---> Windrichting
(raw data in ° from north (CW))
(description: Average 2m 10-min windspeed)
<BLANKLINE>
<BLANKLINE>
...
"""
key_len = 15
print("------ Data obstypes map ---------")
for key, val in self.obscolumnmap.items():
print(f" * {key.ljust(key_len)} <---> {str(val).ljust(key_len)}")
print(f' (raw data in {self.obsdetails[key]["unit"]})')
descr = self.obsdetails[key]["description"]
if len(descr) > 30:
print(f" (description: {descr[:30]} ...)")
else:
print(f" (description: {descr})")
print("")
print("\n------ Data extra mapping info ---------")
print(
f" * {'name column (data)'.ljust(key_len)} <---> {str(self.data_namemap['name'])}"
)
if self.data_is_single_station:
print(
f" * {'single station name'.ljust(key_len)} <---> {self.single_station_name}"
)
print("\n------ Data timestamp map ---------")
for key, val in self.timestampinfo.items():
print(f" * {key.ljust(key_len)} <---> {str(val).ljust(key_len)}")
print(f" * {'Timezone'.ljust(key_len)} <---> {self.tz}")
print("\n------ Metadata map ---------")
for key, val in self.metacolmapname.items():
print(f" * {key.ljust(key_len)} <---> {str(val).ljust(key_len)}")
# =============================================================================
# Setters
# =============================================================================
def _set_dataname(self, datanamecolumn):
"""Overwrite the data name column"""
self.data_namemap["name"] = str(datanamecolumn)
def _set_dataformat(self, datafmt):
"""Overwrite the dataformat"""
if str(datafmt) == "long":
self.dataformat = "long"
elif str(datafmt) == "wide":
self.dataformat = "wide"
elif str(datafmt) == "single_station":
# Data format is long, but indicate that data represents a single station
self.dataformat = "long"
self.data_is_single_station = True
else:
sys.exit(f"{datafmt} is not a known dataformat.")
# =============================================================================
# Getters (used by other classes to extract specific data from a template)
# =============================================================================
def _is_data_long(self):
return self.dataformat == "long"
def _is_data_single_station(self):
return self.data_is_single_station
def _get_single_station_default_name(self):
return str(self.single_station_name)
def _get_wide_obstype(self):
"""Get the name of the wide obstype, in tlk space"""
return list(self.obsdetails.keys())[0]
def _get_tz(self):
"""Get the timezone string"""
return self.tz
def _get_data_name_map(self):
"""Get the name-map for the data file"""
return {self.data_namemap["name"]: "name"}
def _get_metadata_name_map(self):
"""Get the name-map for the metadata file"""
return {self.metadata_namemap["name"]: "name"}
def _get_metadata_column_map(self):
"""Get the mapper for all metadata columns (name, lat, lon included)"""
return {val: key for key, val in self.metacolmapname.items()}
def _get_obs_column_map(self):
"""Get mapper for al the observations columns of the data (for long format)"""
# Check if datetime mapping is valid
self._check_if_datetime_is_mapped()
columnmmap = {}
if self.dataformat == "long":
# add all obstype columns
for key, val in self.obscolumnmap.items():
columnmmap[val] = key
else:
pass # no mapping done on wide
return columnmmap
def _get_all_mapped_data_cols_in_tlk_space(self):
# all mapped columns are: name, datetime and all mapped obstypes
mapped_cols = ["name", "datetime"]
if self.dataformat == "long":
mapped_cols.extend(list(self.obscolumnmap.keys()))
else:
# wide only represents one obstype
mapped_cols.append(list(self.obsdetails.keys())[0])
return mapped_cols
def _get_original_obstype_columnname(self, obstypename):
return str(self.obscolumnmap[obstypename])
def _get_input_unit_of_tlk_obstype(self, obstypename):
return str(self.obsdetails[obstypename]["unit"])
def _get_description_of_tlk_obstype(self, obstypename):
return str(self.obsdetails[obstypename]["description"])
# =============================================================================
# Validity checkers
# =============================================================================
def _check_if_datetime_is_mapped(self):
"""
Test if the required template details are present to construct a timestamp column.
"""
ts_info = self.timestampinfo
# situation 1: datetime column is present
if ts_info["datetimecolumn"] is not None:
assert (
ts_info["fmt"] is not None
), f"Datetimes are assumed to be present in ONE column, but no datetime format is specified."
if ts_info["time_column"] is not None:
self.timestampinfo["time_column"] = None
logger.warning(
f"The mapping of the time column ({ts_info['time_column']}) is ignored because of the presence of a datetime column."
)
if ts_info["date_column"] is not None:
self.timestampinfo["date_column"] = None
logger.warning(
f"The mapping of the date column ({ts_info['date_column']}) is ignored because of the presence of a datetime column."
)
return
# Situation 2: a seperate date and time columns is present.
if (ts_info["time_column"] is not None) & (ts_info["date_column"] is not None):
assert (
ts_info["fmt"] is not None
), f"Datetimes are assumed to be present as a date and time column, but no formats are specified."
return
sys.exit(
"The timestamps are not correctly mapped (either by using a datetime column, or by a time and date column)"
)
def _data_template_compatibility_test(self, datacolumns):
"""Check the compatibility of the template and the columns of the data"""
# check datetime
self._check_if_datetime_is_mapped()
if self.timestampinfo["datetimecolumn"] is not None:
if not (self.timestampinfo["datetimecolumn"] in datacolumns):
raise MetobsTemplateError(
f'The column {self.timestampinfo["datetimecolumn"]} is incorrectly mapped in the template as the "datetime" column. The template is therefore not valid with the data.'
)
if self.timestampinfo["time_column"] is not None:
if not (self.timestampinfo["time_column"] in datacolumns):
raise MetobsTemplateError(
f'The column {self.timestampinfo["time_column"]} is incorrectly mapped in the template as the "_time" column. The template is therefore not valid with the data.'
)
if self.timestampinfo["date_column"] is not None:
if not (self.timestampinfo["date_column"] in datacolumns):
raise MetobsTemplateError(
f'The column {self.timestampinfo["date_column"]} is incorrectly mapped in the template as the "_date" column. The template is therefore not valid with the data.'
)
if self._is_data_long():
# check name column
if not self._is_data_single_station():
if not (self.data_namemap["name"] in datacolumns):
raise MetobsTemplateError(
f'The column {self.data_namemap["name"]} is not (or incorrectly) mapped in the template as the "name" column. The template is therefore not valid with the data.'
)
# check of templates has obstypes not present in the data
for mapped_obscol in self.obscolumnmap.values():
if mapped_obscol not in datacolumns:
msg = f"{mapped_obscol} is a mapped observation, present in the template but not found in the data! This obstype will be ignored."
logger.warning(msg)
# check if data has obstypes not present in the template
for data_col in datacolumns:
if data_col in [
self.data_namemap["name"],
self.timestampinfo["datetimecolumn"],
self.timestampinfo["time_column"],
self.timestampinfo["date_column"],
]:
continue
if data_col not in self.obscolumnmap.values():
msg = f"{data_col} is present in the datafile, but not found in the template! This column will be ignored."
logger.warning(msg)
else:
# wide tests"
# all columns are assumed to be station names
pass
def _metadata_template_compatibility_test(self, metadatacolumns):
"""Check the compatibility of the template and the columns of the metadata"""
# check name column (must be present if multiple station are in the data)
if not self._is_data_single_station():
if not (self.metadata_namemap["name"] in metadatacolumns):
raise MetobsTemplateError(
f'The column {self.metadata_namemap["name"]} is not (or incorrectly) mapped in the template as the "name" column. The template is therefore not valid with the metadata.'
)
# check if templates contains mapped columns not present in the metadata
for mapped_col in self.metacolmapname.values():
if mapped_col not in metadatacolumns:
msg = f"{mapped_col} is a mapped metadata column, present in the template but not found in the metadata! This column will be ignored."
logger.warning(msg)
# check if metadata has columns which are not mapped by the template
unmapped = (
set(metadatacolumns) - set(self.metacolmapname.values()) - set(["name"])
)
if not bool(unmapped):
msg = f"The following columns are found in the metadata, but not in the template and are therefore ignored: \n{list(unmapped)}"
logger.warning(msg)
def _apply_blacklist(self, columns, on_data):
"""Create a mapper for columns in the data or metadata file, which have
a specific meaning by the toolkit and are thus present in the blacklists.
If a column, is found in the blacklist, but will also be mapped, than it
it will not be included in the mapper.
The mapper adds a "_original" postifix to the columnnames.
"""
if on_data:
blacklist = column_data_blacklist
else:
blacklist = column_meta_blacklist
to_rename = [col for col in columns if col in column_data_blacklist]
if on_data:
# if the columns is mapped by the template, remove it from the to_rename
mapped_set = set(
[
self.data_namemap["name"], # name col
self.timestampinfo["datetimecolumn"],
self.timestampinfo["date_column"],
self.timestampinfo["time_column"],
]
)
mapped_set.union(set(self._get_obs_column_map().keys()))
else:
# on metadata
mapped_set = set([self.metadata_namemap["name"]])
mapped_set.union(set(self._get_metadata_column_map().keys()))
mapped_set = mapped_set - set([None])
to_rename = set(to_rename) - mapped_set
blacklist_mapper = {col: f"{col}_original" for col in to_rename}
if not bool(blacklist_mapper):
if on_data:
msg = f"The following data columns are renamed because of special meaning by the toolkit: {blacklist_mapper}"
else:
msg = f"The following metadata columns are renamed because of special meaning by the toolkit: {blacklist_mapper}"
logger.warning(msg)
return blacklist_mapper
# =============================================================================
# Other methods
# =============================================================================
def read_template_from_file(self, jsonpath):
"""Read the templatefile (json), and update the attributes of this Template."""
if not str(jsonpath).endswith(".json"):
raise MetobsTemplateError(f"{jsonpath}, is not a json file.")
with open(jsonpath, "r") as f:
tml_dict = json.load(f)
# set attributes
self.data_namemap = {"name": tml_dict["data_related"]["name_column"]}
self.metadata_namemap = {"name": tml_dict["metadata_related"]["name_column"]}
self._set_dataformat(tml_dict["data_related"]["structure"])
self.single_station_name = str(tml_dict["single_station_name"])
if tml_dict["data_related"]["timestamp"]["datetime_column"] is None:
dt_fmt = f'{tml_dict["data_related"]["timestamp"]["date_fmt"]} {tml_dict["data_related"]["timestamp"]["time_fmt"]}'
else:
dt_fmt = f'{tml_dict["data_related"]["timestamp"]["datetime_fmt"]}'
self.timestampinfo = {
"datetimecolumn": tml_dict["data_related"]["timestamp"]["datetime_column"],
"time_column": tml_dict["data_related"]["timestamp"]["time_column"],
"date_column": tml_dict["data_related"]["timestamp"]["date_column"],
"fmt": dt_fmt,
}
for obsdict in tml_dict["data_related"]["obstype_mapping"]:
self.obscolumnmap[obsdict["tlk_obstype"]] = obsdict["columnname"]
self.obsdetails[obsdict["tlk_obstype"]] = {
"unit": obsdict["unit"],
"description": obsdict["description"],
}
self.metacolmapname["name"] = tml_dict["metadata_related"]["name_column"]
if tml_dict["metadata_related"]["lat_column"] is not None:
self.metacolmapname["lat"] = tml_dict["metadata_related"]["lat_column"]
if tml_dict["metadata_related"]["lon_column"] is not None:
self.metacolmapname["lon"] = tml_dict["metadata_related"]["lon_column"]
for extra_col in tml_dict["metadata_related"]["columns_to_include"]:
self.metacolmapname[extra_col] = extra_col
def _create_datetime_column(df, template):
"""Use the template to construct a tz-naive "datetime" column."""
template._check_if_datetime_is_mapped()
if template.timestampinfo["datetimecolumn"] is not None:
if not (template.timestampinfo["datetimecolumn"] in df.columns):
raise MetobsTemplateError(
f'The {template.timestampinfo["datetimecolumn"]} is not found in the columns of the data file: {df.columns}'
)
df = df.rename(columns={template.timestampinfo["datetimecolumn"]: "datetime"})
try:
df["datetime"] = pd.to_datetime(
df["datetime"], format=template.timestampinfo["fmt"]
)
except Exception as e:
raise MetobsTemplateError(
"The timestamps could not be converted to datetimes, check the timestamp format(s) in your template."
)
else:
# by date and time column
if not (template.timestampinfo["time_column"] in df.columns):
raise MetobsTemplateError(
f'The {template.timestampinfo["time_column"]} is not found in the columns of the data file: {df.columns}'
)
if not (template.timestampinfo["date_column"] in df.columns):
raise MetobsTemplateError(
f'The {template.timestampinfo["date_column"]} is not found in the columns of the data file: {df.columns}'
)
df = df.rename(
columns={
template.timestampinfo["time_column"]: "_time",
template.timestampinfo["date_column"]: "_date",
}
)
try:
df["datetime"] = pd.to_datetime(
df["_date"] + " " + df["_time"], format=template.timestampinfo["fmt"]
)
except Exception as e:
raise MetobsTemplateError(
"The timestamps could not be converted to datetimes, check the timestamp format(s) in your template."
)
# raise Exception('The timestamps could not be converted to datetimes, check the timestamp format(s) in your template. \n').with_traceback(e.__traceback__)
df = df.drop(columns=["_date", "_time"])
return df
# =============================================================================
# Exceptions
# =============================================================================
class MetobsTemplateError(Exception):
"""Exception raised for errors in the template."""
pass