#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 16 13:44:49 2024
@author: thoverga
"""
import logging
from metobs_toolkit import Dataset
logger = logging.getLogger(__name__)
from metobs_toolkit.qc_checks import (
gross_value_check,
persistance_check,
repetitions_check,
# duplicate_timestamp_check,
step_check,
window_variation_check,
# invalid_input_check,
toolkit_buddy_check,
titan_buddy_check,
titan_sct_resistant_check,
)
from metobs_toolkit.plotting_functions import qc_stats_pie
from metobs_toolkit.qc_statistics import get_freq_statistics
from metobs_toolkit.df_helpers import (
multiindexdf_datetime_subsetting,
fmt_datetime_argument,
init_multiindex,
init_multiindexdf,
init_triple_multiindexdf,
metadf_to_gdf,
conv_applied_qc_to_df,
get_freqency_series,
value_labeled_doubleidxdf_to_triple_idxdf,
xs_save,
concat_save,
)
[docs]
class Dataset(Dataset):
"""Extension on the metobs_toolkit.Dataset class with QC related methods"""
[docs]
def get_qc_stats(self, obstype="temp", stationname=None, make_plot=True):
"""Get quality control statistics.
Compute frequency statistics on the qc labels for an observationtype.
The output is a dataframe containing the frequency statistics presented
as percentages.
These frequencies can also be presented as a collection of piecharts
per check.
With stationnames you can subset the data to one ore multiple stations.
Parameters
-----------
obstype : str, optional
Observation type to analyse the QC labels on. The default is
'temp'.
stationname : str, optional
Stationname to subset the quality labels on. If None, all
stations are used. The default is None.
make_plot : Bool, optional
If True, a plot with piecharts is generated. The default is True.
Returns
---------
dataset_qc_stats : pandas.DataFrame
A table containing the label frequencies per check presented
as percentages.
Examples
--------
.. code-block:: python
>>> import metobs_toolkit
>>>
>>> # Import data into a Dataset
>>> dataset = metobs_toolkit.Dataset()
>>> dataset.update_settings(
... input_data_file=metobs_toolkit.demo_datafile,
... input_metadata_file=metobs_toolkit.demo_metadatafile,
... template_file=metobs_toolkit.demo_template,
... )
>>> dataset.import_data_from_file()
>>> dataset.coarsen_time_resolution(freq='1h')
>>>
>>> # Apply quality control on the temperature observations
>>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings
>>> dataset
Dataset instance containing:
*28 stations
*['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
*10080 observation records
*1676 records labeled as outliers
*0 gaps
*3 missing observations
*records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00)
*time zone of the records: UTC
*Coordinates are available for all stations.
>>>
>>> #Get quality control statistics
>>> stats = dataset.get_qc_stats(make_plot=False)
>>> stats
({'ok': 83.37301587301587, 'QC outliers': 16.6269841269...
"""
# cobmine all and get final label
comb_df = self.combine_all_to_obsspace()
# subset to relevant columnt
comb_df = xs_save(comb_df, obstype, level="obstype")[["label"]]
# subset to stationnames
if stationname is not None:
assert stationname in comb_df.index.get_level_values(
"name"
), f" stationnames: {stationname} is not a list."
comb_df = comb_df.loc[stationname]
# compute freq statistics
final_freq, outl_freq, specific_freq = get_freq_statistics(
comb_df=comb_df,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
gaps_info=self.settings.gap["gaps_info"],
applied_qc_order=self._applied_qc,
)
if any([stat is None for stat in [final_freq, outl_freq, specific_freq]]):
return None
# make title
orig_obstype = self.obstypes[obstype].get_orig_name()
if stationname is None:
title = f"Label frequency statistics on all stations for {orig_obstype}."
else:
title = f"Label frequency statistics for {stationname} for {orig_obstype}."
if make_plot:
# make pie plots
qc_stats_pie(
final_stats=final_freq,
outlier_stats=outl_freq,
specific_stats=specific_freq,
plot_settings=self.settings.app["plot_settings"],
qc_check_info=self.settings.qc["qc_checks_info"],
title=title,
)
return (final_freq, outl_freq, specific_freq)
[docs]
def apply_quality_control(
self,
obstype="temp",
gross_value=True,
persistance=True,
repetitions=True,
step=True,
window_variation=True,
):
"""Apply quality control methods to the dataset.
The default settings are used, and can be changed in the
settings_files/qc_settings.py
The checks are performed in a sequence: gross_vallue -->
persistance --> ..., Outliers by a previous check are ignored in the
following checks!
The dataset is updated inline.
Parameters
----------
obstype : String, optional
Name of the observationtype you want to apply the checks on. The
default is 'temp'.
gross_value : Bool, optional
If True the gross_value check is applied if False not. The default
is True.
persistance : Bool, optional
If True the persistance check is applied if False not. The default
is True.. The default is True.
repetition : Bool, optional
If True the repetations check is applied if False not. The default
is True.
step : Bool, optional
If True the step check is applied if False not. The default is True.
window_variation : Bool, optional
If True the window_variation check is applied if False not. The
default is True.
Returns
---------
None.
Notes
-----
A schematic description of the quality control checks.
Gross value check
==================
This check looks for outliers based on unrealistic values
1. Find observations that exceed a minimum and maximum value threshold.
2. These observations are labeled as outliers.
Persistence check
=================
Test observations to change over a specific period.
1. Find the stations that have a maximum assumed observation frequency
that does not exceed the minimum number of records for moving window
size. The window size is defined by a duration.
2. Subset to those stations.
3. For each station, a moving window scan is applied that validates if
there is variation in the observations (NaN's are excluded). The
validation is only applied when a sufficient amount of records are
found in the window specified by a threshold.
4. After the scan, all records found in the windows without variation
are labeled as outliers.
Repetitions check
=================
Test if observation changes after a number of records.
1. For each station, make a group of consecutive records for which
the values do not change.
2. Filter those groups that have more records than the maximum valid
repetitions.
3. All the records in these groups are labeled as outliers
Note
-----
The repetitions check is similar to the persistence check, but not identical.
The persistence check uses thresholds that are meteorologically based (i.g. the moving window is defined by a duration),
in contrast to the repetitions check whose thresholds are instrumentally based (i.g. the "window" is defined by a number of records.)
Step check
============
Test if observations do not produce unphysical spikes in time series.
1. Iterate over all the stations.
2. Get the observations of the stations (i.g. drop the previously labeled outliers represented by NaN's).
3. Find the observations for which:
* The increase between two consecutive records is larger than the
threshold. This threshold is defined by a maximum increase per second
multiplied by the timedelta (in seconds) between the consecutive
records.
* Similar filter for a decrease.
4. The found observations are labeled as outliers.
Note
-----
In general, for temperatures, the decrease threshold is set less stringent than the increase
threshold. This is because a temperature drop is meteorologycally more
common than a sudden increase which is often the result of a radiation error.
Window Variation check
=======================
Test if the variation is found in a moving window.
1. Find the stations that have a maximum assumed observation frequency
that does not exceed the minimum number of records for moving window
size. The window size is defined by a duration.
2. Compute the maximum increase and decrease thresholds for a window.
This is done by multiplying the maximum increase per second by the
window size in seconds.
3. For each station, a moving window scan is applied that validates if
the maximum increase/decrease thresholds are exceeded. This is done
by comparison of the minimum and maximum values inside the window. The
validation is only applied when a sufficient amount of records are
found in the window specified by a threshold.
4. After the scan, *all* records found in the window that exceed one
of these thresholds are labeled as outliers.
Examples
--------
.. code-block:: python
>>> import metobs_toolkit
>>>
>>> # Import data into a Dataset
>>> dataset = metobs_toolkit.Dataset()
>>> dataset.update_settings(
... input_data_file=metobs_toolkit.demo_datafile,
... input_metadata_file=metobs_toolkit.demo_metadatafile,
... template_file=metobs_toolkit.demo_template,
... )
>>> dataset.import_data_from_file()
>>> dataset.coarsen_time_resolution(freq='1h')
>>>
>>> #Update some temperature QC settings
>>> dataset.update_qc_settings(obstype='temp',
... gross_value_max_value=42.,
... persis_time_win_to_check='4h',
... buddy_min_std = 1.5)
>>> # Apply quality control on the temperature observations
>>> dataset.apply_quality_control(obstype='temp')
>>> dataset
Dataset instance containing:
*28 stations
*['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
*10080 observation records
*1676 records labeled as outliers
*0 gaps
*3 missing observations
*records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00)
*time zone of the records: UTC
*Coordinates are available for all stations.
"""
if repetitions:
apliable = _can_qc_be_applied(self, obstype, "repetitions")
if apliable:
logger.info("Applying repetitions check.")
obsdf, outl_df = repetitions_check(
obsdf=self.df,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
checks_settings=self.settings.qc["qc_check_settings"],
)
# update the dataset and outliers
self.df = obsdf
if not outl_df.empty:
self.outliersdf = concat_save([self.outliersdf, outl_df])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames="repetitions"
),
],
ignore_index=True,
)
if gross_value:
apliable = _can_qc_be_applied(self, obstype, "gross_value")
if apliable:
logger.info("Applying gross value check.")
obsdf, outl_df = gross_value_check(
obsdf=self.df,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
checks_settings=self.settings.qc["qc_check_settings"],
)
# update the dataset and outliers
self.df = obsdf
if not outl_df.empty:
self.outliersdf = concat_save([self.outliersdf, outl_df])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames="gross_value"
),
],
ignore_index=True,
)
if persistance:
apliable = _can_qc_be_applied(self, obstype, "persistance")
if apliable:
logger.info("Applying persistance check.")
obsdf, outl_df = persistance_check(
station_frequencies=self.metadf["dataset_resolution"],
obsdf=self.df,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
checks_settings=self.settings.qc["qc_check_settings"],
)
# update the dataset and outliers
self.df = obsdf
if not outl_df.empty:
self.outliersdf = concat_save([self.outliersdf, outl_df])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames="persistance"
),
],
ignore_index=True,
)
if step:
apliable = _can_qc_be_applied(self, obstype, "step")
if apliable:
logger.info("Applying step-check.")
obsdf, outl_df = step_check(
obsdf=self.df,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
checks_settings=self.settings.qc["qc_check_settings"],
)
# update the dataset and outliers
self.df = obsdf
if not outl_df.empty:
self.outliersdf = concat_save([self.outliersdf, outl_df])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames="step"
),
],
ignore_index=True,
)
if window_variation:
apliable = _can_qc_be_applied(self, obstype, "window_variation")
if apliable:
logger.info("Applying window variation-check.")
obsdf, outl_df = window_variation_check(
station_frequencies=self.metadf["dataset_resolution"],
obsdf=self.df,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
checks_settings=self.settings.qc["qc_check_settings"],
)
# update the dataset and outliers
self.df = obsdf
if not outl_df.empty:
self.outliersdf = concat_save([self.outliersdf, outl_df])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype,
ordered_checknames="window_variation",
),
],
ignore_index=True,
)
self._qc_checked_obstypes.append(obstype)
self._qc_checked_obstypes = list(set(self._qc_checked_obstypes))
self.outliersdf = self.outliersdf.sort_index()
[docs]
def apply_buddy_check(
self,
obstype="temp",
use_constant_altitude=False,
haversine_approx=True,
metric_epsg="31370",
):
"""Apply the buddy check on the observations.
The buddy check compares an observation against its neighbours (i.e.
buddies). The check looks for buddies in a neighbourhood specified by
a certain radius. The buddy check flags observations if the
(absolute value of the) difference between the observations and the
average of the neighbours normalized by the standard deviation in the
circle is greater than a predefined threshold.
This check is based on the buddy check from titanlib. Documentation on
the titanlib buddy check can be found
`here <https://github.com/metno/titanlib/wiki/Buddy-check>`_.
The observation and outliers attributes will be updated accordingly.
Parameters
----------
obstype : String, optional
Name of the observationtype you want to apply the checks on. The
default is 'temp'.
use_constant_altitude : bool, optional
Use a constant altitude for all stations. The default is False.
haversine_approx : bool, optional
Use the haversine approximation (earth is a sphere) to calculate
distances between stations. The default is True.
metric_epsg : str, optional
EPSG code for the metric CRS to calculate distances in. Only used when
haversine approximation is set to False. Thus becoming a better
distance approximation but not global applicable The default is '31370'
(which is suitable for Belgium).
Returns
-------
None.
Notes
-----
A schematic step-by-step description of the buddy check:
1. A distance matrix is constructed for all inter distances between the stations. This is done using the haversine approximation, or by first converting the Coordinate Reference System (CRS) to a metric one, specified by an EPSG code.
2. A set of all (spatial) buddies per station is created by filtering out all stations that are too far.
3. The buddies are further filtered based on altitude differences with respect to the reference station.
4. For each station:
* Observations of buddies are extracted from all observations.
* These observations are corrected for altitude differences by assuming a constant lapse rate.
* For each reference record, the mean, standard deviation (std), and sample size of the corrected buddies’ observations are computed.
* If the std is lower than the minimum std, it is replaced by the minimum std.
* Chi values are calculated for all reference records.
* If the Chi value is larger than the std_threshold, the record is accepted, otherwise it is marked as an outlier.
Examples
--------
.. code-block:: python
>>> import metobs_toolkit
>>>
>>> # Import data into a Dataset
>>> dataset = metobs_toolkit.Dataset()
>>> dataset.update_settings(
... input_data_file=metobs_toolkit.demo_datafile,
... input_metadata_file=metobs_toolkit.demo_metadatafile,
... template_file=metobs_toolkit.demo_template,
... )
>>> dataset.import_data_from_file()
>>> dataset.coarsen_time_resolution(freq='1h')
>>>
>>> #Update some temperature QC settings
>>> dataset.update_qc_settings(obstype='temp',
... buddy_min_std=1.5,
... buddy_threshold=3.2)
>>> # Apply buddy check on the temperature observations
>>> dataset.apply_buddy_check(obstype='temp',
... use_constant_altitude=True)
>>> dataset
Dataset instance containing:
*28 stations
*['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
*10080 observation records
*69 records labeled as outliers
*0 gaps
*3 missing observations
*records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00)
*time zone of the records: UTC
*Coordinates are available for all stations.
"""
logger.info("Applying the toolkit buddy check")
checkname = "buddy_check"
# 1. coordinates are available?
if self.metadf["lat"].isnull().any():
logger.warning(
f"Not all coordinates are available, the {checkname} cannot be executed!"
)
return
if self.metadf["lon"].isnull().any():
logger.warning(
f"Not all coordinates are available, the {checkname} cannot be executed!"
)
return
# set constant altitude if needed:
# if altitude is already available, save it to restore it after this check
restore_altitude = False
if use_constant_altitude:
if "altitulde" in self.metadf.columns:
self.metadf["altitude_backup"] = self.metadf["altitude"]
restore_altitude = True
self.metadf["altitude"] = 2.0 # absolut value does not matter
# 2. altitude available?
if (not use_constant_altitude) & ("altitude" not in self.metadf.columns):
logger.warning(
f"The altitude is not known for all stations. The {checkname} cannot be executed!"
)
logger.info(
'(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.'
)
return
if (not use_constant_altitude) & (self.metadf["altitude"].isnull().any()):
logger.warning(
f"The altitude is not known for all stations. The {checkname} cannot be executed!"
)
logger.info(
'(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)'
)
return
apliable = _can_qc_be_applied(self, obstype, checkname)
if apliable:
buddy_set = self.settings.qc["qc_check_settings"][checkname][obstype]
outl_flag = self.settings.qc["qc_checks_info"][checkname]["outlier_flag"]
obsdf, outliersdf = toolkit_buddy_check(
obsdf=self.df,
metadf=self.metadf,
obstype=obstype,
buddy_radius=buddy_set["radius"],
min_sample_size=buddy_set["num_min"],
max_alt_diff=buddy_set["max_elev_diff"],
min_std=buddy_set["min_std"],
std_threshold=buddy_set["threshold"],
metric_epsg=metric_epsg,
lapserate=buddy_set["elev_gradient"],
outl_flag=outl_flag,
haversine_approx=haversine_approx,
)
# update the dataset and outliers
self.df = obsdf
if not outliersdf.empty:
self.outliersdf = concat_save([self.outliersdf, outliersdf])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames=checkname
),
],
ignore_index=True,
)
else:
logger.warning(
f"The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!"
)
# Revert artificial data that has been added if needed
if restore_altitude: # altitude was overwritten, thus revert it
self.metadf["altitude"] = self.metadf["altitude_backup"]
self.metadf = self.metadf.drop(columns=["altitude_backup"])
elif use_constant_altitude:
# when no alitude was available apriori, remove the fake constant altitude column
self.metadf = self.metadf.drop(columns=["altitude"])
[docs]
def apply_titan_buddy_check(self, obstype="temp", use_constant_altitude=False):
"""Apply the TITAN buddy check on the observations.
The buddy check compares an observation against its neighbours (i.e. buddies). The check looks for
buddies in a neighbourhood specified by a certain radius. The buddy check flags observations if the
(absolute value of the) difference between the observations and the average of the neighbours
normalized by the standard deviation in the circle is greater than a predefined threshold.
See the `titanlib documentation on the buddy check <https://github.com/metno/titanlib/wiki/Buddy-check>`_
for futher details.
The observation and outliers attributes will be updated accordingly.
Parameters
----------
obstype : String, optional
Name of the observationtype you want to apply the checks on. The
default is 'temp'.
use_constant_altitude : bool, optional
Use a constant altitude for all stations. The default is False.
Returns
-------
None.
Note
-------
To update the check settings, use the update_titan_qc_settings method
of the Dataset class.
Warning
--------
To use this method, you must install titanlib. Windows users must have
a c++ compiler installed. See the titanlib documentation: https://github.com/metno/titanlib/wiki/Installation.
Examples
--------
.. code-block:: python
>>> import metobs_toolkit
>>>
>>> # Import data into a Dataset
>>> dataset = metobs_toolkit.Dataset()
>>> dataset.update_settings(
... input_data_file=metobs_toolkit.demo_datafile,
... input_metadata_file=metobs_toolkit.demo_metadatafile,
... template_file=metobs_toolkit.demo_template,
... )
>>> dataset.import_data_from_file()
>>> dataset.coarsen_time_resolution(freq='1h')
>>>
>>> #Update some temperature QC settings
>>> dataset.update_titan_qc_settings(obstype='temp',
... buddy_min_std=1.5,
... buddy_threshold=3.2,
... buddy_num_min=5)
buddy num min for the TITAN buddy check updated: 2--> 5
buddy threshold for the TITAN buddy check updated: 1.5--> 3.2
buddy min std for the TITAN buddy check updated: 1.0--> 1.5
>>> # Apply buddy check on the temperature observations
>>> dataset.apply_titan_buddy_check(obstype='temp',
... use_constant_altitude=True)
>>> dataset
Dataset instance containing:
*28 stations
*['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
*10080 observation records
*35 records labeled as outliers
*0 gaps
*3 missing observations
*records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00)
*time zone of the records: UTC
*Coordinates are available for all stations.
"""
logger.info("Applying the titan buddy check")
try:
import titanlib
# Add version restrictions??
except ModuleNotFoundError:
logger.warning(
"Titanlib is not installed, install it manually if you want to use this functionallity."
)
return
checkname = "titan_buddy_check"
# 1. coordinates are available?
if self.metadf["lat"].isnull().any():
logger.warning(
f"Not all coordinates are available, the {checkname} cannot be executed!"
)
return
if self.metadf["lon"].isnull().any():
logger.warning(
f"Not all coordinates are available, the {checkname} cannot be executed!"
)
return
# set constant altitude if needed:
# if altitude is already available, save it to restore it after this check
restore_altitude = False
if use_constant_altitude:
if "altitulde" in self.metadf.columns:
self.metadf["altitude_backup"] = self.metadf["altitude"]
restore_altitude = True
self.metadf["altitude"] = 2.0 # absolut value does not matter
# 2. altitude available?
if (not use_constant_altitude) & ("altitude" not in self.metadf.columns):
logger.warning(
f"The altitude is not known for all stations. The {checkname} cannot be executed!"
)
logger.info(
'(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.'
)
return
if (not use_constant_altitude) & (self.metadf["altitude"].isnull().any()):
logger.warning(
f"The altitude is not known for all stations. The {checkname} cannot be executed!"
)
logger.info(
'(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)'
)
return
apliable = _can_qc_be_applied(self, obstype, checkname)
if apliable:
obsdf, outliersdf = titan_buddy_check(
obsdf=self.df,
metadf=self.metadf,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
checks_settings=self.settings.qc["titan_check_settings"][checkname][
obstype
],
titan_specific_labeler=self.settings.qc["titan_specific_labeler"][
checkname
],
)
# update the dataset and outliers
self.df = obsdf
if not outliersdf.empty:
self.outliersdf = concat_save([self.outliersdf, outliersdf])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames=checkname
),
],
ignore_index=True,
)
else:
logger.warning(
f"The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!"
)
# Revert artificial data that has been added if needed
if restore_altitude: # altitude was overwritten, thus revert it
self.metadf["altitude"] = self.metadf["altitude_backup"]
self.metadf = self.metadf.drop(columns=["altitude_backup"])
elif use_constant_altitude:
# when no alitude was available apriori, remove the fake constant altitude column
self.metadf = self.metadf.drop(columns=["altitude"])
[docs]
def apply_titan_sct_resistant_check(self, obstype="temp"):
"""Apply the TITAN spatial consistency test (resistant).
The SCT resistant check is a spatial consistency check which compares each observations to what is expected given the other observations in the
nearby area. If the deviation is large, the observation is removed. The SCT uses optimal interpolation
(OI) to compute an expected value for each observation. The background for the OI is computed from
a general vertical profile of observations in the area.
See the `titanlib documentation on the sct check <https://github.com/metno/titanlib/wiki/Spatial-consistency-test-resistant>`_
for futher details.
The observation and outliers attributes will be updated accordingly.
Parameters
----------
obstype : String, optional
Name of the observationtype you want to apply the checks on. The
default is 'temp'.
Returns
-------
None.
Note
-------
To update the check settings, use the update_titan_qc_settings method
of the Dataset class.
Warning
--------
To use this method, you must install titanlib. Windows users must have
a c++ compiler installed. See the titanlib documentation: https://github.com/metno/titanlib/wiki/Installation.
Warning
-------
This method is a python wrapper on titanlib c++ scripts, and it is prone
to segmentation faults. The perfomance of this check is thus not
guaranteed!
Examples
--------
.. code-block:: python
import metobs_toolkit
# Import data into a Dataset
dataset = metobs_toolkit.Dataset()
dataset.update_settings(
input_data_file=metobs_toolkit.demo_datafile,
input_metadata_file=metobs_toolkit.demo_metadatafile,
template_file=metobs_toolkit.demo_template,
)
dataset.import_data_from_file()
dataset.coarsen_time_resolution(freq='1h')
#Get altitude of all stations
dataset.get_altitude()
#Update some temperature QC settings
dataset.update_titan_qc_settings(obstype='temp',
sct_outer_radius=25000)
# Apply buddy check on the temperature observations
dataset.apply_titan_sct_resistant_check(obstype='temp')
"""
logger.info("Applying the titan SCT check")
try:
import titanlib
# Add version restrictions??
except ModuleNotFoundError:
logger.warning(
"Titanlib is not installed, install it manually if you want to use this functionallity."
)
return
checkname = "titan_sct_resistant_check"
# check if required metadata is available:
# 1. coordinates are available?
if self.metadf["lat"].isnull().any():
logger.warning(
f"Not all coordinates are available, the {checkname} cannot be executed!"
)
return
if self.metadf["lon"].isnull().any():
logger.warning(
f"Not all coordinates are available, the {checkname} cannot be executed!"
)
return
# 2. altitude available?
if "altitude" not in self.metadf.columns:
logger.warning(
f"The altitude is not known for all stations. The {checkname} cannot be executed!"
)
logger.info(
'(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.'
)
return
if self.metadf["altitude"].isnull().any():
logger.warning(
f"The altitude is not known for all stations. The {checkname} cannot be executed!"
)
logger.info(
'(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)'
)
return
apliable = _can_qc_be_applied(self, obstype, checkname)
if apliable:
obsdf, outliersdf = titan_sct_resistant_check(
obsdf=self.df,
metadf=self.metadf,
obstype=obstype,
checks_info=self.settings.qc["qc_checks_info"],
checks_settings=self.settings.qc["titan_check_settings"][checkname][
obstype
],
titan_specific_labeler=self.settings.qc["titan_specific_labeler"][
checkname
],
)
# update the dataset and outliers
self.df = obsdf
if not outliersdf.empty:
self.outliersdf = concat_save([self.outliersdf, outliersdf])
# add this check to the applied checks
self._applied_qc = concat_save(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames=checkname
),
],
ignore_index=True,
)
else:
logger.warning(
f"The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!"
)
# =============================================================================
# Helpers
# =============================================================================
def _can_qc_be_applied(dataset, obstype, checkname):
"""Test if a qc check can be applied."""
# test if check is already applied on the obstype
applied_df = dataset._applied_qc
can_be_applied = (
not applied_df[
(applied_df["obstype"] == obstype) & (applied_df["checkname"] == checkname)
].shape[0]
> 0
)
if not can_be_applied:
logger.warning(
f"The {checkname} check can NOT be applied on {obstype} because it was already applied on this observation type!"
)
return False
# test of all settings are present for the check on the obstype
if checkname not in [
"duplicated_timestamp",
"titan_buddy_check",
"titan_sct_resistant_check",
]:
# these checks are obstype depending,
required_keys = list(
dataset.settings.qc["qc_check_settings"][checkname]["temp"].keys()
) # use temp to find all required settings
if obstype not in dataset.settings.qc["qc_check_settings"][checkname].keys():
logger.warning(
f"The {checkname} check can NOT be applied on {obstype} because none of the required check settings are found. The following are missing: {required_keys}"
)
return False
if not all(
[
req_key
in dataset.settings.qc["qc_check_settings"][checkname][obstype].keys()
for req_key in required_keys
]
):
# not all required settings are available
missing_settings = [
req_key
for req_key in required_keys
if req_key
not in dataset.settings.qc["qc_check_settings"][checkname][
obstype
].keys()
]
logger.warning(
f"The {checkname} check can NOT be applied on {obstype} because not all required check settings ar found. The following are missing: {missing_settings}"
)
return False
return True