Source code for metobs_toolkit.dataset_qc_handling

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 16 13:44:49 2024

@author: thoverga
"""

import logging
from metobs_toolkit import Dataset


logger = logging.getLogger(__name__)

from metobs_toolkit.qc_checks import (
    gross_value_check,
    persistance_check,
    repetitions_check,
    # duplicate_timestamp_check,
    step_check,
    window_variation_check,
    # invalid_input_check,
    toolkit_buddy_check,
    titan_buddy_check,
    titan_sct_resistant_check,
)

from metobs_toolkit.plotting_functions import qc_stats_pie
from metobs_toolkit.qc_statistics import get_freq_statistics
from metobs_toolkit.df_helpers import (
    multiindexdf_datetime_subsetting,
    fmt_datetime_argument,
    init_multiindex,
    init_multiindexdf,
    init_triple_multiindexdf,
    metadf_to_gdf,
    conv_applied_qc_to_df,
    get_freqency_series,
    value_labeled_doubleidxdf_to_triple_idxdf,
    xs_save,
    concat_save,
)



[docs]
class Dataset(Dataset):
    """Extension on the metobs_toolkit.Dataset class with QC related methods"""


[docs]
    def get_qc_stats(self, obstype="temp", stationname=None, make_plot=True):
        """Get quality control statistics.

        Compute frequency statistics on the qc labels for an observationtype.
        The output is a dataframe containing the frequency statistics presented
        as percentages.

        These frequencies can also be presented as a collection of piecharts
        per check.

        With stationnames you can subset the data to one ore multiple stations.

        Parameters
        -----------
        obstype : str, optional
            Observation type to analyse the QC labels on. The default is
            'temp'.
        stationname : str, optional
            Stationname to subset the quality labels on. If None, all
            stations are used. The default is None.
        make_plot : Bool, optional
            If True, a plot with piecharts is generated. The default is True.

        Returns
        ---------
        dataset_qc_stats : pandas.DataFrame
            A table containing the label frequencies per check presented
            as percentages.

        Examples
        --------
        .. code-block:: python

            >>> import metobs_toolkit
            >>>
            >>> # Import data into a Dataset
            >>> dataset = metobs_toolkit.Dataset()
            >>> dataset.update_settings(
            ...                         input_data_file=metobs_toolkit.demo_datafile,
            ...                         input_metadata_file=metobs_toolkit.demo_metadatafile,
            ...                         template_file=metobs_toolkit.demo_template,
            ...                         )
            >>> dataset.import_data_from_file()
            >>> dataset.coarsen_time_resolution(freq='1h')
            >>>
            >>> # Apply quality control on the temperature observations
            >>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings
            >>> dataset
            Dataset instance containing:
                 *28 stations
                 *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
                 *10080 observation records
                 *1676 records labeled as outliers
                 *0 gaps
                 *3 missing observations
                 *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration:  14 days 23:00:00)
                 *time zone of the records: UTC
                 *Coordinates are available for all stations.
            >>>
            >>> #Get quality control statistics
            >>> stats = dataset.get_qc_stats(make_plot=False)
            >>> stats
            ({'ok': 83.37301587301587, 'QC outliers': 16.6269841269...

        """
        # cobmine all and get final label
        comb_df = self.combine_all_to_obsspace()

        # subset to relevant columnt
        comb_df = xs_save(comb_df, obstype, level="obstype")[["label"]]

        # subset to stationnames
        if stationname is not None:
            assert stationname in comb_df.index.get_level_values(
                "name"
            ), f" stationnames: {stationname} is not a list."

            comb_df = comb_df.loc[stationname]

        # compute freq statistics
        final_freq, outl_freq, specific_freq = get_freq_statistics(
            comb_df=comb_df,
            obstype=obstype,
            checks_info=self.settings.qc["qc_checks_info"],
            gaps_info=self.settings.gap["gaps_info"],
            applied_qc_order=self._applied_qc,
        )

        if any([stat is None for stat in [final_freq, outl_freq, specific_freq]]):
            return None

        # make title
        orig_obstype = self.obstypes[obstype].get_orig_name()

        if stationname is None:
            title = f"Label frequency statistics on all stations for {orig_obstype}."
        else:
            title = f"Label frequency statistics for {stationname} for {orig_obstype}."

        if make_plot:
            # make pie plots
            qc_stats_pie(
                final_stats=final_freq,
                outlier_stats=outl_freq,
                specific_stats=specific_freq,
                plot_settings=self.settings.app["plot_settings"],
                qc_check_info=self.settings.qc["qc_checks_info"],
                title=title,
            )

        return (final_freq, outl_freq, specific_freq)



[docs]
    def apply_quality_control(
        self,
        obstype="temp",
        gross_value=True,
        persistance=True,
        repetitions=True,
        step=True,
        window_variation=True,
    ):
        """Apply quality control methods to the dataset.

        The default settings are used, and can be changed in the
        settings_files/qc_settings.py

        The checks are performed in a sequence: gross_vallue -->
        persistance --> ..., Outliers by a previous check are ignored in the
        following checks!

        The dataset is updated inline.

        Parameters
        ----------
        obstype : String, optional
            Name of the observationtype you want to apply the checks on. The
            default is 'temp'.
        gross_value : Bool, optional
            If True the gross_value check is applied if False not. The default
            is True.
        persistance : Bool, optional
            If True the persistance check is applied if False not. The default
            is True.. The default is True.
        repetition : Bool, optional
            If True the repetations check is applied if False not. The default
            is True.
        step : Bool, optional
            If True the step check is applied if False not. The default is True.
        window_variation : Bool, optional
            If True the window_variation check is applied if False not. The
            default is True.

        Returns
        ---------
        None.

        Notes
        -----
        A schematic description of the quality control checks.

        Gross value check
        ==================
        This check looks for outliers based on unrealistic values

        1. Find observations that exceed a minimum and maximum value threshold.
        2. These observations are labeled as outliers.

        Persistence check
        =================
        Test observations to change over a specific period.

        1. Find the stations that have a maximum assumed observation frequency
           that does not exceed the minimum number of records for moving window
           size. The window size is defined by a duration.
        2. Subset to those stations.
        3. For each station, a moving window scan is applied that validates if
           there is variation in the observations (NaN's are excluded). The
           validation is only applied when a sufficient amount of records are
           found in the window specified by a threshold.
        4. After the scan, all records found in the windows without variation
           are labeled as outliers.

        Repetitions check
        =================
        Test if observation changes after a number of records.

        1. For each station, make a group of consecutive records for which
           the values do not change.
        2. Filter those groups that have more records than the maximum valid
           repetitions.
        3. All the records in these groups are labeled as outliers

        Note
        -----
          The repetitions check is similar to the persistence check, but not identical.
          The persistence check uses thresholds that are meteorologically based (i.g. the moving window is defined by a duration),
          in contrast to the repetitions check whose thresholds are instrumentally based (i.g. the "window" is defined by a number of records.)

        Step check
        ============
        Test if observations do not produce unphysical spikes in time series.

        1. Iterate over all the stations.
        2. Get the observations of the stations (i.g. drop the previously labeled outliers represented by NaN's).
        3. Find the observations for which:

           * The increase between two consecutive records is larger than the
             threshold. This threshold is defined by a maximum increase per second
             multiplied by the timedelta (in seconds) between the consecutive
             records.
           * Similar filter for a decrease.
        4. The found observations are labeled as outliers.

        Note
        -----
          In general, for temperatures,  the decrease threshold is set less stringent than the increase
          threshold. This is because a temperature drop is meteorologycally more
          common than a sudden increase which is often the result of a radiation error.

        Window Variation check
        =======================
        Test if the variation is found in a moving window.

        1. Find the stations that have a maximum assumed observation frequency
           that does not exceed the minimum number of records for moving window
           size. The window size is defined by a duration.
        2. Compute the maximum increase and decrease thresholds for a window.
           This is done by multiplying the maximum increase per second by the
           window size in seconds.
        3. For each station, a moving window scan is applied that validates if
           the maximum increase/decrease thresholds are exceeded. This is done
           by comparison of the minimum and maximum values inside the window. The
           validation is only applied when a sufficient amount of records are
           found in the window specified by a threshold.
        4. After the scan, *all* records found in the window that exceed one
           of these thresholds are labeled as outliers.


        Examples
        --------
        .. code-block:: python

            >>> import metobs_toolkit
            >>>
            >>> # Import data into a Dataset
            >>> dataset = metobs_toolkit.Dataset()
            >>> dataset.update_settings(
            ...                         input_data_file=metobs_toolkit.demo_datafile,
            ...                         input_metadata_file=metobs_toolkit.demo_metadatafile,
            ...                         template_file=metobs_toolkit.demo_template,
            ...                         )
            >>> dataset.import_data_from_file()
            >>> dataset.coarsen_time_resolution(freq='1h')
            >>>
            >>> #Update some temperature QC settings
            >>> dataset.update_qc_settings(obstype='temp',
            ...                            gross_value_max_value=42.,
            ...                            persis_time_win_to_check='4h',
            ...                            buddy_min_std = 1.5)

            >>> # Apply quality control on the temperature observations
            >>> dataset.apply_quality_control(obstype='temp')
            >>> dataset
            Dataset instance containing:
                 *28 stations
                 *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
                 *10080 observation records
                 *1676 records labeled as outliers
                 *0 gaps
                 *3 missing observations
                 *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration:  14 days 23:00:00)
                 *time zone of the records: UTC
                 *Coordinates are available for all stations.

        """
        if repetitions:
            apliable = _can_qc_be_applied(self, obstype, "repetitions")
            if apliable:
                logger.info("Applying repetitions check.")

                obsdf, outl_df = repetitions_check(
                    obsdf=self.df,
                    obstype=obstype,
                    checks_info=self.settings.qc["qc_checks_info"],
                    checks_settings=self.settings.qc["qc_check_settings"],
                )

                # update the dataset and outliers
                self.df = obsdf
                if not outl_df.empty:
                    self.outliersdf = concat_save([self.outliersdf, outl_df])

                # add this check to the applied checks
                self._applied_qc = concat_save(
                    [
                        self._applied_qc,
                        conv_applied_qc_to_df(
                            obstypes=obstype, ordered_checknames="repetitions"
                        ),
                    ],
                    ignore_index=True,
                )

        if gross_value:
            apliable = _can_qc_be_applied(self, obstype, "gross_value")

            if apliable:
                logger.info("Applying gross value check.")

                obsdf, outl_df = gross_value_check(
                    obsdf=self.df,
                    obstype=obstype,
                    checks_info=self.settings.qc["qc_checks_info"],
                    checks_settings=self.settings.qc["qc_check_settings"],
                )

                # update the dataset and outliers
                self.df = obsdf
                if not outl_df.empty:
                    self.outliersdf = concat_save([self.outliersdf, outl_df])

                # add this check to the applied checks
                self._applied_qc = concat_save(
                    [
                        self._applied_qc,
                        conv_applied_qc_to_df(
                            obstypes=obstype, ordered_checknames="gross_value"
                        ),
                    ],
                    ignore_index=True,
                )

        if persistance:
            apliable = _can_qc_be_applied(self, obstype, "persistance")

            if apliable:
                logger.info("Applying persistance check.")
                obsdf, outl_df = persistance_check(
                    station_frequencies=self.metadf["dataset_resolution"],
                    obsdf=self.df,
                    obstype=obstype,
                    checks_info=self.settings.qc["qc_checks_info"],
                    checks_settings=self.settings.qc["qc_check_settings"],
                )

                # update the dataset and outliers
                self.df = obsdf
                if not outl_df.empty:
                    self.outliersdf = concat_save([self.outliersdf, outl_df])

                # add this check to the applied checks
                self._applied_qc = concat_save(
                    [
                        self._applied_qc,
                        conv_applied_qc_to_df(
                            obstypes=obstype, ordered_checknames="persistance"
                        ),
                    ],
                    ignore_index=True,
                )

        if step:
            apliable = _can_qc_be_applied(self, obstype, "step")

            if apliable:
                logger.info("Applying step-check.")
                obsdf, outl_df = step_check(
                    obsdf=self.df,
                    obstype=obstype,
                    checks_info=self.settings.qc["qc_checks_info"],
                    checks_settings=self.settings.qc["qc_check_settings"],
                )

                # update the dataset and outliers
                self.df = obsdf
                if not outl_df.empty:
                    self.outliersdf = concat_save([self.outliersdf, outl_df])

                # add this check to the applied checks
                self._applied_qc = concat_save(
                    [
                        self._applied_qc,
                        conv_applied_qc_to_df(
                            obstypes=obstype, ordered_checknames="step"
                        ),
                    ],
                    ignore_index=True,
                )

        if window_variation:
            apliable = _can_qc_be_applied(self, obstype, "window_variation")
            if apliable:
                logger.info("Applying window variation-check.")
                obsdf, outl_df = window_variation_check(
                    station_frequencies=self.metadf["dataset_resolution"],
                    obsdf=self.df,
                    obstype=obstype,
                    checks_info=self.settings.qc["qc_checks_info"],
                    checks_settings=self.settings.qc["qc_check_settings"],
                )

                # update the dataset and outliers
                self.df = obsdf
                if not outl_df.empty:
                    self.outliersdf = concat_save([self.outliersdf, outl_df])

                # add this check to the applied checks
                self._applied_qc = concat_save(
                    [
                        self._applied_qc,
                        conv_applied_qc_to_df(
                            obstypes=obstype,
                            ordered_checknames="window_variation",
                        ),
                    ],
                    ignore_index=True,
                )

        self._qc_checked_obstypes.append(obstype)
        self._qc_checked_obstypes = list(set(self._qc_checked_obstypes))
        self.outliersdf = self.outliersdf.sort_index()



[docs]
    def apply_buddy_check(
        self,
        obstype="temp",
        use_constant_altitude=False,
        haversine_approx=True,
        metric_epsg="31370",
    ):
        """Apply the buddy check on the observations.

        The buddy check compares an observation against its neighbours (i.e.
        buddies). The check looks for buddies in a neighbourhood specified by
        a certain radius. The buddy check flags observations if the
        (absolute value of the) difference between the observations and the
        average of the neighbours normalized by the standard deviation in the
        circle is greater than a predefined threshold.

        This check is based on the buddy check from titanlib. Documentation on
        the titanlib buddy check can be found
        `here <https://github.com/metno/titanlib/wiki/Buddy-check>`_.


        The observation and outliers attributes will be updated accordingly.

        Parameters
        ----------
        obstype : String, optional
            Name of the observationtype you want to apply the checks on. The
            default is 'temp'.
        use_constant_altitude : bool, optional
            Use a constant altitude for all stations. The default is False.
        haversine_approx : bool, optional
            Use the haversine approximation (earth is a sphere) to calculate
            distances between stations. The default is True.
        metric_epsg : str, optional
            EPSG code for the metric CRS to calculate distances in. Only used when
            haversine approximation is set to False. Thus becoming a better
            distance approximation but not global applicable The default is '31370'
            (which is suitable for Belgium).

        Returns
        -------
        None.

        Notes
        -----
        A schematic step-by-step description of the buddy check:

        1. A distance matrix is constructed for all inter distances between the stations. This is done using the haversine approximation, or by first converting the Coordinate Reference System (CRS) to a metric one, specified by an EPSG code.
        2. A set of all (spatial) buddies per station is created by filtering out all stations that are too far.
        3. The buddies are further filtered based on altitude differences with respect to the reference station.
        4. For each station:

           * Observations of buddies are extracted from all observations.
           * These observations are corrected for altitude differences by assuming a constant lapse rate.
           * For each reference record, the mean, standard deviation (std), and sample size of the corrected buddies’ observations are computed.
           * If the std is lower than the minimum std, it is replaced by the minimum std.
           * Chi values are calculated for all reference records.
           * If the Chi value is larger than the std_threshold, the record is accepted, otherwise it is marked as an outlier.

        Examples
        --------
        .. code-block:: python

            >>> import metobs_toolkit
            >>>
            >>> # Import data into a Dataset
            >>> dataset = metobs_toolkit.Dataset()
            >>> dataset.update_settings(
            ...                         input_data_file=metobs_toolkit.demo_datafile,
            ...                         input_metadata_file=metobs_toolkit.demo_metadatafile,
            ...                         template_file=metobs_toolkit.demo_template,
            ...                         )
            >>> dataset.import_data_from_file()
            >>> dataset.coarsen_time_resolution(freq='1h')
            >>>
            >>> #Update some temperature QC settings
            >>> dataset.update_qc_settings(obstype='temp',
            ...                            buddy_min_std=1.5,
            ...                            buddy_threshold=3.2)

            >>> # Apply buddy check on the temperature observations
            >>> dataset.apply_buddy_check(obstype='temp',
            ...                           use_constant_altitude=True)
            >>> dataset
            Dataset instance containing:
                 *28 stations
                 *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
                 *10080 observation records
                 *69 records labeled as outliers
                 *0 gaps
                 *3 missing observations
                 *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration:  14 days 23:00:00)
                 *time zone of the records: UTC
                 *Coordinates are available for all stations.
        """

        logger.info("Applying the toolkit buddy check")

        checkname = "buddy_check"

        # 1. coordinates are available?
        if self.metadf["lat"].isnull().any():
            logger.warning(
                f"Not all coordinates are available, the {checkname} cannot be executed!"
            )
            return
        if self.metadf["lon"].isnull().any():
            logger.warning(
                f"Not all coordinates are available, the {checkname} cannot be executed!"
            )
            return

        # set constant altitude if needed:

        # if altitude is already available, save it to restore it after this check
        restore_altitude = False
        if use_constant_altitude:
            if "altitulde" in self.metadf.columns:
                self.metadf["altitude_backup"] = self.metadf["altitude"]
                restore_altitude = True

            self.metadf["altitude"] = 2.0  # absolut value does not matter

        # 2. altitude available?
        if (not use_constant_altitude) & ("altitude" not in self.metadf.columns):
            logger.warning(
                f"The altitude is not known for all stations. The {checkname} cannot be executed!"
            )
            logger.info(
                '(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.'
            )
            return
        if (not use_constant_altitude) & (self.metadf["altitude"].isnull().any()):
            logger.warning(
                f"The altitude is not known for all stations. The {checkname} cannot be executed!"
            )
            logger.info(
                '(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)'
            )
            return

        apliable = _can_qc_be_applied(self, obstype, checkname)
        if apliable:
            buddy_set = self.settings.qc["qc_check_settings"][checkname][obstype]
            outl_flag = self.settings.qc["qc_checks_info"][checkname]["outlier_flag"]
            obsdf, outliersdf = toolkit_buddy_check(
                obsdf=self.df,
                metadf=self.metadf,
                obstype=obstype,
                buddy_radius=buddy_set["radius"],
                min_sample_size=buddy_set["num_min"],
                max_alt_diff=buddy_set["max_elev_diff"],
                min_std=buddy_set["min_std"],
                std_threshold=buddy_set["threshold"],
                metric_epsg=metric_epsg,
                lapserate=buddy_set["elev_gradient"],
                outl_flag=outl_flag,
                haversine_approx=haversine_approx,
            )

            # update the dataset and outliers
            self.df = obsdf
            if not outliersdf.empty:
                self.outliersdf = concat_save([self.outliersdf, outliersdf])

            # add this check to the applied checks
            self._applied_qc = concat_save(
                [
                    self._applied_qc,
                    conv_applied_qc_to_df(
                        obstypes=obstype, ordered_checknames=checkname
                    ),
                ],
                ignore_index=True,
            )

        else:
            logger.warning(
                f"The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!"
            )

        # Revert artificial data that has been added if needed
        if restore_altitude:  # altitude was overwritten, thus revert it
            self.metadf["altitude"] = self.metadf["altitude_backup"]
            self.metadf = self.metadf.drop(columns=["altitude_backup"])

        elif use_constant_altitude:
            # when no alitude was available apriori, remove the fake constant altitude column
            self.metadf = self.metadf.drop(columns=["altitude"])



[docs]
    def apply_titan_buddy_check(self, obstype="temp", use_constant_altitude=False):
        """Apply the TITAN buddy check on the observations.

        The buddy check compares an observation against its neighbours (i.e. buddies). The check looks for
        buddies in a neighbourhood specified by a certain radius. The buddy check flags observations if the
        (absolute value of the) difference between the observations and the average of the neighbours
        normalized by the standard deviation in the circle is greater than a predefined threshold.

        See the `titanlib documentation on the buddy check <https://github.com/metno/titanlib/wiki/Buddy-check>`_
        for futher details.

        The observation and outliers attributes will be updated accordingly.

        Parameters
        ----------
        obstype : String, optional
            Name of the observationtype you want to apply the checks on. The
            default is 'temp'.
        use_constant_altitude : bool, optional
            Use a constant altitude for all stations. The default is False.

        Returns
        -------
        None.

        Note
        -------
        To update the check settings, use the update_titan_qc_settings method
        of the Dataset class.

        Warning
        --------
        To use this method, you must install titanlib. Windows users must have
        a c++ compiler installed. See the titanlib documentation: https://github.com/metno/titanlib/wiki/Installation.

        Examples
        --------
        .. code-block:: python

            >>> import metobs_toolkit
            >>>
            >>> # Import data into a Dataset
            >>> dataset = metobs_toolkit.Dataset()
            >>> dataset.update_settings(
            ...                         input_data_file=metobs_toolkit.demo_datafile,
            ...                         input_metadata_file=metobs_toolkit.demo_metadatafile,
            ...                         template_file=metobs_toolkit.demo_template,
            ...                         )
            >>> dataset.import_data_from_file()
            >>> dataset.coarsen_time_resolution(freq='1h')
            >>>
            >>> #Update some temperature QC settings
            >>> dataset.update_titan_qc_settings(obstype='temp',
            ...                                  buddy_min_std=1.5,
            ...                                  buddy_threshold=3.2,
            ...                                  buddy_num_min=5)
            buddy num min for the TITAN buddy check updated:  2--> 5
            buddy threshold for the TITAN buddy check updated:  1.5--> 3.2
            buddy min std for the TITAN buddy check updated:  1.0--> 1.5
            >>> # Apply buddy check on the temperature observations
            >>> dataset.apply_titan_buddy_check(obstype='temp',
            ...                                 use_constant_altitude=True)
            >>> dataset
            Dataset instance containing:
                 *28 stations
                 *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types
                 *10080 observation records
                 *35 records labeled as outliers
                 *0 gaps
                 *3 missing observations
                 *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration:  14 days 23:00:00)
                 *time zone of the records: UTC
                 *Coordinates are available for all stations.

        """
        logger.info("Applying the titan buddy check")

        try:
            import titanlib

            # Add version restrictions??
        except ModuleNotFoundError:
            logger.warning(
                "Titanlib is not installed, install it manually if you want to use this functionallity."
            )
            return

        checkname = "titan_buddy_check"

        # 1. coordinates are available?
        if self.metadf["lat"].isnull().any():
            logger.warning(
                f"Not all coordinates are available, the {checkname} cannot be executed!"
            )
            return
        if self.metadf["lon"].isnull().any():
            logger.warning(
                f"Not all coordinates are available, the {checkname} cannot be executed!"
            )
            return

        # set constant altitude if needed:

        # if altitude is already available, save it to restore it after this check
        restore_altitude = False
        if use_constant_altitude:
            if "altitulde" in self.metadf.columns:
                self.metadf["altitude_backup"] = self.metadf["altitude"]
                restore_altitude = True

            self.metadf["altitude"] = 2.0  # absolut value does not matter

        # 2. altitude available?
        if (not use_constant_altitude) & ("altitude" not in self.metadf.columns):
            logger.warning(
                f"The altitude is not known for all stations. The {checkname} cannot be executed!"
            )
            logger.info(
                '(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.'
            )
            return
        if (not use_constant_altitude) & (self.metadf["altitude"].isnull().any()):
            logger.warning(
                f"The altitude is not known for all stations. The {checkname} cannot be executed!"
            )
            logger.info(
                '(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)'
            )
            return

        apliable = _can_qc_be_applied(self, obstype, checkname)
        if apliable:
            obsdf, outliersdf = titan_buddy_check(
                obsdf=self.df,
                metadf=self.metadf,
                obstype=obstype,
                checks_info=self.settings.qc["qc_checks_info"],
                checks_settings=self.settings.qc["titan_check_settings"][checkname][
                    obstype
                ],
                titan_specific_labeler=self.settings.qc["titan_specific_labeler"][
                    checkname
                ],
            )

            # update the dataset and outliers
            self.df = obsdf
            if not outliersdf.empty:
                self.outliersdf = concat_save([self.outliersdf, outliersdf])

            # add this check to the applied checks
            self._applied_qc = concat_save(
                [
                    self._applied_qc,
                    conv_applied_qc_to_df(
                        obstypes=obstype, ordered_checknames=checkname
                    ),
                ],
                ignore_index=True,
            )

        else:
            logger.warning(
                f"The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!"
            )

        # Revert artificial data that has been added if needed
        if restore_altitude:  # altitude was overwritten, thus revert it
            self.metadf["altitude"] = self.metadf["altitude_backup"]
            self.metadf = self.metadf.drop(columns=["altitude_backup"])

        elif use_constant_altitude:
            # when no alitude was available apriori, remove the fake constant altitude column
            self.metadf = self.metadf.drop(columns=["altitude"])



[docs]
    def apply_titan_sct_resistant_check(self, obstype="temp"):
        """Apply the TITAN spatial consistency test (resistant).

        The SCT resistant check is a spatial consistency check which compares each observations to what is expected given the other observations in the
        nearby area. If the deviation is large, the observation is removed. The SCT uses optimal interpolation
        (OI) to compute an expected value for each observation. The background for the OI is computed from
        a general vertical profile of observations in the area.

        See the `titanlib documentation on the sct check <https://github.com/metno/titanlib/wiki/Spatial-consistency-test-resistant>`_
        for futher details.

        The observation and outliers attributes will be updated accordingly.


        Parameters
        ----------
        obstype : String, optional
            Name of the observationtype you want to apply the checks on. The
            default is 'temp'.

        Returns
        -------
        None.

        Note
        -------
        To update the check settings, use the update_titan_qc_settings method
        of the Dataset class.

        Warning
        --------
        To use this method, you must install titanlib. Windows users must have
        a c++ compiler installed. See the titanlib documentation: https://github.com/metno/titanlib/wiki/Installation.

        Warning
        -------
        This method is a python wrapper on titanlib c++ scripts, and it is prone
        to segmentation faults. The perfomance of this check is thus not
        guaranteed!

        Examples
        --------
        .. code-block:: python

             import metobs_toolkit

             # Import data into a Dataset
             dataset = metobs_toolkit.Dataset()
             dataset.update_settings(
                                     input_data_file=metobs_toolkit.demo_datafile,
                                     input_metadata_file=metobs_toolkit.demo_metadatafile,
                                     template_file=metobs_toolkit.demo_template,
                                     )
             dataset.import_data_from_file()
             dataset.coarsen_time_resolution(freq='1h')

             #Get altitude of all stations
             dataset.get_altitude()

             #Update some temperature QC settings
             dataset.update_titan_qc_settings(obstype='temp',
                                              sct_outer_radius=25000)


             # Apply buddy check on the temperature observations
             dataset.apply_titan_sct_resistant_check(obstype='temp')

        """
        logger.info("Applying the titan SCT check")

        try:
            import titanlib

            # Add version restrictions??
        except ModuleNotFoundError:
            logger.warning(
                "Titanlib is not installed, install it manually if you want to use this functionallity."
            )
            return

        checkname = "titan_sct_resistant_check"
        # check if required metadata is available:

        # 1. coordinates are available?
        if self.metadf["lat"].isnull().any():
            logger.warning(
                f"Not all coordinates are available, the {checkname} cannot be executed!"
            )
            return
        if self.metadf["lon"].isnull().any():
            logger.warning(
                f"Not all coordinates are available, the {checkname} cannot be executed!"
            )
            return

        # 2. altitude available?
        if "altitude" not in self.metadf.columns:
            logger.warning(
                f"The altitude is not known for all stations. The {checkname} cannot be executed!"
            )
            logger.info(
                '(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.'
            )
            return
        if self.metadf["altitude"].isnull().any():
            logger.warning(
                f"The altitude is not known for all stations. The {checkname} cannot be executed!"
            )
            logger.info(
                '(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)'
            )
            return

        apliable = _can_qc_be_applied(self, obstype, checkname)
        if apliable:
            obsdf, outliersdf = titan_sct_resistant_check(
                obsdf=self.df,
                metadf=self.metadf,
                obstype=obstype,
                checks_info=self.settings.qc["qc_checks_info"],
                checks_settings=self.settings.qc["titan_check_settings"][checkname][
                    obstype
                ],
                titan_specific_labeler=self.settings.qc["titan_specific_labeler"][
                    checkname
                ],
            )

            # update the dataset and outliers
            self.df = obsdf
            if not outliersdf.empty:
                self.outliersdf = concat_save([self.outliersdf, outliersdf])

            # add this check to the applied checks
            self._applied_qc = concat_save(
                [
                    self._applied_qc,
                    conv_applied_qc_to_df(
                        obstypes=obstype, ordered_checknames=checkname
                    ),
                ],
                ignore_index=True,
            )

        else:
            logger.warning(
                f"The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!"
            )




# =============================================================================
# Helpers
# =============================================================================


def _can_qc_be_applied(dataset, obstype, checkname):
    """Test if a qc check can be applied."""
    # test if check is already applied on the obstype
    applied_df = dataset._applied_qc
    can_be_applied = (
        not applied_df[
            (applied_df["obstype"] == obstype) & (applied_df["checkname"] == checkname)
        ].shape[0]
        > 0
    )

    if not can_be_applied:
        logger.warning(
            f"The {checkname} check can NOT be applied on {obstype} because it was already applied on this observation type!"
        )
        return False
    # test of all settings are present for the check on the obstype
    if checkname not in [
        "duplicated_timestamp",
        "titan_buddy_check",
        "titan_sct_resistant_check",
    ]:
        # these checks are obstype depending,
        required_keys = list(
            dataset.settings.qc["qc_check_settings"][checkname]["temp"].keys()
        )  # use temp to find all required settings
        if obstype not in dataset.settings.qc["qc_check_settings"][checkname].keys():
            logger.warning(
                f"The {checkname} check can NOT be applied on {obstype} because none of the required check settings are found. The following are missing: {required_keys}"
            )
            return False

        if not all(
            [
                req_key
                in dataset.settings.qc["qc_check_settings"][checkname][obstype].keys()
                for req_key in required_keys
            ]
        ):
            # not all required settings are available
            missing_settings = [
                req_key
                for req_key in required_keys
                if req_key
                not in dataset.settings.qc["qc_check_settings"][checkname][
                    obstype
                ].keys()
            ]
            logger.warning(
                f"The {checkname} check can NOT be applied on {obstype} because not all required check settings ar found. The following are missing: {missing_settings}"
            )
            return False

    return True