Source code for metobs_toolkit.dataset_gap_handling

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 16 12:33:07 2024

@author: thoverga
"""

import logging
import sys
import pandas as pd

logger = logging.getLogger(__name__)


from metobs_toolkit import Dataset

from metobs_toolkit.missingobs import Missingob_collection

from metobs_toolkit.gap import (
    Gap,
    remove_gaps_from_obs,
    remove_gaps_from_outliers,
    missing_timestamp_and_gap_check,
    get_gaps_indx_in_obs_space,
    get_station_gaps,
    apply_interpolate_gaps,
    make_gapfill_df,
    apply_debias_era5_gapfill,
    gaps_to_df,
)


from metobs_toolkit.df_helpers import (
    multiindexdf_datetime_subsetting,
    fmt_datetime_argument,
    init_multiindex,
    init_multiindexdf,
    init_triple_multiindexdf,
    metadf_to_gdf,
    conv_applied_qc_to_df,
    get_freqency_series,
    value_labeled_doubleidxdf_to_triple_idxdf,
    xs_save,
    concat_save,
)

logger = logging.getLogger(__name__)


class Dataset(Dataset):
    """Extension on the metobs_toolkit.Dataset class with gap related methods"""

    # =============================================================================
    # Get info and details
    # =============================================================================

    # =============================================================================
    # Update gaps
    # =============================================================================

[docs] def update_gaps_and_missing_from_outliers(self, obstype="temp", n_gapsize=None): """Interpret the outliers as missing observations. If there is a sequence of these outliers for a station, larger than n_gapsize than this will be interpreted as a gap. The outliers are not removed. Parameters ---------- obstype : str, optional Use the outliers on this observation type to update the gaps and missing timestamps. The default is 'temp'. n_gapsize : int, optional The minimum number of consecutive missing observations to define as a gap. If None, n_gapsize is taken from the settings defenition of gaps. The default is None. Returns ------- None. Note ------- Gaps and missing observations resulting from an outlier on a specific obstype, are assumed to be gaps/missing observation for all obstypes. Note ------ Be aware that n_gapsize is used for the current resolution of the Dataset, this is different from the gap check applied on the inported data, if the dataset is coarsend. Examples -------- .. code-block:: python >>> import metobs_toolkit >>> >>> # Import data into a Dataset >>> dataset = metobs_toolkit.Dataset() >>> dataset.update_settings( ... input_data_file=metobs_toolkit.demo_datafile, ... input_metadata_file=metobs_toolkit.demo_metadatafile, ... template_file=metobs_toolkit.demo_template, ... ) >>> dataset.import_data_from_file() >>> dataset.coarsen_time_resolution(freq='1h') >>> >>> # Apply quality control on the temperature observations >>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings >>> dataset Dataset instance containing: *28 stations *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types *10080 observation records *1676 records labeled as outliers *0 gaps *3 missing observations *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00) *time zone of the records: UTC *Coordinates are available for all stations. >>> # Interpret the outliers as missing/gaps >>> dataset.update_gaps_and_missing_from_outliers(obstype='temp') >>> dataset Dataset instance containing: *28 stations *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types *10080 observation records *0 records labeled as outliers *2 gaps *1473 missing observations *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00) *time zone of the records: UTC *Coordinates are available for all stations. """ if n_gapsize is None: n_gapsize = self.settings.gap["gaps_settings"]["gaps_finder"]["gapsize_n"] if ( not self.metadf["assumed_import_frequency"] .eq(self.metadf["dataset_resolution"]) .all() ): logger.info( f"The defenition of the gapsize (n_gapsize = {n_gapsize}) \ will have another effect on the update of the gaps and missing \ timestamps because coarsening is applied and the defenition \ of the gapsize is not changed." ) # combine to one dataframe mergedf = self.combine_all_to_obsspace() mergedf = xs_save(mergedf, obstype, level="obstype") # ignore labels possible_outlier_labels = [ vals["outlier_flag"] for vals in self.settings.qc["qc_checks_info"].values() ] # create groups when the final label changes persistance_filter = ((mergedf["label"].shift() != mergedf["label"])).cumsum() grouped = mergedf.groupby(["name", persistance_filter]) # locate new gaps by size of consecutive the same final label per station group_sizes = grouped.size() large_groups = group_sizes[group_sizes > n_gapsize] # find only groups with final label as an outlier gaps = [] # new_gapsdf = pd.DataFrame() new_gaps_idx = init_multiindex() for group_idx in large_groups.index: groupdf = grouped.get_group(group_idx) group_final_label = groupdf["label"].iloc[0] if group_final_label not in possible_outlier_labels: # no gap candidates continue else: gap = Gap( name=groupdf.index.get_level_values("name")[0], startdt=groupdf.index.get_level_values("datetime").min(), enddt=groupdf.index.get_level_values("datetime").max(), ) gaps.append(gap) new_gaps_idx = new_gaps_idx.union(groupdf.index, sort=False) # add all the outliers, that are not in the new gaps to the new missing obs new_missing_obs = mergedf[mergedf["label"].isin(possible_outlier_labels)].index new_missing_obs = new_missing_obs.drop(new_gaps_idx.to_numpy(), errors="ignore") # to series missing_obs_series = ( new_missing_obs.to_frame() .reset_index(drop=True) .set_index("name")["datetime"] ) # Create missing obs new_missing_collection = Missingob_collection(missing_obs_series) # update self self.gaps.extend(gaps) self.missing_obs = self.missing_obs + new_missing_collection # remove outliers that are converted to gaps self.outliersdf = remove_gaps_from_outliers( gaplist=gaps, outldf=self.outliersdf ) # remove outliers that are converted to missing obs self.outliersdf = self.missing_obs.remove_missing_from_outliers(self.outliersdf)
# ============================================================================= # Fill gaps # =============================================================================
[docs] def fill_gaps_era5( self, modeldata, method="debias", obstype="temp", overwrite_fill=False ): """Fill the gaps using a diurnal debiased modeldata approach. Parameters ---------- modeldata : metobs_toolkit.Modeldata The modeldata to use for the gapfill. This model data should the required timeseries to fill all gaps present in the dataset. method : 'debias', optional Specify which method to use. The default is 'debias'. obstype : String, optional Name of the observationtype you want to apply gap filling on. The modeldata must contain this observation type as well. The default is 'temp'. overwrite_fill: bool, optional If a gap has already filled values, the interpolation of this gap is skipped if overwrite_fill is False. If set to True, the gapfill values and info will be overwitten. The default is False. Returns ------- Gapfilldf : pandas.DataFrame A dataframe containing all gap filled values and the use method. Notes ----- A schematic description of the fill_gaps_era5 method: 1. Modeldata is converted to the timezone of the observations. 2. Iterate over all gaps. * The gap is converted into a set of missing records (depending on the time resolution of the observations). * Find a leading and trailing period. These periods are a subset of observations respectively before and after the gap. The size of these subsets is set by a target size (in records) and a minimum size (in records). If the subset of observations is smaller than the corresponding minimum size, the gap cannot be filled. * Modeldata, for the corresponding station and observation type, is extracted for the leading and trailing period. * By comparing the model data with the observations of the leading and trailing period, and grouping all records to their timestamp (i.g. diurnal categories), biasses are computed. * Modeldata for the missing records is extracted. * Weights ([0;1]) are computed for each gap record, representing the normalized distance (in time), to the beginning and end of the gap. * The modeldata at the missing records is then corrected by a weighted sum of the leading and trailing biases at the corresponding timestamp. In general, this means that the diurnal trend of the observations is restored as well as possible. 3. The gap is updated with the interpolated values (metobs_toolkit.Gap.gapfill_df) Note ------- A scientific publication on the performance of this technique is expected. Examples -------- .. code-block:: python import metobs_toolkit your_dataset = metobs_toolkit.Dataset() your_dataset.update_settings( input_data_file=metobs_toolkit.demo_datafile, # path to the data file input_metadata_file=metobs_toolkit.demo_metadatafile, template_file=metobs_toolkit.demo_template, ) # Specify the gap defenition your_dataset.update_qc_settings(gapsize_in_records = 20) #Update the gapsize BEFORE importing the data your_dataset.import_data_from_file() #Update the settings (definition of the period to calculate biases for) your_dataset.update_gap_and_missing_fill_settings( gap_debias_prefered_leading_period_hours=24, gap_debias_prefered_trailing_period_hours=24, gap_debias_minimum_leading_period_hours=6, gap_debias_minimum_trailing_period_hours=6, ) #(As a demonstration, we will fill the gaps of a single station. The following functions can also be # directly applied to the dataset.) your_station = your_dataset.get_station('vlinder05') #Get ERA5 modeldata at the location of your stations and period. ERA5_modeldata = your_station.get_modeldata(modelname='ERA5_hourly', obstype='temp') #Use the debias method to fill the gaps gapfill_df = your_station.fill_gaps_era5(modeldata=ERA5_modeldata, obstype='temp') """ # check if modeldata is available if modeldata is None: logger.warning( "The dataset has no modeldate. Use the set_modeldata() function to add modeldata." ) return None # check if obstype is present in eramodel assert ( obstype in modeldata.df.columns ), f"{obstype} is not present in the modeldate: {modeldata}" # check if all station are present in eramodeldata # stations = self.gaps.to_df().index.unique().to_list() stations = list(set([gap.name for gap in self.gaps])) assert all( [sta in modeldata.df.index.get_level_values("name") for sta in stations] ), "Not all stations with gaps are in the modeldata!" if method == "debias": fill_settings_debias = self.settings.gap["gaps_fill_settings"][ "model_debias" ] apply_debias_era5_gapfill( gapslist=self.gaps, dataset=self, eraModelData=modeldata, obstype=obstype, debias_settings=fill_settings_debias, overwrite_fill=overwrite_fill, ) # get fill df filldf = make_gapfill_df(self.gaps) else: sys.exit(f"{method} not implemented yet") # update attribute self.gapfilldf = filldf return filldf
[docs] def fill_gaps_automatic( self, modeldata, obstype="temp", max_interpolate_duration_str=None, overwrite_fill=False, ): """Fill the gaps by using linear interpolation or debiased modeldata. This method serves as a triage to select the gaps to be filled with linear interpolation and those to be filled using a diurnal debias gapfill. When the duration of a gap is smaller or equal than max_interpolation_duration, the linear interpolation method is applied else the debiased modeldata method. For a detailed description of these methods, we refer to the corresponding metobs_toolkit.Dataset.fill_gaps_linear() and metobs_toolkit.Dataset.fill_gaps_era5(). Parameters ---------- modeldata : metobs_toolkit.Modeldata The modeldata to use for the gapfill. This model data should the required timeseries to fill all gaps present in the dataset. obstype : String, optional Name of the observationtype you want to apply gap filling on. The modeldata must contain this observation type as well. The default is 'temp'. max_interpolate_duration_str : Timedelta or str, optional Maximum duration to apply interpolation for gapfill when using the automatic gapfill method. Gaps with longer durations will be filled using debiased modeldata. The default is None. overwrite_fill: bool, optional If a gap has already filled values, the interpolation of this gap is skipped if overwrite_fill is False. If set to True, the gapfill values and info will be overwitten. The default is False. Returns ------- comb_df : pandas.DataFrame A dataframe containing all the filled records. Examples -------- .. code-block:: python import metobs_toolkit your_dataset = metobs_toolkit.Dataset() your_dataset.update_settings( input_data_file=metobs_toolkit.demo_datafile, # path to the data file input_metadata_file=metobs_toolkit.demo_metadatafile, template_file=metobs_toolkit.demo_template, ) # Specify the gap defenition your_dataset.update_qc_settings(gapsize_in_records = 20) #Update the gapsize BEFORE importing the data your_dataset.import_data_from_file() #Update the settings (definition of the period to calculate biases for) your_dataset.update_gap_and_missing_fill_settings( gap_debias_prefered_leading_period_hours=24, gap_debias_prefered_trailing_period_hours=24, gap_debias_minimum_leading_period_hours=6, gap_debias_minimum_trailing_period_hours=6, ) #(As a demonstration, we will fill the gaps of a single station. The following functions can also be # directly applied to the dataset.) your_station = your_dataset.get_station('vlinder05') #Get ERA5 modeldata at the location of your stations and period. ERA5_modeldata = your_station.get_modeldata(modelname='ERA5_hourly', obstype='temp') #Use the debias method to fill the gaps gapfill_df = your_station.fill_gaps_automatic(modeldata=ERA5_modeldata, max_interpolate_duration_str='6h', # <6 hours will be interpolated obstype='temp') """ # ----------- Validate ---------------------------------------- # check if modeldata is available if modeldata is None: logger.warning( "The dataset has no modeldate. Use the set_modeldata() function to add modeldata." ) return None # check if obstype is present in eramodel assert ( obstype in modeldata.df.columns ), f"{obstype} is not present in the modeldate: {modeldata}" # check if all station are present in eramodeldata # stations = self.gaps.to_df().index.unique().to_list() stations = list(set([gap.name for gap in self.gaps])) assert all( [sta in modeldata.df.index.get_level_values("name") for sta in stations] ), "Not all stations with gaps are in the modeldata!" if max_interpolate_duration_str is None: max_interpolate_duration_str = self.settings.gap["gaps_fill_settings"][ "automatic" ]["max_interpolation_duration_str"] # ------------select the method to apply gapfill per gap ---------- interpolate_gaps = [] debias_gaps = [] for gap in self.gaps: if gap.duration <= pd.to_timedelta(max_interpolate_duration_str): interpolate_gaps.append(gap) else: debias_gaps.append(gap) # 1 ---------------Fill by interpolation --------------------- fill_settings_interp = self.settings.gap["gaps_fill_settings"]["linear"] apply_interpolate_gaps( gapslist=interpolate_gaps, obsdf=self.df, outliersdf=self.outliersdf, dataset_res=self.metadf["dataset_resolution"], gapfill_settings=self.settings.gap["gaps_fill_info"], obstype=obstype, method=fill_settings_interp["method"], max_consec_fill=fill_settings_interp["max_consec_fill"], overwrite_fill=overwrite_fill, ) filldf_interp = make_gapfill_df(interpolate_gaps) # 2 -------------- Fill by debias ----------------------------- fill_settings_debias = self.settings.gap["gaps_fill_settings"]["model_debias"] apply_debias_era5_gapfill( gapslist=debias_gaps, dataset=self, eraModelData=modeldata, obstype=obstype, debias_settings=fill_settings_debias, overwrite_fill=overwrite_fill, ) # add label column filldf_debias = make_gapfill_df(debias_gaps) # combine both fill df's comb_df = concat_save([filldf_interp, filldf_debias]) # update attr self.gapfilldf = comb_df return comb_df
[docs] def fill_gaps_linear(self, obstype="temp", overwrite_fill=False): """Fill the gaps using linear interpolation. The gapsfilldf attribute of the Datasetinstance will be updated if the gaps are not filled yet or if overwrite_fill is set to True. Parameters ---------- obstype : string, optional Fieldname to visualise. This can be an observation or station attribute. The default is 'temp'. overwrite_fill: bool, optional If a gap has already filled values, the interpolation of this gap is skipped if overwrite_fill is False. If set to True, the gapfill values and info will be overwitten. The default is False. Returns ------- gapfilldf : pandas.DataFrame A dataframe containing all the filled records. Notes ----- A schematic description of the linear gap fill: 1. Iterate over all gaps. 2. The gap is converted into a set of missing records (depending on the time resolution of the observations). 3. Find a leading (the last observations before the gap) record and a trailing record (the last observation after the gap). 4. By using the leading and trailing record an interpolation is applied to fill the missing records. A maximum consecutive fill threshold is applied, if exceeded the fill values are Nan's. 5. The gap is updated with the interpolated values (metobs_toolkit.Gap.gapfill_df) Examples -------- .. code-block:: python >>> import metobs_toolkit >>> >>> # Import data into a Dataset >>> dataset = metobs_toolkit.Dataset() >>> dataset.update_settings( ... input_data_file=metobs_toolkit.demo_datafile, ... input_metadata_file=metobs_toolkit.demo_metadatafile, ... template_file=metobs_toolkit.demo_template, ... ) >>> dataset.import_data_from_file() >>> dataset.coarsen_time_resolution(freq='1h') >>> >>> # Apply quality control on the temperature observations >>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings >>> >>> # Interpret the outliers as missing/gaps >>> dataset.update_gaps_and_missing_from_outliers(obstype='temp') >>> dataset Dataset instance containing: *28 stations *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types *10080 observation records *0 records labeled as outliers *2 gaps *1473 missing observations *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00) *time zone of the records: UTC *Coordinates are available for all stations. >>> >>> #Update the gapfill settings (else the defaults are used) >>> dataset.update_gap_and_missing_fill_settings(gap_interpolation_max_consec_fill=35) >>> >>> # Fill the gaps >>> dataset.fill_gaps_linear(obstype='temp') temp temp_final_label name datetime vlinder05 2022-09-06 21:00:00+00:00 21.378710 gap_interpolation 2022-09-06 22:00:00+00:00 21.357419 gap_interpolation 2022-09-06 23:00:00+00:00 21.336129 gap_interpolation 2022-09-07 00:00:00+00:00 21.314839 gap_interpolation 2022-09-07 01:00:00+00:00 21.293548 gap_interpolation 2022-09-07 02:00:00+00:00 21.272258 gap_interpolation 2022-09-07 03:00:00+00:00 21.250968 gap_interpolation 2022-09-07 04:00:00+00:00 21.229677 gap_interpolation 2022-09-07 05:00:00+00:00 21.208387 gap_interpolation 2022-09-07 06:00:00+00:00 21.187097 gap_interpolation 2022-09-07 07:00:00+00:00 21.165806 gap_interpolation 2022-09-07 08:00:00+00:00 21.144516 gap_interpolation 2022-09-07 09:00:00+00:00 21.123226 gap_interpolation 2022-09-07 10:00:00+00:00 21.101935 gap_interpolation 2022-09-07 11:00:00+00:00 21.080645 gap_interpolation 2022-09-07 12:00:00+00:00 21.059355 gap_interpolation 2022-09-07 13:00:00+00:00 21.038065 gap_interpolation 2022-09-07 14:00:00+00:00 21.016774 gap_interpolation 2022-09-07 15:00:00+00:00 20.995484 gap_interpolation 2022-09-07 16:00:00+00:00 20.974194 gap_interpolation 2022-09-07 17:00:00+00:00 20.952903 gap_interpolation 2022-09-07 18:00:00+00:00 20.931613 gap_interpolation 2022-09-07 19:00:00+00:00 20.910323 gap_interpolation 2022-09-07 20:00:00+00:00 20.889032 gap_interpolation 2022-09-07 21:00:00+00:00 20.867742 gap_interpolation 2022-09-07 22:00:00+00:00 20.846452 gap_interpolation 2022-09-07 23:00:00+00:00 20.825161 gap_interpolation 2022-09-08 00:00:00+00:00 20.803871 gap_interpolation 2022-09-08 01:00:00+00:00 20.782581 gap_interpolation 2022-09-08 02:00:00+00:00 20.761290 gap_interpolation 2022-09-08 03:00:00+00:00 20.740000 gap_interpolation 2022-09-08 04:00:00+00:00 20.718710 gap_interpolation 2022-09-08 05:00:00+00:00 20.697419 gap_interpolation 2022-09-08 06:00:00+00:00 20.676129 gap_interpolation 2022-09-08 07:00:00+00:00 20.654839 gap_interpolation >>> dataset.get_gaps_info() Gap for vlinder05 with:... """ # TODO logging fill_settings = self.settings.gap["gaps_fill_settings"]["linear"] # fill gaps apply_interpolate_gaps( gapslist=self.gaps, obsdf=self.df, outliersdf=self.outliersdf, dataset_res=self.metadf["dataset_resolution"], gapfill_settings=self.settings.gap["gaps_fill_info"], obstype=obstype, method=fill_settings["method"], max_consec_fill=fill_settings["max_consec_fill"], overwrite_fill=overwrite_fill, ) # get gapfilldf gapfilldf = make_gapfill_df(self.gaps) # update attr self.gapfilldf = gapfilldf return gapfilldf
[docs] def fill_missing_obs_linear(self, obstype="temp"): """Interpolate missing observations. Fill in the missing observation rectords using interpolation. The missing_fill_df attribute of the Dataset will be updated. Parameters ---------- obstype : string, optional Fieldname to visualise. This can be an observation or station attribute. The default is 'temp'. Returns ------- None. Notes ----- A schematic description of the linear fill of missing observations: 1. Iterate over all missing observations. 2. The missing observations are converted into a set of missing records (depending on the time resolution of the observations). 3. Find a leading (the last observations before the missing observation) record and a trailing record (the last observation after the missing observation). 4. By using the leading and trailing records, interpolation is applied to fill the missing records. 5. The missing record is updated with the interpolated values (metobs_toolkit.Gap.gapfill_df). Examples -------- .. code-block:: python >>> import metobs_toolkit >>> >>> # Import data into a Dataset >>> dataset = metobs_toolkit.Dataset() >>> dataset.update_settings( ... input_data_file=metobs_toolkit.demo_datafile, ... input_metadata_file=metobs_toolkit.demo_metadatafile, ... template_file=metobs_toolkit.demo_template, ... ) >>> dataset.import_data_from_file() >>> dataset.coarsen_time_resolution(freq='1h') >>> >>> # Apply quality control on the temperature observations >>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings >>> >>> # Interpret the outliers as missing/gaps >>> dataset.update_gaps_and_missing_from_outliers(obstype='temp') >>> dataset Dataset instance containing: *28 stations *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types *10080 observation records *0 records labeled as outliers *2 gaps *1473 missing observations *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00) *time zone of the records: UTC *Coordinates are available for all stations. >>> >>> # Fill the missing observations >>> dataset.fill_missing_obs_linear(obstype='temp') >>> dataset.missing_obs.get_info() -------- Missing observations info -------- (Note: missing observations are defined on the frequency estimation of the native dataset.) * 1473 missing observations * For 28 stations * Missing observations are filled with interpolate for: temp: temp name datetime vlinder01 2022-09-08 08:00:00+00:00 18.630303 2022-09-07 23:00:00+00:00 17.512121 2022-09-08 00:00:00+00:00 17.636364 2022-09-08 02:00:00+00:00 17.884848 2022-09-08 03:00:00+00:00 18.009091 ... """ # TODO logging fill_settings = self.settings.missing_obs["missing_obs_fill_settings"]["linear"] fill_info = self.settings.missing_obs["missing_obs_fill_info"] # fill missing obs self.missing_obs.interpolate_missing( obsdf=self.df, resolutionseries=self.metadf["dataset_resolution"], obstype=obstype, method=fill_settings["method"], ) missing_fill_df = self.missing_obs.fill_df missing_fill_df[obstype + "_" + fill_info["label_columnname"]] = fill_info[ "label" ]["linear"] # Update attribute self.missing_fill_df = missing_fill_df
[docs] def get_gaps_df(self): """ List all gaps into an overview dataframe. Returns ------- pandas.DataFrame A DataFrame with stationnames as index, and the start, end and duretion of the gaps as columns. Examples -------- .. code-block:: python >>> import metobs_toolkit >>> >>> # Import data into a Dataset >>> dataset = metobs_toolkit.Dataset() >>> dataset.update_settings( ... input_data_file=metobs_toolkit.demo_datafile, ... input_metadata_file=metobs_toolkit.demo_metadatafile, ... template_file=metobs_toolkit.demo_template, ... ) >>> dataset.import_data_from_file() >>> dataset.coarsen_time_resolution(freq='1h') >>> >>> # Apply quality control on the temperature observations >>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings >>> >>> # Interpret the outliers as missing/gaps >>> dataset.update_gaps_and_missing_from_outliers(obstype='temp') >>> dataset Dataset instance containing: *28 stations *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types *10080 observation records *0 records labeled as outliers *2 gaps *1473 missing observations *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00) *time zone of the records: UTC *Coordinates are available for all stations. >>> dataset.get_gaps_df() start_gap end_gap duration name vlinder05 2022-09-06 21:00:00+00:00 2022-09-13 06:00:00+00:00 6 days 09:00:00 vlinder05 2022-09-13 20:00:00+00:00 2022-09-15 23:00:00+00:00 2 days 03:00:00 """ return gaps_to_df(self.gaps)
[docs] def get_gaps_info(self): """Print out detailed information of the gaps. Returns ------- None. Examples -------- .. code-block:: python >>> import metobs_toolkit >>> >>> # Import data into a Dataset >>> dataset = metobs_toolkit.Dataset() >>> dataset.update_settings( ... input_data_file=metobs_toolkit.demo_datafile, ... input_metadata_file=metobs_toolkit.demo_metadatafile, ... template_file=metobs_toolkit.demo_template, ... ) >>> dataset.import_data_from_file() >>> dataset.coarsen_time_resolution(freq='1h') >>> >>> # Apply quality control on the temperature observations >>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings >>> >>> # Interpret the outliers as missing/gaps >>> dataset.update_gaps_and_missing_from_outliers(obstype='temp') >>> dataset Dataset instance containing: *28 stations *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types *10080 observation records *0 records labeled as outliers *2 gaps *1473 missing observations *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00) *time zone of the records: UTC *Coordinates are available for all stations. >>> dataset.get_gaps_info() Gap for vlinder05 with: ---- Gap info ----- (Note: gaps are defined on the frequency estimation of the native dataset.) * Start gap: 2022-09-06 21:00:00+00:00 * End gap: 2022-09-13 06:00:00+00:00 * Duration gap: 6 days 09:00:00 ---- Gap fill info ----- (No gapfill applied) Gap for vlinder05 with: ---- Gap info ----- (Note: gaps are defined on the frequency estimation of the native dataset.) * Start gap: 2022-09-13 20:00:00+00:00 * End gap: 2022-09-15 23:00:00+00:00 * Duration gap: 2 days 03:00:00 ---- Gap fill info ----- (No gapfill applied) """ if bool(self.gaps): # there are gaps for gap in self.gaps: gap.get_info() else: # no gaps print("There are no gaps.")
[docs] def get_missing_obs_info(self): """Print out detailed information of the missing observations. Returns ------- None. Examples -------- .. code-block:: python >>> import metobs_toolkit >>> >>> # Import data into a Dataset >>> dataset = metobs_toolkit.Dataset() >>> dataset.update_settings( ... input_data_file=metobs_toolkit.demo_datafile, ... input_metadata_file=metobs_toolkit.demo_metadatafile, ... template_file=metobs_toolkit.demo_template, ... ) >>> dataset.import_data_from_file() >>> dataset.coarsen_time_resolution(freq='1h') >>> >>> # Apply quality control on the temperature observations >>> dataset.apply_quality_control(obstype='temp') #Using the default QC settings >>> >>> # Interpret the outliers as missing/gaps >>> dataset.update_gaps_and_missing_from_outliers(obstype='temp') >>> dataset Dataset instance containing: *28 stations *['temp', 'humidity', 'wind_speed', 'wind_direction'] observation types *10080 observation records *0 records labeled as outliers *2 gaps *1473 missing observations *records range: 2022-09-01 00:00:00+00:00 --> 2022-09-15 23:00:00+00:00 (total duration: 14 days 23:00:00) *time zone of the records: UTC *Coordinates are available for all stations. >>> dataset.get_missing_obs_info() -------- Missing observations info -------- (Note: missing observations are defined on the frequency estimation of the native dataset.) * 1473 missing observations * For 28 stations * The missing observations are not filled. (More details on the missing observation can be found in the .series and .fill_df attributes.) """ # empty obs protector in the .get_info method. self.missing_obs.get_info()