Source code for operational_analysis.toolkits.filters

"""
This module provides functions for flagging pandas data series based on a range of criteria. The functions are largely
intended for application in wind plant operational energy analysis, particularly wind speed vs. power curves.
"""

import numpy as np
import scipy as sp
import pandas as pd


[docs]def range_flag(data_col, below=-1.0 * np.inf, above=np.inf):
    """Flag data for which the specified data is outside a specified range

    Args:
        data_col (:obj:`pandas.Series`): data to be flagged
        below (:obj:`float`): upper threshold (inclusive) for data; default np.inf
        above (:obj:`float`): lower threshold (inclusive) for data; default -np.inf

    Returns:
        :obj:`pandas.Series(bool)`: Array-like object with boolean entries.
    """

    flag = ~((data_col <= above) & (data_col >= below))  # Apply the range flag
    return flag  # Return boolean series of data flags


[docs]def unresponsive_flag(data_col, threshold=3):
    """Flag time stamps for which the reported data does not change for <threshold> repeated intervals.
    Function includes the option to group by a column in the data frame (e.g. turbine ID)

    Args:
        data_col(:obj:`pandas.Series`): data to be flagged
        threshold(:obj:`int`): number of intervals over which measurment does not change

    Returns:
        :obj:`pandas.Series(bool)`: Array-like object with boolean entries.
    """

    # Get True/False value depending if the difference in successive time steps is not equal to zero
    value_diff = data_col.diff().ne(0)

    # take the rolling sum of the boolean diff column in period lengths defined by threshold
    roll_sum = value_diff.rolling(threshold - 1).sum()

    # Create boolean series that is True if rolling sum is zero
    flag_ind = roll_sum == 0

    # Need to flag preceding <threshold> -1 values as well
    for n in np.arange(threshold - 1):
        flag_ind = flag_ind | flag_ind.shift(-1)

    return flag_ind  # Return boolean series of data flags


[docs]def std_range_flag(data_col, threshold=2.0):
    """Flag time stamps for which the measurement is outside of the threshold number of standard deviations from the
    mean across all passed columns; does not distinguish between asset ids

    Args:
        data_col(:obj:`pandas.Series`): data to be flagged
        threshold(:obj:`float`): multiplicative factor on standard deviation to use in flagging

    Returns:
        :obj:`pandas.Series(bool)`: Array-like object with boolean entries.
    """

    data_mean = data_col.mean()  # Get mean of data
    data_std = data_col.std()  # Get std of data
    flag = (data_col <= data_mean - threshold * data_std) | (
        data_col >= data_mean + threshold * data_std
    )  # Apply the range flag

    return flag


[docs]def window_range_flag(window_col, window_start, window_end, value_col, value_min, value_max):
    """Flag time stamps for which measurement in column <window> within range [window_start, window_end] and measurement
    in column <value> outside of range [value_min, value_max]

    Args:
        window_col(:obj:`pandas.Series`): data used to define the window
        window_start(:obj:`float`): minimum value for window
        window_end(:obj:`float`): maximum value for window
        value_col(:obj:`pandas.Series`): data to be flagged
        value_max(:obj:`float`): upper threshold for data; default np.inf
        value_min(:obj:`float`): lower threshold for data; default -np.inf

    Returns:
        :obj:`pandas.Series(bool)`: Array-like object with boolean entries.
    """

    flag = (
        (window_col >= window_start)
        & (window_col <= window_end)
        & ((value_col < value_min) | (value_col > value_max))
    )

    return flag


[docs]def bin_filter(
    bin_col,
    value_col,
    bin_width,
    threshold=2,
    center_type="mean",
    bin_min=None,
    bin_max=None,
    threshold_type="std",
    direction="all",
):
    """Flag time stamps for which data in <value_col> when binned by data in <bin_col> into bins of <width>
    is outside <threhsold> bin. The <center_type> of each bin can be either the median or mean, and flagging
    can be applied directionally (i.e. above or below the center, or both)

    Args:
        bin_col(:obj:`pandas.Series`): data to be used for binning
        value_col(:obj:`pandas.Series`): data to be flagged
        bin_width(:obj:`float`): width of bin in units of bin_col
        threshold(:obj:`float`): outlier threshold (multiplicative factor of std of <value_col> in bin)
        bin_min(:obj:`float`): minimum bin value below which flag should not be applied
        bin_max(:obj:`float`): maximum bin value above which flag should not be applied
        threshold_type(:obj:`str`): option to apply a 'std' or 'scalar' based threshold
        center_type(:obj:`str`): option to use a 'mean' or 'median' center for each bin
        direction(:obj:`str`): option to apply flag only to data 'above' or 'below' the mean, otherwise the default is
        'all'

    Returns:
        :obj:`pandas.Series(bool)`: Array-like object with boolean entries.
    """

    # Set bin min and max values if not passed to function
    if bin_min is None:
        bin_min = bin_col.min()
    if bin_max is None:
        bin_max = bin_col.max()

    # Define bin edges
    bin_edges = np.arange(bin_min, bin_max, bin_width)

    # Ensure the last bin edge value is bin_max
    if bin_edges[-1] < bin_max:
        bin_edges = np.append(bin_edges, bin_max)
    elif bin_edges[-1] > bin_max:
        bin_edges[-1] = bin_max

    nbins = len(bin_edges) - 1  # Get number of bins

    # Define empty flag of 'False' values with indices matching value_col
    flag = pd.Series(index=value_col.index, data=False)

    # Loop through bins and applying flagging
    for n in np.arange(nbins):
        # Get data that fall wihtin bin
        y_bin = value_col.loc[(bin_col <= bin_edges[n + 1]) & (bin_col > bin_edges[n])]

        # Get center of binned data
        if center_type == "mean":
            cent = y_bin.mean()
        elif center_type == "median":
            cent = y_bin.median()
        else:
            print("incorrect center type specified")

        # Define threshold of data flag
        if threshold_type == "std":
            ran = y_bin.std() * threshold
        elif threshold_type == "scalar":
            ran = threshold

        # Perform flagging depending on specfied direction
        if direction == "all":
            flag_bin = (y_bin > (cent + ran)) | (y_bin < (cent - ran))
        elif direction == "above":
            flag_bin = y_bin > (cent + ran)
        elif direction == "below":
            flag_bin = y_bin < (cent - ran)

        # Record flags in final flag column
        flag.loc[flag_bin.index] = flag_bin

    return flag


[docs]def cluster_mahalanobis_2d(data_col1, data_col2, n_clusters=13, dist_thresh=3.0):
    """K-means clustering of  data into <n_cluster> clusters; Mahalanobis distance evaluated for each cluster and
    points with distances outside of <dist_thresh> are flagged; distinguishes between asset ids

    Args:
        data_col1(:obj:`pandas.Series`): first data column in 2D cluster analysis
        data_col2(:obj:`pandas.Series`): second data column in 2D cluster analysis
        n_clusters(:obj:`int`):' number of clusters to use
        dist_thresh(:obj:`float`): maximum Mahalanobis distance within each cluster for data to be remain unflagged

    Returns:
        :obj:`pandas.Series(bool)`: Array-like object with boolean entries.
    """

    # Create 2D data frame for input into cluster algorithm
    df = pd.DataFrame({"d1": data_col1, "d2": data_col2})

    # Run cluster algorithm
    from sklearn.cluster import KMeans

    kmeans = KMeans(n_clusters=n_clusters).fit(df)

    # Define empty flag of 'False' values with indices matching value_col
    flag = pd.Series(index=data_col1.index, data=False)

    # Loop through clusters and flag data that fall outside a threshold distance from cluster center
    for ic in range(n_clusters):
        # Extract data for cluster
        clust_sub = kmeans.labels_ == ic
        cluster = df.loc[clust_sub]

        # Cluster centroid
        centroid = kmeans.cluster_centers_[ic]

        # Cluster covariance and inverse covariance
        covmx = cluster.cov()
        invcovmx = sp.linalg.inv(covmx)

        # Compute mahalnobis distance of each point in cluster
        mahalanobis_dist = cluster.apply(
            lambda r: sp.spatial.distance.mahalanobis(r.values, centroid, invcovmx), axis=1
        )

        # Flag data outside the distance threshold
        flag_bin = mahalanobis_dist > dist_thresh

        # Record flags in final flag column
        flag.loc[flag_bin.index] = flag_bin

    return flag