Source code for operational_analysis.toolkits.timeseries

"""
This module provides useful functions for processing timeseries data
"""

from datetime import datetime

import numpy as np
import pandas as pd
from pytz import timezone


[docs]def convert_local_to_utc(d, tz_string):
    """
    Convert timestamps in local time to UTC. The function can only act on a single timestamp at a time, so
    for example use the .apply function in Pandas:

        date_utc = df['time'].apply(convert_local_to_utc, args = ('US/Pacific',))

    Also note that this function doesn't solve the end of DST when times between 1:00-2:00 are repeated
    in November. Those dates are left repeated in UTC time and need to be shifted manually.

    The function does address the missing 2:00-3:00 times at the start of DST in March

    Args:
        d(:obj:`datetime.datetime`): the local date, tzinfo must not be set
        tz_string(:obj:`str`): the local timezone

    Returns:
        :obj:`datetime.datetime`: the local date converted to UTC time

    """
    if d.tzinfo:
        raise Exception("d parameter must not have a timezone")

    d_obj = datetime(
        d.year, d.month, d.day, d.hour, d.minute
    )  # Convert datetime object into simple integer form
    tz = timezone(tz_string)  # define timezone
    d_local = tz.localize(d_obj, is_dst=True)  # localize the date object
    utc = timezone("UTC")  # define UTC timezone
    d_utc = d_local.astimezone(utc)  # calculate UTC time

    return d_utc


[docs]def find_time_gaps(t_series, freq):
    """
    Find data gaps in input data and report them

    Args:
        t_series(:obj:`pandas.Series`): Pandas series of datetime objects
        freq(:obj:`string`): time series frequency

    Returns:
        :obj:`pandas.Series`: Series of missing time stamps in datetime format
    """

    # Convert 't_series' to Pandas series in case a time index is passed
    t_series = pd.Series(t_series)

    if t_series.size == 0:
        return t_series

    range_dt = pd.Series(
        data=pd.date_range(t_series.min(), end=t_series.max(), freq=freq)
    )  # Full range of timestamps

    # Find missing time stamps by concatenating full timestamps and actual and removing duplicates
    # What remains is those timestamps not found in the data
    missing_dt = (pd.concat([range_dt, t_series])).drop_duplicates(keep=False)

    return missing_dt


[docs]def find_duplicate_times(t_series, freq):
    """
    Find duplicate input data and report them. The first duplicated item is not reported, only subsequent duplicates.

    Args:
        t_series(:obj:`pandas.Series`): Pandas series of datetime objects
        freq(:obj:`string`): time series frequency

    Returns:
        :obj:`pandas.Series`: Duplicates from input data
    """

    # Convert 't_series' to Pandas series in case a time index is passed
    t_series = pd.Series(t_series)

    repeated_steps = t_series[t_series.duplicated()]

    return repeated_steps


[docs]def gap_fill_data_frame(df, time_col, freq):
    """
    Find missing timestamps in the input data frame and add rows with NaN values for those missing rows.
    Return a new data frame that has no missing timestamps and that is sorted by time.

    Args:
        df(:obj:`pandas.DataFrame`): the input data frame
        time_col(:obj:`str`): name of the column in 'df' with time data

    Returns:
        :obj:`pandas.DataFrame`: output data frame with NaN data for the data gaps

    """
    # If the dataframe is empty, just return it.
    if df[time_col].size == 0:
        return df

    timestamp_gaps = find_time_gaps(df[time_col], freq)  # Find gaps in timestep
    gap_df = pd.DataFrame(columns=df.columns)
    gap_df[time_col] = timestamp_gaps

    return df.append(gap_df).sort_values(time_col)


[docs]def percent_nan(s):
    """
    Return percentage of data that are Nan or 1 if the series is empty.

    Args:
        s(:obj:`pandas.Series`): The data to be checked for 'na' values

    Returns:
        :obj:`float`: Percentage of NaN data in the data series
    """
    if len(s) > 0:
        perc = np.float64((s.isnull().sum())) / np.float64(len(s))
    else:
        perc = 1
    return perc


[docs]def num_days(s):
    """
    Return number of days in 's'

    Args:
        s(:obj:`pandas.Series`): The data to be checked for number of days.

    Returns:
        :obj:`int`: Number of days in the data
    """
    n_days = len(s.resample("D"))

    return n_days


[docs]def num_hours(s):
    """
    Return number of data points in 's'

    Args:
        s(:obj:`pandas.Series`): The data to be checked for number of data points
    Returns:
        :obj:`int`: Number of hours in the data
    """
    n_hours = len(s.resample("H"))

    return n_hours