Source code for debiased_sd.utils

"""
Utility helpers
"""

# Modules
import numpy as np
from typing import Callable, Tuple
from scipy.stats import kurtosis
from scipy.special import gamma, gammaln

##########################
# --- COMMON HELPERS --- #


[docs]
def check_x_axis_ddof(
        x: np.ndarray, 
        axis: int | None = None,
        ddof:int = 1, 
        ) -> Tuple[np.ndarray, np.ndarray, int]:
    """
    Checks that x aligns with with the stated axis, and that ddof is valid

    Args
    ====
    x: np.ndarray
        An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk)
    axis: int | None = None
        Axis to calculate the SD over
    ddof: int = 1
        The degrees of freedom for the sample SD

    Returns
    =======
    (x, std, n) or (np.array(x), sd_array[-axis] , x.shape[-axis])
    """
    # Input checks
    assert axis >= 0, f'Axis must be at least 0, not {axis}'
    assert ddof >= 0, f'Degrees of freedom must be >= 0, not {ddof}'
    if not isinstance(x, np.ndarray):
        x = np.array(x)
    shape_len = len(x.shape)
    assert shape_len >= 1, f'You must provide at least a vector'
    if shape_len >= 2:
        valid_axes = list(range(shape_len))
        assert axis in valid_axes, f'axis must be one of {valid_axes}, not {axis}'
    # Calculate sample size
    sample_size = x.shape[axis]
    assert sample_size > ddof, f'If you have {sample_size} observations, then ddof must be at most {sample_size-1}, not {ddof}'
    # Calculate the sample SD
    sighat = np.std(x, axis=axis, ddof=ddof)
    return x, sighat, sample_size



###############################
# --- GAUSSIAN ADJUSTMENT --- #


[docs]
def sd_gaussian(
        x: np.ndarray, 
        axis: int | None = None,
        ddof: int = 1,
        approx:bool = True,
        **kwargs,
        ) -> np.ndarray:
    """
    Calculate the de-biased sample SD when the data is drawn from a normal distribution
    
    Args
    ====
    x : np.ndarray
        An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk)
    axis : int, optional
        Axis to calculate the SD over
    ddof : int, optional
        The degrees of freedom for the sample SD
    approx : bool, optional
        Should a log-approximation be used for the Gamma function calculation? Recommended if n is large
    """
    # Input checks
    x, sighat, n = check_x_axis_ddof(x, axis, ddof)
    # Calculate adjustment ratio
    if approx:
        gamma_ratio = np.exp(gammaln((n-1)/2) - gammaln(n/2))
    else:
        gamma_ratio = gamma((n-1)/2) / gamma(n / 2)
    C_n = np.sqrt((n - 1)/2) * gamma_ratio
    # Return adjusted value
    sighat *= C_n
    return sighat



###############################
# --- KURTOSIS ADJUSTMENT --- #


[docs]
def sd_kappa(
            x: np.ndarray, 
            axis: int | None = None,
            ddof:int = 1, 
            debias_cumulants: bool = True,
            kurtosis_fun: Callable | None = None,
            **kwargs
            ) -> np.ndarray:
    """
    Adjust the vanilla SD estimator with the first-order adjustment from a Maclaurin expansion: sigma*[1-(kappa-1+2(n-1))/(8n)]^{-1}. See Giles (2021): http://web.uvic.ca/~dgiles/downloads/working_papers/std_dev.pdf

    Args
    ====
    x : np.ndarray
        An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk)
    axis : int | None, optional
        Axis to calculate the SD over
    ddof : int, optional
        The degrees of freedom for the sample SD
    debias_cumulants : bool, optional
        Should the "debiased" versions of the cumulants to be used? See https://mathworld.wolfram.com/Cumulant.html (this is the default in pandas.kurtosis btw)
    kurtosis_fun : Callable | None, optional
        Should an alternative kurtosis calculation be used? Will pass in kwargs & axis.
    """
    # Input checks
    x, sighat, n = check_x_axis_ddof(x, axis, ddof)
    # Calculate kurtosis (kappa)
    if kurtosis_fun:
        assert isinstance(kurtosis_fun, callable), 'if you supply an other_method, it must be callabte'
        kappa = kurtosis_fun(x, axis=axis, **kwargs)
    if debias_cumulants:
        kappa = kurtosis(a=x, fisher=False, bias=False, **kwargs)
    else:
        kappa = kurtosis(a=x, fisher=False, bias=True, **kwargs)
    # Calculate C_n
    C_n_kappa = 1 / ( 1 - (kappa - 1 + 2/(n-1)) / (8*n) ) 
    # Return adjusted value
    sighat *= C_n_kappa
    return sighat



################################
# --- JACKKNIFE ADJUSTMENT --- #


[docs]
def sd_jackknife(
        x: np.ndarray, 
        axis: int = 0,
        ddof: int = 0,
        **kwargs,
        ) -> np.ndarray:
    """
    Calculates the jackknife-adjusted sample SD by calculating the (LOO) bias, where the final estimator is: n*sighat - (n-1)*mean(sighat_loo)

    Args
    ====
    x : np.ndarray
        An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk)
    axis : int, optional
        Axis to calculate the SD over
    ddof : int, optional
        The degrees of freedom for the sample SD
    """
    # Input checks
    x, sighat, n = check_x_axis_ddof(x, axis, ddof)
    # Calculat the LOO sample SD
    xbar = np.mean(x, axis = axis)
    xbar_loo = (n*xbar - x) / (n-1)
    mu_x2 = np.mean(x ** 2, axis=axis)
    # Unadjuasted LOO variance
    sigma2_loo = (n / (n - 1)) * (mu_x2 - x**2 / n - (n - 1) * xbar_loo**2 / n)
    # Apply DOF adjustment, if any
    n_adj = (n-1) / (n - ddof - 1)
    # Clip is needed for ≈ 0 values which are negative
    sighat_loo = np.sqrt(n_adj * np.clip(sigma2_loo, 0, None))
    sighat_loo_mu = np.mean(sighat_loo, axis=axis)
    # Calculate the bias and offset
    bias_jackknife = (n - 1) * (sighat_loo_mu - sighat)
    sighat -= bias_jackknife
    return sighat



################################
# --- BOOTSTRAP ADJUSTMENT --- #



[docs]
def sd_bootstrap(
        x:np.ndarray, 
        axis: int = 0,
        ddof: int = 0,
        num_boot: int = 1000,
        random_state: int | None = None,
        **kwargs,
        ) -> np.ndarray:
    """
    Generates {num_boot} bootstrap replicates for a 1-d array

    Args
    ====
    x : np.ndarray
        An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk)
    axis : int | None, optional
        Axis to calculate the SD over
    ddof : int, optional
        The degrees of freedom for the sample SD
    num_boot : int, optional
        If method=='bootstrap', how many bootstrap sample to draw? Note that this approach will broadcast the original array with an addition {num_boot} rows in the axis=-1 dimension, so keep that in mind for memorary consideration
    random_state : int | None, optional
        Reproducability seed for the bootstrap method
    """
    # Input checks
    x, sighat, n = check_x_axis_ddof(x, axis, ddof)
    num_dim = len(x.shape)
    # Generate {num_boot} sample SDs
    np.random.seed(random_state)
    if num_dim == 1:  # Fast to do random choice 
        idx = np.random.randint(low=0, high=n, size=n*num_boot)
        sighat_star = x[idx].reshape([n, num_boot]).std(ddof=ddof, axis=axis)
    else:  # Otherwise need to take along axis of index
        idx = np.random.randint(low=0, high=n, size=x.shape+(num_boot,))
        sighat_star = np.take_along_axis(np.expand_dims(x, -1), idx, axis=axis).\
                        std(ddof=ddof, axis=axis)
    # Calculate the bootstrapped means  
    sighat_star_mu = np.mean(sighat_star, axis=-1)
    # Calculate bias and offset
    bias_bs = sighat_star_mu - sighat
    sighat -= bias_bs
    return sighat