Source code for debiased_sd.utils

"""
Utility helpers
"""

# Modules
import numpy as np
from typing import Callable, Tuple
from scipy.stats import kurtosis
from scipy.special import gamma, gammaln

##########################
# --- COMMON HELPERS --- #

[docs] def check_x_axis_ddof( x: np.ndarray, axis: int | None = None, ddof:int = 1, ) -> Tuple[np.ndarray, np.ndarray, int]: """ Checks that x aligns with with the stated axis, and that ddof is valid Args ==== x: np.ndarray An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk) axis: int | None = None Axis to calculate the SD over ddof: int = 1 The degrees of freedom for the sample SD Returns ======= (x, std, n) or (np.array(x), sd_array[-axis] , x.shape[-axis]) """ # Input checks assert axis >= 0, f'Axis must be at least 0, not {axis}' assert ddof >= 0, f'Degrees of freedom must be >= 0, not {ddof}' if not isinstance(x, np.ndarray): x = np.array(x) shape_len = len(x.shape) assert shape_len >= 1, f'You must provide at least a vector' if shape_len >= 2: valid_axes = list(range(shape_len)) assert axis in valid_axes, f'axis must be one of {valid_axes}, not {axis}' # Calculate sample size sample_size = x.shape[axis] assert sample_size > ddof, f'If you have {sample_size} observations, then ddof must be at most {sample_size-1}, not {ddof}' # Calculate the sample SD sighat = np.std(x, axis=axis, ddof=ddof) return x, sighat, sample_size
############################### # --- GAUSSIAN ADJUSTMENT --- #
[docs] def sd_gaussian( x: np.ndarray, axis: int | None = None, ddof: int = 1, approx:bool = True, **kwargs, ) -> np.ndarray: """ Calculate the de-biased sample SD when the data is drawn from a normal distribution Args ==== x : np.ndarray An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk) axis : int, optional Axis to calculate the SD over ddof : int, optional The degrees of freedom for the sample SD approx : bool, optional Should a log-approximation be used for the Gamma function calculation? Recommended if n is large """ # Input checks x, sighat, n = check_x_axis_ddof(x, axis, ddof) # Calculate adjustment ratio if approx: gamma_ratio = np.exp(gammaln((n-1)/2) - gammaln(n/2)) else: gamma_ratio = gamma((n-1)/2) / gamma(n / 2) C_n = np.sqrt((n - 1)/2) * gamma_ratio # Return adjusted value sighat *= C_n return sighat
############################### # --- KURTOSIS ADJUSTMENT --- #
[docs] def sd_kappa( x: np.ndarray, axis: int | None = None, ddof:int = 1, debias_cumulants: bool = True, kurtosis_fun: Callable | None = None, **kwargs ) -> np.ndarray: """ Adjust the vanilla SD estimator with the first-order adjustment from a Maclaurin expansion: sigma*[1-(kappa-1+2(n-1))/(8n)]^{-1}. See Giles (2021): http://web.uvic.ca/~dgiles/downloads/working_papers/std_dev.pdf Args ==== x : np.ndarray An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk) axis : int | None, optional Axis to calculate the SD over ddof : int, optional The degrees of freedom for the sample SD debias_cumulants : bool, optional Should the "debiased" versions of the cumulants to be used? See https://mathworld.wolfram.com/Cumulant.html (this is the default in pandas.kurtosis btw) kurtosis_fun : Callable | None, optional Should an alternative kurtosis calculation be used? Will pass in kwargs & axis. """ # Input checks x, sighat, n = check_x_axis_ddof(x, axis, ddof) # Calculate kurtosis (kappa) if kurtosis_fun: assert isinstance(kurtosis_fun, callable), 'if you supply an other_method, it must be callabte' kappa = kurtosis_fun(x, axis=axis, **kwargs) if debias_cumulants: kappa = kurtosis(a=x, fisher=False, bias=False, **kwargs) else: kappa = kurtosis(a=x, fisher=False, bias=True, **kwargs) # Calculate C_n C_n_kappa = 1 / ( 1 - (kappa - 1 + 2/(n-1)) / (8*n) ) # Return adjusted value sighat *= C_n_kappa return sighat
################################ # --- JACKKNIFE ADJUSTMENT --- #
[docs] def sd_jackknife( x: np.ndarray, axis: int = 0, ddof: int = 0, **kwargs, ) -> np.ndarray: """ Calculates the jackknife-adjusted sample SD by calculating the (LOO) bias, where the final estimator is: n*sighat - (n-1)*mean(sighat_loo) Args ==== x : np.ndarray An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk) axis : int, optional Axis to calculate the SD over ddof : int, optional The degrees of freedom for the sample SD """ # Input checks x, sighat, n = check_x_axis_ddof(x, axis, ddof) # Calculat the LOO sample SD xbar = np.mean(x, axis = axis) xbar_loo = (n*xbar - x) / (n-1) mu_x2 = np.mean(x ** 2, axis=axis) # Unadjuasted LOO variance sigma2_loo = (n / (n - 1)) * (mu_x2 - x**2 / n - (n - 1) * xbar_loo**2 / n) # Apply DOF adjustment, if any n_adj = (n-1) / (n - ddof - 1) # Clip is needed for ≈ 0 values which are negative sighat_loo = np.sqrt(n_adj * np.clip(sigma2_loo, 0, None)) sighat_loo_mu = np.mean(sighat_loo, axis=axis) # Calculate the bias and offset bias_jackknife = (n - 1) * (sighat_loo_mu - sighat) sighat -= bias_jackknife return sighat
################################ # --- BOOTSTRAP ADJUSTMENT --- #
[docs] def sd_bootstrap( x:np.ndarray, axis: int = 0, ddof: int = 0, num_boot: int = 1000, random_state: int | None = None, **kwargs, ) -> np.ndarray: """ Generates {num_boot} bootstrap replicates for a 1-d array Args ==== x : np.ndarray An array or array-like object of arbitrary dimension: x.shape = (d1, d2, ..., dk) axis : int | None, optional Axis to calculate the SD over ddof : int, optional The degrees of freedom for the sample SD num_boot : int, optional If method=='bootstrap', how many bootstrap sample to draw? Note that this approach will broadcast the original array with an addition {num_boot} rows in the axis=-1 dimension, so keep that in mind for memorary consideration random_state : int | None, optional Reproducability seed for the bootstrap method """ # Input checks x, sighat, n = check_x_axis_ddof(x, axis, ddof) num_dim = len(x.shape) # Generate {num_boot} sample SDs np.random.seed(random_state) if num_dim == 1: # Fast to do random choice idx = np.random.randint(low=0, high=n, size=n*num_boot) sighat_star = x[idx].reshape([n, num_boot]).std(ddof=ddof, axis=axis) else: # Otherwise need to take along axis of index idx = np.random.randint(low=0, high=n, size=x.shape+(num_boot,)) sighat_star = np.take_along_axis(np.expand_dims(x, -1), idx, axis=axis).\ std(ddof=ddof, axis=axis) # Calculate the bootstrapped means sighat_star_mu = np.mean(sighat_star, axis=-1) # Calculate bias and offset bias_bs = sighat_star_mu - sighat sighat -= bias_bs return sighat