Source code for chromosight.utils.stats

"""Chromosight's stats submodule contains helper function to compute statistical estimators from distributions"""

import numpy as np
import scipy.stats as ss


[docs]def fdr_correction(pvals):
    """
    Applies false discovery rate correction via the
    Benjamini-Hochberg procedure to adjust input p-values
    for multiple testing. .

    Parameters
    ----------
    pvals : numpy.array of floats
        Array of uncorrected p-values.

    Returns
    -------
    fdr : numpy.array of floats
        Array of corrected p-values (q-values).
    """
    if pvals is None:
        qvals = None
    else:
        pvals = np.array(pvals)
        # get descending indices
        by_descend = pvals.argsort()[::-1]
        # Get mapping from sorted to original indices
        by_orig = by_descend.argsort()
        # Compute the ratio of n_pval by ranks, which can then be
        # multiplied by p-values to get the critical value
        steps = float(len(pvals)) / np.arange(len(pvals), 0, -1)
        # Compute critical values and find last p-value below above it
        # Set all other to the same value and back to 1 if higher
        qvals = np.minimum(1, np.minimum.accumulate(steps * pvals[by_descend]))
        # Unsort q-values to get original order
        qvals = qvals[by_orig]

    return qvals


[docs]def corr_to_pval(corr, n, rho0=0):
    """
    Given a list of Pearson correlation coefficient,
    convert them to two-sided log10 p-values. The p-values
    are computed via the fisher transformation described
    on: https://w.wiki/Ksu

    Parameters
    ----------
    corr : numpy.array
        The Pearson correlation coefficients.
    n : int or numpy.array
        The number of observations used to compute correlation
        coefficients. Can be given as an array of the same size as corr
        to give the number of sample in each coefficient.
    rho0 : float
        The correlation value under h0. We test if corr is
        significantly different from rho0.

    Returns
    -------
    numpy.array
        The array of log10-transformed two-sided p-values,
        same size as corr.
    """
    if isinstance(n, int):
        n = np.repeat(n, corr.shape)
    elif isinstance(n, np.ndarray):
        if n.shape != corr.shape:
            raise ValueError("corr and n must have identical shapes.")
    # Apply Fisher z-transformation on coefficients
    z_score = np.arctanh(corr) - np.arctanh(rho0)
    # Get values of the cumulative standard distribution for each zscore
    # Zscores are all set to negative so as to obtain the tail, resulting
    # p-values are multiplied by two to obtain the two-sided values
    # (distribution is symmetric)
    pvals = 2 * ss.norm().cdf(-np.abs(z_score * np.sqrt(n - 3)))

    return np.log10(pvals)