Source code for cytoflow.utility.util_functions

#!/usr/bin/env python3.8
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
cytoflow.utility.util_functions
-------------------------------

Useful (mostly numeric) utility functions.

`iqr` -- calculate the interquartile range for an array of numberes

`num_hist_bins` -- calculate the number of histogram bins using Freedman-Diaconis

`geom_mean` -- compute the geometric mean

`geom_sd` -- compute the geometric standard deviation

`geom_sd_range` --  compute [geom_mean / geom_sd, geom_mean * geom_sd]

`geom_sem` -- compute the geometric standard error of the mean

`geom_sem_range` -- compute [geom_mean / geom_sem, geom_mean * geom_sem]

`cartesian` -- generate the cartesian product of input arrays

`sanitize_identifier` -- makes a string a valid Python identifier by replacing all non-safe characters with '_'

`random_string` -- Makes a random string of ascii digits and lowercase letters

`is_numeric` -- determine if a `pandas.Series` or `numpy.ndarray` is numeric from its dtype

`cov2corr` -- compute the correlation matrix from the covariance matric
"""

import random, string

import numpy as np
from scipy import stats
from .cytoflow_errors import CytoflowError


[docs]
def iqr(a):
    """
    Calculate the inter-quartile range for an array of numbers.
    
    Parameters
    ----------
    a : array_like
        The array of numbers to compute the IQR for.
        
    Returns
    -------
    float
        The IQR of the data.
    """
    a = np.asarray(a)
    q1 = np.nanpercentile(a, 25)
    q3 = np.nanpercentile(a, 75)
    return q3 - q1



[docs]
def num_hist_bins(a):
    """
    Calculate number of histogram bins using Freedman-Diaconis rule.
    
    From http://stats.stackexchange.com/questions/798/
    
    Parameters
    ----------
    a : array_like
        The data to make a histogram of.
        
    Returns
    -------
    int
        The number of bins in the histogram
    """
    a = np.asarray(a)
    h = 2 * iqr(a) / (len(a) ** (1 / 3))
      
    # fall back to 10 bins if iqr is 0
    if h == 0:
        return 10.
    else:
        return np.ceil((np.nanpercentile(a, 99) - 
                        np.nanpercentile(a, 1)) / h)

    

[docs]
def geom_mean(a):
    """
    Compute the geometric mean for an "arbitrary" data set, ie one that
    contains zeros and negative numbers.
    
    Parameters
    ----------
    
    a : array-like
        A numpy.ndarray, or something that can be converted to an ndarray
        
    Returns
    -------
    The geometric mean of the input array
    
    Notes
    -----
    The traditional geometric mean can not be computed on a mixture of positive
    and negative numbers.  The approach here, validated rigorously in the
    cited paper[1], is to compute the geometric mean of the absolute value of
    the negative numbers separately, and then take a weighted arithmetic mean
    of that and the geometric mean of the positive numbers.  We're going to 
    discard 0 values, operating under the assumption that in this context
    there are going to be few or no observations with a value of exactly 0.
    
    References
    ----------
    [1] Geometric mean for negative and zero values
        Elsayed A. E. Habib
        International Journal of Research and Reviews in Applied Sciences
        11:419 (2012)
        http://www.arpapress.com/Volumes/Vol11Issue3/IJRRAS_11_3_08.pdf

    """
    
    a = np.array(a)
    pos = a[a > 0]
    pos_mean = stats.gmean(pos) if pos.size > 0 else 0
    pos_prop = pos.size / a.size
    
    neg = a[a < 0]
    neg = np.abs(neg)
    neg_mean = stats.gmean(neg) if neg.size > 0 else 0
    neg_prop = neg.size / a.size
    
    return (pos_mean * pos_prop) - (neg_mean * neg_prop)



[docs]
def geom_sd(a):
    """
    Compute the geometric standard deviation for an "abitrary" data set, ie one
    that contains zeros and negative numbers.  Since we're in log space, this
    gives a *dimensionless scaling factor*, not a measure.  If you want 
    traditional "error bars", don't plot ``[geom_mean - geom_sd, geom_mean + geom_sd]``;
    rather, plot ``[geom_mean / geom_sd, geom_mean * geom_sd]``.
    
    Parameters
    ----------
    
    a : array-like
        A numpy.ndarray, or something that can be converted to an ndarray
        
    Returns
    -------
    The geometric standard deviation of the distribution.
    
    Notes
    -----
    As with `geom_mean`, non-positive numbers pose a problem.  The approach
    here, though less rigorously validated than the one above, is to replace
    negative numbers with their absolute value plus 2 * geometric mean, then
    go about our business as per the Wikipedia page for geometric sd[1].
    
    References
    ----------
    [1] https://en.wikipedia.org/wiki/Geometric_standard_deviation
    """
    
    a = np.array(a)
    u = geom_mean(a)
    a[a <= 0] = np.abs(a[a <= 0]) + 2 * u
    
    return np.exp(np.std(np.log(a)))

    

[docs]
def geom_sd_range(a):
    """
    Used to compute [geom_mean / geom_sd, geom_mean * geom_sd].
    However, now statistics can't have tuples.
    """
    
    raise CytoflowError("Statistics have changed, and can't contain tuples, "
                        "so geom_sd_range is no longer useful. See the docs "
                        "for details.")



[docs]
def geom_sem(a):
    """
    Compute the geometric standard error of the mean for an "arbirary" data set,
    ie one that contains zeros and negative numbers.
    
    Parameters
    ----------
    
    a : array-like
        A numpy.ndarray, or something that can be converted to an ndarray
        
    Returns
    -------
    The geometric mean of the distribution.
    
    Notes
    -----
    As with `geom_mean`, non-positive numbers pose a problem.  The approach
    here, though less rigorously validated than the one above, is to replace
    negative numbers with their absolute value plus 2 * geometric mean.  The
    geometric SEM is computed as in [1].
    
    References
    ----------
    [1] The Standard Errors of the Geometric and Harmonic Means and Their Application to Index Numbers
        Nilan Norris
        The Annals of Mathematical Statistics
        Vol. 11, No. 4 (Dec., 1940), pp. 445-448
        
        http://www.jstor.org/stable/2235723?seq=1#page_scan_tab_contents
    """
    
    a = np.array(a)
    u = geom_mean(a)
    a[a <= 0] = np.abs(a[a <= 0]) + 2 * u
    
    return u * np.std(np.log(a)) / np.sqrt(a.size)


    

[docs]
def geom_sem_range(a):
    """
    Used to compute [geom_mean / geom_sem, geom_mean * geom_sem].
    No longer useful because statistics can't contain tuples.
    """
    
    raise CytoflowError("Statistics have changed, and can't contain tuples, "
                        "so geom_sem_range is no longer useful. See the docs "
                        "for details.")


    

[docs]
def cartesian(arrays, out=None):
    """
    Generate a cartesian product of input arrays.

    Parameters
    ----------
    arrays : list of array-like
        1-D arrays to form the cartesian product of.
    out : ndarray
        Array to place the cartesian product in.

    Returns
    -------
    out : ndarray
        2-D array of shape (M, len(arrays)) containing cartesian products
        formed of input arrays.

    Examples
    --------
    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
    array([[1, 4, 6],
           [1, 4, 7],
           [1, 5, 6],
           [1, 5, 7],
           [2, 4, 6],
           [2, 4, 7],
           [2, 5, 6],
           [2, 5, 7],
           [3, 4, 6],
           [3, 4, 7],
           [3, 5, 6],
           [3, 5, 7]])
           
    References
    ----------
    Originally from http://stackoverflow.com/a/1235363/4755587
    """

    arrays = [np.asarray(x) for x in arrays]
    dtype = arrays[0].dtype

    n = np.prod([x.size for x in arrays])
    if out is None:
        out = np.zeros([n, len(arrays)], dtype=dtype)

    m = n // arrays[0].size
    out[:,0] = np.repeat(arrays[0], m)
    if arrays[1:]:
        cartesian(arrays[1:], out=out[0:m,1:])
        for j in range(1, arrays[0].size):
            out[j*m:(j+1)*m,1:] = out[0:m,1:]
    return out



[docs]
def sanitize_identifier(name):
    """
    Makes name a Python identifier by replacing all nonsafe characters with '_'
    """
    
    new_name = list(name)
    for i, c in enumerate(list(name)): # character by character
        if i == 0 and not (c.isalpha() or c == '_'):
            new_name[i] = '_'
        if i > 0 and not (c.isalnum() or c == '_'):
            new_name[i] = '_'

    return  "".join(new_name)




[docs]
def random_string(n):
    """
    Makes a random string of ascii digits and lowercase letters of length ``n``
    
    from http://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python
    """
    return ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(n))



[docs]
def is_numeric(s):
    """
    Determine if a `pandas.Series` or `numpy.ndarray` is numeric
    from its dtype.
    """
    return s.dtype.kind in 'iufc'


#     return issubclass(s.dtype.type, np.number)


[docs]
def cov2corr(covariance):
    ''' 
    Compute the correlation matrix from the covariance matrix.

    From https://github.com/AndreaCensi/procgraph/blob/master/src/procgraph_statistics/cov2corr.py
    '''

    sigma = np.sqrt(covariance.diagonal())
    M = np.multiply.outer(sigma, sigma)
    correlation = covariance / M

    return sigma, correlation