Source code for cytoflow.utility.util_functions

#!/usr/bin/env python3.8
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
cytoflow.utility.util_functions
-------------------------------

Useful (mostly numeric) utility functions.

`iqr` -- calculate the interquartile range for an array of numberes

`num_hist_bins` -- calculate the number of histogram bins using Freedman-Diaconis

`geom_mean` -- compute the geometric mean

`geom_sd` -- compute the geometric standard deviation

`geom_sd_range` --  compute [geom_mean / geom_sd, geom_mean * geom_sd]

`geom_sem` -- compute the geometric standard error of the mean

`geom_sem_range` -- compute [geom_mean / geom_sem, geom_mean * geom_sem]

`cartesian` -- generate the cartesian product of input arrays

`sanitize_identifier` -- makes a string a valid Python identifier by replacing all non-safe characters with '_'

`random_string` -- Makes a random string of ascii digits and lowercase letters

`is_numeric` -- determine if a `pandas.Series` or `numpy.ndarray` is numeric from its dtype

`cov2corr` -- compute the correlation matrix from the covariance matric
"""

import random, string

import numpy as np
from scipy import stats

[docs]def iqr(a): """ Calculate the inter-quartile range for an array of numbers. Parameters ---------- a : array_like The array of numbers to compute the IQR for. Returns ------- float The IQR of the data. """ a = np.asarray(a) q1 = np.nanpercentile(a, 25) q3 = np.nanpercentile(a, 75) return q3 - q1
[docs]def num_hist_bins(a): """ Calculate number of histogram bins using Freedman-Diaconis rule. From http://stats.stackexchange.com/questions/798/ Parameters ---------- a : array_like The data to make a histogram of. Returns ------- int The number of bins in the histogram """ a = np.asarray(a) h = 2 * iqr(a) / (len(a) ** (1 / 3)) # fall back to 10 bins if iqr is 0 if h == 0: return 10. else: return np.ceil((np.nanpercentile(a, 99) - np.nanpercentile(a, 1)) / h)
[docs]def geom_mean(a): """ Compute the geometric mean for an "arbitrary" data set, ie one that contains zeros and negative numbers. Parameters ---------- a : array-like A numpy.ndarray, or something that can be converted to an ndarray Returns ------- The geometric mean of the input array Notes ----- The traditional geometric mean can not be computed on a mixture of positive and negative numbers. The approach here, validated rigorously in the cited paper[1], is to compute the geometric mean of the absolute value of the negative numbers separately, and then take a weighted arithmetic mean of that and the geometric mean of the positive numbers. We're going to discard 0 values, operating under the assumption that in this context there are going to be few or no observations with a value of exactly 0. References ---------- [1] Geometric mean for negative and zero values Elsayed A. E. Habib International Journal of Research and Reviews in Applied Sciences 11:419 (2012) http://www.arpapress.com/Volumes/Vol11Issue3/IJRRAS_11_3_08.pdf """ a = np.array(a) pos = a[a > 0] pos_mean = stats.gmean(pos) pos_prop = pos.size / a.size neg = a[a < 0] neg = np.abs(neg) neg_mean = stats.gmean(neg) if neg.size > 0 else 0 neg_prop = neg.size / a.size return (pos_mean * pos_prop) - (neg_mean * neg_prop)
[docs]def geom_sd(a): """ Compute the geometric standard deviation for an "abitrary" data set, ie one that contains zeros and negative numbers. Since we're in log space, this gives a *dimensionless scaling factor*, not a measure. If you want traditional "error bars", don't plot ``[geom_mean - geom_sd, geom_mean + geom_sd]``; rather, plot ``[geom_mean / geom_sd, geom_mean * geom_sd]``. Parameters ---------- a : array-like A numpy.ndarray, or something that can be converted to an ndarray Returns ------- The geometric standard deviation of the distribution. Notes ----- As with `geom_mean`, non-positive numbers pose a problem. The approach here, though less rigorously validated than the one above, is to replace negative numbers with their absolute value plus 2 * geometric mean, then go about our business as per the Wikipedia page for geometric sd[1]. References ---------- [1] https://en.wikipedia.org/wiki/Geometric_standard_deviation """ a = np.array(a) u = geom_mean(a) a[a <= 0] = np.abs(a[a <= 0]) + 2 * u return np.exp(np.std(np.log(a)))
[docs]def geom_sd_range(a): """ A convenience function to compute [geom_mean / geom_sd, geom_mean * geom_sd]. Parameters ---------- a : array-like A numpy.ndarray, or something that can be converted to an ndarray Returns ------- A tuple, with ``(geom_mean / geom_sd, geom_mean * geom_sd)`` """ u = geom_mean(a) sd = geom_sd(a) return (u / sd, u * sd)
[docs]def geom_sem(a): """ Compute the geometric standard error of the mean for an "arbirary" data set, ie one that contains zeros and negative numbers. Parameters ---------- a : array-like A numpy.ndarray, or something that can be converted to an ndarray Returns ------- The geometric mean of the distribution. Notes ----- As with `geom_mean`, non-positive numbers pose a problem. The approach here, though less rigorously validated than the one above, is to replace negative numbers with their absolute value plus 2 * geometric mean. The geometric SEM is computed as in [1]. References ---------- [1] The Standard Errors of the Geometric and Harmonic Means and Their Application to Index Numbers Nilan Norris The Annals of Mathematical Statistics Vol. 11, No. 4 (Dec., 1940), pp. 445-448 http://www.jstor.org/stable/2235723?seq=1#page_scan_tab_contents """ a = np.array(a) u = geom_mean(a) a[a <= 0] = np.abs(a[a <= 0]) + 2 * u return u * np.std(np.log(a)) / np.sqrt(a.size)
[docs]def geom_sem_range(a): """ A convenience function to compute [geom_mean / geom_sem, geom_mean * geom_sem]. Parameters ---------- a : array-like A numpy.ndarray, or something that can be converted to an ndarray Returns ------- A tuple, with ``(geom_mean / geom_sem, geom_mean * geom_sem)`` """ u = geom_mean(a) sem = geom_sem(a) return (u / sem, u * sem)
[docs]def cartesian(arrays, out=None): """ Generate a cartesian product of input arrays. Parameters ---------- arrays : list of array-like 1-D arrays to form the cartesian product of. out : ndarray Array to place the cartesian product in. Returns ------- out : ndarray 2-D array of shape (M, len(arrays)) containing cartesian products formed of input arrays. Examples -------- >>> cartesian(([1, 2, 3], [4, 5], [6, 7])) array([[1, 4, 6], [1, 4, 7], [1, 5, 6], [1, 5, 7], [2, 4, 6], [2, 4, 7], [2, 5, 6], [2, 5, 7], [3, 4, 6], [3, 4, 7], [3, 5, 6], [3, 5, 7]]) References ---------- Originally from http://stackoverflow.com/a/1235363/4755587 """ arrays = [np.asarray(x) for x in arrays] dtype = arrays[0].dtype n = np.prod([x.size for x in arrays]) if out is None: out = np.zeros([n, len(arrays)], dtype=dtype) m = n // arrays[0].size out[:,0] = np.repeat(arrays[0], m) if arrays[1:]: cartesian(arrays[1:], out=out[0:m,1:]) for j in range(1, arrays[0].size): out[j*m:(j+1)*m,1:] = out[0:m,1:] return out
[docs]def sanitize_identifier(name): """ Makes name a Python identifier by replacing all nonsafe characters with '_' """ new_name = list(name) for i, c in enumerate(list(name)): # character by character if i == 0 and not (c.isalpha() or c == '_'): new_name[i] = '_' if i > 0 and not (c.isalnum() or c == '_'): new_name[i] = '_' return "".join(new_name)
[docs]def random_string(n): """ Makes a random string of ascii digits and lowercase letters of length ``n`` from http://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python """ return ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(n))
[docs]def is_numeric(s): """ Determine if a `pandas.Series` or `numpy.ndarray` is numeric from its dtype. """ return s.dtype.kind in 'iufc'
# return issubclass(s.dtype.type, np.number)
[docs]def cov2corr(covariance): ''' Compute the correlation matrix from the covariance matrix. From https://github.com/AndreaCensi/procgraph/blob/master/src/procgraph_statistics/cov2corr.py ''' sigma = np.sqrt(covariance.diagonal()) M = np.multiply.outer(sigma, sigma) correlation = covariance / M return sigma, correlation