Source code for cytoflow.operations.xform_stat

#!/usr/bin/env python3.8
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
cytoflow.operations.xform_stat
------------------------------

Transforms a statistic. `xform_stat` has one class:

`TransformStatisticOp` -- apply a function to a statistic, making a new statistic.
"""

import pandas as pd
import numpy as np 

from traits.api import (HasStrictTraits, Str, List, Constant, provides,
                        Callable, Bool)

import cytoflow
import cytoflow.utility as util

from .i_operation import IOperation


[docs]
@provides(IOperation)
class TransformStatisticOp(HasStrictTraits):
    """
    Apply a function to a feature of a statistic, creating a new statistic.  
    
    If you set `by`, then calling `apply` will group the input statistic by 
    unique combinations of the conditions in `by`, then call `function` on the
    column specified by `feature` in each group. The `function` should take a 
    `pandas.Series` and it can return a ``float``, a value that can be cast to 
    a ``float``, or `pandas.Series` whose `dtype` is a floating-point. 
    
    If `function` returns a ``float``, then the resulting statistic
    will have one column with the name set to `feature` and levels that are the
    same as the conditions in `by`.
    
    If `function` returns a `pandas.Series`, then the names of the rows will
    become the names of the columns in the new statistic and the levels will
    be the same as the conditions in `by`.
    
    .. note::
        If `function` returns a `pandas.Series`, it must have an index with only
        one level -- no hierarchical indexing, please!
    
    .. note::
        If `function` returns a `pandas.Series`, it must return a series with the
        same index each time! 
    
    Finally, if `by` is left empty, then `function` must be a transformation.
    `function` must take a `pandas.Series` as an argument and return a `pandas.Series`
    with exactly the same index. The new statistic will contain that `pandas.Series`
    as its only column, with the column name set to `feature`.

    Attributes
    ----------
    name : Str
        The operation name.  Becomes the name of the new statistic.
    
    statistic : Str
        The statistic to apply `function` to.
        
    feature : Str
        The feature to apply `function` to.
        
    function : Callable
        The function used to transform the statistic.  `function` must 
        take a `pandas.Series` as its only parameter and return a ``float``,
        a value that can be cast to ``float``, or a `pandas.Series` whose 
        ``dtype`` is ``float``.
        
    by : List(Str)
        A list of metadata attributes to aggregate the input statistic before 
        applying the function.  For example, if the statistic has two indices
        ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply 
        `function` separately to each subset of the data with a unique 
        combination of ``Time`` and ``Dox``.
        
    ignore_incomplete_groups : Bool (default = False)
        Sometimes, a statistic doesn't have a row for every possible group of
        labels. If this flag is true, groups that don't have all possible
        labels of the non-grouped levels won't have `function` called -- this
        can make writing `function` easier, at the cost of losing some data.

    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import pandas as pd
        >>> import numpy as np
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> ch_op = flow.ChannelStatisticOp(name = 'MeanByDox',
        ...                                 channel = 'Y2-A',
        ...                                 function = flow.geom_mean,
        ...                                 by = ['Dox'])
        >>> ex2 = ch_op.apply(ex)
        
    View the new statistic
    
    .. plot::
        :context: close-figs
    
        >>> print(ex2.statistics.keys())
        dict_keys(['MeanByDox'])
        
        >>> print(ex2.statistics['MeanByDox'])
                    Y2-A    
        Dox                        
        1.0    19.805601  
        10.0  446.981927  
        
    Transform the statistic
    
    .. plot::
        :context: close-figs
        
        >>> xform_op = flow.TransformStatisticOp(name = 'LogMean',
        ...                                      statistic = 'MeanByDox',
        ...                                      feature = 'Y2-A',
        ...                                      function = np.log)
        
        >>> ex_3 = xform_op.apply(ex2)
        >>> ex_3.statistics['LogMean']
            Y2-A
        Dox    
        1.0    2.985965
        10.0    6.102518

    """
    
    id = Constant('cytoflow.operations.transform_statistic')
    friendly_id = Constant("Transform Statistic")

    name = Str
    statistic = Str
    feature = Str
    function = Callable
    by = List(Str)    
    ignore_incomplete_groups = Bool(False)


[docs]
    def apply(self, experiment):
        """
        Applies `function` to a statistic.
        
        Parameters
        ----------
        experiment : `Experiment`
            The `Experiment` to apply the operation to
        
        Returns
        -------
        Experiment
            The same as the old experiment, but with a new statistic that
            results from applying `function` to the statistic specified
            in `statistic`.
        """
        
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "Must specify an experiment")

        if not self.name:
            raise util.CytoflowOpError('name',
                                       "Must specify a name")
        
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
        
        if not self.statistic:
            raise util.CytoflowOpError('statistic',
                                         "Statistic not set")
        
        if self.statistic not in experiment.statistics:
            raise util.CytoflowOpError('statistic',
                                         "Can't find the statistic {} in the experiment"
                                         .format(self.statistic))
        else:
            stat = experiment.statistics[self.statistic]
            
        if not self.feature:
            raise util.CytoflowOpError('feature',
                                       "Must set a feature")
            
        if self.feature not in stat:
            raise util.CytoflowOpError('feature',
                                       "Can't find feature {} in statistic {}"
                                       .format(self.feature, self.statistic))

        if not self.function:
            raise util.CytoflowOpError('function',
                                       "Must specify a function")

        if self.name in experiment.statistics:
            raise util.CytoflowOpError('name',
                                       "{} is already in the experiment's statistics"
                                       .format(self.name))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "{} must be in the experiment's conditions")
            if b not in stat.index.names:
                raise util.CytoflowOpError('by',
                                           "{} is not a statistic index; "
                                           " must be one of {}"
                                           .format(b, stat.index.names))
                
        if set(self.by) == set(stat.index.names):
            raise util.CytoflowOpError('by',
                                       "You can't set all of the statistic levels in 'by'!")
            
        if cytoflow.RUNNING_IN_GUI and not self.by:
            raise util.CytoflowOpError('by',
                                       "Must set a value for 'by'")
                
        new_stat = None
                    
        if self.by: 
            for group in stat.index.to_frame()[self.by].itertuples(index = False, name = None):  
                s = stat.xs(group, level = self.by, drop_level = True)[self.feature]

                if len(s) == 0:
                    continue
                
                if isinstance(s.index, pd.MultiIndex):
                    idx = s.index.remove_unused_levels()
                    idx_incomplete = [set(idx.levels[li]) != set(stat.index.to_frame()[level].unique())
                                     for li, level in enumerate(idx.names)]
                    if any(idx_incomplete) and self.ignore_incomplete_groups:
                        continue 
                else:
                    idx = s.index
                    if set(idx.values) != set(stat.index.to_frame()[idx.name].unique()) and self.ignore_incomplete_groups:
                        continue

                try:
                    v = self.function(s)
                except Exception as e:
                    raise util.CytoflowOpError('function',
                                               "Your function threw an error in group {}".format(group)) from e
                           
                if isinstance(v, pd.Series):
                    if v.dtype.kind != 'f':
                        raise util.CytoflowOpError('function',
                                                   "Your function returned a pandas.Series with dtype {}. "
                                                   "If it returns a Series, the data must be floating point."
                                                   .format(v.dtype))
                        
                    # check for, and warn about, NaNs.
                    if np.any(np.isnan(v)):
                        raise util.CytoflowOpError('function',
                                                   "Category {} returned {}, which had NaNs that aren't allowed"
                                                   .format(group, v))
                        
                    # check for, and warn about, NaNs.
                    if np.any(np.isinf(v)):
                        raise util.CytoflowOpError('function',
                                                   "Category {} returned {}, which had infs that aren't allowed"
                                                   .format(group, v))
                else:
                    try:
                        v = float(v)
                    except (TypeError, ValueError) as e:
                        if not isinstance(v, pd.Series):
                            raise util.CytoflowOpError('function',
                                                       "Your function returned a {}. It must return "
                                                       "a float, a value that can be cast to float, "
                                                       "or a pandas.Series (with type float)"
                                                       .format(type(v))) from e
                                                       
                    if np.isnan(v):
                        raise util.CytoflowOpError('function',
                                                   "Category {} returned {} and NaNs aren't allowed"
                                                   .format(group, v))
                        
                    if np.isinf(v):
                        raise util.CytoflowOpError('function',
                                                   "Category {} returned {} and infs aren't allowed"
                                                   .format(group, v))
                    
                if new_stat is None:
                    if isinstance(v, float):
                        new_stat = pd.DataFrame(index = pd.MultiIndex.from_tuples([], names = self.by),
                                                columns = [self.feature],
                                                dtype = 'float' ).sort_index()
                    else:
                        if v.index.nlevels > 1:
                            raise util.CytoflowOpError('function',
                                                       "Your function returned a Series with a multi-level index!")
                            
                        new_stat = pd.DataFrame(index = pd.MultiIndex.from_tuples([], names = self.by),
                                                columns = v.index.tolist(),
                                                dtype = 'float').sort_index()
                        first_v = v
                elif isinstance(v, pd.Series):
                    if not v.index.equals(first_v.index):
                        raise util.CytoflowOpError('function',
                                                   "The first call of 'function' returned series with index of {}, "
                                                   "but the call on group {} returned a series with index {}. "
                                                   "All returned series must have the same index!"
                                                   .format(first_v.index, group, v.index))
                new_stat.loc[group] = v
                                        
                # # check for, and warn about, NaNs.
                # if np.any(np.isnan(new_stat.loc[group])):
                #     raise util.CytoflowOpError('function',
                #                                "Category {} returned {}, which had NaNs that aren't allowed"
                #                                .format(group, new_stat.loc[group]))
                
        else:
            idx = stat.index.copy()
            new_stat = pd.DataFrame(columns = [self.feature],
                                    index = idx, 
                                    dtype = 'float').sort_index()

            v = self.function(stat[self.feature])
            
            if not isinstance(v, pd.Series):
                raise util.CytoflowOpError('function',
                                           "If you don't specify 'by', your function must return a pandas.Series. "
                                           "Instead, the function returned {} ({})".format(v, type(v)))
            new_stat[self.feature] = v
        
        # sort the index, for performance
        new_stat = new_stat.sort_index()
        
        # make sure the new statistic's column index is a type 'string'
        new_stat.rename(columns = str, inplace = True)
        
        new_experiment = experiment.clone(deep = False)
        new_experiment.history.append(self.clone_traits(transient = lambda t: True))
        new_experiment.statistics[self.name] = new_stat

        return new_experiment