Source code for cytoflow.operations.xform_stat

#!/usr/bin/env python3.8
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
cytoflow.operations.xform_stat
------------------------------

Transforms a statistic. `xform_stat` has one class:

`TransformStatisticOp` -- apply a function to a statistic, making a new statistic.
"""

import pandas as pd
import numpy as np 

from traits.api import (HasStrictTraits, Str, List, Constant, provides,
                        Callable, Bool)

import cytoflow
import cytoflow.utility as util

from .i_operation import IOperation

[docs] @provides(IOperation) class TransformStatisticOp(HasStrictTraits): """ Apply a function to a feature of a statistic, creating a new statistic. If you set `by`, then calling `apply` will group the input statistic by unique combinations of the conditions in `by`, then call `function` on the column specified by `feature` in each group. The `function` should take a `pandas.Series` and it can return a ``float``, a value that can be cast to a ``float``, or `pandas.Series` whose `dtype` is a floating-point. If `function` returns a ``float``, then the resulting statistic will have one column with the name set to `feature` and levels that are the same as the conditions in `by`. If `function` returns a `pandas.Series`, then the names of the rows will become the names of the columns in the new statistic and the levels will be the same as the conditions in `by`. .. note:: If `function` returns a `pandas.Series`, it must have an index with only one level -- no hierarchical indexing, please! .. note:: If `function` returns a `pandas.Series`, it must return a series with the same index each time! Finally, if `by` is left empty, then `function` must be a transformation. `function` must take a `pandas.Series` as an argument and return a `pandas.Series` with exactly the same index. The new statistic will contain that `pandas.Series` as its only column, with the column name set to `feature`. Attributes ---------- name : Str The operation name. Becomes the name of the new statistic. statistic : Str The statistic to apply `function` to. feature : Str The feature to apply `function` to. function : Callable The function used to transform the statistic. `function` must take a `pandas.Series` as its only parameter and return a ``float``, a value that can be cast to ``float``, or a `pandas.Series` whose ``dtype`` is ``float``. by : List(Str) A list of metadata attributes to aggregate the input statistic before applying the function. For example, if the statistic has two indices ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply `function` separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. ignore_incomplete_groups : Bool (default = False) Sometimes, a statistic doesn't have a row for every possible group of labels. If this flag is true, groups that don't have all possible labels of the non-grouped levels won't have `function` called -- this can make writing `function` easier, at the cost of losing some data. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import pandas as pd >>> import numpy as np >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> ch_op = flow.ChannelStatisticOp(name = 'MeanByDox', ... channel = 'Y2-A', ... function = flow.geom_mean, ... by = ['Dox']) >>> ex2 = ch_op.apply(ex) View the new statistic .. plot:: :context: close-figs >>> print(ex2.statistics.keys()) dict_keys(['MeanByDox']) >>> print(ex2.statistics['MeanByDox']) Y2-A Dox 1.0 19.805601 10.0 446.981927 Transform the statistic .. plot:: :context: close-figs >>> xform_op = flow.TransformStatisticOp(name = 'LogMean', ... statistic = 'MeanByDox', ... feature = 'Y2-A', ... function = np.log) >>> ex_3 = xform_op.apply(ex2) >>> ex_3.statistics['LogMean'] Y2-A Dox 1.0 2.985965 10.0 6.102518 """ id = Constant('cytoflow.operations.transform_statistic') friendly_id = Constant("Transform Statistic") name = Str statistic = Str feature = Str function = Callable by = List(Str) ignore_incomplete_groups = Bool(False)
[docs] def apply(self, experiment): """ Applies `function` to a statistic. Parameters ---------- experiment : `Experiment` The `Experiment` to apply the operation to Returns ------- Experiment The same as the old experiment, but with a new statistic that results from applying `function` to the statistic specified in `statistic`. """ if experiment is None: raise util.CytoflowOpError('experiment', "Must specify an experiment") if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if not self.statistic: raise util.CytoflowOpError('statistic', "Statistic not set") if self.statistic not in experiment.statistics: raise util.CytoflowOpError('statistic', "Can't find the statistic {} in the experiment" .format(self.statistic)) else: stat = experiment.statistics[self.statistic] if not self.feature: raise util.CytoflowOpError('feature', "Must set a feature") if self.feature not in stat: raise util.CytoflowOpError('feature', "Can't find feature {} in statistic {}" .format(self.feature, self.statistic)) if not self.function: raise util.CytoflowOpError('function', "Must specify a function") if self.name in experiment.statistics: raise util.CytoflowOpError('name', "{} is already in the experiment's statistics" .format(self.name)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "{} must be in the experiment's conditions") if b not in stat.index.names: raise util.CytoflowOpError('by', "{} is not a statistic index; " " must be one of {}" .format(b, stat.index.names)) if set(self.by) == set(stat.index.names): raise util.CytoflowOpError('by', "You can't set all of the statistic levels in 'by'!") if cytoflow.RUNNING_IN_GUI and not self.by: raise util.CytoflowOpError('by', "Must set a value for 'by'") new_stat = None if self.by: for group in stat.index.to_frame()[self.by].itertuples(index = False, name = None): s = stat.xs(group, level = self.by, drop_level = True)[self.feature] if len(s) == 0: continue if isinstance(s.index, pd.MultiIndex): idx = s.index.remove_unused_levels() idx_incomplete = [set(idx.levels[li]) != set(stat.index.to_frame()[level].unique()) for li, level in enumerate(idx.names)] if any(idx_incomplete) and self.ignore_incomplete_groups: continue else: idx = s.index if set(idx.values) != set(stat.index.to_frame()[idx.name].unique()) and self.ignore_incomplete_groups: continue try: v = self.function(s) except Exception as e: raise util.CytoflowOpError('function', "Your function threw an error in group {}".format(group)) from e if isinstance(v, pd.Series): if v.dtype.kind != 'f': raise util.CytoflowOpError('function', "Your function returned a pandas.Series with dtype {}. " "If it returns a Series, the data must be floating point." .format(v.dtype)) # check for, and warn about, NaNs. if np.any(np.isnan(v)): raise util.CytoflowOpError('function', "Category {} returned {}, which had NaNs that aren't allowed" .format(group, v)) # check for, and warn about, NaNs. if np.any(np.isinf(v)): raise util.CytoflowOpError('function', "Category {} returned {}, which had infs that aren't allowed" .format(group, v)) else: try: v = float(v) except (TypeError, ValueError) as e: if not isinstance(v, pd.Series): raise util.CytoflowOpError('function', "Your function returned a {}. It must return " "a float, a value that can be cast to float, " "or a pandas.Series (with type float)" .format(type(v))) from e if np.isnan(v): raise util.CytoflowOpError('function', "Category {} returned {} and NaNs aren't allowed" .format(group, v)) if np.isinf(v): raise util.CytoflowOpError('function', "Category {} returned {} and infs aren't allowed" .format(group, v)) if new_stat is None: if isinstance(v, float): new_stat = pd.DataFrame(index = pd.MultiIndex.from_tuples([], names = self.by), columns = [self.feature], dtype = 'float' ).sort_index() else: if v.index.nlevels > 1: raise util.CytoflowOpError('function', "Your function returned a Series with a multi-level index!") new_stat = pd.DataFrame(index = pd.MultiIndex.from_tuples([], names = self.by), columns = v.index.tolist(), dtype = 'float').sort_index() first_v = v elif isinstance(v, pd.Series): if not v.index.equals(first_v.index): raise util.CytoflowOpError('function', "The first call of 'function' returned series with index of {}, " "but the call on group {} returned a series with index {}. " "All returned series must have the same index!" .format(first_v.index, group, v.index)) new_stat.loc[group] = v # # check for, and warn about, NaNs. # if np.any(np.isnan(new_stat.loc[group])): # raise util.CytoflowOpError('function', # "Category {} returned {}, which had NaNs that aren't allowed" # .format(group, new_stat.loc[group])) else: idx = stat.index.copy() new_stat = pd.DataFrame(columns = [self.feature], index = idx, dtype = 'float').sort_index() v = self.function(stat[self.feature]) if not isinstance(v, pd.Series): raise util.CytoflowOpError('function', "If you don't specify 'by', your function must return a pandas.Series. " "Instead, the function returned {} ({})".format(v, type(v))) new_stat[self.feature] = v # sort the index, for performance new_stat = new_stat.sort_index() # make sure the new statistic's column index is a type 'string' new_stat.rename(columns = str, inplace = True) new_experiment = experiment.clone(deep = False) new_experiment.history.append(self.clone_traits(transient = lambda t: True)) new_experiment.statistics[self.name] = new_stat return new_experiment