Source code for cytoflow.operations.xform_stat

#!/usr/bin/env python3.8
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
cytoflow.operations.xform_stat
------------------------------

Transforms a statistic. `xform_stat` has one class:

`TransformStatisticOp` -- apply a function to a statistic, making a new statistic.
"""

from warnings import warn
import pandas as pd
import numpy as np 

from traits.api import (HasStrictTraits, Str, List, Constant, provides,
                        Callable, Tuple, Any)

import cytoflow.utility as util

from .i_operation import IOperation

[docs]@provides(IOperation) class TransformStatisticOp(HasStrictTraits): """ Apply a function to a statistic, creating a new statistic. The function can be applied to the entire statistic, or it can be applied individually to groups of the statistic. The function should take a `pandas.Series` as its only argument. Return type is arbitrary, but a to be used with the rest of `cytoflow` it should probably be a numeric type or an iterable of numeric types. As a special case, if the function returns a `pandas.Series` *with the same index that it was passed*, it is interpreted as a transformation. The resulting statistic will have the same length, index names and index levels as the original statistic. Attributes ---------- name : Str The operation name. Becomes the first element in the `Experiment.statistics` key tuple. statistic : Tuple(Str, Str) The statistic to apply the function to. function : Callable The function used to transform the statistic. `function` must take a `pandas.Series` as its only parameter. The return type is arbitrary, but to work with the rest of `cytoflow` it should probably be a numeric type or an iterable of numeric types.. If `statistic_name` is unset, the name of the function becomes the second in element in the `Experiment.statistics` key tuple. statistic_name : Str The name of the function; if present, becomes the second element in the `Experiment.statistics` key tuple. by : List(Str) A list of metadata attributes to aggregate the input statistic before applying the function. For example, if the statistic has two indices ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply `function` separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. fill : Any (default = 0) Value to use in the statistic if a slice of the data is empty. Examples -------- >>> stats_op = ChannelStatisticOp(name = "Mean", ... channel = "Y2-A", ... function = np.mean, ... by = ["Dox"]) >>> ex2 = stats_op.apply(ex) >>> log_op = TransformStatisticOp(name = "LogMean", ... statistic = ("Mean", "mean"), ... function = np.log) >>> ex3 = log_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.transform_statistic') friendly_id = Constant("Transform Statistic") name = Str statistic = Tuple(Str, Str) function = Callable statistic_name = Str by = List(Str) fill = Any(0)
[docs] def apply(self, experiment): """ Applies `function` to a statistic. Parameters ---------- experiment : `Experiment` The `Experiment` to apply the operation to Returns ------- Experiment The same as the old experiment, but with a new statistic that results from applying `function` to the statistic specified in `statistic`. """ if experiment is None: raise util.CytoflowOpError('experiment', "Must specify an experiment") if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if not self.statistic: raise util.CytoflowViewError('statistic', "Statistic not set") if self.statistic not in experiment.statistics: raise util.CytoflowViewError('statistic', "Can't find the statistic {} in the experiment" .format(self.statistic)) else: stat = experiment.statistics[self.statistic] if not self.function: raise util.CytoflowOpError('function', "Must specify a function") stat_name = (self.name, self.statistic_name) \ if self.statistic_name \ else (self.name, self.function.__name__) if stat_name in experiment.statistics: raise util.CytoflowOpError('name', "{} is already in the experiment's statistics" .format(stat_name)) for b in self.by: if b not in stat.index.names: raise util.CytoflowOpError('by', "{} is not a statistic index; " " must be one of {}" .format(b, stat.index.names)) data = stat.reset_index() if self.by: idx = pd.MultiIndex.from_product([data[x].unique() for x in self.by], names = self.by) else: idx = stat.index.copy() new_stat = pd.Series(data = self.fill, index = idx, dtype = np.dtype(object)).sort_index() if self.by: for group in data[self.by].itertuples(index = False, name = None): if isinstance(stat.index, pd.MultiIndex): s = stat.xs(group, level = self.by, drop_level = False) else: s = stat.loc[list(group)] if len(s) == 0: continue try: new_stat[group] = self.function(s) except Exception as e: raise util.CytoflowOpError('function', "Your function threw an error in group {}".format(group)) from e # check for, and warn about, NaNs. if np.any(np.isnan(new_stat.loc[group])): warn("Category {} returned {}".format(group, new_stat.loc[group]), util.CytoflowOpWarning) else: new_stat = self.function(stat) if not isinstance(new_stat, pd.Series): raise util.CytoflowOpError('by', "Transform function {} does not return a Series; " "in this case, you must set 'by'" .format(self.function)) new_stat.name = "{} : {}".format(stat_name[0], stat_name[1]) matched_series = True for group in data[self.by].itertuples(index = False, name = None): if isinstance(stat.index, pd.MultiIndex): s = stat.xs(group, level = self.by, drop_level = False) else: s = stat.loc[list(group)] if isinstance(new_stat.loc[group], pd.Series) and \ s.index.equals(new_stat.loc[group].index): pass else: matched_series = False break if matched_series and len(self.by) > 0: new_stat = pd.concat(new_stat.values) # try to convert to numeric, but if there are non-numeric bits ignore new_stat = pd.to_numeric(new_stat, errors = 'ignore') # sort the index, for performance new_stat = new_stat.sort_index() new_experiment = experiment.clone(deep = False) new_experiment.history.append(self.clone_traits(transient = lambda t: True)) if self.statistic_name: new_experiment.statistics[(self.name, self.statistic_name)] = new_stat else: new_experiment.statistics[(self.name, self.function.__name__)] = new_stat return new_experiment