Source code for cytoflow.operations.xform_stat

#!/usr/bin/env python3.4
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2021
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

'''
cytoflow.operations.xform_stat
------------------------------
'''

from warnings import warn
import pandas as pd
import numpy as np 

from traits.api import (HasStrictTraits, Str, List, Constant, provides, CStr,
                        Callable, Tuple, Any)

import cytoflow.utility as util

from .i_operation import IOperation

[docs]@provides(IOperation) class TransformStatisticOp(HasStrictTraits): """ Apply a function to a statistic, creating a new statistic. The function can be applied to the entire statistic, or it can be applied individually to groups of the statistic. The function should take a :class:`pandas.Series` as its only argument. Return type is arbitrary, but a to be used with the rest of :class:`cytoflow` it should probably be a numeric type or an iterable of numeric types. As a special case, if the function returns a :class:`pandas.Series` *with the same index that it was passed*, it is interpreted as a transformation. The resulting statistic will have the same length, index names and index levels as the original statistic. Attributes ---------- name : Str The operation name. Becomes the first element in the :attr:`~Experiment.statistics` key tuple. statistic : Tuple(Str, Str) The statistic to apply the function to. function : Callable The function used to transform the statistic. :attr:`function` must take a :class:`pandas.Series` as its only parameter. The return type is arbitrary, but to work with the rest of :class:`cytoflow` it should probably be a numeric type or an iterable of numeric types.. If :attr:`statistic_name` is unset, the name of the function becomes the second in element in the :attr:`~Experiment.statistics` key tuple. statistic_name : Str The name of the function; if present, becomes the second element in the :attr:`~Experiment.statistics` key tuple. by : List(Str) A list of metadata attributes to aggregate the input statistic before applying the function. For example, if the statistic has two indices ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply :attr:`function` separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. fill : Any (default = 0) Value to use in the statistic if a slice of the data is empty. Examples -------- >>> stats_op = ChannelStatisticOp(name = "Mean", ... channel = "Y2-A", ... function = np.mean, ... by = ["Dox"]) >>> ex2 = stats_op.apply(ex) >>> log_op = TransformStatisticOp(name = "LogMean", ... statistic = ("Mean", "mean"), ... function = np.log) >>> ex3 = log_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.transform_statistic') friendly_id = Constant("Transform Statistic") name = CStr statistic = Tuple(Str, Str) function = Callable statistic_name = Str by = List(Str) fill = Any(0)
[docs] def apply(self, experiment): """ Applies :attr:`function` to a statistic. Parameters ---------- experiment : Experiment The experiment to apply the operation to Returns ------- Experiment The same as the old experiment, but with a new statistic that results from applying :attr:`function` to the statistic specified in :attr:`statistic`. """ if experiment is None: raise util.CytoflowOpError('experiment', "Must specify an experiment") if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if not self.statistic: raise util.CytoflowViewError('statistic', "Statistic not set") if self.statistic not in experiment.statistics: raise util.CytoflowViewError('statistic', "Can't find the statistic {} in the experiment" .format(self.statistic)) else: stat = experiment.statistics[self.statistic] if not self.function: raise util.CytoflowOpError('function', "Must specify a function") stat_name = (self.name, self.statistic_name) \ if self.statistic_name \ else (self.name, self.function.__name__) if stat_name in experiment.statistics: raise util.CytoflowOpError('name', "{} is already in the experiment's statistics" .format(stat_name)) for b in self.by: if b not in stat.index.names: raise util.CytoflowOpError('by', "{} is not a statistic index; " " must be one of {}" .format(b, stat.index.names)) data = stat.reset_index() if self.by: idx = pd.MultiIndex.from_product([data[x].unique() for x in self.by], names = self.by) else: idx = stat.index.copy() new_stat = pd.Series(data = self.fill, index = idx, dtype = np.dtype(object)).sort_index() if self.by: for group in data[self.by].itertuples(index = False, name = None): if isinstance(stat.index, pd.MultiIndex): s = stat.xs(group, level = self.by, drop_level = False) else: s = stat.loc[list(group)] if len(s) == 0: continue try: new_stat[group] = self.function(s) except Exception as e: raise util.CytoflowOpError('function', "Your function threw an error in group {}".format(group)) from e # check for, and warn about, NaNs. if np.any(np.isnan(new_stat.loc[group])): warn("Category {} returned {}".format(group, new_stat.loc[group]), util.CytoflowOpWarning) else: new_stat = self.function(stat) if not isinstance(new_stat, pd.Series): raise util.CytoflowOpError('by', "Transform function {} does not return a Series; " "in this case, you must set 'by'" .format(self.function)) new_stat.name = "{} : {}".format(stat_name[0], stat_name[1]) matched_series = True for group in data[self.by].itertuples(index = False, name = None): if isinstance(stat.index, pd.MultiIndex): s = stat.xs(group, level = self.by, drop_level = False) else: s = stat.loc[list(group)] if isinstance(new_stat.loc[group], pd.Series) and \ s.index.equals(new_stat.loc[group].index): pass else: matched_series = False break if matched_series and len(self.by) > 0: new_stat = pd.concat(new_stat.values) # try to convert to numeric, but if there are non-numeric bits ignore new_stat = pd.to_numeric(new_stat, errors = 'ignore') # sort the index, for performance new_stat = new_stat.sort_index() new_experiment = experiment.clone() new_experiment.history.append(self.clone_traits(transient = lambda t: True)) if self.statistic_name: new_experiment.statistics[(self.name, self.statistic_name)] = new_stat else: new_experiment.statistics[(self.name, self.function.__name__)] = new_stat return new_experiment