Source code for cytoflow.operations.channel_stat

#!/usr/bin/env python3.8
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
cytoflow.operations.channel_stat
--------------------------------

Creates a new statistic. `channel_stat` has one class:

`ChannelStatisticOp` -- applies a function to subsets of a data set,
and adds the resulting statistic to the `Experiment`
"""

from warnings import warn
import pandas as pd
import numpy as np

from traits.api import HasStrictTraits, Str, List, Constant, provides, Callable

import cytoflow.utility as util

from .i_operation import IOperation

[docs] @provides(IOperation) class ChannelStatisticOp(HasStrictTraits): """ Apply a functions to subsets of a data set, and add it as a statistic to the experiment. The `apply` function groups the data by the variables in `by`, then applies the `function` callable to each group in the channel specified by `channel`. The `function` callable should take a single `pandas.Series` of ``float`` as an argument and return a ``float``, a value that can be cast to ``float``, or a `pandas.Series` of ``float``. If `function` returns a ``float`` or a value that can be cast to ``float``, then the resulting statistic has one column and its name is set to `channel`. If `function` returns a `pandas.Series`, then the ``Series``' index labels become the column names. (If used this way, each call to `function` must **always** return a `pandas.Series` with the same index.) Attributes ---------- name : Str The operation name. Becomes the name of the new statistic. channel : Str The channel to apply the function to. By default, the channel name becomes the column (feature) name in the new statistic. function : Callable The function used to compute the statistic. `function` must take a `pandas.Series` as its only parameter and return either a ``float``, a value that can be cast to ``float``, or a `pandas.Series`. .. warning:: Be careful! Sometimes this function is called with an empty input! If this is the case, poorly-behaved functions can return ``NaN`` or throw an error. If this happens, it will be reported. by : List(Str) A list of metadata attributes to aggregate the data before applying the function. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply `function` separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. subset : Str A Python expression sent to `Experiment.query` to subset the data before computing the statistic. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import pandas as pd >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> ch_op = flow.ChannelStatisticOp(name = 'MeanByDox', ... channel = 'Y2-A', ... function = flow.geom_mean, ... by = ['Dox']) >>> ex2 = ch_op.apply(ex) View the new statistic .. plot:: :context: close-figs >>> print(ex2.statistics.keys()) dict_keys(['MeanByDox']) >>> print(ex2.statistics['MeanByDox']) Y2-A Dox 1.0 19.805601 10.0 446.981927 """ id = Constant('cytoflow.operations.channel_statistic') friendly_id = Constant("Channel Statistic") name = Str channel = Str function = Callable by = List(Str) subset = Str
[docs] def apply(self, experiment): """ Apply the operation to an `Experiment`. Parameters ---------- experiment The `Experiment` to apply this operation to. Returns ------- Experiment A new `Experiment`, containing a new entry in `Experiment.statistics`. The key of the new entry is ``name``. """ if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if not self.channel: raise util.CytoflowOpError('channels', "Must specify a channel") if not self.function: raise util.CytoflowOpError('function', "Must specify a function") if self.channel not in experiment.data: raise util.CytoflowOpError('channels', "Channel {} not found in the experiment" .format(self.channel)) if not self.by: raise util.CytoflowOpError('by', "Must specify some grouping conditions " "in 'by'") if self.name in experiment.statistics: raise util.CytoflowOpError('name', "{} is already in the experiment's statistics" .format(self.name)) new_experiment = experiment.clone(deep = False) if self.subset: try: experiment = experiment.query(self.subset) except Exception as exc: raise util.CytoflowOpError('subset', "Subset string '{0}' isn't valid" .format(self.subset)) from exc for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) unique = experiment.data[b].unique() if len(unique) == 1: warn("Only one category for {}".format(b), util.CytoflowOpWarning) groupby = experiment.data.groupby(self.by, observed = True) keys = [x if isinstance(x, tuple) else (x,) for x in groupby.groups.keys()] idx = pd.MultiIndex.from_tuples(keys, names = self.by) stat = None for group, data_subset in groupby: try: v = self.function(data_subset[self.channel]) except Exception as e: raise util.CytoflowOpError(None, "Your function threw an error in group {}" .format(group)) from e try: v = float(v) except (TypeError, ValueError) as e: if not isinstance(v, pd.Series): raise util.CytoflowOpError(None, "Your function returned a {}. It must return " "a float, a value that can be cast to float, " "or a pandas.Series (with type float)" .format(type(v))) from e if isinstance(v, pd.Series) and v.dtype.kind != 'f': raise util.CytoflowOpError(None, "Your function returned a pandas.Series with dtype {}. " "If it returns a Series, the data must be floating point." .format(v.dtype)) if stat is None: if isinstance(v, float): stat = pd.DataFrame(data = np.full((len(idx), 1), np.nan), index = idx, columns = [self.channel], dtype = 'float' ).sort_index() elif isinstance(v, pd.Series): stat = pd.DataFrame(data = np.full((len(idx), len(v)), np.nan), index = idx, columns = v.index.tolist(), dtype = 'float').sort_index() first_v = v if type(v) != type(first_v): raise util.CytoflowOpError(None, "The first call to your function returned a {}, " "but calling it on group {} returned a {}" .format(type(first_v), group, type(v))) stat.loc[group] = v # fail on NaNs. if stat.loc[group].isna().any(): raise util.CytoflowOpError(None, "Calling function on category {} returned {} " "which contains NaN".format(group, stat.loc[group])) # # if stat.isna().any().any(): # raise util.CytoflowOpError(None, # "The statistic has at least one NaN in it, which probably means " # "one of the groups did not have any events AND you forgot to set " # "'fill' to something other than NaN.".format(group, stat.loc[group])) # new_experiment.history.append(self.clone_traits(transient = lambda _: True)) new_experiment.statistics[self.name] = stat return new_experiment