Source code for cytoflow.operations.frame_stat

#!/usr/bin/env python3.8
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
cytoflow.operations.frame_stat
------------------------------

The `frame_stat` module contains one class:

`FrameStatisticOp` -- applies a function to subsets of a data set,
and adds the resulting statistic to the `Experiment`.  Unlike
`ChannelStatisticOp`, which operates on a single channel, this operation
operates on entire `pandas.DataFrame`.
"""

from warnings import warn
import pandas as pd
import numpy as np

from traits.api import (HasStrictTraits, Str, List, Constant, provides, 
                        Callable)
import cytoflow.utility as util

from .i_operation import IOperation

[docs] @provides(IOperation) class FrameStatisticOp(HasStrictTraits): """ Apply a function to subsets of a data set, and add it as a statistic to the experiment. The `apply` function groups the data by the variables in `by`, then applies the `function` callable to each `pandas.DataFrame` subset. The callable should take a `pandas.DataFrame` as its only parameter and return a `pandas.Series` whose values are ``float``. The columns of the resulting statistic come from the index (ie, the row names) of the first `pandas.Series` to be returned. Attributes ---------- name : Str The operation name. Becomes the first element in the `Experiment.statistics` key tuple. function : Callable The function used to compute the statistic. Must take a `pandas.DataFrame` as its only argument and return a `pandas.Series` containing ``float`` values. The row names of this series will become the column names of the new statistic. by : List(Str) A list of metadata attributes to aggregate the data before applying the function. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply `function` separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. subset : Str A Python expression sent to Experiment.query() to subset the data before computing the statistic. Examples -------- >>> stats_op = FrameStatisticOp(name = "MeanByDox", ... function = lambda x: x.mean, ... by = ["Dox"]) >>> ex2 = stats_op.apply(ex) """ id = Constant('cytoflow.operations.frame_statistic') friendly_id = Constant("Frame Statistic") name = Str function = Callable by = List(Str) subset = Str
[docs] def apply(self, experiment): if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if not self.function: raise util.CytoflowOpError('function', "Must specify a function") if not self.by: raise util.CytoflowOpError('by', "Must specify some grouping conditions " "in 'by'") new_experiment = experiment.clone(deep = False) if self.subset: try: experiment = experiment.query(self.subset) except Exception as e: raise util.CytoflowOpError('subset', "Subset string '{0}' isn't valid" .format(self.subset)) from e if len(experiment) == 0: raise util.CytoflowOpError('subset', "Subset string '{0}' returned no events" .format(self.subset)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " " must be one of {}" .format(b, experiment.conditions)) unique = experiment.data[b].unique() if len(unique) == 1: warn("Only one category for {}".format(b), util.CytoflowOpWarning) groupby = experiment.data.groupby(self.by, observed = True) keys = [x if isinstance(x, tuple) else (x,) for x in groupby.groups.keys()] idx = pd.MultiIndex.from_tuples(keys, names = self.by) stat = None for group, data_subset in groupby: try: v = self.function(data_subset) if v.isna().any(): raise util.CytoflowOpError('function', "`function` must not return any NAs! Category {} returned {}".format(group, stat.loc[group])) if stat is None: stat = pd.DataFrame(np.full((len(idx), len(v.index)), np.nan), index = idx, columns = v.index.to_list(), dtype = 'float').sort_index() if not isinstance(v, pd.Series): raise util.CytoflowOpError('function', "'function' must return a pandas.Series") if len(stat.columns) == 0: for col in v.index: stat.insert(len(stat.columns), col, value = np.nan) stat.loc[group] = v except Exception as e: raise util.CytoflowOpError('function', "Your function threw an error in group {}" .format(group)) from e new_experiment.history.append(self.clone_traits(transient = lambda _: True)) new_experiment.statistics[self.name] = stat return new_experiment