#!/usr/bin/env python3.8
# coding: latin-1
# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
cytoflow.operations.frame_stat
------------------------------
The `frame_stat` module contains one class:
`FrameStatisticOp` -- applies a function to subsets of a data set,
and adds the resulting statistic to the `Experiment`. Unlike
`ChannelStatisticOp`, which operates on a single channel, this operation
operates on entire `pandas.DataFrame`.
"""
from warnings import warn
import pandas as pd
import numpy as np
from traits.api import (HasStrictTraits, Str, List, Constant, provides,
Callable)
import cytoflow.utility as util
from .i_operation import IOperation
[docs]
@provides(IOperation)
class FrameStatisticOp(HasStrictTraits):
"""
Apply a function to subsets of a data set, and add it as a statistic
to the experiment.
The `apply` function groups the data by the variables in `by`,
then applies the `function` callable to each `pandas.DataFrame`
subset. The callable should take a `pandas.DataFrame` as its only
parameter and return a `pandas.Series` whose values are ``float``.
The columns of the resulting statistic come from the index (ie, the
row names) of the first `pandas.Series` to be returned.
Attributes
----------
name : Str
The operation name. Becomes the first element in the
`Experiment.statistics` key tuple.
function : Callable
The function used to compute the statistic. Must take a
`pandas.DataFrame` as its only argument and return a
`pandas.Series` containing ``float`` values. The row names
of this series will become the column names of the new statistic.
by : List(Str)
A list of metadata attributes to aggregate the data before applying the
function. For example, if the experiment has two pieces of metadata,
``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply
`function` separately to each subset of the data with a unique
combination of ``Time`` and ``Dox``.
subset : Str
A Python expression sent to Experiment.query() to subset the data before
computing the statistic.
Examples
--------
>>> stats_op = FrameStatisticOp(name = "MeanByDox",
... function = lambda x: x.mean,
... by = ["Dox"])
>>> ex2 = stats_op.apply(ex)
"""
id = Constant('cytoflow.operations.frame_statistic')
friendly_id = Constant("Frame Statistic")
name = Str
function = Callable
by = List(Str)
subset = Str
[docs]
def apply(self, experiment):
if experiment is None:
raise util.CytoflowOpError('experiment',
"No experiment specified")
if not self.name:
raise util.CytoflowOpError('name',
"Must specify a name")
if self.name != util.sanitize_identifier(self.name):
raise util.CytoflowOpError('name',
"Name can only contain letters, numbers and underscores."
.format(self.name))
if not self.function:
raise util.CytoflowOpError('function',
"Must specify a function")
if not self.by:
raise util.CytoflowOpError('by',
"Must specify some grouping conditions "
"in 'by'")
new_experiment = experiment.clone(deep = False)
if self.subset:
try:
experiment = experiment.query(self.subset)
except Exception as e:
raise util.CytoflowOpError('subset',
"Subset string '{0}' isn't valid"
.format(self.subset)) from e
if len(experiment) == 0:
raise util.CytoflowOpError('subset',
"Subset string '{0}' returned no events"
.format(self.subset))
for b in self.by:
if b not in experiment.conditions:
raise util.CytoflowOpError('by',
"Aggregation metadata {} not found, "
" must be one of {}"
.format(b, experiment.conditions))
unique = experiment.data[b].unique()
if len(unique) == 1:
warn("Only one category for {}".format(b), util.CytoflowOpWarning)
groupby = experiment.data.groupby(self.by, observed = True)
keys = [x if isinstance(x, tuple)
else (x,)
for x in groupby.groups.keys()]
idx = pd.MultiIndex.from_tuples(keys, names = self.by)
stat = None
for group, data_subset in groupby:
try:
v = self.function(data_subset)
if v.isna().any():
raise util.CytoflowOpError('function',
"`function` must not return any NAs! Category {} returned {}".format(group, stat.loc[group]))
if stat is None:
stat = pd.DataFrame(np.full((len(idx), len(v.index)), np.nan),
index = idx,
columns = v.index.to_list(),
dtype = 'float').sort_index()
if not isinstance(v, pd.Series):
raise util.CytoflowOpError('function',
"'function' must return a pandas.Series")
if len(stat.columns) == 0:
for col in v.index:
stat.insert(len(stat.columns), col, value = np.nan)
stat.loc[group] = v
except Exception as e:
raise util.CytoflowOpError('function',
"Your function threw an error in group {}"
.format(group)) from e
new_experiment.history.append(self.clone_traits(transient = lambda _: True))
new_experiment.statistics[self.name] = stat
return new_experiment