#!/usr/bin/env python3.8
# coding: latin-1
# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2022
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
cytoflow.operations.channel_stat
--------------------------------
Creates a new statistic. `channel_stat` has one class:
`ChannelStatisticOp` -- applies a function to subsets of a data set,
and adds the resulting statistic to the `Experiment`
"""
from warnings import warn
import pandas as pd
import numpy as np
from traits.api import HasStrictTraits, Str, List, Constant, provides, Callable
import cytoflow.utility as util
from .i_operation import IOperation
[docs]
@provides(IOperation)
class ChannelStatisticOp(HasStrictTraits):
"""
Apply a functions to subsets of a data set, and add it as a
statistic to the experiment.
The `apply` function groups the data by the variables in `by`,
then applies the `function` callable to each group in the channel
specified by `channel`.
The `function` callable should take a single `pandas.Series` of ``float``
as an argument and return a ``float``, a value that can be cast to
``float``, or a `pandas.Series` of ``float``. If `function` returns a
``float`` or a value that can be cast to ``float``, then the resulting
statistic has one column and its name is set to `channel`. If `function`
returns a `pandas.Series`, then the ``Series``' index labels become the
column names. (If used this way, each call to `function` must **always**
return a `pandas.Series` with the same index.)
Attributes
----------
name : Str
The operation name. Becomes the name of the new statistic.
channel : Str
The channel to apply the function to. By default, the channel name
becomes the column (feature) name in the new statistic.
function : Callable
The function used to compute the statistic. `function` must take
a `pandas.Series` as its only parameter and return either a ``float``,
a value that can be cast to ``float``, or a `pandas.Series`.
.. warning::
Be careful! Sometimes this function is called with an empty input!
If this is the case, poorly-behaved functions can return ``NaN`` or
throw an error. If this happens, it will be reported.
by : List(Str)
A list of metadata attributes to aggregate the data before applying the
function. For example, if the experiment has two pieces of metadata,
``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will apply
`function` separately to each subset of the data with a unique
combination of ``Time`` and ``Dox``.
subset : Str
A Python expression sent to `Experiment.query` to subset the
data before computing the statistic.
Examples
--------
.. plot::
:context: close-figs
Make a little data set.
>>> import cytoflow as flow
>>> import pandas as pd
>>> import_op = flow.ImportOp()
>>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
... conditions = {'Dox' : 10.0}),
... flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
... conditions = {'Dox' : 1.0})]
>>> import_op.conditions = {'Dox' : 'float'}
>>> ex = import_op.apply()
Create and parameterize the operation.
.. plot::
:context: close-figs
>>> ch_op = flow.ChannelStatisticOp(name = 'MeanByDox',
... channel = 'Y2-A',
... function = flow.geom_mean,
... by = ['Dox'])
>>> ex2 = ch_op.apply(ex)
View the new statistic
.. plot::
:context: close-figs
>>> print(ex2.statistics.keys())
dict_keys(['MeanByDox'])
>>> print(ex2.statistics['MeanByDox'])
Y2-A
Dox
1.0 19.805601
10.0 446.981927
"""
id = Constant('cytoflow.operations.channel_statistic')
friendly_id = Constant("Channel Statistic")
name = Str
channel = Str
function = Callable
by = List(Str)
subset = Str
[docs]
def apply(self, experiment):
"""
Apply the operation to an `Experiment`.
Parameters
----------
experiment
The `Experiment` to apply this operation to.
Returns
-------
Experiment
A new `Experiment`, containing a new entry in
`Experiment.statistics`. The key of the new entry
is ``name``.
"""
if not self.name:
raise util.CytoflowOpError('name', "Must specify a name")
if self.name != util.sanitize_identifier(self.name):
raise util.CytoflowOpError('name',
"Name can only contain letters, numbers and underscores."
.format(self.name))
if not self.channel:
raise util.CytoflowOpError('channels', "Must specify a channel")
if not self.function:
raise util.CytoflowOpError('function', "Must specify a function")
if self.channel not in experiment.data:
raise util.CytoflowOpError('channels',
"Channel {} not found in the experiment"
.format(self.channel))
if not self.by:
raise util.CytoflowOpError('by',
"Must specify some grouping conditions "
"in 'by'")
if self.name in experiment.statistics:
raise util.CytoflowOpError('name',
"{} is already in the experiment's statistics"
.format(self.name))
new_experiment = experiment.clone(deep = False)
if self.subset:
try:
experiment = experiment.query(self.subset)
except Exception as exc:
raise util.CytoflowOpError('subset',
"Subset string '{0}' isn't valid"
.format(self.subset)) from exc
for b in self.by:
if b not in experiment.conditions:
raise util.CytoflowOpError('by',
"Aggregation metadata {} not found, "
"must be one of {}"
.format(b, experiment.conditions))
unique = experiment.data[b].unique()
if len(unique) == 1:
warn("Only one category for {}".format(b), util.CytoflowOpWarning)
groupby = experiment.data.groupby(self.by, observed = True)
keys = [x if isinstance(x, tuple)
else (x,)
for x in groupby.groups.keys()]
idx = pd.MultiIndex.from_tuples(keys, names = self.by)
stat = None
for group, data_subset in groupby:
try:
v = self.function(data_subset[self.channel])
except Exception as e:
raise util.CytoflowOpError(None,
"Your function threw an error in group {}"
.format(group)) from e
try:
v = float(v)
except (TypeError, ValueError) as e:
if not isinstance(v, pd.Series):
raise util.CytoflowOpError(None,
"Your function returned a {}. It must return "
"a float, a value that can be cast to float, "
"or a pandas.Series (with type float)"
.format(type(v))) from e
if isinstance(v, pd.Series) and v.dtype.kind != 'f':
raise util.CytoflowOpError(None,
"Your function returned a pandas.Series with dtype {}. "
"If it returns a Series, the data must be floating point."
.format(v.dtype))
if stat is None:
if isinstance(v, float):
stat = pd.DataFrame(data = np.full((len(idx), 1), np.nan),
index = idx,
columns = [self.channel],
dtype = 'float' ).sort_index()
elif isinstance(v, pd.Series):
stat = pd.DataFrame(data = np.full((len(idx), len(v)), np.nan),
index = idx,
columns = v.index.tolist(),
dtype = 'float').sort_index()
first_v = v
if type(v) != type(first_v):
raise util.CytoflowOpError(None,
"The first call to your function returned a {}, "
"but calling it on group {} returned a {}"
.format(type(first_v), group, type(v)))
stat.loc[group] = v
# fail on NaNs.
if stat.loc[group].isna().any():
raise util.CytoflowOpError(None,
"Calling function on category {} returned {} "
"which contains NaN".format(group, stat.loc[group]))
#
# if stat.isna().any().any():
# raise util.CytoflowOpError(None,
# "The statistic has at least one NaN in it, which probably means "
# "one of the groups did not have any events AND you forgot to set "
# "'fill' to something other than NaN.".format(group, stat.loc[group]))
#
new_experiment.history.append(self.clone_traits(transient = lambda _: True))
new_experiment.statistics[self.name] = stat
return new_experiment