Source code for cytoflow.experiment

#!/usr/bin/env python3.4
# coding: latin-1

# (c) Massachusetts Institute of Technology 2015-2018
# (c) Brian Teague 2018-2019
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <>.


import pandas as pd
from pandas.api.types import CategoricalDtype, is_categorical_dtype
from traits.api import (HasStrictTraits, Dict, List, Instance, Str, Any,
                       Property, Tuple)

import cytoflow.utility as util

[docs]class Experiment(HasStrictTraits): """ An Experiment manages all the data and metadata for a flow experiment. An :class:`Experiment` is the central data struture in :mod:`cytoflow`: it wraps a :class:`pandas.DataFrame` containing all the data from a flow experiment. Each row in the table is an event. Each column is either a measurement from one of the detectors (or a "derived" measurement such as a transformed value or a ratio), or a piece of metadata associated with that event: which tube it came from, what the experimental conditions for that tube were, gate membership, etc. The :class:`Experiment` object lets you: - Add additional metadata to define subpopulations - Get events that match a particular metadata signature. Additionally, the :class:`Experiment` object manages channel- and experiment-level metadata in the :attr:`metadata` attribute, which is a dictionary. This allows the rest of the :mod:`cytoflow` package to track and enforce other constraints that are important in doing quantitative flow cytometry: for example, every tube must be collected with the same channel parameters (such as PMT voltage.) .. note:: :class:`Experiment` is not responsible for enforcing the constraints; :class:`.ImportOp` and the other modules are. Attributes ---------- data : pandas.DataFrame All the events and metadata represented by this experiment. Each event is a row; each column is either a measured channel (eg. a fluorescence measurement), a derived channel (eg. the ratio between two channels), or a piece of metadata. Metadata can be either experimental conditions (eg. induction level, timepoint) or added by operations (eg. gate membership). metadata : Dict(Str : Dict(Str : Any) Each column in :attr:`data` has an entry in :attr:`metadata` whose key is the column name and whose value is a dict of column-specific metadata. Metadata is added by operations, and is occasionally useful if modules are expected to work together. See individual operations' documentation for a list of the metadata that operation adds. The only "required" metadata is ``type``, which can be ``channel`` (if the column is a measured channel, or derived from one) or ``condition`` (if the column is an experimental condition, gate membership, etc.) .. warning:: There may also be experiment-wide entries in :attr:`metadata` that are *not* columns in :attr:`data`! history : List(IOperation) The :class:`.IOperation` operations that have been applied to the raw data to result in this :class:`Experiment`. statistics : Dict((Str, Str) : pandas.Series) The statistics and parameters computed by models that were fit to the data. The key is an ``(Str, Str)`` tuple, where the first ``Str`` is the name of the operation that supplied the statistic, and the second ``Str`` is the name of the statistic. The value is a multi-indexed :class:`pandas.Series`: each level of the index is a facet, and each combination of indices is a subset for which the statistic was computed. The values of the series, of course, are the values of the computed parameters or statistics for each subset. channels : List(String) The channels that this experiment tracks (read-only). conditions : Dict(String : pandas.Series) The experimental conditions and analysis groups (gate membership, etc) that this experiment tracks. The key is the name of the condition, and the value is a :class:`pandas.Series` with that condition's possible values. Notes ----- The OOP programmer in me desperately wanted to subclass :class:`pandas.DataFrame`, add some flow-specific stuff, and move on with my life. (I may still, with something like A few things get in the way of directly subclassing :class:`pandas.DataFrame`: - First, to enable some of the delicious syntactic sugar for accessing its contents, :class:`pandas.DataFrame` redefines :meth:`__getattribute__` and :meth:`__setattribute__`, and making it recognize (and maintain across copies) additional attributes is an unsupported (non-public) API feature and introduces other subclassing weirdness. - Second, many of the operations (like appending!) don't happen in-place; they return copies instead. It's cleaner to simply manage that copying ourselves instead of making the client deal with it. We can pretend to operate on the data in-place. To maintain the ease of use, we'll override :meth:`__getitem__` and pass it to the wrapped :class:`pandas.DataFrame`. We'll do the same with some of the more useful :class:`~pandas.DataFrame` API pieces (like :meth:`query`); and of course, you can just get the data frame itself with :attr:``. Examples -------- >>> import cytoflow as flow >>> tube1 = flow.Tube(file = 'cytoflow/tests/data/Plate01/RFP_Well_A3.fcs', ... conditions = {"Dox" : 10.0}) >>> tube2 = flow.Tube(file='cytoflow/tests/data/Plate01/CFP_Well_A4.fcs', ... conditions = {"Dox" : 1.0}) >>> >>> import_op = flow.ImportOp(conditions = {"Dox" : "float"}, ... tubes = [tube1, tube2]) >>> >>> ex = import_op.apply() >>> (20000, 17) >>>['Dox']).size() Dox 1 10000 10 10000 dtype: int64 """ # this doesn't play nice with copy.copy(); clone it ourselves. data = Instance(pd.DataFrame, args=()) # potentially mutable. deep copy required metadata = Dict(Str, Any, copy = "deep") # statistics. mutable, deep copy required statistics = Dict(Tuple(Str, Str), pd.Series, copy = "deep") history = List(Any) channels = Property(List) conditions = Property(Dict) def __getitem__(self, key): """Override __getitem__ so we can reference columns like ex.column""" return def __setitem__(self, key, value): """Override __setitem__ so we can assign columns like ex.column = ...""" if key in, axis = 'columns', inplace = True) return, value) def __len__(self): """Return the length of the underlying pandas.DataFrame""" return len( def _get_channels(self): """Getter for the `channels` property""" return sorted([x for x in if self.metadata[x]['type'] == "channel"]) def _get_conditions(self): """Getter for the `conditions` property""" return {x : pd.Series([x].unique().copy()).sort_values() for x in if self.metadata[x]['type'] == "condition"}
[docs] def subset(self, conditions, values): """ Returns a subset of this experiment including only the events where each condition in ``condition`` equals the corresponding value in ``values``. Parameters ---------- conditions : Str or Tuple(Str) A condition or list of conditions values : Any or Tuple(Any) The value(s) of the condition(s) Returns ------- Experiment A new :class:`Experiment` containing only the events specified in ``conditions`` and ``values``. """ if isinstance(conditions, str): c = conditions v = values if c not in self.conditions: raise util.CytoflowError("{} is not a condition".format(c)) if v not in list(self.conditions[c]): raise util.CytoflowError("{} is not a value of condition {}".format(v, c)) else: for c, v in zip(conditions, values): if c not in self.conditions: raise util.CytoflowError("{} is not a condition".format(c)) if v not in list(self.conditions[c]): raise util.CytoflowError("{} is not a value of condition {}".format(v, c)) g = ret = self.clone() = g.get_group(values) = True, inplace = True) return ret
[docs] def query(self, expr, **kwargs): """ Return an experiment whose data is a subset of this one where ``expr`` evaluates to ``True``. This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore ``_``. So, the column name ``a column`` becomes ``a_column``, and can be queried with an ``a_column == True`` or such. Parameters ---------- expr : string The expression to pass to :meth:`pandas.DataFrame.query`. Must be a valid Python expression, something you could pass to :func:`eval`. **kwargs : dict Other named parameters to pass to :meth:`pandas.DataFrame.query`. Returns ------- Experiment A new :class:`Experiment`, a clone of this one with the data returned by :meth:`pandas.DataFrame.query()` """ resolvers = {} for name, col in new_name = util.sanitize_identifier(name) if new_name in resolvers: raise util.CytoflowError("Tried to sanitize column name {1} to " "{2} but it already existed in the " " DataFrame." .format(name, new_name)) else: resolvers[new_name] = col ret = self.clone() =, resolvers = ({}, resolvers), **kwargs) = True, inplace = True) if len( == 0: raise util.CytoflowError("No events matched {}".format(expr)) return ret
[docs] def clone(self): """ Create a copy of this :class:`Experiment` """ new_exp = self.clone_traits() = = False) # shallow copy of the history new_exp.history = self.history[:] return new_exp
[docs] def add_condition(self, name, dtype, data = None): """ Add a new column of per-event metadata to this :class:`Experiment`. .. note:: :meth:`add_condition` operates **in place.** There are two places to call `add_condition`. - As you're setting up a new :class:`Experiment`, call :meth:`add_condition` with ``data`` set to ``None`` to specify the conditions the new events will have. - If you compute some new per-event metadata on an existing :class:`Experiment`, call :meth:`add_condition` to add it. Parameters ---------- name : String The name of the new column in :attr:`data`. Must be a valid Python identifier: must start with ``[A-Za-z_]`` and contain only the characters ``[A-Za-z0-9_]``. dtype : String The type of the new column in :attr:`data`. Must be a string that :class:`pandas.Series` recognizes as a ``dtype``: common types are ``category``, ``float``, ``int``, and ``bool``. data : pandas.Series (default = None) The :class:`pandas.Series` to add to :attr:`data`. Must be the same length as :attr:`data`, and it must be convertable to a :class:`pandas.Series` of type ``dtype``. If ``None``, will add an empty column to the :class:`Experiment` ... but the :class:`Experiment` must be empty to do so! Raises ------ :class:`.CytoflowError` If the :class:`pandas.Series` passed in ``data`` isn't the same length as :attr:`data`, or isn't convertable to type ``dtype``. Examples -------- >>> import cytoflow as flow >>> ex = flow.Experiment() >>> ex.add_condition("Time", "float") >>> ex.add_condition("Strain", "category") """ if name != util.sanitize_identifier(name): raise util.CytoflowError("Name '{}' is not a valid Python identifier" .format(name)) if name in raise util.CytoflowError("Already a column named {0} in" .format(name)) if data is None and len(self) > 0: raise util.CytoflowError("If data is None, must be empty!") if data is not None and len(self) != len(data): raise util.CytoflowError("data must be the same length as") try: if data is not None:[name] = data.astype(dtype, copy = True) else:[name] = pd.Series(dtype = dtype) except (ValueError, TypeError) as exc: raise util.CytoflowError("Had trouble converting data to type {0}" .format(dtype)) from exc self.metadata[name] = {} self.metadata[name]['type'] = "condition"
[docs] def add_channel(self, name, data = None): """ Add a new column of per-event data (as opposed to metadata) to this :class:`Experiment`: ie, something that was measured per cell, or derived from per-cell measurements. .. note:: :meth:`add_channel` operates *in place*. Parameters ---------- name : String The name of the new column to be added to :attr:`data`. data : pandas.Series The :class:`pandas.Series` to add to :attr:`data`. Must be the same length as :attr:`data`, and it must be convertable to a dtype of ``float64``. If ``None``, will add an empty column to the :class:`Experiment` ... but the :class:`Experiment` must be empty to do so! Raises ------ :exc:`.CytoflowError` If the :class:`pandas.Series` passed in ``data`` isn't the same length as :attr:`data`, or isn't convertable to a dtype ``float64``. Examples -------- >>> ex.add_channel("FSC_over_2",["FSC-A"] / 2.0) """ if name in raise util.CytoflowError("Already a column named {0} in" .format(name)) if data is None and len(self) > 0: raise util.CytoflowError("If data is None, must be empty!") if data is not None and len(self) != len(data): raise util.CytoflowError("data must be the same length as") try: if data is not None:[name] = data.astype("float64", copy = True) else:[name] = pd.Series(dtype = "float64") except (ValueError, TypeError) as exc: raise util.CytoflowError("Had trouble converting data to type \"float64\"") from exc self.metadata[name] = {} self.metadata[name]['type'] = "channel"
[docs] def add_events(self, data, conditions): """ Add new events to this :class:`Experiment`. Each new event in ``data`` is appended to :attr:`data`, and its per-event metadata columns will be set with the values specified in ``conditions``. Thus, it is particularly useful for adding tubes of data to new experiments, before additional per-event metadata is added by gates, etc. .. note:: *Every* column in :attr:`data` must be accounted for. Each column of type ``channel`` must appear in ``data``; each column of metadata must have a key:value pair in ``conditions``. Parameters ---------- tube : pandas.DataFrame A single tube or well's worth of data. Must be a DataFrame with the same columns as :attr:`channels` conditions : Dict(Str, Any) A dictionary of the tube's metadata. The keys must match :attr:`conditions`, and the values must be coercable to the relevant ``numpy`` dtype. Raises ------ :exc:`.CytoflowError` :meth:`add_events` pukes if: - there are columns in ``data`` that aren't channels in the experiment, or vice versa. - there are keys in ``conditions`` that aren't conditions in the experiment, or vice versa. - there is metadata specified in ``conditions`` that can't be converted to the corresponding metadata ``dtype``. Examples -------- >>> import cytoflow as flow >>> import fcsparser >>> ex = flow.Experiment() >>> ex.add_condition("Time", "float") >>> ex.add_condition("Strain", "category") >>> tube1, _ = fcparser.parse('CFP_Well_A4.fcs') >>> tube2, _ = fcparser.parse('RFP_Well_A3.fcs') >>> ex.add_events(tube1, {"Time" : 1, "Strain" : "BL21"}) >>> ex.add_events(tube2, {"Time" : 1, "Strain" : "Top10G"}) """ # make sure the new tube's channels match the rest of the # channels in the Experiment if len(self) > 0 and set(data.columns) != set(self.channels): raise util.CytoflowError("New events don't have the same channels") # check that the conditions for this tube exist in the experiment # already if( any(True for k in conditions if k not in self.conditions) or \ any(True for k in self.conditions if k not in conditions) ): raise util.CytoflowError("Metadata for this tube should be {}" .format(list(self.conditions.keys()))) # add the conditions to tube's internal data frame. specify the conditions # dtype using self.conditions. check for errors as we do so. # take this chance to up-convert the float32s to float64. # this happened automatically in DataFrame.append(), below, but # only in certain cases.... :-/ # TODO - the FCS standard says you can specify the precision. # check with int/float/double files! new_data = data.astype("float64", copy=True) for meta_name, meta_value in conditions.items(): meta_type = self.conditions[meta_name].dtype if is_categorical_dtype(meta_type): meta_type = CategoricalDtype([meta_value]) new_data[meta_name] = \ pd.Series(data = [meta_value] * len(new_data), index = new_data.index, dtype = meta_type) # if we're categorical, merge the categories if is_categorical_dtype(meta_type) and meta_name in cats = set([meta_name].cat.categories) | set(new_data[meta_name].cat.categories)[meta_name] =[meta_name].cat.set_categories(cats) new_data[meta_name] = new_data[meta_name].cat.set_categories(cats) =, ignore_index = True, sort = True) del new_data
if __name__ == "__main__": import fcsparser ex = Experiment() ex.add_conditions({"time" : "category"}) tube0, _ = fcsparser.parse('../cytoflow/tests/data/tasbe/BEADS-1_H7_H07_P3.fcs') tube1, _ = fcsparser.parse('../cytoflow/tests/data/tasbe/beads.fcs') tube2, _ = fcsparser.parse('../cytoflow/tests/data/Plate01/RFP_Well_A3.fcs') ex.add_tube(tube1, {"time" : "one"}) ex.add_tube(tube2, {"time" : "two"})