Source code for pymaid.cluster

#    This script is part of pymaid (http://www.github.com/navis-org/pymaid).
#    Copyright (C) 2017 Philipp Schlegel
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

import os
import json
import colorsys
import math

import numpy as np
import pandas as pd
import navis as ns
import scipy.cluster.hierarchy
import scipy.spatial

from concurrent.futures import ThreadPoolExecutor

from . import fetch, core, utils, config

# Set up logging
logger = config.get_logger(__name__)

__all__ = sorted(['cluster_by_connectivity', 'cluster_by_synapse_placement',
                  'cluster_xyz', 'ClustResults'])



[docs]
def cluster_by_connectivity(x, similarity='vertex_normalized',
                            upstream=True, downstream=True,
                            threshold=1, include_skids=None,
                            exclude_skids=None, min_nodes=2,
                            connectivity_table=None, cluster_kws={},
                            skip_missing=True, remote_instance=None):
    r"""Cluster neurons based on connectivity.

    This functions offers a selection of metrics to compare connectivity:

    .. list-table::
       :widths: 15 75
       :header-rows: 1

       * - Metric
         - Explanation
       * - matching_index
         - Number of shared partners divided by total number of partners.
       * - matching_index_synapses
         - Number of shared synapses (i.e. number of connections from/onto the
           same partners) divided by total number of synapses. Attention: this
           metric is tricky when there is a disparity of total number of
           connections between neuron A and B. For example, consider 100/200
           and 1/50 shared/total synapse: 101/250 results in a fairly high
           matching index of 0.404.
       * - matching_index_weighted_synapses
         - Similar to *matching_index_synapses* but slightly less prone to
           above mentioned error as it uses the percentage of shared synapses:

           .. math::

               S = \\frac{\\text{NeuronA}_{\\text{ shared synapses}}}{\\text{NeuronA}_{\\text{ total synapses}}} \\times \\frac{\\text{NeuronB}_{\\text{ shared synapses}}}{\\text{NeuronB}_{\\text{ total synapses}}}

       * - vertex
         - Matching index that rewards shared and punishes non-shared partners.
           Based on
           `Jarrell et al., 2012 <http://science.sciencemag.org/content/337/6093/437>`_:

           .. math::

               f(x,y) = min(x,y) - C1 \\times max(x,y) \\times \\exp(-C2 * min(x,y))

           Final score is the sum of :math:`f(x,y)` over all edges x, y between
           neurons A+B and their partners. C1 determines how negatively a case
           where one edge is much stronger than another is punished. C2
           determines the point where the similarity switches from negative to
           positive. C1 and C2 default to 0.5 and 1, respectively, but can be
           changed by passing them in a dictionary as `cluster_kws`.
       * - vertex_normalized
         - This is *vertex* similarity normalized by the lowest (total
           dissimilarity) and highest (all edge weights the same) achievable
           score.


    Parameters
    ----------
    x
                         Neurons as single or list of either:

                         1. skeleton IDs (int or str)
                         2. neuron name (str, exact match)
                         3. annotation: e.g. 'annotation:PN right'
                         4. CatmaidNeuron or CatmaidNeuronList object
    similarity :        'matching_index' | 'matching_index_synapses' | 'matching_index_weighted_synapses' | 'vertex' | vertex_normalized', optional
                         Metric used to compare connectivity. See notes for
                         detailed explanation.
    upstream :           bool, optional
                         If True, upstream partners will be considered.
    downstream :         bool, optional
                         If True, downstream partners will be considered.
    threshold :          int, optional
                         Only partners with more this synapses are used for
                         comparison. This threshold applies to the *total*
                         number of connections from/to all neurons in ``x``.
    min_nodes :          int, optional
                         Minimum number of nodes for a partners to be
                         considered.
    include_skids :      see ``x``, optional
                         If filter_skids is not empty, only neurons whose skids
                         are in filter_skids will be considered when
                         calculating similarity score.
    exclude_skids :      see ``x``, optional
                         Neurons to exclude from calculation of connectivity
                         similarity.
    connectivity_table : pd.DataFrame, optional
                         Connectivity table, e.g. from
                         :func:`~pymaid.get_partners`. If provided, will use
                         this instead of querying CATMAID server. Filters
                         still apply!
    skip_missing :       bool, optional
                         If True, neurons that don't have connectivity data
                         (i.e. no up-/ and/or downstream partners after
                         filtering) will be skipped. If False, will keep them
                         but they will have similarity ``nan``.
    remote_instance :   CatmaidInstance, optional
                        If not passed, will try using globally defined.

    Returns
    -------
    :class:`pymaid.ClustResults`
                         Custom cluster results class holding the distance
                         matrix and contains wrappers e.g. to plot dendograms.

    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> # Cluster a set of neurons by their inputs (ignore small fragments)
    >>> res = pymaid.cluster_by_connectivity('annotation:PBG6 P-EN right',
    ...                                      upstream=True, downstream=False,
    ...                                      threshold=1, min_nodes=500)
    >>> # Get the similarity matrix
    >>> print(res.sim_mat)
    >>> # Plot a dendrogram
    >>> fig = res.plot_dendrogram()
    >>> plt.show()

    """
    remote_instance = utils._eval_remote_instance(remote_instance)

    # Extract skids from CatmaidNeuron, CatmaidNeuronList, DataFrame or Series
    neurons = utils.eval_skids(x, remote_instance=remote_instance)

    # Make sure neurons are strings, not integers
    neurons = [str(n) for n in list(set(neurons))]

    directions = []
    if upstream is True:
        directions.append('upstream')
    if downstream is True:
        directions.append('downstream')

    if isinstance(connectivity_table, type(None)):
        # Retrieve connectivity and apply filters
        connectivity = fetch.get_partners(neurons,
                                          directions=directions,
                                          remote_instance=remote_instance,
                                          min_size=min_nodes,
                                          threshold=threshold)
    else:
        connectivity = connectivity_table[(connectivity_table.num_nodes >= min_nodes)
                                          & (connectivity_table.total >= threshold)]

    if not isinstance(include_skids, type(None)) or not isinstance(exclude_skids, type(None)):
        logger.info('Filtering connectivity. '
                    '%i entries before filtering' % (connectivity.shape[0]))

        if not isinstance(include_skids, type(None)):
            connectivity = connectivity[
                connectivity.skeleton_id.isin(utils.eval_skids(include_skids, remote_instance=remote_instance))]

        if not isinstance(exclude_skids, type(None)):
            connectivity = connectivity[
                ~connectivity.skeleton_id.isin(utils.eval_skids(exclude_skids, remote_instance=remote_instance))]

        logger.info('{} entries after filtering'.format(connectivity.shape[0]))

    # Calc number of partners used for calculating matching score (i.e. ratio of input to outputs)!
    # This is AFTER filtering! Total number of partners can be altered!
    n_partners = {n: {r: connectivity[(connectivity[str(n)] > 0) & (connectivity.relation == r)].shape[0]
                      for r in directions}
                  for n in neurons}

    # Make sure all neurons have connectivity data to calculate similarity
    no_data = [n for n in neurons if sum(n_partners[n].values()) == 0]

    if no_data:
        w = '{} neuron(s) without connectivity data found.'.format(len(no_data))
        if skip_missing:
            w += ' Skipped: {}'.format(', '.join(no_data))
            neurons = list(set(neurons) - set(no_data))
        else:
            w += ' These neurons might have "NaN" similarities.'
        logger.warning(w)

    # Retrieve names
    logger.debug('Retrieving neuron names')
    neuron_names = fetch.get_names(list(set(neurons + connectivity.skeleton_id.tolist())),
                                   remote_instance=remote_instance)

    matching_scores = {}

    if similarity in ['vertex_normalized', 'vertex']:
        vertex_score = True
    else:
        vertex_score = False

    # Calculate connectivity similarity by direction
    for d in directions:
        this_cn = connectivity[connectivity.relation == d]

        # Prepare connectivity subsets:
        cn_subsets = {n: this_cn[n] > 0 for n in neurons}

        logger.info('Calculating {} similarity scores'.format(d))
        matching_scores[d] = pd.DataFrame(np.zeros((len(neurons), len(neurons))),
                                          index=neurons, columns=neurons)
        if this_cn.shape[0] == 0:
            logger.warning('No {} partners found: filtered?'.format(d))

        combinations = [(nA, nB, this_cn, vertex_score, cn_subsets[nA],
                         cn_subsets[nB], cluster_kws) for nA in neurons for nB in neurons]

        with ThreadPoolExecutor(max_workers=max(1, os.cpu_count())) as e:
            futures = e.map(_unpack_connectivity_helper, combinations)

            matching_indices = [n for n in config.tqdm(futures,
                                                       total=len(combinations),
                                                       desc=d,
                                                       disable=config.pbar_hide,
                                                       leave=config.pbar_leave)]

        for i, v in enumerate(combinations):
            matching_scores[d].loc[v[0], v[1]
                                   ] = matching_indices[i][similarity]

    # Attention! Averaging over incoming and outgoing pairing scores will
    # give weird results with - for example - sensory/motor neurons
    # that have predominantly either only up- or downstream partners!
    # To compensate, the ratio of upstream to downstream partners (after
    # applying filters!) is considered!
    # Ratio is applied to neuronA of A-B comparison -> will be reversed at B-A
    # comparison
    logger.info('Finalizing scores')
    dist_matrix = pd.DataFrame(
        np.zeros((len(neurons), len(neurons))), index=neurons, columns=neurons)
    for neuronA in neurons:
        for neuronB in neurons:
            if len(directions) == 1:
                dist_matrix[neuronA][neuronB] = matching_scores[
                    directions[0]][neuronA][neuronB]
            else:
                try:
                    r_inputs = n_partners[neuronA][
                        'upstream'] / (n_partners[neuronA]['upstream'] + n_partners[neuronA]['downstream'])
                    r_outputs = 1 - r_inputs
                except BaseException():
                    logger.warning('Failed to calculate input/output ratio '
                                   'for skeleton ID #{} assuming 50/50 '
                                   '(probably "division-by-0" error)'.format(neuronA))
                    r_inputs = 0.5
                    r_outputs = 0.5

                dist_matrix[neuronA][neuronB] = matching_scores['upstream'][neuronA][
                    neuronB] * r_inputs + matching_scores['downstream'][neuronA][neuronB] * r_outputs

    logger.info('All done.')

    # Rename rows and columns
    # dist_matrix.columns = [neuron_names[str(n)] for n in dist_matrix.columns]
    # dist_matrix.index = [ neuron_names[str(n)] for n in dist_matrix.index ]

    results = ClustResults(dist_matrix, labels=[neuron_names[str(
        n)] for n in dist_matrix.columns], mat_type='similarity')

    if isinstance(x, core.CatmaidNeuronList):
        results.neurons = x

    return results



def _unpack_connectivity_helper(x):
    """Helper function to unpack values from pool."""
    return _calc_connectivity_matching_index(x[0], x[1], x[2], vertex_score=x[3], nA_cn=x[4], nB_cn=x[5], **x[6])


def _calc_connectivity_matching_index(neuronA, neuronB, connectivity,
                                      syn_threshold=1, min_nodes=1, **kwargs):
    """Calculate various connectivity similarity metrics.

    Parameters
    ----------
    neuronA :         skeleton ID
    neuronB :         skeleton ID
    connectivity :    pandas DataFrame
                      Connectivity data as provided by :func:`pymaid.get_partners`.
    syn_threshold :   int, optional
                      Min number of synapses for a connection to be considered.
                      Default = 1
    min_nodes :       int, optional
                      Min number of nodes for a partner to be considered use
                      this to filter fragments. Default = 1
    vertex_score :    bool, optional
                      If False, no vertex score is returned (much faster!).
                      Default = True
    nA_cn/nB_cn :     list of bools
                      Subsets of the connectivity that connect to either
                      neuronA or neuronB -> if not provided, will be calculated
                      -> time consuming

    Returns
    -------
    dict
                      Containing all initially described matching indices

    Notes
    -----
    |matching_index =           Number of shared partners divided by total
    |                           number of partners

    |matching_index_synapses =  Number of shared synapses divided by total number
    |                           of synapses. Attention! matching_index_synapses
    |                           is tricky, because if neuronA has lots of
    |                           connections and neuronB only little, they will
    |                           still get a high matching index.
    |                           E.g. 100 of 200 / 1 of 50 = 101/250
    |                           -> ``matching index = 0.404``

    |matching_index_weighted_synapses = Similar to matching_index_synapses but
    |                           slightly less prone to above mentioned error:
    |                           % of shared synapses A * % of shared synapses
    |                           B * 2 / (% of shared synapses A + % of shared
    |                           synapses B)
    |                           -> value will be between 0 and 1; if one neuronB
    |                           has only few connections (percentage) to a shared
    |                           partner, the final value will also be small

    |vertex_normalized =        Matching index that rewards shared and punishes
    |                           non-shared partners. Vertex similarity based on
    |                           Jarrell et al., 2012:
    |                           f(x,y) = min(x,y) - C1 * max(x,y) * e^(-C2 * min(x,y))
    |                           x,y = edge weights to compare
    |                           vertex_similarity is the sum of f over all vertices
    |                           C1 determines how negatively a case where one edge
    |                           is much stronger than another is punished
    |                           C2 determines the point where the similarity
    |                           switches from negative to positive

    """
    if min_nodes > 1:
        connectivity = connectivity[connectivity.num_nodes > min_nodes]

    vertex_score = kwargs.get('vertex_score', True)
    nA_cn = kwargs.get('nA_cn', connectivity[neuronA] >= syn_threshold)
    nB_cn = kwargs.get('nB_cn', connectivity[neuronB] >= syn_threshold)

    total = connectivity[nA_cn | nB_cn]
    n_total = total.shape[0]

    shared = connectivity[nA_cn & nB_cn]
    n_shared = shared.shape[0]

    shared_sum = shared.sum()
    n_synapses_sharedA = shared_sum[neuronA]
    n_synapses_sharedB = shared_sum[neuronB]
    n_synapses_shared = n_synapses_sharedA + n_synapses_sharedB

    total_sum = total.sum()
    n_synapses_totalA = total_sum[neuronA]
    n_synapses_totalB = total_sum[neuronB]
    n_synapses_total = n_synapses_totalA + n_synapses_totalB

    # Vertex similarity based on Jarrell et al., 2012
    # f(x,y) = min(x,y) - C1 * max(x,y) * e^(-C2 * min(x,y))
    # x,y = edge weights to compare
    # vertex_similarity is the sum of f over all vertices
    # C1 determines how negatively a case where one edge is much stronger than
    #    another is punished
    # C2 determines the point where the similarity switches from negative to
    # positive
    C1 = kwargs.get('C1', 0.5)
    C2 = kwargs.get('C2', 1)
    vertex_similarity = 0
    max_score = 0
    similarity_indices = {}

    if vertex_score:
        # We only need the columns for neuronA and neuronB
        this_cn = total[[neuronA, neuronB]].astype(float)

        # Get min and max between both neurons
        this_max = np.max(this_cn, axis=1)
        this_min = np.min(this_cn, axis=1)

        # The max possible score is when both synapse counts are the same:
        # in which case score = max(x,y) - C1 * max(x,y) * e^(-C2 * max(x,y))
        max_score = this_max - C1 * this_max * np.exp(- C2 * this_max)

        # The smallest possible score is when either synapse count is 0:
        # in which case score = -C1 * max(a,b)
        min_score = -C1 * this_max

        # Implement: f(x,y) = min(x,y) - C1 * max(x,y) * e^(-C2 * min(x,y))
        v_sim = this_min - C1 * this_max * np.exp(- C2 * this_min)

        # Sum over all partners
        vertex_similarity = v_sim.sum()

        similarity_indices['vertex'] = vertex_similarity

        try:
            similarity_indices['vertex_normalized'] = (
                vertex_similarity - min_score.sum()) / (max_score.sum() - min_score.sum())
        except BaseException:
            similarity_indices['vertex_normalized'] = 0

    if n_total != 0:
        similarity_indices['matching_index'] = n_shared / n_total
        similarity_indices[
            'matching_index_synapses'] = n_synapses_shared / n_synapses_total
        if n_synapses_sharedA != 0 and n_synapses_sharedB != 0:
            similarity_indices['matching_index_weighted_synapses'] = (
                n_synapses_sharedA / n_synapses_totalA) * (n_synapses_sharedB / n_synapses_totalB)
        else:
            # If no shared synapses at all:
            similarity_indices['matching_index_weighted_synapses'] = 0
    else:
        similarity_indices['matching_index'] = 0
        similarity_indices['matching_index_synapses'] = 0
        similarity_indices['matching_index_weighted_synapses'] = 0

    return similarity_indices


def _unpack_synapse_helper(x):
    """Helper function to unpack values from pool."""
    return _calc_synapse_similarity(x[0], x[1], x[2], x[3], x[4])


def _calc_synapse_similarity(cnA, cnB, sigma=2000, omega=2000,
                             restrict_cn=None):
    """Calculate synapses similarity score.

    Synapse similarity score is calculated by calculating for each synapse of
    neuron A: (1) the distance to the closest (eucledian) synapse in neuron B
    and (2) comparing the synapse density around synapse A and B. This is type
    sensitive: presynapses will only be matched with presynapses, post with
    post, etc. The formula is described in Schlegel et al., eLife (2017).

    Parameters
    ----------
    (cnA, cnB) :    CatmaidNeuron connector tables
    sigma :         int, optional
                    Distance in nanometer that is considered to be "close".
    omega :         int, optional
                    Radius in nanometer over which to calculate
                    synapse density.

    Returns
    -------
    synapse_similarity_score

    """

    all_values = []

    # Get the connector types that we want to compare between neuron A and B
    if isinstance(restrict_cn, type(None)):
        # If no restrictions, get all cn types in neuron A
        cn_to_check = cnA.type.unique()
    else:
        # Intersect restricted connectors and actually available types
        cn_to_check = set(cnA.type.unique()) & set(restrict_cn)

    # Iterate over all types of connectors
    for r in cn_to_check:
        # Skip if either neuronA or neuronB don't have this synapse type
        if cnB[cnB.type == r].empty:
            all_values += [0] * cnA[cnA.type == r].shape[0]
            continue

        # Get inter-neuron matrix
        dist_mat = scipy.spatial.distance.cdist(cnA[cnA.type == r][['x', 'y', 'z']],
                                                cnB[cnB.type == r][['x', 'y', 'z']])

        # Get index of closest synapse in neuron B
        closest_ix = np.argmin(dist_mat, axis=1)

        # Get closest distances
        closest_dist = dist_mat.min(axis=1)

        # Get intra-neuron matrices for synapse density checking
        distA = scipy.spatial.distance.pdist(
            cnA[cnA.type == r][['x', 'y', 'z']])
        distA = scipy.spatial.distance.squareform(distA)
        distB = scipy.spatial.distance.pdist(
            cnB[cnB.type == r][['x', 'y', 'z']])
        distB = scipy.spatial.distance.squareform(distB)

        # Calculate number of synapses closer than OMEGA. This does count itself!
        closeA = (distA <= omega).sum(axis=1)
        closeB = (distB <= omega).sum(axis=1)

        # Now calculate the scores over all synapses
        for a in range(distA.shape[0]):
            this_synapse_value = math.exp(-1 * math.fabs(closeA[a] - closeB[closest_ix[a]]) / (
                closeA[a] + closeB[closest_ix[a]])) * math.exp(-1 * (closest_dist[a]**2) / (2 * sigma**2))
            all_values.append(this_synapse_value)

    score = sum(all_values) / len(all_values)

    return score



[docs]
def cluster_by_synapse_placement(x, sigma=2000, omega=2000, mu_score=True,
                                 restrict_cn=None, remote_instance=None):
    """Cluster neurons based on their synapse placement.

    Distances score is calculated by calculating for each synapse of
    neuron A: (1) the Eucledian distance to the closest synapse in neuron B
    and (2) comparing the synapse density around synapse A and B.
    This is type-sensitive: presynapses will only be matched with presynapses,
    post with post, etc. The formula is described in
    `Schlegel et al., eLife (2017) <https://elifesciences.org/articles/16799>`_:

    .. math::

        f(i_{s},j_{k}) = \\exp(\\frac{-d^{2}_{sk}}{2\\sigma^{2}}) \\exp(-\\frac{|n(i_{s})-n(j_{k})|}{n(i_{s})+n(j_{k})})

    The synapse similarity score for neurons i and j being the average
    of :math:`f(i_{s},j_{k})` over all synapses s of i. Synapse k is the
    closest synapse of the same sign (pre/post) in neuron j to synapse s.
    :math:`d^{2}_{sk}` is the eucledian distance between these distances.
    Variable :math:`\\sigma` (``sigma``) determines what distance between
    s and k is considered "close". :math:`n(i_{s})` and :math:`n(j_{k})` are
    defined as the number of synapses of neuron i/j that are within given
    radius :math:`\\omega` (``omega``) of synapse s and j, respectively (same
    sign only). This esnures that in cases of a strong disparity between
    :math:`n(i_{s})` and :math:`n(j_{k})`, the synapse similarity will be
    close to zero even if the distance between s and k is very small.


    Parameters
    ----------
    x
                        Neurons as single or list of either:

                        1. skeleton IDs (int or str)
                        2. neuron name (str, exact match)
                        3. annotation: e.g. 'annotation:PN right'
                        4. CatmaidNeuron or CatmaidNeuronList object
    sigma :             int, optional
                        Distance in nanometer between synapses that is
                        considered to be "close".
    omega :             int, optional
                        Radius in nanometer over which to calculate synapse
                        density.
    mu_score :          bool, optional
                        If True, score is calculated as mean between A->B and
                        B->A comparison.
    restrict_cn :       int | list | None, optional
                        Restrict to given connector types:
                            - 0: presynapses
                            - 1: postsynapses
                            - 2: gap junctions
                            - 3: abutting connectors
                        If None, will use all connectors. Use either single
                        integer or list. E.g. ``restrict_cn=[0, 1]`` to use
                        only pre- and postsynapses.
    remote_instance :   CatmaidInstance, optional
                        If not passed, will try using globally defined.
                        Need to provide if neurons are only skids or
                        annotation(s).

    Returns
    -------
    :class:`~pymaid.ClustResults`
                Object that contains distance matrix and methods to plot
                dendrograms.

    """
    if not isinstance(x, core.CatmaidNeuronList):
        remote_instance = utils._eval_remote_instance(remote_instance)
        neurons = fetch.get_neuron(x, remote_instance=remote_instance)
    else:
        neurons = x

    # If single value, turn into list
    if not isinstance(restrict_cn, (type(None), list, set, np.ndarray)):
        restrict_cn = [restrict_cn]

    sim_matrix = pd.DataFrame(np.zeros((len(neurons), len(neurons))),
                              index=neurons.skeleton_id,
                              columns=neurons.skeleton_id)

    combinations = [(nA.connectors, nB.connectors, sigma, omega, restrict_cn)
                    for nA in neurons for nB in neurons]
    comb_skids = [(nA.skeleton_id, nB.skeleton_id)
                  for nA in neurons for nB in neurons]

    with ThreadPoolExecutor(max_workers=max(1, os.cpu_count())) as e:
        futures = e.map(_unpack_synapse_helper, combinations)

        scores = [n for n in config.tqdm(futures, total=len(combinations),
                                         desc='Processing',
                                         disable=config.pbar_hide,
                                         leave=config.pbar_leave)]

    for i, v in enumerate(combinations):
        sim_matrix.loc[comb_skids[i][0], comb_skids[i][1]] = scores[i]

    if mu_score:
        sim_matrix = (sim_matrix + sim_matrix.T) / 2

    res = ClustResults(sim_matrix, mat_type='similarity', labels=[
                       neurons.skid[str(s)].neuron_name for s in sim_matrix.columns])
    res.neurons = neurons

    return res



def cluster_xyz(x, labels=None):
    """Thin wrapper for ``scipy.scipy.spatial.distance``.

    Takes a list of x,y,z coordinates and calculates EUCLEDIAN distance matrix.

    Parameters
    ----------
    x :             pandas.DataFrame
                    Must contain ``x``,``y``,``z`` columns.
    labels :        list of str, optional
                    Labels for each leaf of the dendrogram
                    (e.g. connector ids).

    Returns
    -------
    :class:`pymaid.ClustResults`
                      Contains distance matrix and methods to generate plots.

    Examples
    --------
    This examples assumes you understand the basics of using pymaid:

    >>> import matplotlib.pyplot as plt
    >>> n = pymaid.get_neuron(16)
    >>> rs = pymaid.cluster_xyz(n.connectors,
    ...                         labels=n.connectors.connector_id.values)
    >>> rs.plot_matrix()
    >>> plt.show()

    """
    # Generate numpy array containing x, y, z coordinates
    try:
        s = x[['x', 'y', 'z']].values
    except BaseException:
        logger.error('Please provide dataframe connector data of '
                     'exactly a single neuron')
        return

    # Calculate euclidean distance matrix
    condensed_dist_mat = scipy.spatial.distance.pdist(s, 'euclidean')
    squared_dist_mat = scipy.spatial.distance.squareform(condensed_dist_mat)

    return ClustResults(squared_dist_mat, labels=labels, mat_type='distance')



[docs]
class ClustResults:
    """Class to handle, analyze and plot similarity/distance matrices.

    Contains thin wrappers for ``scipy.cluster``.

    Attributes
    ----------
    dist_mat :  Distance matrix (0=similar, 1=dissimilar)
    sim_mat :   Similarity matrix (0=dissimilar, 1=similar)
    linkage :   Hierarchical clustering. Run :func:`pymaid.ClustResults.cluster`
                to generate linkage. By default, WARD's algorithm is used.
    leafs :     list of skids

    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> # Get a bunch of neurons
    >>> nl = pymaid.get_neuron('annotation:glomerulus DA1')
    >>> # Perform connectivity clustering
    >>> res = pymaid.cluster_by_connectivity(nl)
    >>> # `res` is a ClustResults object with handy methods:
    >>> res.plot_matrix()
    >>> plt.show()
    >>> # Extract 5 clusters
    >>> res.get_clusters(5, criterion = 'maxclust')

    """
    _PERM_MAT_TYPES = ['similarity', 'distance']


[docs]
    def __init__(self, mat, labels=None, mat_type='distance'):
        """Initialize class instance.

        Parameters
        ----------
        mat :       numpy.array | pandas.DataFrame
                    Distance or similarity matrix.
        labels :    list, optional
                    Labels for matrix.
        mat_type :  'distance' | 'similarity', default = 'distance'
                    Sets the type of input matrix:
                      - 'similarity' = high values are more similar
                      - 'distance' = low values are more similar

                    The "missing" matrix type will be computed. For clustering,
                    plotting, etc. distance matrices are used.

        """
        if mat_type not in ClustResults._PERM_MAT_TYPES:
            raise ValueError('Matrix type "{0}" unkown.'.format(mat_type))

        if mat_type == 'similarity':
            self.dist_mat = self._invert_mat(mat)
            self.sim_mat = mat
        else:
            self.dist_mat = mat
            self.sim_mat = self._invert_mat(mat)

        self.labels = labels
        self.mat_type = mat_type

        if isinstance(labels, type(None)) and isinstance(mat, pd.DataFrame):
            self.labels = mat.columns.tolist()


    def __getattr__(self, key):
        if key == 'linkage':
            self.cluster()
            return self.linkage
        elif key == 'condensed_dist_mat':
            return scipy.spatial.distance.squareform(self.dist_mat,
                                                     checks=False)
        elif key in ['leafs', 'leaves']:
            return self.get_leafs()
        elif key == 'cophenet':
            return self.calc_cophenet()
        elif key == 'agg_coeff':
            return self.calc_agg_coeff()

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        return 'ClustResults of {} items at {}'.format(self.sim_mat.shape[0],
                                                       hex(id(self)))

    def get_leafs(self, use_labels=False):
        """Retrieve leaf labels.

        Parameters
        ----------
        use_labels :    bool, optional
                        If True, self.labels will be returned. If False, will
                        use either columns (if matrix is pandas DataFrame)
                        or indices (if matrix is np.ndarray)

        """
        if isinstance(self.dist_mat, pd.DataFrame):
            if use_labels:
                return [self.labels[i] for i in scipy.cluster.hierarchy.leaves_list(self.linkage)]
            else:
                return [self.dist_mat.columns.tolist()[i] for i in scipy.cluster.hierarchy.leaves_list(self.linkage)]
        else:
            return scipy.cluster.hierarchy.leaves_list(self.linkage)

    def calc_cophenet(self):
        """Return Cophenetic Correlation coefficient of your clustering.

        This (very very briefly) compares (correlates) the actual pairwise
        distances of all your samples to those implied by the hierarchical
        clustering. The closer the value is to 1, the better the clustering
        preserves the original distances.

        """
        return scipy.cluster.hierarchy.cophenet(self.linkage,
                                                self.condensed_dist_mat)

    def calc_agg_coeff(self):
        """Return the agglomerative coefficient.

        This measures the clustering structure of the linkage matrix. Because
        it grows with the number of observations, this measure should not be
        used to compare datasets of very different sizes.

        For each observation i, denote by m(i) its dissimilarity to the first
        cluster it is merged with, divided by the dissimilarity of the merger
        in the final step of the algorithm. The agglomerative coefficient is
        the average of all 1 - m(i).

        """
        # Turn into pandas DataFrame for fancy indexing
        Z = pd.DataFrame(self.linkage, columns=[
                         'obs1', 'obs2', 'dist', 'n_org'])

        # Get all distances at which an original observation is merged
        all_dist = Z[(Z.obs1.isin(self.leafs)) | (Z.obs2.isin(self.leafs))].dist.values

        # Divide all distances by last merger
        all_dist /= self.linkage[-1][2]

        # Calc final coefficient
        coeff = np.mean(1 - all_dist)

        return coeff

    def _invert_mat(self, sim_mat):
        """ Inverts matrix."""
        if isinstance(sim_mat, pd.DataFrame):
            return (sim_mat - sim_mat.max().max()) * -1
        else:
            return (sim_mat - sim_mat.max()) * -1

    def cluster(self, method='ward'):
        """Cluster distance matrix.

        This will automatically be called when attribute linkage is requested
        for the first time.

        Parameters
        ----------
        method :    str, optional
                    Clustering method (see scipy.cluster.hierarchy.linkage
                    for reference)

        """
        # Use condensed distance matrix - otherwise clustering thinks we are
        # passing observations instead of final scores
        self.linkage = scipy.cluster.hierarchy.linkage(self.condensed_dist_mat,
                                                       method=method)

        # Save method in case we want to look it up later
        self.cluster_method = method

        logger.info('Clustering done using method "{0}"'.format(method))

    def plot_dendrogram(self, color_threshold=None, return_dendrogram=False,
                        labels=None, fig=None, **kwargs):
        """Plot dendrogram using matplotlib.

        Parameters
        ----------
        color_threshold :   int | float, optional
                            Coloring threshold for dendrogram.
        return_dendrogram : bool, optional
                            If True, dendrogram object is returned instead of
                            figure.
        labels :            list of str, dict
                            Labels in order of original observation or
                            dictionary with mapping original labels.
        kwargs
                            Passed to ``scipy.cluster.hierarchy.dendrogram()``

        Returns
        -------
        matplotlib.figure
                            If ``return_dendrogram=False`.
        sciyp.cluster.hierarchy.dendrogram
                            If ``return_dendrogram=True``.

        """
        import matplotlib.pyplot as plt

        if isinstance(labels, type(None)):
            labels = self.labels
        elif isinstance(labels, dict):
            labels = [labels[l] for l in self.labels]

        if not fig:
            fig = plt.figure()

        dn_kwargs = {'leaf_rotation': 90,
                     'above_threshold_color': 'k'}
        dn_kwargs.update(kwargs)

        dn = scipy.cluster.hierarchy.dendrogram(self.linkage,
                                                color_threshold=color_threshold,
                                                labels=labels,
                                                **dn_kwargs)
        logger.info(
            'Use matplotlib.pyplot.show() to render dendrogram.')

        ax = plt.gca()
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_visible(False)

        try:
            plt.tight_layout()
        except BaseException:
            pass

        if return_dendrogram:
            return dn
        else:
            return fig

    def plot_matrix2(self, **kwargs):
        """Plot distance matrix and dendrogram using seaborn.

        Parameters
        ----------
        kwargs      dict
                    Keyword arguments to be passed to seaborn.clustermap. See
                    http://seaborn.pydata.org/generated/seaborn.clustermap.html


        Returns
        -------
        seaborn.clustermap

        """
        import matplotlib.pyplot as plt

        try:
            import seaborn as sns
        except BaseException:
            raise ImportError('Need seaborn package installed.')

        cg = sns.clustermap(self.dist_mat, row_linkage=self.linkage,
                            col_linkage=self.linkage, **kwargs)

        # Rotate labels
        plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
        plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

        # Make labels smaller
        plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), fontsize=4)
        plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), fontsize=4)

        # Increase padding
        cg.fig.subplots_adjust(right=.8, top=.95, bottom=.2)

        logger.info(
            'Use matplotlib.pyplot.show() to render figure.')

        return cg

    def plot_matrix(self):
        """Plot distance matrix and dendrogram using matplotlib.

        Returns
        -------
        matplotlib figure

        """
        import pylab

        # Compute and plot first dendrogram for all nodes.
        fig = pylab.figure(figsize=(8, 8))
        ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
        Z1 = scipy.cluster.hierarchy.dendrogram(
            self.linkage, orientation='left', labels=self.labels)
        ax1.set_xticks([])
        ax1.set_yticks([])

        # Compute and plot second dendrogram.
        ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
        Z2 = scipy.cluster.hierarchy.dendrogram(
            self.linkage, labels=self.labels)
        ax2.set_xticks([])
        ax2.set_yticks([])

        # Plot distance matrix.
        axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
        idx1 = Z1['leaves']
        idx2 = Z2['leaves']
        D = self.dist_mat.copy()

        if isinstance(D, pd.DataFrame):
            D = D.values

        D = D[idx1, :]
        D = D[:, idx2]
        im = axmatrix.matshow(
            D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
        axmatrix.set_xticks([])
        axmatrix.set_yticks([])

        # Plot colorbar.
        axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
        pylab.colorbar(im, cax=axcolor)

        logger.info(
            'Use matplotlib.pyplot.show() to render figure.')

        return fig

    def plot3d(self, k=5, criterion='maxclust', **kwargs):
        """Plot neuron using :func:`pymaid.plot.plot3d`.

        Will only work if the ClustResult instance has neurons attached to it.

        Parameters
        ----------
        k :         int | float
        criterion : 'maxclust' | 'distance', optional
                    If ``maxclust``, ``k`` clusters will be formed. If
                    ``distance`` clusters will be created at threshold ``k``.
        **kwargs
                Will be passed to ``pymaid.plot3d()``. See
                ``help(plot.plot3d)`` for a list of keywords.

        See Also
        --------
        :func:`pymaid.plot3d`
                    Function called to generate 3d plot.

        """
        if 'neurons' not in self.__dict__:
            logger.error(
                'This works only with cluster results from neurons')
            return None

        cmap = self.get_colormap(k=k, criterion=criterion)

        kwargs.update({'color': cmap})

        return ns.plot3d(self.neurons, **kwargs)

    def to_selection(self, fname='cluster.json', k=5, criterion='maxclust'):
        """Convert clustered neurons into json file.

        This file can be loaded into CATMAID selection table.

        Parameters
        ----------
        fname :     str, optional
                    Filename to save selection to.
        k :         int | float
        criterion : 'maxclust' | 'distance', optional
                    If ``maxclust``, ``k`` clusters will be formed. If
                    ``distance`` clusters will be created at threshold ``k``.

        See Also
        --------
        :func:`pymaid.CatmaidNeuronList.to_selection`
                    Turn CatmaidNeuronList into CATMAID-readable selection.
        :func:`pymaid.CatmaidNeuronList.from_selection`
                    CatmaidNeuronList from CATMAID selection.

        """
        cmap = self.get_colormap(k=k, criterion=criterion)

        # Convert to 0-255
        cmap = {n: [int(v * 255) for v in cmap[n]] for n in cmap}

        data = [dict(skeleton_id=int(n),
                     color="#{:02x}{:02x}{:02x}".format(
                         cmap[n][0], cmap[n][1], cmap[n][2]),
                     opacity=1
                     ) for n in cmap]

        with open(fname, 'w') as outfile:
            json.dump(data, outfile)

        logger.info('Selection saved as %s in %s' % (fname, os.getcwd()))

        return

    def get_colormap(self, k=5, criterion='maxclust'):
        """Generate colormap based on clustering.

        Parameters
        ----------
        k :         int | float
        criterion : 'maxclust' | 'distance', optional
                    If ``maxclust``, ``k`` clusters will be formed. If
                    ``distance`` clusters will be created at threshold ``k``.

        Returns
        -------
        dict
                    {'skeleton_id': (r,g,b),...}

        """
        cl = self.get_clusters(k, criterion, return_type='indices')

        cl = [[self.dist_mat.index.tolist()[i] for i in l] for l in cl]

        colors = [colorsys.hsv_to_rgb(1 / len(cl) * i, 1, 1)
                  for i in range(len(cl) + 1)]

        return {n: colors[i] for i in range(len(cl)) for n in cl[i]}

    def get_clusters(self, k, criterion='maxclust', return_type='labels'):
        """Get get clusters.

        Thin wrapper for ``scipy.cluster.hierarchy.fcluster``.

        Parameters
        ----------
        k :             int | float
        criterion :     'maxclust' | 'distance', optional
                        If ``maxclust``, ``k`` clusters will be formed. If
                        ``distance`` clusters will be created at threshold.
                        ``k``.
        return_type :   'labels' | 'indices' | 'columns' | 'rows'
                        Determines what to construct the clusters of. 'labels'
                        only works if labels are provided. 'indices' refers
                        to index in distance matrix. 'columns'/'rows' works
                        if distance matrix is pandas DataFrame

        Returns
        -------
        list
                    list of clusters ``[[leaf1, leaf5], [leaf2, ...], ...]``

        """
        cl = scipy.cluster.hierarchy.fcluster(
            self.linkage, k, criterion=criterion)

        if self.labels and return_type.lower() == 'labels':
            return [[self.labels[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
        elif return_type.lower() == 'rows':
            return [[self.dist_mat.columns.tolist()[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
        elif return_type.lower() == 'columns':
            return [[self.dist_mat.index.tolist()[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
        else:
            return [[j for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]

    def to_tree(self):
        """Turn linkage to ete3 tree.

        See Also
        --------
        http://etetoolkit.org/
            Ete3 homepage

        Returns
        -------
        ete 3 tree

        """
        try:
            import ete3
        except BaseException:
            raise ImportError(
                'Please install ete3 package to use this function.')

        max_dist = self.linkage[-1][2]
        n_original_obs = self.dist_mat.shape[0]

        list_of_childs = {n_original_obs + i: e[:2]
                          for i, e in enumerate(self.linkage)}
        list_of_parents = {int(e[0]): int(n_original_obs + i)
                           for i, e in enumerate(self.linkage)}
        list_of_parents.update(
            {int(e[1]): int(n_original_obs + i) for i, e in enumerate(self.linkage)})

        total_dist = {n_original_obs + i: e[2]
                      for i, e in enumerate(self.linkage)}
        # Process total distance into distances between nodes (root = 0)
        dist_to_parent = {
            n: max_dist - total_dist[list_of_parents[n]] for n in list_of_parents}

        names = {i: n for i, n in enumerate(self.dist_mat.columns.tolist())}

        # Create empty tree
        tree = ete3.Tree()

        # Start with root node
        root = sorted(list(list_of_childs.keys()), reverse=True)[0]
        treenodes = {root: tree.add_child()}

        for k in sorted(list(list_of_childs.keys()), reverse=True):
            e = list_of_childs[k]
            treenodes[e[0]] = treenodes[k].add_child(
                dist=dist_to_parent[e[0]], name=names.get(e[0], None))
            treenodes[e[1]] = treenodes[k].add_child(
                dist=dist_to_parent[e[1]], name=names.get(e[1], None))

        return tree