Source code for pymaid.user_stats

#    This script is part of pymaid (http://www.github.com/navis-org/pymaid).
#    Copyright (C) 2017 Philipp Schlegel
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along

"""This module contains functions to retrieve user statistics.

Examples
--------
>>> import pymaid
>>> myInstance = pymaid.CatmaidInstance('https://www.your.catmaid-server.org',
...                                     api_token='YOURTOKEN',
...                                     http_user='HTTP_PASSWORD', # omit if not required
...                                     http_password='TOKEN')
>>> skeleton_ids = pymaid.get_skids_by_annotation('Hugin')
>>> cont = pymaid.get_user_contributions(skeleton_ids)
>>> cont
             user  nodes  presynapses  postsynapses
0        Schlegel  47221          470          1408
1            Tran   1645            7             4
2           Lacin   1300            1            20
3              Li   1244            5            45
...
>>> # Get the time that each user has invested
>>> time_inv = pymaid.get_time_invested(skeleton_ids,
...                                     remote_instance = myInstance)
>>> time_inv
            user  total  creation  edition  review
0       Schlegel   4649      3224     2151    1204
1           Tran    174       125       59       0
2             Li    150       114       65       0
3          Lacin    133       119       30       0
...
>>> # Plot contributions as pie chart
>>> import plotly
>>> fig = {"data": [{"values": time_inv.total.tolist(),
...        "labels": time_inv.user.tolist(),
...        "type": "pie"}]}
>>> plotly.offline.plot(fig)

"""

# TODOs
# - Github punch card-like figure

import datetime

import pandas as pd
import numpy as np

from . import core, fetch, utils, config

# Set up logging
logger = config.get_logger(__name__)

__all__ = ['get_user_contributions', 'get_time_invested', 'get_user_actions',
           'get_team_contributions', 'get_user_stats']



[docs]
def get_user_stats(start_date=None, end_date=None, remote_instance=None):
    """Get user stats similar to the pie chart statistics widget in CATMAID.

    Returns cable [nm], nodes created/reviewed and connector links created.

    Parameters
    ----------
    start_date :        tuple | datetime.date, optional
    end_date :          tuple | datetime.date, optional
                        Start and end date of time window to check. If
                        ``None``, will use entire project history.
    remote_instance :   CatmaidInstance, optional
                        Either pass explicitly or define globally.

    Returns
    -------
    pandas.DataFrame
                Dataframe in which each row represents a user::

                          cable  nodes_created  nodes_reviewed  links_created
                 username
                    user1  ...
                    user2  ...

    Examples
    --------
    Create a pie chart similar to the stats widget in CATMAID:

    >>> import matplotlib.pyplot as plt
    >>> stats = pymaid.get_user_stats()
    >>> stats_to_plot = ['cable', 'nodes_created', 'nodes_reviewed',
    ...                  'links_created']
    >>> fig, axes = plt.subplots(1, len(stats_to_plot), figsize=(12, 4))
    >>> for s, ax in zip(stats_to_plot, axes):
    ...     # Get the top 10 contributors for this stat
    ...     this_stats = stats[s].sort_values(ascending=False).iloc[:10]
    ...     # Calculate "others"
    ...     this_stats.loc['others'] = stats[s].sort_values(ascending=False).iloc[10:].sum()
    ...     # Plot
    ...     this_stats.plot.pie(ax=ax, textprops={'size': 6},
    ...                         explode=[.05] * this_stats.shape[0],
    ...                         rotatelabels=True)
    ...     # Make labels a bit smaller
    ...     ax.set_ylabel(s.replace('_', ' '), fontsize=8)
    >>> plt.show()

    See Also
    --------
    :func:`~pymaid.get_history`
            Returns day-by-day stats.

    """
    remote_instance = utils._eval_remote_instance(remote_instance)

    if isinstance(start_date, type(None)):
        start_date = datetime.date(2010, 1, 1)
    elif not isinstance(start_date, datetime.date):
        start_date = datetime.date(*start_date)

    if isinstance(end_date, type(None)):
        end_date = datetime.date.today()
    elif not isinstance(end_date, datetime.date):
        end_date = datetime.date(*end_date)

    # Get and summarize other stats
    hist = fetch.get_history(remote_instance=remote_instance,
                             start_date=start_date,
                             end_date=end_date)

    stats = pd.concat([hist.cable.sum(axis=1),
                       hist.treenodes.sum(axis=1),
                       hist.reviewed.sum(axis=1),
                       hist.connector_links.sum(axis=1)],
                      axis=1, sort=True).fillna(0).astype(int)

    stats.index.name = 'username'
    stats.columns = ['cable', 'nodes_created', 'nodes_reviewed',
                     'links_created']

    stats.sort_values('nodes_created', ascending=False, inplace=True)

    return stats




[docs]
def get_team_contributions(teams, neurons=None, remote_instance=None):
    """Get contributions by teams (nodes, reviews, connectors, time invested).

    Notes
    -----
     1. Time calculation uses defaults from :func:`pymaid.get_time_invested`.
     2. ``total_reviews`` > ``total_nodes`` is possible if nodes have been
        reviewed multiple times by different users. Similarly,
        ``total_reviews`` = ``total_nodes`` does not imply that the neuron
        is fully reviewed!

    Parameters
    ----------
    teams               dict
                        Teams to group contributions for. Users must be logins.
                        Format can be either:

                          1. Simple user assignments. For example::

                              {'teamA': ['user1', 'user2'],
                               'team2': ['user3'], ...]}

                          2. Users with start and end dates. Start and end date
                             must be either ``datetime.date`` or a single
                             ``pandas.date_range`` object. For example::

                               {'team1': {
                                        'user1': (datetime.date(2017, 1, 1),
                                                  datetime.date(2018, 1, 1)),
                                        'user2': (datetime.date(2016, 6, 1),
                                                  datetime.date(2017, 1, 1)
                                        }
                                'team2': {
                                        'user3': pandas.date_range('2017-1-1',
                                                                   '2018-1-1'),
                                        }}

                        Mixing both styles is permissible. For second style,
                        use e.g. ``'user1': None`` for no date restrictions
                        on that user.

    neurons             skeleton ID(s) | CatmaidNeuron/List, optional
                        Restrict check to given set of neurons. If
                        CatmaidNeuron/List, will use this neurons nodes/
                        connectors. Use to subset contributions e.g. to a given
                        neuropil by pruning neurons before passing to this
                        function.
    remote_instance :   CatmaidInstance, optional
                        Either pass explicitly or define globally.

    Returns
    -------
    pandas.DataFrame
        DataFrame in which each row represents a neuron. Example for two teams,
        ``teamA`` and ``teamB``::

           skeleton_id  total_nodes  teamA_nodes  teamB_nodes  ...
         0
         1
           total_reviews  teamA_reviews  teamB_reviews  ...
         0
         1
           total_connectors  teamA_connectors  teamB_connectors ...
         0
         1
           total_time  teamA_time  teamB_time
         0
         1

    Examples
    --------
    >>> from datetime import date
    >>> import pandas as pd
    >>> teams = {'teamA': ['user1', 'user2'],
    ...          'teamB': {'user3': None,
    ...                    'user4': (date(2017, 1, 1), date(2018, 1, 1))},
    ...          'teamC': {'user5': pd.date_range('2015-1-1', '2018-1-1')}}
    >>> stats = pymaid.get_team_contributions(teams)

    See Also
    --------
    :func:`~pymaid.get_contributor_statistics`
                           Gives you more basic info on neurons of interest
                           such as total reconstruction/review time.
    :func:`~pymaid.get_time_invested`
                           Time invested by individual users. Gives you more
                           control over how time is calculated.

    """
    remote_instance = utils._eval_remote_instance(remote_instance)

    # Prepare teams
    if not isinstance(teams, dict):
        raise TypeError('Expected teams of type dict, got '
                        '{}'.format(type(teams)))

    beginning_of_time = datetime.date(1900, 1, 1)
    today = datetime.date.today()
    all_time = pd.date_range(beginning_of_time, today)

    for t in teams:
        if isinstance(teams[t], list):
            teams[t] = {u: all_time for u in teams[t]}
        elif isinstance(teams[t], dict):
            for u in teams[t]:
                if isinstance(teams[t][u], type(None)):
                    teams[t][u] = all_time
                elif isinstance(teams[t][u], (tuple, list)):
                    try:
                        teams[t][u] = pd.date_range(*teams[t][u])
                    except BaseException:
                        raise Exception('Error converting "{}" to pandas.'
                                        'date_range'.format(teams[t][u]))
                elif isinstance(teams[t][u],
                                pd.core.indexes.datetimes.DatetimeIndex):
                    pass
                else:
                    TypeError('Expected user dates to be either None, tuple '
                              'of datetimes or pandas.date_range, '
                              'got {}'.format(type(teams[t][u])))
        else:
            raise TypeError('Expected teams to be either lists or dicts of '
                            'users, got {}'.format(type(teams[t])))

    # Get all users
    all_users = [u for t in teams for u in teams[t]]

    # Prepare neurons - download if neccessary
    if not isinstance(neurons, type(None)):
        if isinstance(neurons, core.CatmaidNeuron):
            neurons = core.CatmaidNeuronList(neurons)
        elif isinstance(neurons, core.CatmaidNeuronList):
            pass
        else:
            neurons = fetch.get_neurons(neurons,
                                        remote_instance=remote_instance)
    else:
        all_dates = [d.date() for t in teams for u in teams[t] for d in teams[t][u]]
        neurons = fetch.find_neurons(users=all_users,
                                     from_date=min(all_dates),
                                     to_date=max(all_dates),
                                     remote_instance=remote_instance)
        neurons.get_skeletons()

    # Get user list
    user_list = fetch.get_user_list(remote_instance=remote_instance).set_index('login')

    for u in all_users:
        if u not in user_list.index:
            raise ValueError('User "{}" not found in user list'.format(u))

    # Get all node details
    all_node_details = fetch.get_node_details(neurons,
                                              remote_instance=remote_instance)

    # Get connector links
    link_details = fetch.get_connector_links(neurons, remote_instance=remote_instance)

    # link_details contains all links. We have to subset this to existing
    # connectors in case the input neurons have been pruned
    link_details = link_details[link_details.connector_id.isin(neurons.connectors.connector_id.values)]

    interval = 3
    bin_width = '%iMin' % interval
    minimum_actions = 10 * interval
    stats = []
    for n in config.tqdm(neurons, desc='Processing',
                         disable=config.pbar_hide, leave=config.pbar_leave):
        # Get node details
        tn_ids = n.nodes.node_id.values.astype(str)
        cn_ids = n.connectors.connector_id.values.astype(str)

        current_status = config.pbar_hide
        config.pbar_hide = True
        node_details = all_node_details[all_node_details.node_id.isin(np.append(tn_ids, cn_ids))]
        config.pbar_hide = current_status

        # Extract node creation
        node_creation = node_details.loc[node_details.node_id.isin(tn_ids),
                                         ['creator', 'creation_time']].values
        node_creation = np.c_[node_creation, ['node_creation'] * node_creation.shape[0]]

        # Extract connector creation
        cn_creation = node_details.loc[node_details.node_id.isin(cn_ids),
                                       ['creator', 'creation_time']].values
        cn_creation = np.c_[cn_creation, ['cn_creation'] * cn_creation.shape[0]]

        # Extract edition times (treenodes + connectors)
        node_edits = node_details.loc[:, ['editor', 'edition_time']].values
        node_edits = np.c_[node_edits, ['editor'] * node_edits.shape[0]]

        # Link creation
        link_creation = link_details.loc[link_details.connector_id.isin(cn_ids),
                                         ['creator', 'creation_time']].values
        link_creation = np.c_[link_creation, ['link_creation'] * link_creation.shape[0]]

        # Extract review times
        reviewers = [u for l in node_details.reviewers.values for u in l]
        timestamps = [ts for l in node_details.review_times.values for ts in l]
        node_review = np.c_[reviewers, timestamps, ['review'] * len(reviewers)]

        # Merge all timestamps (ignore edits for now) to get time_invested
        all_ts = pd.DataFrame(np.vstack([node_creation,
                                         node_review,
                                         cn_creation,
                                         link_creation,
                                         node_edits]),
                              columns=['user', 'timestamp', 'type'])

        return all_ts

        # Add column with just the date and make it the index
        all_ts['date'] = all_ts.timestamp.values.astype('datetime64[D]')
        all_ts.index = pd.to_datetime(all_ts.date, format="ISO8601")

        # Fill in teams for each timestamp based on user + date
        all_ts['team'] = None
        for t in teams:
            for u in teams[t]:
                # Assign all timestamps by this user in the right time to
                # this team
                existing_dates = (teams[t][u] & all_ts.index).unique()
                ss = (all_ts.index.isin(existing_dates)) & (all_ts.user.values == user_list.loc[u, 'id'])
                all_ts.loc[ss, 'team'] = t

        # Get total
        total_time = sum(all_ts.timestamp.to_frame().set_index(
                'timestamp', drop=False).groupby(pd.Grouper(freq=bin_width)).count().values >= minimum_actions)[0] * interval

        this_neuron = [n.skeleton_id, n.n_nodes, n.n_connectors,
                       node_review.shape[0], total_time]
        # Go over the teams and collect values
        for t in teams:
            # Subset to team
            this_team = all_ts[all_ts.team == t]
            if this_team.shape[0] > 0:
                # Subset to user ID
                team_time = sum(this_team.timestamp.to_frame().set_index(
                    'timestamp', drop=False).groupby(pd.Grouper(freq=bin_width)).count().values >= minimum_actions)[0] * interval
                team_nodes = this_team[this_team['type'] == 'node_creation'].shape[0]
                team_cn = this_team[this_team['type'] == 'cn_creation'].shape[0]
                team_rev = this_team[this_team['type'] == 'review'].shape[0]
            else:
                team_nodes = team_cn = team_rev = team_time = 0

            this_neuron += [team_nodes, team_cn, team_rev, team_time]

        stats.append(this_neuron)

    cols = ['skeleton_id', 'total_nodes', 'total_connectors',
            'total_reviews', 'total_time']

    for t in teams:
        for s in ['nodes', 'connectors', 'reviews', 'time']:
            cols += ['{}_{}'.format(t, s)]

    stats = pd.DataFrame(stats, columns=cols)

    cols_ordered = ['skeleton_id'] + ['{}_{}'.format(t, v) for v in
                                      ['nodes', 'connectors',
                                       'reviews', 'time'] for t in ['total'] + list(teams)]
    stats = stats[cols_ordered]

    return stats




[docs]
def get_user_contributions(x, teams=None, remote_instance=None):
    """Return number of nodes and synapses contributed by each user.

    This is essentially a wrapper for :func:`pymaid.get_contributor_statistics`
    - if you are also interested in e.g. construction time, review time, etc.
    you may want to consider using :func:`~pymaid.get_contributor_statistics`
    instead.

    Parameters
    ----------
    x
                        Which neurons to check. Can be either:

                        1. skeleton IDs (int or str)
                        2. neuron name (str, must be exact match)
                        3. annotation: e.g. 'annotation:PN right'
                        4. CatmaidNeuron or CatmaidNeuronList object
    teams               dict, optional
                        Teams to group contributions for. Users must be logins::

                            {'teamA': ['user1', 'user2'], 'team2': ['user3'], ...]}

                        Users not part of any team, will be grouped as team
                        ``'others'``.

    remote_instance :   CatmaidInstance, optional
                        Either pass explicitly or define globally.

    Returns
    -------
    pandas.DataFrame
        DataFrame in which each row represents a user::

            user  nodes  presynapses  postsynapses  nodes_reviewed
         0
         1
         ...

    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> # Get contributors for a single neuron
    >>> cont = pymaid.get_user_contributions(2333007)
    >>> # Get top 10 (by node contribution)
    >>> top10 = cont.iloc[:10].set_index('user')
    >>> # Plot as bar chart
    >>> ax = top10.plot(kind='bar')
    >>> plt.show()

    >>> # Plot relative contributions
    >>> cont = pymaid.get_user_contributions(2333007)
    >>> cont = cont.set_index('user')
    >>> # Normalize
    >>> cont_rel = cont / cont.sum(axis=0).values
    >>> # Plot contributors with >5% node contributions
    >>> ax = cont_rel[cont_rel.nodes > .05].plot(kind='bar')
    >>> plt.show()

    See Also
    --------
    :func:`~pymaid.get_contributor_statistics`
                           Gives you more basic info on neurons of interest
                           such as total reconstruction/review time.

    """
    if not isinstance(teams, type(None)):
        # Prepare teams
        if not isinstance(teams, dict):
            raise TypeError('Expected teams of type dict, got '
                            '{}'.format(type(teams)))

        for t in teams:
            if not isinstance(teams[t], list):
                raise TypeError('Teams need to list of user logins, '
                                'got {}'.format(type(teams[t])))

        # Turn teams into a login -> team dict
        teams = {u: t for t in teams for u in teams[t]}

    remote_instance = utils._eval_remote_instance(remote_instance)

    skids = utils.eval_skids(x, remote_instance=remote_instance)

    cont = fetch.get_contributor_statistics(skids,
                                            remote_instance=remote_instance,
                                            separate=False)

    all_users = set(list(cont.node_contributors.keys()) + list(cont.pre_contributors.keys()) + list(cont.post_contributors.keys()))

    stats = {
        'nodes': {u: 0 for u in all_users},
        'presynapses': {u: 0 for u in all_users},
        'postsynapses': {u: 0 for u in all_users},
        'nodes_reviewed': {u: 0 for u in all_users}
    }

    for u in cont.node_contributors:
        stats['nodes'][u] = cont.node_contributors[u]
    for u in cont.pre_contributors:
        stats['presynapses'][u] = cont.pre_contributors[u]
    for u in cont.post_contributors:
        stats['postsynapses'][u] = cont.post_contributors[u]
    for u in cont.review_contributors:
        stats['nodes_reviewed'][u] = cont.review_contributors[u]

    stats = pd.DataFrame([[u, stats['nodes'][u],
                           stats['presynapses'][u],
                           stats['postsynapses'][u],
                           stats['nodes_reviewed'][u]] for u in all_users],
                         columns=['user', 'nodes', 'presynapses',
                                  'postsynapses', 'nodes_reviewed']
                         ).sort_values('nodes', ascending=False).reset_index(drop=True)

    if isinstance(teams, type(None)):
        return stats

    stats['team'] = [teams.get(u, 'others') for u in stats.user.values]
    return stats.groupby('team').sum()




[docs]
def get_time_invested(x, mode='SUM', by='USER', minimum_actions=10,
                      max_inactive_time=3, treenodes=True, connectors=True,
                      links=True, start_date=None, end_date=None,
                      remote_instance=None):
    """Calculate the time spent working on a set of neurons.

    Use ``minimum_actions`` and ``max_inactive_time`` to fine tune how time
    invested is calculated: by default, time is binned over 3 minutes in
    which a user has to perform 3x10 actions for that interval to be
    counted towards the time spent tracing.

    Important
    ---------
    Creation/Edition/Review times can overlap! This is why total time spent
    is not just creation + edition + review.

    Please note that this does currently not take placement of
    pre-/postsynaptic nodes into account!

    Be aware of the ``minimum_actions`` parameter: at low values even
    a single action (e.g. connecting a node) will add considerably to time
    invested. To keep total reconstruction time comparable to what Catmaid
    calculates, you should consider about 10 actions/minute (= a click every
    6 seconds) and ``max_inactive_time`` of 3 mins.

    CATMAID gives reconstruction time across all users. Here, we calculate
    the time spent tracing for individuals. This may lead to a discrepancy
    between sum of time invested over of all users from this function vs.
    CATMAID's reconstruction time.


    Parameters
    ----------
    x
                        Which neurons to check. Can be either:

                        1. skeleton IDs (int or str)
                        2. neuron name (str, must be exact match)
                        3. annotation: e.g. 'annotation:PN right'
                        4. CatmaidNeuron or CatmaidNeuronList object

                        If you pass a CatmaidNeuron/List, its node/connectors
                        are used to calculate time invested. You can exploit
                        this to get time spent reconstructing in given
                        compartment of a neurons, e.g. by pruning it to a
                        volume before passing it to ``get_time_invested``.
    mode :              'SUM' | 'SUM2' | 'OVER_TIME' | 'ACTIONS', optional
                        (1) 'SUM' will return total time invested (in minutes)
                            broken down by creation, edition and review.
                        (2) 'SUM2' will return total time invested (in
                            minutes) broken down by `treenodes`, `connectors`
                            and `links`.
                        (3) 'OVER_TIME' will return minutes invested/day over
                            time.
                        (4) 'ACTIONS' will return actions
                            (node/connectors placed/edited) per day.
    by :                'USER' | 'NEURON', optional
                        Determines whether the stats are broken down by user or
                        by neuron.
    minimum_actions :   int, optional
                        Minimum number of actions per minute to be counted as
                        active.
    max_inactive_time : int, optional
                        Interval in minutes over which time invested is
                        binned. Essentially determines how much time can be
                        between bouts of activity.
    treenodes :         bool, optional
                        If False, treenodes will not be taken into account.
    connectors :        bool, optional
                        If False, connectors will not be taken into account.
    links :             bool, optional
                        If False, connector links will not be taken into account.
    start_date :        iterable | datetime.date | numpy.datetime64, optional
                        Restricts time invested to window. Applies to creation
                        but not edition time! If iterable, must be year, month
                        day, e.g. ``[2018, 1, 1]``.
    end_date :          iterable | datetime.date | numpy.datetime64, optional
                        See ``start_date``.
    remote_instance :   CatmaidInstance, optional
                        Either pass explicitly or define globally.

    Returns
    -------
    pandas.DataFrame
        If ``mode='SUM'``, values represent minutes invested::

                 total  creation  edition  review
          user1
          user2
          ..
          .

        If ``mode='SUM2'``, values represent minutes invested::

                 total  treenodes  connectors  links
          user1
          user2
          ..
          .

        If ``mode='OVER_TIME'`` or ``mode='ACTIONS'``::

                 date1  date2  date3  ...
          user1
          user2
          ..
          .

        For `OVER_TIME`, values respresent minutes invested on that day. For
        `ACTIONS`, values represent actions (creation, edition, review) on that
        day.


    Examples
    --------
    Get time invested for a set of neurons:

    >>> da1 = pymaid.get_neurons('annotation:glomerulus DA1')
    >>> time = pymaid.get_time_invested(da1)

    Get time spent tracing in a specific compartment:

    >>> da1_lh = pymaid.prune_by_volume('LH_R', inplace=False)
    >>> time_lh = pymaid.get_time_invested(da1_lh)

    Get contributions within a given time window:

    >>> time_jan = pymaid.get_time_invested(da1,
    ...                                     start_date=[2018, 1, 1],
    ...                                     end_date=[2018, 1, 31])


    Plot pie chart of contributions per user using Plotly:

    >>> import plotly
    >>> stats = pymaid.get_time_invested(skids, remote_instance)
    >>> # Use plotly to generate pie chart
    >>> fig = {"data": [{"values": stats.total.tolist(),
    ...        "labels": stats.user.tolist(), "type" : "pie" }]}
    >>> plotly.offline.plot(fig)

    Plot reconstruction efforts over time:

    >>> stats = pymaid.get_time_invested(skids, mode='OVER_TIME')
    >>> # Plot time invested over time
    >>> stats.T.plot()
    >>> # Plot cumulative time invested over time
    >>> stats.T.cumsum(axis=0).plot()
    >>> # Filter for major contributors
    >>> stats[stats.sum(axis=1) > 20].T.cumsum(axis=0).plot()

    """
    def _extract_timestamps(ts, restrict_groups, desc='Calc'):
        if ts.empty:
            return {}
        grouped = ts.set_index('timestamp',
                               drop=False).groupby(['group',
                                                    pd.Grouper(freq=bin_width)]).count() >= minimum_actions
        temp_stats = {}
        for g in config.tqdm(set(ts.group.unique()) & set(restrict_groups),
                             desc=desc, disable=config.pbar_hide, leave=False):
            temp_stats[g] = sum(grouped.loc[g].values)[0] * interval
        return temp_stats

    if mode not in ['SUM', 'SUM2', 'OVER_TIME', 'ACTIONS']:
        raise ValueError('Unknown mode "{}"'.format(mode))

    if by not in ['NEURON', 'USER']:
        raise ValueError('Unknown by "{}"'.format(by))

    remote_instance = utils._eval_remote_instance(remote_instance)

    skids = utils.eval_skids(x, remote_instance=remote_instance)

    # Maximal inactive time is simply translated into binning
    # We need this later for pandas.TimeGrouper() anyway
    interval = max_inactive_time
    bin_width = '%iMin' % interval

    # Update minimum_actions to reflect actions/interval instead of
    # actions/minute
    minimum_actions *= interval

    user_list = fetch.get_user_list(remote_instance=remote_instance).set_index('id')
    user_dict = user_list.login.to_dict()

    if not isinstance(x, (core.CatmaidNeuron, core.CatmaidNeuronList)):
        x = fetch.get_neuron(skids, remote_instance=remote_instance)

    if isinstance(x, core.CatmaidNeuron):
        skdata = core.CatmaidNeuronList(x)
    elif isinstance(x, core.CatmaidNeuronList):
        skdata = x

    if not isinstance(end_date, (datetime.date, np.datetime64, type(None))):
        end_date = datetime.date(*end_date)

    if not isinstance(start_date, (datetime.date, np.datetime64, type(None))):
        start_date = datetime.date(*start_date)

    # Extract connector and node IDs
    node_ids = []
    connector_ids = []
    for n in skdata.itertuples():
        if treenodes:
            node_ids += n.nodes.node_id.tolist()
        if connectors:
            connector_ids += n.connectors.connector_id.tolist()

    # Get node details
    node_details = fetch.get_node_details(node_ids + connector_ids,
                                          remote_instance=remote_instance)

    # Get details for links
    if links:
        link_details = fetch.get_connector_links(skdata,
                                                 remote_instance=remote_instance)

        # link_details contains all links. We have to subset this to existing
        # connectors in case the input neurons have been pruned
        link_details = link_details[link_details.connector_id.isin(connector_ids)]
    else:
        link_details = pd.DataFrame([], columns=['creator', 'creation_time'])

    # Remove timestamps outside of date range (if provided)
    if start_date:
        node_details = node_details[node_details.creation_time >= np.datetime64(start_date)]
        link_details = link_details[link_details.creation_time >= np.datetime64(start_date)]
    if end_date:
        node_details = node_details[node_details.creation_time <= np.datetime64(end_date)]
        link_details = link_details[link_details.creation_time <= np.datetime64(end_date)]

    # If we want to group by neuron, we need to add a "skeleton ID" column and
    # make check if we need to duplicate rows with connectors
    if by == 'NEURON':
        # Need to add a column with the skeleton ID
        node_details['skeleton_id'] = None
        node_details['node_type'] = 'connector'
        col_name = 'skeleton_id'

        for n in skdata:
            cond = node_details.node_id.isin(n.nodes.node_id.values.astype(str))
            node_details.loc[cond, 'skeleton_id'] = n.skeleton_id
            node_details.loc[cond, 'node_type'] = 'treenode'

        # Connectors can show up in more than one neuron -> we need to duplicate
        # those rows for each of the associated neurons
        cn_details = []
        for n in skdata:
            cond1 = node_details.node_type == 'connector'
            cond2 = node_details.node_id.isin(n.connectors.connector_id.values.astype(str))
            node_details.loc[cond1 & cond2, 'skeleton_id'] = n.skeleton_id
            this_cn = node_details.loc[cond1 & cond2]
            cn_details.append(this_cn)
        cn_details = pd.concat(cn_details, axis=0)

        # Merge the node details again
        cond1 = node_details.node_type == 'treenode'
        node_details = pd.concat([node_details.loc[cond1], cn_details],
                                 axis=0).reset_index(drop=True)

        # Note that link_details already has a "skeleton_id" column
        # but we need to make sure it's strings
        link_details['skeleton_id'] = link_details.skeleton_id.astype(str)

        create_group = edit_group = 'skeleton_id'
    else:
        create_group = 'creator'
        edit_group = 'editor'
        col_name = 'user'

    # Dataframe for creation (i.e. the actual generation of the nodes)
    creation_timestamps = np.append(node_details[[create_group,
                                                  'creation_time']].values,
                                    link_details[[create_group,
                                                  'creation_time']].values,
                                    axis=0)
    creation_timestamps = pd.DataFrame(creation_timestamps,
                                       columns=['group', 'timestamp'])

    # Dataframe for edition times - can't use links as there is no editor
    # Because creation of a node counts as an edit, we are removing
    # timestamps where creation and edition time are less than 100ms apart
    is_edit = (node_details.edition_time - node_details.creation_time) > np.timedelta64(200, 'ms')
    edition_timestamps = node_details.loc[is_edit, [edit_group, 'edition_time']]
    edition_timestamps.columns = ['group', 'timestamp']

    # Generate dataframe for reviews -> here we have to unpack
    if by == 'USER':
        groups = [u for l in node_details.reviewers.values for u in l]
    else:
        groups = [s for l, s in zip(node_details.review_times.values,
                                    node_details.skeleton_id.values) for ts in l]
    timestamps = [ts for l in node_details.review_times.values for ts in l]
    review_timestamps = pd.DataFrame([groups, timestamps]).T
    review_timestamps.columns = ['group', 'timestamp']

    # Change user ID to login
    if by == 'USER':
        if mode == 'SUM2':
            node_details['creator'] = node_details.creator.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
            node_details['editor'] = node_details.editor.map(lambda x: user_dict.get(x, f'Anonymous{x}'))

            link_details['creator'] = link_details.creator.map(lambda x: user_dict.get(x, f'Anonymous{x}'))

        creation_timestamps['group'] = creation_timestamps.group.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
        edition_timestamps['group'] = edition_timestamps.group.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
        review_timestamps['group'] = review_timestamps.group.map(lambda x: user_dict.get(x, f'Anonymous{x}'))

    # Merge all timestamps
    all_timestamps = pd.concat([creation_timestamps,
                                edition_timestamps,
                                review_timestamps],
                               axis=0)

    all_timestamps.sort_values('timestamp', inplace=True)

    if by == 'USER':
        # Extract the users that are relevant for us
        relevant_users = all_timestamps.groupby('group').count()
        groups = relevant_users[relevant_users.timestamp >= minimum_actions].index.values
    else:
        groups = skdata.skeleton_id

    if mode == 'SUM':
        # This breaks it down by time spent on creation, edition and review
        stats = {k: {g: 0 for g in groups} for k in ['total',
                                                     'creation',
                                                     'edition',
                                                     'review']}

        stats['total'].update(_extract_timestamps(all_timestamps,
                                                  groups,
                                                  desc='Calc total'))
        stats['creation'].update(_extract_timestamps(creation_timestamps,
                                                     groups,
                                                     desc='Calc creation'))
        stats['edition'].update(_extract_timestamps(edition_timestamps,
                                                    groups,
                                                    desc='Calc edition'))
        stats['review'].update(_extract_timestamps(review_timestamps,
                                                   groups,
                                                   desc='Calc review'))

        return pd.DataFrame([[g,
                              stats['total'][g],
                              stats['creation'][g],
                              stats['edition'][g],
                              stats['review'][g]] for g in groups],
                            columns=[col_name, 'total',
                                     'creation', 'edition',
                                     'review']
                            ).sort_values('total',
                                          ascending=False
                                          ).reset_index(drop=True).set_index(col_name)

    elif mode == 'SUM2':
        # This breaks it down by time spent on nodes, connectors and links
        stats = {k: {g: 0 for g in groups} for k in ['total',
                                                     'treenodes',
                                                     'connectors',
                                                     'links']}

        stats['total'].update(_extract_timestamps(all_timestamps,
                                                  groups,
                                                  desc='Calc total'))

        # We need to construct separate DataFrames for nodes, connectors + links
        # Note that we are using only edits that do not stem from the creation
        is_tn = node_details.node_id.astype(int).isin(node_ids)
        conc = np.concatenate([node_details.loc[is_tn,
                                                [create_group, 'creation_time']
                                                ].values,
                               node_details.loc[is_edit & is_tn,
                                                [edit_group, 'edition_time']
                                                ].values
                               ],
                              axis=0)
        treenode_timestamps = pd.DataFrame(conc, columns=['group', 'timestamp'])

        stats['treenodes'].update(_extract_timestamps(treenode_timestamps,
                                                      groups,
                                                      desc='Calc treenodes'))

        # Now connectors
        # Note that we are using only edits that do not stem from the creation
        is_cn = node_details.node_id.astype(int).isin(connector_ids)
        conc = np.concatenate([node_details.loc[is_cn,
                                                [create_group, 'creation_time']
                                                ].values,
                               node_details.loc[is_edit & is_cn,
                                                [edit_group, 'edition_time']
                                                ].values
                               ],
                              axis=0)
        connector_timestamps = pd.DataFrame(conc, columns=['group', 'timestamp'])

        stats['connectors'].update(_extract_timestamps(connector_timestamps,
                                                       groups,
                                                       desc='Calc connectors'))

        # Now links
        link_timestamps = pd.DataFrame(link_details[[create_group,
                                                     'creation_time']].values,
                                       columns=['group', 'timestamp'])

        stats['links'].update(_extract_timestamps(link_timestamps,
                                                  groups,
                                                  desc='Calc links'))

        return pd.DataFrame([[g,
                              stats['total'][g],
                              stats['treenodes'][g],
                              stats['connectors'][g],
                              stats['links'][g]] for g in groups],
                            columns=[col_name, 'total',
                                     'treenodes', 'connectors',
                                     'links']
                            ).sort_values('total', ascending=False
                                          ).reset_index(drop=True
                                                        ).set_index(col_name)

    elif mode == 'ACTIONS':
        all_ts = all_timestamps.set_index('timestamp', drop=False
                                          ).timestamp.groupby(pd.Grouper(freq='1d')
                                                              ).count().to_frame()
        all_ts.columns = ['all_groups']
        all_ts = all_ts.T
        # Get total time spent
        for g in config.tqdm(all_timestamps.group.unique(), desc='Calc. total',
                             disable=config.pbar_hide, leave=False):
            this_ts = all_timestamps[all_timestamps.group == g].set_index(
                'timestamp', drop=False).timestamp.groupby(pd.Grouper(freq='1d')).count().to_frame()
            this_ts.columns = [g]

            all_ts = pd.concat([all_ts, this_ts.T])

        return all_ts.fillna(0)

    elif mode == 'OVER_TIME':
        # Go over all users and collect time invested
        all_ts = []
        for g in config.tqdm(all_timestamps.group.unique(), desc='Calc. total', disable=config.pbar_hide, leave=False):
            # First count all minutes with minimum number of actions
            minutes_counting = (all_timestamps[all_timestamps.group == g].set_index(
                'timestamp', drop=False).timestamp.groupby(pd.Grouper(freq=bin_width)).count().to_frame() >= minimum_actions)
            # Then remove the minutes that have less than minimum actions
            minutes_counting = minutes_counting[minutes_counting.timestamp]

            # Now group timestamps by day
            this_ts = minutes_counting.groupby(pd.Grouper(freq='1d')).count()

            # Rename columns to user login
            this_ts.columns = [g]

            # Append if an and move on
            if not this_ts.empty:
                all_ts.append(this_ts.T)

        # Turn into DataFrame
        all_ts = pd.concat(all_ts).sort_index()

        # Replace NaNs with 0
        all_ts.fillna(0, inplace=True)

        # Add all users column
        all_users = all_ts.sum(axis=0)
        all_users.name = 'all_groups'

        all_ts = pd.concat([all_users, all_ts.T], axis=1).T

        return all_ts




[docs]
def get_user_actions(users=None, neurons=None, start_date=None, end_date=None,
                     remote_instance=None):
    """Get timestamps of user actions (creations, editions, reviews, linking).

    Important
    ---------
    This function returns most but not all user actions::

      1. The API endpoint used for finding neurons worked on by a given user
         (:func:`pymaid.find_neurons`) does not return single-node neurons.
         Hence, placing e.g. postsynaptic nodes is not taken into account.
      2. Any creation is also an edit. However, only the last edit is kept
         track of. So each creation counts as an edit for the creator until a
         different user makes an edit.

    Parameters
    ----------
    users :           str | list, optional
                      Users login(s) for which to return timestamps.
    neurons :         list of skeleton IDs | CatmaidNeuron/List, optional
                      Neurons for which to return timestamps. If None, will
                      find neurons by user.
    start_date :      tuple | datetime.date, optional
    end_date :        tuple | datetime.date, optional
                      Start and end date of time window to check.
    remote_instance : CatmaidInstance, optional

    Return
    ------
    pandas.DataFrame
            DataFrame in which each row is a user action::

                user   timestamp   action
             0
             1
             ...

    Examples
    --------
    In the first example we will have a look at how active a user is over
    the course of a day.

    >>> import pandas as pd
    >>> import matplotlib.pyplot as plt
    >>> # Get all actions for a single user
    >>> actions = pymaid.get_user_actions(users='schlegelp',
    ....                                  start_date=(2017, 11, 1))
    >>> # Group by hour and see what time of the day user is usually active
    >>> actions.set_index(pd.DatetimeIndex(actions.timestamp), inplace=True)
    >>> hours = actions.groupby(actions.index.hour).count()
    >>> ax = hours.action.plot()
    >>> plt.show()

    >>> # Plot day-by-day activity
    >>> ax = plt.subplot()
    >>> ax.scatter(actions.timestamp.date.values,
    ...            actions.timestamp.time.values,
    ...            marker='_')

    """
    if not neurons and not users and not (start_date or end_date):
        raise ValueError('Query must be restricted by at least a single '
                         'parameter!')

    if users and not isinstance(users, (list, np.ndarray)):
        users = [users]

    # Get user dictionary (needed later)
    user_list = fetch.get_user_list(remote_instance=remote_instance)
    user_dict = user_list.set_index('id').login.to_dict()

    if isinstance(neurons, type(None)):
        neurons = fetch.find_neurons(users=users,
                                     from_date=start_date, to_date=end_date,
                                     reviewed_by=users,
                                     remote_instance=remote_instance)
        # Get skeletons
        neurons.get_skeletons()
    elif not isinstance(neurons, (core.CatmaidNeuron, core.CatmaidNeuronList)):
        neurons = fetch.get_neuron(neurons, remote_instance=remote_instance)

    if not isinstance(end_date, (datetime.date, type(None))):
        end_date = datetime.date(*end_date)

    if not isinstance(start_date, (datetime.date, type(None))):
        start_date = datetime.date(*start_date)

    node_ids = neurons.nodes.node_id.tolist()
    connector_ids = neurons.connectors.connector_id.tolist()

    # Get node details
    node_details = fetch.get_node_details(node_ids + connector_ids,
                                          remote_instance=remote_instance)

    # Get details for links
    link_details = fetch.get_connector_links(neurons,
                                             remote_instance=remote_instance)

    # Dataframe for creation (i.e. the actual generation of the nodes)
    creation_timestamps = node_details[['creator', 'creation_time']].copy()
    creation_timestamps['action'] = 'creation'
    creation_timestamps.columns = ['user', 'timestamp', 'action']

    # Dataframe for edition times
    edition_timestamps = node_details[['editor', 'edition_time']].copy()
    edition_timestamps['action'] = 'edition'
    edition_timestamps.columns = ['user', 'timestamp', 'action']

    # DataFrame for linking
    linking_timestamps = link_details[['creator', 'creation_time']].copy()
    linking_timestamps['action'] = 'linking'
    linking_timestamps.columns = ['user', 'timestamp', 'action']

    # Generate dataframe for reviews
    reviewers = [u for l in node_details.reviewers.tolist() for u in l]
    timestamps = [ts for l in node_details.review_times.tolist() for ts in l]
    review_timestamps = pd.DataFrame([[u, ts, 'review'] for u, ts in zip(
        reviewers, timestamps)], columns=['user', 'timestamp', 'action'])

    # Merge all timestamps
    all_timestamps = pd.concat([creation_timestamps,
                                edition_timestamps,
                                review_timestamps,
                                linking_timestamps],
                               axis=0).reset_index(drop=True)

    # Map login onto user ID
    all_timestamps.user = [user_dict[u] for u in all_timestamps.user.values]

    # Remove other users
    all_timestamps = all_timestamps[all_timestamps.user.isin(users)]

    # Remove timestamps outside of date range (if provided)
    if start_date:
        all_timestamps = all_timestamps[all_timestamps.timestamp.values >= np.datetime64(start_date)]
    if end_date:
        all_timestamps = all_timestamps[all_timestamps.timestamp.values <= np.datetime64(end_date)]

    return all_timestamps.sort_values('timestamp').reset_index(drop=True)