# This script is part of pymaid (http://www.github.com/navis-org/pymaid).
# Copyright (C) 2017 Philipp Schlegel
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along
"""This module contains functions to retrieve user statistics.
Examples
--------
>>> import pymaid
>>> myInstance = pymaid.CatmaidInstance('https://www.your.catmaid-server.org',
... api_token='YOURTOKEN',
... http_user='HTTP_PASSWORD', # omit if not required
... http_password='TOKEN')
>>> skeleton_ids = pymaid.get_skids_by_annotation('Hugin')
>>> cont = pymaid.get_user_contributions(skeleton_ids)
>>> cont
user nodes presynapses postsynapses
0 Schlegel 47221 470 1408
1 Tran 1645 7 4
2 Lacin 1300 1 20
3 Li 1244 5 45
...
>>> # Get the time that each user has invested
>>> time_inv = pymaid.get_time_invested(skeleton_ids,
... remote_instance = myInstance)
>>> time_inv
user total creation edition review
0 Schlegel 4649 3224 2151 1204
1 Tran 174 125 59 0
2 Li 150 114 65 0
3 Lacin 133 119 30 0
...
>>> # Plot contributions as pie chart
>>> import plotly
>>> fig = {"data": [{"values": time_inv.total.tolist(),
... "labels": time_inv.user.tolist(),
... "type": "pie"}]}
>>> plotly.offline.plot(fig)
"""
# TODOs
# - Github punch card-like figure
import datetime
import pandas as pd
import numpy as np
from . import core, fetch, utils, config
# Set up logging
logger = config.get_logger(__name__)
__all__ = ['get_user_contributions', 'get_time_invested', 'get_user_actions',
'get_team_contributions', 'get_user_stats']
[docs]
def get_user_stats(start_date=None, end_date=None, remote_instance=None):
"""Get user stats similar to the pie chart statistics widget in CATMAID.
Returns cable [nm], nodes created/reviewed and connector links created.
Parameters
----------
start_date : tuple | datetime.date, optional
end_date : tuple | datetime.date, optional
Start and end date of time window to check. If
``None``, will use entire project history.
remote_instance : CatmaidInstance, optional
Either pass explicitly or define globally.
Returns
-------
pandas.DataFrame
Dataframe in which each row represents a user::
cable nodes_created nodes_reviewed links_created
username
user1 ...
user2 ...
Examples
--------
Create a pie chart similar to the stats widget in CATMAID:
>>> import matplotlib.pyplot as plt
>>> stats = pymaid.get_user_stats()
>>> stats_to_plot = ['cable', 'nodes_created', 'nodes_reviewed',
... 'links_created']
>>> fig, axes = plt.subplots(1, len(stats_to_plot), figsize=(12, 4))
>>> for s, ax in zip(stats_to_plot, axes):
... # Get the top 10 contributors for this stat
... this_stats = stats[s].sort_values(ascending=False).iloc[:10]
... # Calculate "others"
... this_stats.loc['others'] = stats[s].sort_values(ascending=False).iloc[10:].sum()
... # Plot
... this_stats.plot.pie(ax=ax, textprops={'size': 6},
... explode=[.05] * this_stats.shape[0],
... rotatelabels=True)
... # Make labels a bit smaller
... ax.set_ylabel(s.replace('_', ' '), fontsize=8)
>>> plt.show()
See Also
--------
:func:`~pymaid.get_history`
Returns day-by-day stats.
"""
remote_instance = utils._eval_remote_instance(remote_instance)
if isinstance(start_date, type(None)):
start_date = datetime.date(2010, 1, 1)
elif not isinstance(start_date, datetime.date):
start_date = datetime.date(*start_date)
if isinstance(end_date, type(None)):
end_date = datetime.date.today()
elif not isinstance(end_date, datetime.date):
end_date = datetime.date(*end_date)
# Get and summarize other stats
hist = fetch.get_history(remote_instance=remote_instance,
start_date=start_date,
end_date=end_date)
stats = pd.concat([hist.cable.sum(axis=1),
hist.treenodes.sum(axis=1),
hist.reviewed.sum(axis=1),
hist.connector_links.sum(axis=1)],
axis=1, sort=True).fillna(0).astype(int)
stats.index.name = 'username'
stats.columns = ['cable', 'nodes_created', 'nodes_reviewed',
'links_created']
stats.sort_values('nodes_created', ascending=False, inplace=True)
return stats
[docs]
def get_team_contributions(teams, neurons=None, remote_instance=None):
"""Get contributions by teams (nodes, reviews, connectors, time invested).
Notes
-----
1. Time calculation uses defaults from :func:`pymaid.get_time_invested`.
2. ``total_reviews`` > ``total_nodes`` is possible if nodes have been
reviewed multiple times by different users. Similarly,
``total_reviews`` = ``total_nodes`` does not imply that the neuron
is fully reviewed!
Parameters
----------
teams dict
Teams to group contributions for. Users must be logins.
Format can be either:
1. Simple user assignments. For example::
{'teamA': ['user1', 'user2'],
'team2': ['user3'], ...]}
2. Users with start and end dates. Start and end date
must be either ``datetime.date`` or a single
``pandas.date_range`` object. For example::
{'team1': {
'user1': (datetime.date(2017, 1, 1),
datetime.date(2018, 1, 1)),
'user2': (datetime.date(2016, 6, 1),
datetime.date(2017, 1, 1)
}
'team2': {
'user3': pandas.date_range('2017-1-1',
'2018-1-1'),
}}
Mixing both styles is permissible. For second style,
use e.g. ``'user1': None`` for no date restrictions
on that user.
neurons skeleton ID(s) | CatmaidNeuron/List, optional
Restrict check to given set of neurons. If
CatmaidNeuron/List, will use this neurons nodes/
connectors. Use to subset contributions e.g. to a given
neuropil by pruning neurons before passing to this
function.
remote_instance : CatmaidInstance, optional
Either pass explicitly or define globally.
Returns
-------
pandas.DataFrame
DataFrame in which each row represents a neuron. Example for two teams,
``teamA`` and ``teamB``::
skeleton_id total_nodes teamA_nodes teamB_nodes ...
0
1
total_reviews teamA_reviews teamB_reviews ...
0
1
total_connectors teamA_connectors teamB_connectors ...
0
1
total_time teamA_time teamB_time
0
1
Examples
--------
>>> from datetime import date
>>> import pandas as pd
>>> teams = {'teamA': ['user1', 'user2'],
... 'teamB': {'user3': None,
... 'user4': (date(2017, 1, 1), date(2018, 1, 1))},
... 'teamC': {'user5': pd.date_range('2015-1-1', '2018-1-1')}}
>>> stats = pymaid.get_team_contributions(teams)
See Also
--------
:func:`~pymaid.get_contributor_statistics`
Gives you more basic info on neurons of interest
such as total reconstruction/review time.
:func:`~pymaid.get_time_invested`
Time invested by individual users. Gives you more
control over how time is calculated.
"""
remote_instance = utils._eval_remote_instance(remote_instance)
# Prepare teams
if not isinstance(teams, dict):
raise TypeError('Expected teams of type dict, got '
'{}'.format(type(teams)))
beginning_of_time = datetime.date(1900, 1, 1)
today = datetime.date.today()
all_time = pd.date_range(beginning_of_time, today)
for t in teams:
if isinstance(teams[t], list):
teams[t] = {u: all_time for u in teams[t]}
elif isinstance(teams[t], dict):
for u in teams[t]:
if isinstance(teams[t][u], type(None)):
teams[t][u] = all_time
elif isinstance(teams[t][u], (tuple, list)):
try:
teams[t][u] = pd.date_range(*teams[t][u])
except BaseException:
raise Exception('Error converting "{}" to pandas.'
'date_range'.format(teams[t][u]))
elif isinstance(teams[t][u],
pd.core.indexes.datetimes.DatetimeIndex):
pass
else:
TypeError('Expected user dates to be either None, tuple '
'of datetimes or pandas.date_range, '
'got {}'.format(type(teams[t][u])))
else:
raise TypeError('Expected teams to be either lists or dicts of '
'users, got {}'.format(type(teams[t])))
# Get all users
all_users = [u for t in teams for u in teams[t]]
# Prepare neurons - download if neccessary
if not isinstance(neurons, type(None)):
if isinstance(neurons, core.CatmaidNeuron):
neurons = core.CatmaidNeuronList(neurons)
elif isinstance(neurons, core.CatmaidNeuronList):
pass
else:
neurons = fetch.get_neurons(neurons,
remote_instance=remote_instance)
else:
all_dates = [d.date() for t in teams for u in teams[t] for d in teams[t][u]]
neurons = fetch.find_neurons(users=all_users,
from_date=min(all_dates),
to_date=max(all_dates),
remote_instance=remote_instance)
neurons.get_skeletons()
# Get user list
user_list = fetch.get_user_list(remote_instance=remote_instance).set_index('login')
for u in all_users:
if u not in user_list.index:
raise ValueError('User "{}" not found in user list'.format(u))
# Get all node details
all_node_details = fetch.get_node_details(neurons,
remote_instance=remote_instance)
# Get connector links
link_details = fetch.get_connector_links(neurons, remote_instance=remote_instance)
# link_details contains all links. We have to subset this to existing
# connectors in case the input neurons have been pruned
link_details = link_details[link_details.connector_id.isin(neurons.connectors.connector_id.values)]
interval = 3
bin_width = '%iMin' % interval
minimum_actions = 10 * interval
stats = []
for n in config.tqdm(neurons, desc='Processing',
disable=config.pbar_hide, leave=config.pbar_leave):
# Get node details
tn_ids = n.nodes.node_id.values.astype(str)
cn_ids = n.connectors.connector_id.values.astype(str)
current_status = config.pbar_hide
config.pbar_hide = True
node_details = all_node_details[all_node_details.node_id.isin(np.append(tn_ids, cn_ids))]
config.pbar_hide = current_status
# Extract node creation
node_creation = node_details.loc[node_details.node_id.isin(tn_ids),
['creator', 'creation_time']].values
node_creation = np.c_[node_creation, ['node_creation'] * node_creation.shape[0]]
# Extract connector creation
cn_creation = node_details.loc[node_details.node_id.isin(cn_ids),
['creator', 'creation_time']].values
cn_creation = np.c_[cn_creation, ['cn_creation'] * cn_creation.shape[0]]
# Extract edition times (treenodes + connectors)
node_edits = node_details.loc[:, ['editor', 'edition_time']].values
node_edits = np.c_[node_edits, ['editor'] * node_edits.shape[0]]
# Link creation
link_creation = link_details.loc[link_details.connector_id.isin(cn_ids),
['creator', 'creation_time']].values
link_creation = np.c_[link_creation, ['link_creation'] * link_creation.shape[0]]
# Extract review times
reviewers = [u for l in node_details.reviewers.values for u in l]
timestamps = [ts for l in node_details.review_times.values for ts in l]
node_review = np.c_[reviewers, timestamps, ['review'] * len(reviewers)]
# Merge all timestamps (ignore edits for now) to get time_invested
all_ts = pd.DataFrame(np.vstack([node_creation,
node_review,
cn_creation,
link_creation,
node_edits]),
columns=['user', 'timestamp', 'type'])
return all_ts
# Add column with just the date and make it the index
all_ts['date'] = all_ts.timestamp.values.astype('datetime64[D]')
all_ts.index = pd.to_datetime(all_ts.date, format="ISO8601")
# Fill in teams for each timestamp based on user + date
all_ts['team'] = None
for t in teams:
for u in teams[t]:
# Assign all timestamps by this user in the right time to
# this team
existing_dates = (teams[t][u] & all_ts.index).unique()
ss = (all_ts.index.isin(existing_dates)) & (all_ts.user.values == user_list.loc[u, 'id'])
all_ts.loc[ss, 'team'] = t
# Get total
total_time = sum(all_ts.timestamp.to_frame().set_index(
'timestamp', drop=False).groupby(pd.Grouper(freq=bin_width)).count().values >= minimum_actions)[0] * interval
this_neuron = [n.skeleton_id, n.n_nodes, n.n_connectors,
node_review.shape[0], total_time]
# Go over the teams and collect values
for t in teams:
# Subset to team
this_team = all_ts[all_ts.team == t]
if this_team.shape[0] > 0:
# Subset to user ID
team_time = sum(this_team.timestamp.to_frame().set_index(
'timestamp', drop=False).groupby(pd.Grouper(freq=bin_width)).count().values >= minimum_actions)[0] * interval
team_nodes = this_team[this_team['type'] == 'node_creation'].shape[0]
team_cn = this_team[this_team['type'] == 'cn_creation'].shape[0]
team_rev = this_team[this_team['type'] == 'review'].shape[0]
else:
team_nodes = team_cn = team_rev = team_time = 0
this_neuron += [team_nodes, team_cn, team_rev, team_time]
stats.append(this_neuron)
cols = ['skeleton_id', 'total_nodes', 'total_connectors',
'total_reviews', 'total_time']
for t in teams:
for s in ['nodes', 'connectors', 'reviews', 'time']:
cols += ['{}_{}'.format(t, s)]
stats = pd.DataFrame(stats, columns=cols)
cols_ordered = ['skeleton_id'] + ['{}_{}'.format(t, v) for v in
['nodes', 'connectors',
'reviews', 'time'] for t in ['total'] + list(teams)]
stats = stats[cols_ordered]
return stats
[docs]
def get_user_contributions(x, teams=None, remote_instance=None):
"""Return number of nodes and synapses contributed by each user.
This is essentially a wrapper for :func:`pymaid.get_contributor_statistics`
- if you are also interested in e.g. construction time, review time, etc.
you may want to consider using :func:`~pymaid.get_contributor_statistics`
instead.
Parameters
----------
x
Which neurons to check. Can be either:
1. skeleton IDs (int or str)
2. neuron name (str, must be exact match)
3. annotation: e.g. 'annotation:PN right'
4. CatmaidNeuron or CatmaidNeuronList object
teams dict, optional
Teams to group contributions for. Users must be logins::
{'teamA': ['user1', 'user2'], 'team2': ['user3'], ...]}
Users not part of any team, will be grouped as team
``'others'``.
remote_instance : CatmaidInstance, optional
Either pass explicitly or define globally.
Returns
-------
pandas.DataFrame
DataFrame in which each row represents a user::
user nodes presynapses postsynapses nodes_reviewed
0
1
...
Examples
--------
>>> import matplotlib.pyplot as plt
>>> # Get contributors for a single neuron
>>> cont = pymaid.get_user_contributions(2333007)
>>> # Get top 10 (by node contribution)
>>> top10 = cont.iloc[:10].set_index('user')
>>> # Plot as bar chart
>>> ax = top10.plot(kind='bar')
>>> plt.show()
>>> # Plot relative contributions
>>> cont = pymaid.get_user_contributions(2333007)
>>> cont = cont.set_index('user')
>>> # Normalize
>>> cont_rel = cont / cont.sum(axis=0).values
>>> # Plot contributors with >5% node contributions
>>> ax = cont_rel[cont_rel.nodes > .05].plot(kind='bar')
>>> plt.show()
See Also
--------
:func:`~pymaid.get_contributor_statistics`
Gives you more basic info on neurons of interest
such as total reconstruction/review time.
"""
if not isinstance(teams, type(None)):
# Prepare teams
if not isinstance(teams, dict):
raise TypeError('Expected teams of type dict, got '
'{}'.format(type(teams)))
for t in teams:
if not isinstance(teams[t], list):
raise TypeError('Teams need to list of user logins, '
'got {}'.format(type(teams[t])))
# Turn teams into a login -> team dict
teams = {u: t for t in teams for u in teams[t]}
remote_instance = utils._eval_remote_instance(remote_instance)
skids = utils.eval_skids(x, remote_instance=remote_instance)
cont = fetch.get_contributor_statistics(skids,
remote_instance=remote_instance,
separate=False)
all_users = set(list(cont.node_contributors.keys()) + list(cont.pre_contributors.keys()) + list(cont.post_contributors.keys()))
stats = {
'nodes': {u: 0 for u in all_users},
'presynapses': {u: 0 for u in all_users},
'postsynapses': {u: 0 for u in all_users},
'nodes_reviewed': {u: 0 for u in all_users}
}
for u in cont.node_contributors:
stats['nodes'][u] = cont.node_contributors[u]
for u in cont.pre_contributors:
stats['presynapses'][u] = cont.pre_contributors[u]
for u in cont.post_contributors:
stats['postsynapses'][u] = cont.post_contributors[u]
for u in cont.review_contributors:
stats['nodes_reviewed'][u] = cont.review_contributors[u]
stats = pd.DataFrame([[u, stats['nodes'][u],
stats['presynapses'][u],
stats['postsynapses'][u],
stats['nodes_reviewed'][u]] for u in all_users],
columns=['user', 'nodes', 'presynapses',
'postsynapses', 'nodes_reviewed']
).sort_values('nodes', ascending=False).reset_index(drop=True)
if isinstance(teams, type(None)):
return stats
stats['team'] = [teams.get(u, 'others') for u in stats.user.values]
return stats.groupby('team').sum()
[docs]
def get_time_invested(x, mode='SUM', by='USER', minimum_actions=10,
max_inactive_time=3, treenodes=True, connectors=True,
links=True, start_date=None, end_date=None,
remote_instance=None):
"""Calculate the time spent working on a set of neurons.
Use ``minimum_actions`` and ``max_inactive_time`` to fine tune how time
invested is calculated: by default, time is binned over 3 minutes in
which a user has to perform 3x10 actions for that interval to be
counted towards the time spent tracing.
Important
---------
Creation/Edition/Review times can overlap! This is why total time spent
is not just creation + edition + review.
Please note that this does currently not take placement of
pre-/postsynaptic nodes into account!
Be aware of the ``minimum_actions`` parameter: at low values even
a single action (e.g. connecting a node) will add considerably to time
invested. To keep total reconstruction time comparable to what Catmaid
calculates, you should consider about 10 actions/minute (= a click every
6 seconds) and ``max_inactive_time`` of 3 mins.
CATMAID gives reconstruction time across all users. Here, we calculate
the time spent tracing for individuals. This may lead to a discrepancy
between sum of time invested over of all users from this function vs.
CATMAID's reconstruction time.
Parameters
----------
x
Which neurons to check. Can be either:
1. skeleton IDs (int or str)
2. neuron name (str, must be exact match)
3. annotation: e.g. 'annotation:PN right'
4. CatmaidNeuron or CatmaidNeuronList object
If you pass a CatmaidNeuron/List, its node/connectors
are used to calculate time invested. You can exploit
this to get time spent reconstructing in given
compartment of a neurons, e.g. by pruning it to a
volume before passing it to ``get_time_invested``.
mode : 'SUM' | 'SUM2' | 'OVER_TIME' | 'ACTIONS', optional
(1) 'SUM' will return total time invested (in minutes)
broken down by creation, edition and review.
(2) 'SUM2' will return total time invested (in
minutes) broken down by `treenodes`, `connectors`
and `links`.
(3) 'OVER_TIME' will return minutes invested/day over
time.
(4) 'ACTIONS' will return actions
(node/connectors placed/edited) per day.
by : 'USER' | 'NEURON', optional
Determines whether the stats are broken down by user or
by neuron.
minimum_actions : int, optional
Minimum number of actions per minute to be counted as
active.
max_inactive_time : int, optional
Interval in minutes over which time invested is
binned. Essentially determines how much time can be
between bouts of activity.
treenodes : bool, optional
If False, treenodes will not be taken into account.
connectors : bool, optional
If False, connectors will not be taken into account.
links : bool, optional
If False, connector links will not be taken into account.
start_date : iterable | datetime.date | numpy.datetime64, optional
Restricts time invested to window. Applies to creation
but not edition time! If iterable, must be year, month
day, e.g. ``[2018, 1, 1]``.
end_date : iterable | datetime.date | numpy.datetime64, optional
See ``start_date``.
remote_instance : CatmaidInstance, optional
Either pass explicitly or define globally.
Returns
-------
pandas.DataFrame
If ``mode='SUM'``, values represent minutes invested::
total creation edition review
user1
user2
..
.
If ``mode='SUM2'``, values represent minutes invested::
total treenodes connectors links
user1
user2
..
.
If ``mode='OVER_TIME'`` or ``mode='ACTIONS'``::
date1 date2 date3 ...
user1
user2
..
.
For `OVER_TIME`, values respresent minutes invested on that day. For
`ACTIONS`, values represent actions (creation, edition, review) on that
day.
Examples
--------
Get time invested for a set of neurons:
>>> da1 = pymaid.get_neurons('annotation:glomerulus DA1')
>>> time = pymaid.get_time_invested(da1)
Get time spent tracing in a specific compartment:
>>> da1_lh = pymaid.prune_by_volume('LH_R', inplace=False)
>>> time_lh = pymaid.get_time_invested(da1_lh)
Get contributions within a given time window:
>>> time_jan = pymaid.get_time_invested(da1,
... start_date=[2018, 1, 1],
... end_date=[2018, 1, 31])
Plot pie chart of contributions per user using Plotly:
>>> import plotly
>>> stats = pymaid.get_time_invested(skids, remote_instance)
>>> # Use plotly to generate pie chart
>>> fig = {"data": [{"values": stats.total.tolist(),
... "labels": stats.user.tolist(), "type" : "pie" }]}
>>> plotly.offline.plot(fig)
Plot reconstruction efforts over time:
>>> stats = pymaid.get_time_invested(skids, mode='OVER_TIME')
>>> # Plot time invested over time
>>> stats.T.plot()
>>> # Plot cumulative time invested over time
>>> stats.T.cumsum(axis=0).plot()
>>> # Filter for major contributors
>>> stats[stats.sum(axis=1) > 20].T.cumsum(axis=0).plot()
"""
def _extract_timestamps(ts, restrict_groups, desc='Calc'):
if ts.empty:
return {}
grouped = ts.set_index('timestamp',
drop=False).groupby(['group',
pd.Grouper(freq=bin_width)]).count() >= minimum_actions
temp_stats = {}
for g in config.tqdm(set(ts.group.unique()) & set(restrict_groups),
desc=desc, disable=config.pbar_hide, leave=False):
temp_stats[g] = sum(grouped.loc[g].values)[0] * interval
return temp_stats
if mode not in ['SUM', 'SUM2', 'OVER_TIME', 'ACTIONS']:
raise ValueError('Unknown mode "{}"'.format(mode))
if by not in ['NEURON', 'USER']:
raise ValueError('Unknown by "{}"'.format(by))
remote_instance = utils._eval_remote_instance(remote_instance)
skids = utils.eval_skids(x, remote_instance=remote_instance)
# Maximal inactive time is simply translated into binning
# We need this later for pandas.TimeGrouper() anyway
interval = max_inactive_time
bin_width = '%iMin' % interval
# Update minimum_actions to reflect actions/interval instead of
# actions/minute
minimum_actions *= interval
user_list = fetch.get_user_list(remote_instance=remote_instance).set_index('id')
user_dict = user_list.login.to_dict()
if not isinstance(x, (core.CatmaidNeuron, core.CatmaidNeuronList)):
x = fetch.get_neuron(skids, remote_instance=remote_instance)
if isinstance(x, core.CatmaidNeuron):
skdata = core.CatmaidNeuronList(x)
elif isinstance(x, core.CatmaidNeuronList):
skdata = x
if not isinstance(end_date, (datetime.date, np.datetime64, type(None))):
end_date = datetime.date(*end_date)
if not isinstance(start_date, (datetime.date, np.datetime64, type(None))):
start_date = datetime.date(*start_date)
# Extract connector and node IDs
node_ids = []
connector_ids = []
for n in skdata.itertuples():
if treenodes:
node_ids += n.nodes.node_id.tolist()
if connectors:
connector_ids += n.connectors.connector_id.tolist()
# Get node details
node_details = fetch.get_node_details(node_ids + connector_ids,
remote_instance=remote_instance)
# Get details for links
if links:
link_details = fetch.get_connector_links(skdata,
remote_instance=remote_instance)
# link_details contains all links. We have to subset this to existing
# connectors in case the input neurons have been pruned
link_details = link_details[link_details.connector_id.isin(connector_ids)]
else:
link_details = pd.DataFrame([], columns=['creator', 'creation_time'])
# Remove timestamps outside of date range (if provided)
if start_date:
node_details = node_details[node_details.creation_time >= np.datetime64(start_date)]
link_details = link_details[link_details.creation_time >= np.datetime64(start_date)]
if end_date:
node_details = node_details[node_details.creation_time <= np.datetime64(end_date)]
link_details = link_details[link_details.creation_time <= np.datetime64(end_date)]
# If we want to group by neuron, we need to add a "skeleton ID" column and
# make check if we need to duplicate rows with connectors
if by == 'NEURON':
# Need to add a column with the skeleton ID
node_details['skeleton_id'] = None
node_details['node_type'] = 'connector'
col_name = 'skeleton_id'
for n in skdata:
cond = node_details.node_id.isin(n.nodes.node_id.values.astype(str))
node_details.loc[cond, 'skeleton_id'] = n.skeleton_id
node_details.loc[cond, 'node_type'] = 'treenode'
# Connectors can show up in more than one neuron -> we need to duplicate
# those rows for each of the associated neurons
cn_details = []
for n in skdata:
cond1 = node_details.node_type == 'connector'
cond2 = node_details.node_id.isin(n.connectors.connector_id.values.astype(str))
node_details.loc[cond1 & cond2, 'skeleton_id'] = n.skeleton_id
this_cn = node_details.loc[cond1 & cond2]
cn_details.append(this_cn)
cn_details = pd.concat(cn_details, axis=0)
# Merge the node details again
cond1 = node_details.node_type == 'treenode'
node_details = pd.concat([node_details.loc[cond1], cn_details],
axis=0).reset_index(drop=True)
# Note that link_details already has a "skeleton_id" column
# but we need to make sure it's strings
link_details['skeleton_id'] = link_details.skeleton_id.astype(str)
create_group = edit_group = 'skeleton_id'
else:
create_group = 'creator'
edit_group = 'editor'
col_name = 'user'
# Dataframe for creation (i.e. the actual generation of the nodes)
creation_timestamps = np.append(node_details[[create_group,
'creation_time']].values,
link_details[[create_group,
'creation_time']].values,
axis=0)
creation_timestamps = pd.DataFrame(creation_timestamps,
columns=['group', 'timestamp'])
# Dataframe for edition times - can't use links as there is no editor
# Because creation of a node counts as an edit, we are removing
# timestamps where creation and edition time are less than 100ms apart
is_edit = (node_details.edition_time - node_details.creation_time) > np.timedelta64(200, 'ms')
edition_timestamps = node_details.loc[is_edit, [edit_group, 'edition_time']]
edition_timestamps.columns = ['group', 'timestamp']
# Generate dataframe for reviews -> here we have to unpack
if by == 'USER':
groups = [u for l in node_details.reviewers.values for u in l]
else:
groups = [s for l, s in zip(node_details.review_times.values,
node_details.skeleton_id.values) for ts in l]
timestamps = [ts for l in node_details.review_times.values for ts in l]
review_timestamps = pd.DataFrame([groups, timestamps]).T
review_timestamps.columns = ['group', 'timestamp']
# Change user ID to login
if by == 'USER':
if mode == 'SUM2':
node_details['creator'] = node_details.creator.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
node_details['editor'] = node_details.editor.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
link_details['creator'] = link_details.creator.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
creation_timestamps['group'] = creation_timestamps.group.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
edition_timestamps['group'] = edition_timestamps.group.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
review_timestamps['group'] = review_timestamps.group.map(lambda x: user_dict.get(x, f'Anonymous{x}'))
# Merge all timestamps
all_timestamps = pd.concat([creation_timestamps,
edition_timestamps,
review_timestamps],
axis=0)
all_timestamps.sort_values('timestamp', inplace=True)
if by == 'USER':
# Extract the users that are relevant for us
relevant_users = all_timestamps.groupby('group').count()
groups = relevant_users[relevant_users.timestamp >= minimum_actions].index.values
else:
groups = skdata.skeleton_id
if mode == 'SUM':
# This breaks it down by time spent on creation, edition and review
stats = {k: {g: 0 for g in groups} for k in ['total',
'creation',
'edition',
'review']}
stats['total'].update(_extract_timestamps(all_timestamps,
groups,
desc='Calc total'))
stats['creation'].update(_extract_timestamps(creation_timestamps,
groups,
desc='Calc creation'))
stats['edition'].update(_extract_timestamps(edition_timestamps,
groups,
desc='Calc edition'))
stats['review'].update(_extract_timestamps(review_timestamps,
groups,
desc='Calc review'))
return pd.DataFrame([[g,
stats['total'][g],
stats['creation'][g],
stats['edition'][g],
stats['review'][g]] for g in groups],
columns=[col_name, 'total',
'creation', 'edition',
'review']
).sort_values('total',
ascending=False
).reset_index(drop=True).set_index(col_name)
elif mode == 'SUM2':
# This breaks it down by time spent on nodes, connectors and links
stats = {k: {g: 0 for g in groups} for k in ['total',
'treenodes',
'connectors',
'links']}
stats['total'].update(_extract_timestamps(all_timestamps,
groups,
desc='Calc total'))
# We need to construct separate DataFrames for nodes, connectors + links
# Note that we are using only edits that do not stem from the creation
is_tn = node_details.node_id.astype(int).isin(node_ids)
conc = np.concatenate([node_details.loc[is_tn,
[create_group, 'creation_time']
].values,
node_details.loc[is_edit & is_tn,
[edit_group, 'edition_time']
].values
],
axis=0)
treenode_timestamps = pd.DataFrame(conc, columns=['group', 'timestamp'])
stats['treenodes'].update(_extract_timestamps(treenode_timestamps,
groups,
desc='Calc treenodes'))
# Now connectors
# Note that we are using only edits that do not stem from the creation
is_cn = node_details.node_id.astype(int).isin(connector_ids)
conc = np.concatenate([node_details.loc[is_cn,
[create_group, 'creation_time']
].values,
node_details.loc[is_edit & is_cn,
[edit_group, 'edition_time']
].values
],
axis=0)
connector_timestamps = pd.DataFrame(conc, columns=['group', 'timestamp'])
stats['connectors'].update(_extract_timestamps(connector_timestamps,
groups,
desc='Calc connectors'))
# Now links
link_timestamps = pd.DataFrame(link_details[[create_group,
'creation_time']].values,
columns=['group', 'timestamp'])
stats['links'].update(_extract_timestamps(link_timestamps,
groups,
desc='Calc links'))
return pd.DataFrame([[g,
stats['total'][g],
stats['treenodes'][g],
stats['connectors'][g],
stats['links'][g]] for g in groups],
columns=[col_name, 'total',
'treenodes', 'connectors',
'links']
).sort_values('total', ascending=False
).reset_index(drop=True
).set_index(col_name)
elif mode == 'ACTIONS':
all_ts = all_timestamps.set_index('timestamp', drop=False
).timestamp.groupby(pd.Grouper(freq='1d')
).count().to_frame()
all_ts.columns = ['all_groups']
all_ts = all_ts.T
# Get total time spent
for g in config.tqdm(all_timestamps.group.unique(), desc='Calc. total',
disable=config.pbar_hide, leave=False):
this_ts = all_timestamps[all_timestamps.group == g].set_index(
'timestamp', drop=False).timestamp.groupby(pd.Grouper(freq='1d')).count().to_frame()
this_ts.columns = [g]
all_ts = pd.concat([all_ts, this_ts.T])
return all_ts.fillna(0)
elif mode == 'OVER_TIME':
# Go over all users and collect time invested
all_ts = []
for g in config.tqdm(all_timestamps.group.unique(), desc='Calc. total', disable=config.pbar_hide, leave=False):
# First count all minutes with minimum number of actions
minutes_counting = (all_timestamps[all_timestamps.group == g].set_index(
'timestamp', drop=False).timestamp.groupby(pd.Grouper(freq=bin_width)).count().to_frame() >= minimum_actions)
# Then remove the minutes that have less than minimum actions
minutes_counting = minutes_counting[minutes_counting.timestamp]
# Now group timestamps by day
this_ts = minutes_counting.groupby(pd.Grouper(freq='1d')).count()
# Rename columns to user login
this_ts.columns = [g]
# Append if an and move on
if not this_ts.empty:
all_ts.append(this_ts.T)
# Turn into DataFrame
all_ts = pd.concat(all_ts).sort_index()
# Replace NaNs with 0
all_ts.fillna(0, inplace=True)
# Add all users column
all_users = all_ts.sum(axis=0)
all_users.name = 'all_groups'
all_ts = pd.concat([all_users, all_ts.T], axis=1).T
return all_ts
[docs]
def get_user_actions(users=None, neurons=None, start_date=None, end_date=None,
remote_instance=None):
"""Get timestamps of user actions (creations, editions, reviews, linking).
Important
---------
This function returns most but not all user actions::
1. The API endpoint used for finding neurons worked on by a given user
(:func:`pymaid.find_neurons`) does not return single-node neurons.
Hence, placing e.g. postsynaptic nodes is not taken into account.
2. Any creation is also an edit. However, only the last edit is kept
track of. So each creation counts as an edit for the creator until a
different user makes an edit.
Parameters
----------
users : str | list, optional
Users login(s) for which to return timestamps.
neurons : list of skeleton IDs | CatmaidNeuron/List, optional
Neurons for which to return timestamps. If None, will
find neurons by user.
start_date : tuple | datetime.date, optional
end_date : tuple | datetime.date, optional
Start and end date of time window to check.
remote_instance : CatmaidInstance, optional
Return
------
pandas.DataFrame
DataFrame in which each row is a user action::
user timestamp action
0
1
...
Examples
--------
In the first example we will have a look at how active a user is over
the course of a day.
>>> import pandas as pd
>>> import matplotlib.pyplot as plt
>>> # Get all actions for a single user
>>> actions = pymaid.get_user_actions(users='schlegelp',
.... start_date=(2017, 11, 1))
>>> # Group by hour and see what time of the day user is usually active
>>> actions.set_index(pd.DatetimeIndex(actions.timestamp), inplace=True)
>>> hours = actions.groupby(actions.index.hour).count()
>>> ax = hours.action.plot()
>>> plt.show()
>>> # Plot day-by-day activity
>>> ax = plt.subplot()
>>> ax.scatter(actions.timestamp.date.values,
... actions.timestamp.time.values,
... marker='_')
"""
if not neurons and not users and not (start_date or end_date):
raise ValueError('Query must be restricted by at least a single '
'parameter!')
if users and not isinstance(users, (list, np.ndarray)):
users = [users]
# Get user dictionary (needed later)
user_list = fetch.get_user_list(remote_instance=remote_instance)
user_dict = user_list.set_index('id').login.to_dict()
if isinstance(neurons, type(None)):
neurons = fetch.find_neurons(users=users,
from_date=start_date, to_date=end_date,
reviewed_by=users,
remote_instance=remote_instance)
# Get skeletons
neurons.get_skeletons()
elif not isinstance(neurons, (core.CatmaidNeuron, core.CatmaidNeuronList)):
neurons = fetch.get_neuron(neurons, remote_instance=remote_instance)
if not isinstance(end_date, (datetime.date, type(None))):
end_date = datetime.date(*end_date)
if not isinstance(start_date, (datetime.date, type(None))):
start_date = datetime.date(*start_date)
node_ids = neurons.nodes.node_id.tolist()
connector_ids = neurons.connectors.connector_id.tolist()
# Get node details
node_details = fetch.get_node_details(node_ids + connector_ids,
remote_instance=remote_instance)
# Get details for links
link_details = fetch.get_connector_links(neurons,
remote_instance=remote_instance)
# Dataframe for creation (i.e. the actual generation of the nodes)
creation_timestamps = node_details[['creator', 'creation_time']].copy()
creation_timestamps['action'] = 'creation'
creation_timestamps.columns = ['user', 'timestamp', 'action']
# Dataframe for edition times
edition_timestamps = node_details[['editor', 'edition_time']].copy()
edition_timestamps['action'] = 'edition'
edition_timestamps.columns = ['user', 'timestamp', 'action']
# DataFrame for linking
linking_timestamps = link_details[['creator', 'creation_time']].copy()
linking_timestamps['action'] = 'linking'
linking_timestamps.columns = ['user', 'timestamp', 'action']
# Generate dataframe for reviews
reviewers = [u for l in node_details.reviewers.tolist() for u in l]
timestamps = [ts for l in node_details.review_times.tolist() for ts in l]
review_timestamps = pd.DataFrame([[u, ts, 'review'] for u, ts in zip(
reviewers, timestamps)], columns=['user', 'timestamp', 'action'])
# Merge all timestamps
all_timestamps = pd.concat([creation_timestamps,
edition_timestamps,
review_timestamps,
linking_timestamps],
axis=0).reset_index(drop=True)
# Map login onto user ID
all_timestamps.user = [user_dict[u] for u in all_timestamps.user.values]
# Remove other users
all_timestamps = all_timestamps[all_timestamps.user.isin(users)]
# Remove timestamps outside of date range (if provided)
if start_date:
all_timestamps = all_timestamps[all_timestamps.timestamp.values >= np.datetime64(start_date)]
if end_date:
all_timestamps = all_timestamps[all_timestamps.timestamp.values <= np.datetime64(end_date)]
return all_timestamps.sort_values('timestamp').reset_index(drop=True)