Source code for dvid.utils

# This code is part of dvid-tools (https://github.com/flyconnectome/dvid_tools)
# and is released under GNU GPL3

from . import fetch

import json
import networkx as nx
import pandas as pd
import numpy as np
import re
import requests
import urllib

from io import StringIO
from itertools import combinations
from scipy.spatial.distance import pdist, cdist

__all__ = ['check_skeleton', 'gen_assignments']


def swc_to_graph(x):
    """Generate NetworkX DiGraph from SWC DataFrame.

    Parameters
    ----------
    x :         pandas DataFrame
                SWC table.

    Returns
    -------
    networkx.DiGraph

    """
    if not isinstance(x, pd.DataFrame):
        raise TypeError('Expected DataFrame, got "{}"'.format(type(x)))

    edges = x.loc[x.parent_id > 0, ['node_id', 'parent_id']].values.astype(object)

    nlocs = x.set_index('node_id').loc[edges[:, 0], ['x', 'y', 'z']].values
    plocs = x.set_index('node_id').loc[edges[:, 1], ['x', 'y', 'z']].values

    weights = np.sqrt(np.sum((nlocs - plocs)**2, axis=1))

    weighted_edges = np.append(edges, weights.reshape(len(weights), 1), axis=1)

    g = nx.DiGraph()

    # Add edges
    g.add_weighted_edges_from(weighted_edges)

    # Add disconnected nodes as they do not show in edge list
    disc = x[(x.parent_id < 0) & (~x.node_id.isin(x.parent_id))]
    if not disc.empty:
        g.add_nodes_from(disc.node_id.values)

    # Add positions
    nx.set_node_attributes(g, {r.node_id: np.array([r.x, r.y, r.z]) for r in x.itertuples()}, name='location')
    nx.set_node_attributes(g, {r.node_id: r.radius for r in x.itertuples()}, name='radius')

    return g


[docs]def reroot_skeleton(x, new_root, inplace=False):
    """Reroot skeleton to new root.

    For fragmented skeletons, only the root within the new_root's connected
    component will be changed.

    Node order might not conform to SWC standard after healing. Use
    ``dvidtools.refurbish_table`` to fix that.

    Parameters
    ----------
    x :         pandas.DataFrame
                SWC table to reroot.
    new_root :  int
                New root ID.

    Returns
    -------
    SWC:        pandas.DataFrame

    """
    if not isinstance(x, pd.DataFrame):
        raise TypeError('Expected DataFrame, got "{}"'.format(type(x)))

    if new_root not in x.node_id.values:
        raise ValueError('New root node not found')

    if x.set_index('node_id').loc[new_root, 'parent_id'] < 0:
        return x

    # Turn skeleton into Graph
    g = swc_to_graph(x)

    # Walk from new root to old root and keep track of visited edges
    seen = [new_root]
    p = next(g.successors(new_root), None)
    while p:
        seen.append(p)
        p = next(g.successors(p), None)

    # Invert path and change parents
    swc = x.copy() if not inplace else x
    new_p = {v: u for u, v in zip(seen[:-1], seen[1:])}
    swc.loc[swc.node_id.isin(seen), 'parent_id'] = swc.loc[swc.node_id.isin(seen)].node_id.map(lambda x: new_p.get(x, -1))

    if not inplace:
        return swc


[docs]def heal_skeleton(x, root=None, inplace=False):
    """Merge fragmented skeleton back together.

    Uses a minimum spanning tree to connect disconnected components.

    Important
    ---------
    Node order might not conform to SWC standard after healing. Use
    ``dvidtools.refurbish_table`` to fix that.

    Parameters
    ----------
    x :         pandas.DataFrame
                SWC table to heal.
    root :      int | None, optional
                New root. If None, will use an existing root.

    Returns
    -------
    SWC:        pandas.DataFrame

    """
    if not isinstance(x, pd.DataFrame):
        raise TypeError('Expected DataFrame, got "{}"'.format(type(x)))

    # If not fragmented, simply return
    if x[x.parent_id < 0].shape[0] <= 1:
        return x

    # Turn skeleton into Graph
    g = swc_to_graph(x)

    # Set existing edges to zero weight to make sure they remain when
    # calculating the minimum spanning tree
    nx.set_edge_attributes(g, 0, 'weight')

    # Calculate distance between all leafs
    leafs = x[~x.node_id.isin(x.parent_id) | (x.parent_id < 0)]
    d = pdist(leafs[['x', 'y', 'z']].values)

    # Generate new edges
    edges = np.array(list(combinations(leafs.node_id.values, 2)))
    edges = np.append(edges.astype(object), d.reshape(d.size, 1), axis=1)

    g.add_weighted_edges_from(edges)

    # Get minimum spanning tree
    st = nx.minimum_spanning_tree(g.to_undirected())

    # Now we have to make sure that all edges are correctly oriented
    if not root:
        root = x.loc[x.parent_id < 0, 'node_id'].values[0]
    tree = nx.bfs_tree(st, source=root)
    tree = nx.reverse(tree, copy=False)

    # Now do some cleaning up
    swc = x.copy() if not inplace else x

    # Map new parents
    lop = {n: next(tree.successors(n)) for n in tree if tree.out_degree(n)}
    swc['parent_id'] = swc.node_id.map(lambda x: lop.get(x, -1))

    # Make sure parent IDs are in ascending order (according to SWC standard)
    swc.sort_values('parent_id', inplace=True)

    # Reorder node IDs
    swc.reset_index(drop=True, inplace=True)
    lon = {k: v for k, v in zip(swc.node_id.values, swc.index.values + 1)}
    swc['node_id'] = swc.index.values + 1
    swc['parent_id'] = swc.parent_id.map(lambda x: lon.get(x, -1))

    if not inplace:
        return swc


def refurbish_table(x, inplace=False):
    """Refurbishes SWC table to keep in conform with official format.

    Important
    ---------
    This operation can change node IDs!

    Parameters
    ----------
    x :         pandas.DataFrame
                SWC table to refurbish.

    Returns
    -------
    SWC:        pandas.DataFrame

    """
    if not isinstance(x, pd.DataFrame):
        raise TypeError('Expected DataFrame, got "{}"'.format(type(x)))

    swc = x.copy() if not inplace else x

    # Make sure parent IDs are in ascending order
    swc.sort_values('parent_id', inplace=True)

    # Reorder node IDs
    swc.reset_index(drop=True, inplace=True)
    lon = {k: v for k, v in zip(swc.node_id.values, swc.index.values + 1)}
    swc['node_id'] = swc.index.values + 1
    swc['parent_id'] = swc.parent_id.map(lambda x: lon.get(x, -1))

    if not inplace:
        return swc


def verify_payload(data, required, required_only=True):
    """Verify payload.

    Parameters
    ----------
    data :      list
                Data to verify.
    required :  dict
                Required entries. Can be nested. For example::
                    {'Note': str, 'User': str, 'Props': {'name': str}}

    Returns
    -------
    None

    """
    if not isinstance(data, list) or any([not isinstance(d, dict) for d in data]):
        raise TypeError('Data must be list of dicts.)')

    for d in data:
        not_required = [str(e) for e in d if e not in required]
        if any(not_required) and required_only:
            raise ValueError('Unallowed entries in data: {}'.format(','.join(not_required)))

        for e, t in required.items():
            if isinstance(t, dict):
                verify_payload([d[e]], t)
            else:
                if e not in d:
                    raise ValueError('Data must contain entry "{}"'.format(e))
                if isinstance(t, type) and not isinstance(d[e], t):
                    raise TypeError('Entry "{}" must be of type "{}"'.format(e, t))
                elif isinstance(t, list):
                    if not isinstance(d[e], list):
                        raise TypeError('"{}" must be list not "{}"'.format(e, type(d[e])))
                    for l in d[e]:
                        if not isinstance(l, tuple(t)):
                            raise TypeError('"{}" must not contain "{}"'.format(e, type(l)))


[docs]def parse_swc_str(x):
    """Parse SWC string into a pandas DataFrame.

    Parameters
    ----------
    x :     str

    Returns
    -------
    pandas.DataFrame, header

    """
    def split_iter(string):
        # Returns iterator for line split -> memory efficient
        # Attention: this will not fetch the last line!
        return (x.group(0) for x in re.finditer('.*?\n', string))

    if not isinstance(x, str):
        raise TypeError('x must be str, got "{}"'.format(type(x)))

    # Extract header using a generator -> this way we don't have to iterate
    # over all lines
    lines = split_iter(x)
    header = []
    for l in lines:
        if l.startswith('#'):
            header.append(l)
        else:
            break

    # Turn header back into string
    header = ''.join(header)

    # Turn SWC into a DataFrame
    f = StringIO(x)
    df = pd.read_csv(f, delim_whitespace=True, header=None, comment='#')

    df.columns = ['node_id', 'label', 'x', 'y', 'z', 'radius', 'parent_id']

    return df, header


[docs]def save_swc(x, filename, header=''):
    """Save SWC DataFrame to file.

    Parameters
    ----------
    x :         pandas.DataFrame
                SWC table to save.
    filename :  str
    header :    str, optional
                Header to add in front of SWC table. Each line must begin
                with "#"!

    """
    if not isinstance(x, pd.DataFrame):
        raise TypeError('Expected DataFrame, got "{}"'.format(type(x)))

    if not isinstance(header, str):
        raise TypeError('Header must be str, got "{}"'.format(type(header)))

    # Turn DataFrame back into string
    s = StringIO()
    x.to_csv(s, sep=' ', header=False, index=False)

    if not header.endswith('\n'):
        header += '\n'

    # Replace text
    swc = header + s.getvalue()

    with open(filename, 'w') as f:
        f.write(swc)


[docs]def gen_assignments(x, save_to=None, meta={}):
    """Generate JSON file that can be imported into neutu as assignments.

    Parameters
    ----------
    x :         pandas.DataFrame
                Must contain columns ``x``, ``y``, ``z`` or ``location``.
                Optional columns: ``body ID``, ``text`` and ``comment``.
    save_to :   None | filepath | filebuffer, optional
                If not None, will save json to file.
    meta :      dict, optional
                Metadata will be stored in json string as ``"metadata"``.

    Returns
    -------
    JSON-formated string
                Only if ``save_to=None``.

    """
    if not isinstance(x, pd.DataFrame):
        raise TypeError('x must be pandas DataFrame, got "{}"'.format(type(x)))

    if 'location' not in x.columns:
        if any([c not in x.columns for c in ['x', 'y', 'z']]):
            raise ValueError('x must have "location" column or "x", "y" '
                             'and "z" columns')
        x['location'] = x[['x', 'y', 'z']].astype(int).apply(list, axis=1)

    for c in ['body ID', 'text', 'comment']:
        if c not in x.columns:
            x[c] = ''

    x = x[['location', 'text', 'body ID', 'comment']]

    j = {'metadata': meta,
         'data': x.to_dict(orient='records')}

    if save_to:
        with open(save_to, 'w') as f:
            json.dump(j, f, indent=2)
    else:
        return j


def parse_bid(x):
    """Force body ID to integer."""
    try:
        return int(x)
    except BaseException:
        raise ValueError('Unable to coerce "{}" into numeric body ID'.format(x))


def _snap_to_skeleton(x, pos):
    """Snap position to closest node.

    Parameters
    ----------
    x :     pandas.DataFrame
            SWC DataFrame.
    pos :   array-like
            x/y/z position.

    Returns
    -------
    node ID :       int

    """
    if not isinstance(x, pd.DataFrame):
        raise TypeError('x must be pandas DataFrame, got "{}"'.format(type(x)))

    if not isinstance(pos, np.ndarray):
        pos = np.array(pos)

    dist = np.sum((x[['x', 'y', 'z']].values - pos)**2, axis=1)

    return x.iloc[np.argmin(dist)].node_id.astype(int)


[docs]def check_skeleton(bodyid, sample=False, node=None, server=None):
    """Test if skeleton is up-to-date.

    This works by getting the distance to the closest mesh voxel and skeleton
    node for each skeleton node and mesh voxel, respectively. The function
    returns the difference of the mean between skeleton->mesh and
    mesh->skeleton distances. A negative difference means the skeleton is
    missing branches, a positive difference indicates the skeleton having too
    many branches (e.g. if the mesh has been split).

    For large neurons, this can be costly. Use the ``sample`` parameter to
    speed up calculations.

    Parameters
    ----------
    bodyid :        int | str
                    ID of body for which to check the skeleton.
    sample :        float | int | None, optional
                    Use to restrict number of voxels/nodes used to
                    compare mesh and skeleton. If float, must be between 0
                    and 1 and will use random fraction. If int and >1, will
                    shuffle and cap number of voxels/nodes.
    server :        str, optional
                    If not provided, will try reading from global.
    node :          str, optional
                    If not provided, will try reading from global.

    Returns
    -------
    float
                    Difference of the mean between skeleton->mesh and
                    mesh->skeleton distances. A negative difference means the
                    skeleton is missing branches, a positive differene
                    indicates the skeleton having too many branches (e.g. if
                    the mesh has been split).
    None
                    If no skeleton available.

    """
    # Get SWC
    swc = fetch.get_skeleton(bodyid, node=node, server=server, verbose=False)

    if isinstance(swc, type(None)):
        return None

    swc_co = swc[['x', 'y', 'z']].values

    # Get mesh
    mesh = fetch.get_sparsevol(bodyid, scale='coarse', ret_type='COORDS',
                               node=node, server=server)

    if sample:
        if sample < 1 and sample > 0:
            mesh = mesh[np.random.choice(mesh.shape[0],
                                         int(mesh.shape[0] * sample),
                                         replace=False)]
            swc_co = swc_co[np.random.choice(swc_co.shape[0],
                                             int(swc_co.shape[0] * sample),
                                             replace=False)]
        elif sample > 1:
            if mesh.shape[0] > sample:
                mesh = mesh[np.random.choice(mesh.shape[0],
                                             int(sample),
                                             replace=False)]
            if swc_co.shape[0] > sample:
                swc_co = swc_co[np.random.choice(swc_co.shape[0],
                                                 int(sample),
                                                 replace=False)]

    # Get distances
    dist = cdist(mesh, swc_co)

    # Mesh -> SWC
    mesh_dist = np.min(dist, axis=1)
    # SWC -> Mesh
    swc_dist = np.min(dist, axis=0)

    # Return difference in mean distances
    return np.mean(swc_dist) - np.mean(mesh_dist)


def make_url(*args, **GET):
    """Generates URL.

    Parameters
    ----------
    *args
                Will be turned into the URL. For example::

                    >>> make_url('http://my-server.com', 'skeleton', 'list')
                    'http://my-server.com/skeleton/list'

    **GET
                Keyword arguments are assumed to be GET request queries
                and will be encoded in the url. For example::

                    >>> make_url('http://my-server.com', 'skeleton', node_gt=100)
                    'http://my-server.com/skeleton?node_gt=100'

    Returns
    -------
    url :       str

    """
    # Generate the URL
    url = args[0]
    for arg in args[1:]:
        arg_str = str(arg)
        joiner = '' if url.endswith('/') else '/'
        relative = arg_str[1:] if arg_str.startswith('/') else arg_str
        url = requests.compat.urljoin(url + joiner, relative)
    if GET:
        url += '?{}'.format(urllib.parse.urlencode(GET))
    return url