Source code for srttools.histograms

#  From https://gist.github.com/neothemachine/e625cb7777376899adca

"""
This module contains a fast replacement for numpy's histogramdd and
histogram2d.
Two changes were made. The first was replacing

np.digitize(a, b)

with

np.searchsorted(b, a, "right")

This performance bug is explained on https://github.com/numpy/numpy/issues/2656
The speedup is around 2x for big number of bins (roughly >100).
It assumes that the bins are monotonic.

The other change is to allow lists of weight arrays. This is advantageous for
resampling as there is just one set of coordinates but several data arrays
(=weights).
Therefore repeated computations are prevented.
"""


import numpy as np
from numpy import (
    arange,
    around,
    array,
    asarray,
    atleast_1d,
    atleast_2d,
    bincount,
    diff,
    empty,
    isscalar,
    linspace,
    log10,
    ones,
    searchsorted,
    sort,
    where,
    zeros,
)

__all__ = ["histogram2d", "histogramdd"]

__doctest_skip__ = ["*"]



[docs]
def histogramdd(sample, bins=10, bin_range=None, normed=False, weights=None):
    """
    Compute the multidimensional histogram of some data.

    Parameters
    ----------
    sample : array_like
        The data to be histogrammed. It must be an (N,D) array or data
        that can be converted to such. The rows of the resulting array
        are the coordinates of points in a D dimensional polytope.
    bins : sequence or int, optional
        The bin specification:

        * A sequence of arrays describing the bin edges along each dimension.
        * The number of bins for each dimension (nx, ny, ... =bins)
        * The number of bins for all dimensions (nx=ny=...=bins).

    bin_range : sequence, optional
        A sequence of lower and upper bin edges to be used if the edges are
        not given explicitly in ``bins``. Defaults to the minimum and maximum
        values along each dimension.
    normed : bool, optional
        If False, returns the number of samples in each bin. If True,
        returns the bin density ``bin_count / sample_count / bin_volume``.
    weights : array_like (N,), optional
        An array of values ``w_i`` weighing each sample ``(x_i, y_i, z_i, ...)``.
        Weights are normalized to 1 if normed is True. If normed is False,
        the values of the returned histogram are equal to the sum of the
        weights belonging to the samples falling into each bin.
        Weights can also be a list of (weight arrays or None), in which case
        a list of histograms is returned as H.

    Returns
    -------
    H : ndarray
        The multidimensional histogram of sample x. See normed and weights
        for the different possible semantics.
    edges : list
        A list of D arrays describing the bin edges for each dimension.

    See Also
    --------
    histogram: 1-D histogram
    histogram2d: 2-D histogram

    Examples
    --------
    >>> r = np.random.randn(100,3)
    >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))
    >>> H.shape, edges[0].size, edges[1].size, edges[2].size
    ((5, 8, 4), 6, 9, 5)

    """
    try:
        # Sample is an ND-array.
        N, D = sample.shape
    except (AttributeError, ValueError):
        # Sample is a sequence of 1D arrays.
        sample = atleast_2d(sample).T
        N, D = sample.shape

    if weights is None:
        W = None
    else:
        try:
            # Weights is a 1D-array
            _ = weights.shape
            W = -1
        except (AttributeError, ValueError):
            # Weights is a list of 1D-arrays or None's
            W = len(weights)

    if W == -1 and weights.ndim != 1:
        raise AttributeError("Weights must be a 1D-array, None, or " "a list of both")

    nbin = empty(D, int)
    edges = D * [None]
    dedges = D * [None]
    if weights is not None:
        if W == -1:
            weights = asarray(weights)
            if not weights.shape == (N,):
                raise ValueError("Wrong shape of weights array")
        else:
            for i in arange(W):
                if weights[i] is not None:
                    weights[i] = asarray(weights[i])
                    if not weights[i].shape == (N,):
                        raise ValueError("Wrong shape of weights array")

    try:
        M = len(bins)
        if M != D:
            raise AttributeError(
                "The dimension of bins must be equal to the dimension of the " " sample x."
            )
    except TypeError:
        # bins is an integer
        bins = D * [bins]

    # Select range for each dimension
    # Used only if number of bins is given.
    if bin_range is None:
        # Handle empty input. Range can't be determined in that case, use 0-1.
        if N == 0:
            smin = zeros(D)
            smax = ones(D)
        else:
            smin = atleast_1d(array(sample.min(0), float))
            smax = atleast_1d(array(sample.max(0), float))
    else:
        smin = zeros(D)
        smax = zeros(D)
        for i in arange(D):
            smin[i], smax[i] = bin_range[i]

    # Make sure the bins have a finite width.
    for i in arange(len(smin)):
        if smin[i] == smax[i]:
            smin[i] = smin[i] - 0.5
            smax[i] = smax[i] + 0.5

    # Create edge arrays
    for i in arange(D):
        if isscalar(bins[i]):
            if bins[i] < 1:
                raise ValueError(f"Element at index {i} in `bins` should be a positive integer.")
            nbin[i] = bins[i] + 2  # +2 for outlier bins
            edges[i] = linspace(smin[i], smax[i], int(nbin[i]) - 1)
        else:
            edges[i] = asarray(bins[i], float)
            nbin[i] = len(edges[i]) + 1  # +1 for outlier bins
        dedges[i] = diff(edges[i])
        if np.any(np.asarray(dedges[i]) <= 0):
            raise ValueError(
                "Found bin edge of size <= 0. Did you specify `bins` with" "non-monotonic sequence?"
            )

    nbin = asarray(nbin)

    # Handle empty input.
    if N == 0:
        if W is not None and W > 0:
            return [np.zeros(nbin - 2) for _ in arange(W)], edges
        else:
            return np.zeros(nbin - 2), edges

    # Compute the bin number each sample falls into.
    Ncount = {}
    for i in arange(D):
        # searchsorted is faster for many bins
        Ncount[i] = searchsorted(edges[i], sample[:, i], "right")
        # Ncount[i] = digitize(sample[:, i], edges[i])

    # Using digitize, values that fall on an edge are put in the right bin.
    # For the rightmost bin, we want values equal to the right
    # edge to be counted in the last bin, and not as an outlier.
    for i in arange(D):
        # Rounding precision
        mindiff = dedges[i].min()
        if not np.isinf(mindiff):
            decimal = int(-log10(mindiff)) + 6
            # Find which points are on the rightmost edge.
            not_smaller_than_edge = sample[:, i] >= edges[i][-1]
            on_edge = around(sample[:, i], decimal) == around(edges[i][-1], decimal)
            # Shift these points one bin to the left.
            Ncount[i][where(on_edge & not_smaller_than_edge)[0]] -= 1

    # Compute the sample indices in the flattened histogram matrix.
    ni = nbin.argsort()
    xy = zeros(N, int)
    for i in arange(0, D - 1):
        xy += Ncount[ni[i]] * nbin[ni[i + 1 :]].prod()
    xy += Ncount[ni[-1]]

    # Compute the number of repetitions in xy and assign it to the
    # flattened histmat.
    if len(xy) == 0:
        if W is not None and W > 0:
            return [np.zeros(nbin - 2) for _ in arange(W)], edges
        else:
            return zeros(nbin - 2, int), edges

    # Flattened histogram matrix (1D)
    # Reshape is used so that overlarge arrays
    # will raise an error.
    Wd = W if (W is not None and W > 0) else 1
    hists = [zeros(nbin, float).reshape(-1) for _ in arange(Wd)]
    for histidx, hist in enumerate(hists):
        weights_ = weights[histidx] if (W is not None and W > 0) else weights
        flatcount = bincount(xy, weights_)
        a = arange(len(flatcount))
        hist[a] = flatcount

        # Shape into a proper matrix
        hist = hist.reshape(sort(nbin))
        ni = nbin.argsort()
        for i in arange(nbin.size):
            j = ni.argsort()[i]
            hist = hist.swapaxes(i, j)
            ni[i], ni[j] = ni[j], ni[i]

        # Remove outliers (indices 0 and -1 for each dimension).
        core = D * [slice(1, -1)]
        hist = hist[tuple(core)]

        # Normalize if normed is True
        if normed:
            s = hist.sum()
            for i in arange(D):
                shape = ones(D, int)
                shape[i] = nbin[i] - 2
                hist = hist / dedges[i].reshape(shape)
            hist /= s

        if (hist.shape != nbin - 2).any():
            raise RuntimeError(
                "Internal Shape Error: hist.shape != nbin-2 -> "
                + str(hist.shape)
                + " != "
                + str(nbin - 2)
            )

        hists[histidx] = hist

    if W in [None, -1]:
        return hists[0], edges
    else:
        return hists, edges




[docs]
def histogram2d(x, y, bins=10, bin_range=None, normed=False, weights=None):
    """
    Compute the bi-dimensional histogram of two data samples.

    Parameters
    ----------
    x : array_like, shape (N,)
        An array containing the x coordinates of the points to be
        histogrammed.
    y : array_like, shape (N,)
        An array containing the y coordinates of the points to be
        histogrammed.
    bins : int or [int, int] or array_like or [array, array], optional
        The bin specification:

          * If int, the number of bins for the two dimensions (nx=ny=bins).
          * If [int, int], the number of bins in each dimension
            (nx, ny = bins).
          * If array_like, the bin edges for the two dimensions
            (x_edges=y_edges=bins).
          * If [array, array], the bin edges in each dimension
            (x_edges, y_edges = bins).

    bin_range : array_like, shape(2,2), optional
        The leftmost and rightmost edges of the bins along each dimension
        (if not specified explicitly in the ``bins`` parameters):
        ``[[xmin, xmax], [ymin, ymax]]``. All values outside of this range
        will be considered outliers and not tallied in the histogram.
    normed : bool, optional
        If False, returns the number of samples in each bin. If True,
        returns the bin density ``bin_count / sample_count / bin_area``.
    weights : array_like, shape(N,), optional
        An array of values ``w_i`` weighing each sample ``(x_i, y_i)``.
        Weights are normalized to 1 if ``normed`` is True. If ``normed`` is
        False, the values of the returned histogram are equal to the sum of
        the weights belonging to the samples falling into each bin.
        Weights can also be a list of (weight arrays or None), in which case
        a list of histograms is returned as H.

    Returns
    -------
    H : ndarray, shape(nx, ny)
        The bi-dimensional histogram of samples ``x`` and ``y``. Values in
        ``x`` are histogrammed along the first dimension and values in ``y``
        are histogrammed along the second dimension.
    xedges : ndarray, shape(nx,)
        The bin edges along the first dimension.
    yedges : ndarray, shape(ny,)
        The bin edges along the second dimension.

    See Also
    --------
    :func:`histogram` : 1D histogram
    :func:`histogramdd` : Multidimensional histogram

    Notes
    -----
    When ``normed`` is True, then the returned histogram is the sample
    density, defined such that the sum over bins of the product
    ``bin_value * bin_area`` is 1.

    Please note that the histogram does not follow the Cartesian convention
    where ``x`` values are on the abscissa and ``y`` values on the ordinate
    axis.  Rather, ``x`` is histogrammed along the first dimension of the
    array (vertical), and ``y`` along the second dimension of the array
    (horizontal).  This ensures compatibility with ``histogramdd``.

    Examples
    --------
    >>> import matplotlib as mpl
    >>> import matplotlib.pyplot as plt

    Construct a 2D-histogram with variable bin width. First define the bin
    edges:

    >>> xedges = [0, 1, 1.5, 3, 5]
    >>> yedges = [0, 2, 3, 4, 6]

    Next we create a histogram H with random bin content:

    >>> x = np.random.normal(3, 1, 100)
    >>> y = np.random.normal(1, 1, 100)
    >>> H, xedges, yedges = np.histogram2d(y, x, bins=(xedges, yedges))

    Or we fill the histogram H with a determined bin content:

    >>> H = np.ones((4, 4)).cumsum().reshape(4, 4)
    >>> print(H[::-1])  # This shows the bin content in the order as plotted
    [[ 13.  14.  15.  16.]
     [  9.  10.  11.  12.]
     [  5.   6.   7.   8.]
     [  1.   2.   3.   4.]]

    Imshow can only do an equidistant representation of bins:

    >>> fig = plt.figure(figsize=(7, 3))
    >>> ax = fig.add_subplot(131)
    >>> _ = ax.set_title('imshow: equidistant')
    >>> im = plt.imshow(H, interpolation='nearest', origin='low',
    ...                 extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])

    pcolormesh can display exact bin edges:

    >>> ax = fig.add_subplot(132)
    >>> _ = ax.set_title('pcolormesh: exact bin edges')
    >>> X, Y = np.meshgrid(xedges, yedges)
    >>> _ = ax.pcolormesh(X, Y, H)
    >>> _ = ax.set_aspect('equal')

    NonUniformImage displays exact bin edges with interpolation:

    >>> ax = fig.add_subplot(133)
    >>> _ = ax.set_title('NonUniformImage: interpolated')
    >>> im = mpl.image.NonUniformImage(ax, interpolation='bilinear')
    >>> xcenters = xedges[:-1] + 0.5 * (xedges[1:] - xedges[:-1])
    >>> ycenters = yedges[:-1] + 0.5 * (yedges[1:] - yedges[:-1])
    >>> _ = im.set_data(xcenters, ycenters, H)
    >>> _ = ax.images.append(im)
    >>> _ = ax.set_xlim(xedges[0], xedges[-1])
    >>> _ = ax.set_ylim(yedges[0], yedges[-1])
    >>> _ = ax.set_aspect('equal')
    >>> _ = plt.close(fig)

    """
    try:
        N = len(bins)
    except TypeError:
        N = 1

    if N != 1 and N != 2:
        xedges = yedges = asarray(bins, float)
        bins = [xedges, yedges]
    hist, edges = histogramdd([x, y], bins, bin_range, normed, weights)
    return hist, edges[0], edges[1]
Navigation

Source code for srttools.histograms