Source code for srttools.histograms

#  From https://gist.github.com/neothemachine/e625cb7777376899adca

"""
This module contains a fast replacement for numpy's histogramdd and
histogram2d.
Two changes were made. The first was replacing

np.digitize(a, b)

with

np.searchsorted(b, a, "right")

This performance bug is explained on https://github.com/numpy/numpy/issues/2656
The speedup is around 2x for big number of bins (roughly >100).
It assumes that the bins are monotonic.

The other change is to allow lists of weight arrays. This is advantageous for
resampling as there is just one set of coordinates but several data arrays
(=weights).
Therefore repeated computations are prevented.
"""


import numpy as np
from numpy import (
    atleast_2d,
    asarray,
    zeros,
    ones,
    array,
    atleast_1d,
    arange,
    isscalar,
    linspace,
    diff,
    empty,
    around,
    where,
    bincount,
    sort,
    log10,
    searchsorted,
)

__all__ = ["histogram2d", "histogramdd"]

__doctest_skip__ = ["*"]


[docs]def histogramdd(sample, bins=10, bin_range=None, normed=False, weights=None): """ Compute the multidimensional histogram of some data. Parameters ---------- sample : array_like The data to be histogrammed. It must be an (N,D) array or data that can be converted to such. The rows of the resulting array are the coordinates of points in a D dimensional polytope. bins : sequence or int, optional The bin specification: * A sequence of arrays describing the bin edges along each dimension. * The number of bins for each dimension (nx, ny, ... =bins) * The number of bins for all dimensions (nx=ny=...=bins). bin_range : sequence, optional A sequence of lower and upper bin edges to be used if the edges are not given explicitly in ``bins``. Defaults to the minimum and maximum values along each dimension. normed : bool, optional If False, returns the number of samples in each bin. If True, returns the bin density ``bin_count / sample_count / bin_volume``. weights : array_like (N,), optional An array of values ``w_i`` weighing each sample ``(x_i, y_i, z_i, ...)``. Weights are normalized to 1 if normed is True. If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. Weights can also be a list of (weight arrays or None), in which case a list of histograms is returned as H. Returns ------- H : ndarray The multidimensional histogram of sample x. See normed and weights for the different possible semantics. edges : list A list of D arrays describing the bin edges for each dimension. See Also -------- histogram: 1-D histogram histogram2d: 2-D histogram Examples -------- >>> r = np.random.randn(100,3) >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) >>> H.shape, edges[0].size, edges[1].size, edges[2].size ((5, 8, 4), 6, 9, 5) """ try: # Sample is an ND-array. N, D = sample.shape except (AttributeError, ValueError): # Sample is a sequence of 1D arrays. sample = atleast_2d(sample).T N, D = sample.shape if weights is None: W = None else: try: # Weights is a 1D-array _ = weights.shape W = -1 except (AttributeError, ValueError): # Weights is a list of 1D-arrays or None's W = len(weights) if W == -1 and weights.ndim != 1: raise AttributeError("Weights must be a 1D-array, None, or " "a list of both") nbin = empty(D, int) edges = D * [None] dedges = D * [None] if weights is not None: if W == -1: weights = asarray(weights) if not weights.shape == (N,): raise ValueError("Wrong shape of weights array") else: for i in arange(W): if weights[i] is not None: weights[i] = asarray(weights[i]) if not weights[i].shape == (N,): raise ValueError("Wrong shape of weights array") try: M = len(bins) if M != D: raise AttributeError( "The dimension of bins must be equal to the dimension of the " " sample x." ) except TypeError: # bins is an integer bins = D * [bins] # Select range for each dimension # Used only if number of bins is given. if bin_range is None: # Handle empty input. Range can't be determined in that case, use 0-1. if N == 0: smin = zeros(D) smax = ones(D) else: smin = atleast_1d(array(sample.min(0), float)) smax = atleast_1d(array(sample.max(0), float)) else: smin = zeros(D) smax = zeros(D) for i in arange(D): smin[i], smax[i] = bin_range[i] # Make sure the bins have a finite width. for i in arange(len(smin)): if smin[i] == smax[i]: smin[i] = smin[i] - 0.5 smax[i] = smax[i] + 0.5 # Create edge arrays for i in arange(D): if isscalar(bins[i]): if bins[i] < 1: raise ValueError( "Element at index %s in `bins` should be a positive " "integer." % i ) nbin[i] = bins[i] + 2 # +2 for outlier bins edges[i] = linspace(smin[i], smax[i], int(nbin[i]) - 1) else: edges[i] = asarray(bins[i], float) nbin[i] = len(edges[i]) + 1 # +1 for outlier bins dedges[i] = diff(edges[i]) if np.any(np.asarray(dedges[i]) <= 0): raise ValueError( "Found bin edge of size <= 0. Did you specify `bins` with" "non-monotonic sequence?" ) nbin = asarray(nbin) # Handle empty input. if N == 0: if W is not None and W > 0: return [np.zeros(nbin - 2) for _ in arange(W)], edges else: return np.zeros(nbin - 2), edges # Compute the bin number each sample falls into. Ncount = {} for i in arange(D): # searchsorted is faster for many bins Ncount[i] = searchsorted(edges[i], sample[:, i], "right") # Ncount[i] = digitize(sample[:, i], edges[i]) # Using digitize, values that fall on an edge are put in the right bin. # For the rightmost bin, we want values equal to the right # edge to be counted in the last bin, and not as an outlier. for i in arange(D): # Rounding precision mindiff = dedges[i].min() if not np.isinf(mindiff): decimal = int(-log10(mindiff)) + 6 # Find which points are on the rightmost edge. not_smaller_than_edge = sample[:, i] >= edges[i][-1] on_edge = around(sample[:, i], decimal) == around(edges[i][-1], decimal) # Shift these points one bin to the left. Ncount[i][where(on_edge & not_smaller_than_edge)[0]] -= 1 # Compute the sample indices in the flattened histogram matrix. ni = nbin.argsort() xy = zeros(N, int) for i in arange(0, D - 1): xy += Ncount[ni[i]] * nbin[ni[i + 1 :]].prod() xy += Ncount[ni[-1]] # Compute the number of repetitions in xy and assign it to the # flattened histmat. if len(xy) == 0: if W is not None and W > 0: return [np.zeros(nbin - 2) for _ in arange(W)], edges else: return zeros(nbin - 2, int), edges # Flattened histogram matrix (1D) # Reshape is used so that overlarge arrays # will raise an error. Wd = W if (W is not None and W > 0) else 1 hists = [zeros(nbin, float).reshape(-1) for _ in arange(Wd)] for histidx, hist in enumerate(hists): weights_ = weights[histidx] if (W is not None and W > 0) else weights flatcount = bincount(xy, weights_) a = arange(len(flatcount)) hist[a] = flatcount # Shape into a proper matrix hist = hist.reshape(sort(nbin)) ni = nbin.argsort() for i in arange(nbin.size): j = ni.argsort()[i] hist = hist.swapaxes(i, j) ni[i], ni[j] = ni[j], ni[i] # Remove outliers (indices 0 and -1 for each dimension). core = D * [slice(1, -1)] hist = hist[tuple(core)] # Normalize if normed is True if normed: s = hist.sum() for i in arange(D): shape = ones(D, int) shape[i] = nbin[i] - 2 hist = hist / dedges[i].reshape(shape) hist /= s if (hist.shape != nbin - 2).any(): raise RuntimeError( "Internal Shape Error: hist.shape != nbin-2 -> " + str(hist.shape) + " != " + str(nbin - 2) ) hists[histidx] = hist if W in [None, -1]: return hists[0], edges else: return hists, edges
[docs]def histogram2d(x, y, bins=10, bin_range=None, normed=False, weights=None): """ Compute the bi-dimensional histogram of two data samples. Parameters ---------- x : array_like, shape (N,) An array containing the x coordinates of the points to be histogrammed. y : array_like, shape (N,) An array containing the y coordinates of the points to be histogrammed. bins : int or [int, int] or array_like or [array, array], optional The bin specification: * If int, the number of bins for the two dimensions (nx=ny=bins). * If [int, int], the number of bins in each dimension (nx, ny = bins). * If array_like, the bin edges for the two dimensions (x_edges=y_edges=bins). * If [array, array], the bin edges in each dimension (x_edges, y_edges = bins). bin_range : array_like, shape(2,2), optional The leftmost and rightmost edges of the bins along each dimension (if not specified explicitly in the ``bins`` parameters): ``[[xmin, xmax], [ymin, ymax]]``. All values outside of this range will be considered outliers and not tallied in the histogram. normed : bool, optional If False, returns the number of samples in each bin. If True, returns the bin density ``bin_count / sample_count / bin_area``. weights : array_like, shape(N,), optional An array of values ``w_i`` weighing each sample ``(x_i, y_i)``. Weights are normalized to 1 if ``normed`` is True. If ``normed`` is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. Weights can also be a list of (weight arrays or None), in which case a list of histograms is returned as H. Returns ------- H : ndarray, shape(nx, ny) The bi-dimensional histogram of samples ``x`` and ``y``. Values in ``x`` are histogrammed along the first dimension and values in ``y`` are histogrammed along the second dimension. xedges : ndarray, shape(nx,) The bin edges along the first dimension. yedges : ndarray, shape(ny,) The bin edges along the second dimension. See Also -------- :func:`histogram` : 1D histogram :func:`histogramdd` : Multidimensional histogram Notes ----- When ``normed`` is True, then the returned histogram is the sample density, defined such that the sum over bins of the product ``bin_value * bin_area`` is 1. Please note that the histogram does not follow the Cartesian convention where ``x`` values are on the abscissa and ``y`` values on the ordinate axis. Rather, ``x`` is histogrammed along the first dimension of the array (vertical), and ``y`` along the second dimension of the array (horizontal). This ensures compatibility with ``histogramdd``. Examples -------- >>> import matplotlib as mpl >>> import matplotlib.pyplot as plt Construct a 2D-histogram with variable bin width. First define the bin edges: >>> xedges = [0, 1, 1.5, 3, 5] >>> yedges = [0, 2, 3, 4, 6] Next we create a histogram H with random bin content: >>> x = np.random.normal(3, 1, 100) >>> y = np.random.normal(1, 1, 100) >>> H, xedges, yedges = np.histogram2d(y, x, bins=(xedges, yedges)) Or we fill the histogram H with a determined bin content: >>> H = np.ones((4, 4)).cumsum().reshape(4, 4) >>> print(H[::-1]) # This shows the bin content in the order as plotted [[ 13. 14. 15. 16.] [ 9. 10. 11. 12.] [ 5. 6. 7. 8.] [ 1. 2. 3. 4.]] Imshow can only do an equidistant representation of bins: >>> fig = plt.figure(figsize=(7, 3)) >>> ax = fig.add_subplot(131) >>> _ = ax.set_title('imshow: equidistant') >>> im = plt.imshow(H, interpolation='nearest', origin='low', ... extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) pcolormesh can display exact bin edges: >>> ax = fig.add_subplot(132) >>> _ = ax.set_title('pcolormesh: exact bin edges') >>> X, Y = np.meshgrid(xedges, yedges) >>> _ = ax.pcolormesh(X, Y, H) >>> _ = ax.set_aspect('equal') NonUniformImage displays exact bin edges with interpolation: >>> ax = fig.add_subplot(133) >>> _ = ax.set_title('NonUniformImage: interpolated') >>> im = mpl.image.NonUniformImage(ax, interpolation='bilinear') >>> xcenters = xedges[:-1] + 0.5 * (xedges[1:] - xedges[:-1]) >>> ycenters = yedges[:-1] + 0.5 * (yedges[1:] - yedges[:-1]) >>> _ = im.set_data(xcenters, ycenters, H) >>> _ = ax.images.append(im) >>> _ = ax.set_xlim(xedges[0], xedges[-1]) >>> _ = ax.set_ylim(yedges[0], yedges[-1]) >>> _ = ax.set_aspect('equal') >>> _ = plt.close(fig) """ try: N = len(bins) except TypeError: N = 1 if N != 1 and N != 2: xedges = yedges = asarray(bins, float) bins = [xedges, yedges] hist, edges = histogramdd([x, y], bins, bin_range, normed, weights) return hist, edges[0], edges[1]