Thursday, January 30, 2014

Removing redundant points from a numpy array

Here is a snippet which will remove redundant points from timeseries data using numpy. I recently had to do this and had no luck finding any help via google. Here is my solution:
def remove_redundant_points(points):
    """
    Returns a point list with any redundant points (points where 
    the value didn't change from the previous point) removed.
    The resulting list has the points before and after any value 
    change.
    :param points: Array of points (time, value)
    :return: Trimmed Array of points with any points where value 
    doesn't change before or after removed.
    """
    changepoints = numpy.where(points[1:, 1] != points[:-1, 1])[0]
    keepindexes = numpy.unique(numpy.concatenate(
                     ([0, len(points) - 1], 
                       changepoints, changepoints + 1)))
    return points[keepindexes]
Examples...

>>> values = [1, 1, 1, 1, 1, 2, 2, 2, 3, 4, 5, 6, 5, 5, 5, 5, 5, 5]
>>> pts = zip(range(len(values)), values)
>>> pts
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 2), (7, 2), (8, 3), (9, 4),
 (10, 5), (11, 6), (12, 5), (13, 5), (14, 5), (15, 5), (16, 5), (17, 5)]
>>> trimmed_pts = remove_redundant_points(numpy.array(pts))
>>> trimmed_pts.tolist()
[[0, 1], [4, 1], [5, 2], [7, 2], [8, 3], [9, 4], [10, 5], [11, 6], [12, 5], [17,
 5]]

>>> pts = []
>>> v = 0
>>> for i in range(10000):
...    if random.random() > 0.95: v += 1
...    pts.append((t + i, v))
...
>>> pts = numpy.array(pts)
>>> pts
array([[  1.39111413e+09,   0.00000000e+00],
       [  1.39111413e+09,   0.00000000e+00],
       [  1.39111414e+09,   0.00000000e+00],
       ...,
       [  1.39112413e+09,   5.00000000e+02],
       [  1.39112413e+09,   5.00000000e+02],
       [  1.39112413e+09,   5.00000000e+02]])
>>> len(pts)
10000
>>> trimmed_pts = remove_redundant_points(pts)
>>> trimmed_pts
array([[  1.39111413e+09,   0.00000000e+00],
       [  1.39111414e+09,   0.00000000e+00],
       [  1.39111415e+09,   1.00000000e+00],
       ...,
       [  1.39112412e+09,   4.99000000e+02],
       [  1.39112412e+09,   5.00000000e+02],
       [  1.39112413e+09,   5.00000000e+02]])
>>> len(trimmed_pts)
968


No comments:

Post a Comment