Jenk’s Natural Breaks Optimization

A co-worker was interested in segmenting a list of data points, and I went down a rabbit hole on one dimensional segmentation. I found an article on the Jenk’s natural breaks optimization on Wikipedia. This is used to bin data points so that clusters are always binned together. There is an iterative method that takes unordered data, but this implementation just sorts the data before binning.

#!/usr/local/bin/python3
# jenks.py

import scipy
from typing import List


def sdam(x: List[float]) -> float:
    """Calculate the Sum of the Deviations from the Mean.

    Actually, this is the sum of the squared deviations from the mean.

    Args:
        x: (list of floats) list data points.

    """
    mean = scipy.mean(x)
    deviations = [(i-mean)**2 for i in x]
    result = sum(deviations)
    return result


def jenks_two_class(x: List[float]) -> (float, List[float], List[float]):
    """Calculate Jenks's two class break.

    Args:
        x: (list of floats) list of data points.

    Returns:
        gvf: (float) goodness of variance fit
        before_break: (list of floats) first class, before the break
        after_break: (lost of floats) second class, after the break

    """
    s = sorted(x)
    # sum of deviation of the array mean
    SDAM = sdam(x)
    breaks = []
    for i in range(1,len(s)):
        before_break = round(sdam(s[:i]), 2)
        after_break = round(sdam(s[i:]), 2)
        # sum of squared deviations from class means
        scdm = before_break + after_break
        # goodness of variance fit
        gvf = (SDAM - scdm)/float(SDAM)
        breaks.append((gvf, s[:i], s[i:]))
    result = sorted(breaks)[-1]
    print(result)
    return result

I wanted to see how this would work in practice, so I wrote some tests.

from jenks import jenks_two_class
import pytest

data = [
    ([2, 3, 5, 6], [[2, 3], [5, 6]]),
    ([2, 3, 4, 5], [[2, 3], [4, 5]]),
    ([2, 3, 4, 5, 6], [[2, 3, 4], [5, 6]]),
    ([2, 3, 5, 6, 8, 9], [[2, 3, 5], [6, 8, 9]]),
    ([2, 3, 5, 6, 9, 10], [[2, 3, 5, 6], [9, 10]]),
]

@pytest.mark.parametrize("inp, outp", data)
def test_jenks(inp, outp):
    gvf, b1, b2 = jenks_two_class(inp)
    assert [b1, b2] == outp

Leave a Reply

Your email address will not be published. Required fields are marked *