Source code for python_hll.hllutil

# -*- coding: utf-8 -*-
from math import log
from python_hll.hll import HLL
from python_hll.util import NumberUtil
from python_hll.util import BitUtil


[docs]class HLLUtil:
    """
    Static functions for computing constants and parameters used in the HLL
    algorithm.
    """

    # The number of bits used to represent a long value in two's complement binary form
    LONG_BIT_LENGTH = 64

    # Precomputed ``pw_max_mask`` values indexed by ``register_size_in_bits``.
    # Calculated with this formula::
    #
    #     int max_register_value = (1 << register_size_in_bits) - 1;
    #     // Mask with all bits set except for (max_register_value - 1) least significant bits (see add_raw())
    #     return ~((1L << (max_register_value - 1)) - 1);
    #
    # See ``pw_max_mask()``

    PW_MASK = [
        -9223372036854775808,  # ~((1 << (((1 << 0) - 1) - 1)) - 1)
        -1,                    # ~((1 << (((1 << 1) - 1) - 1)) - 1)
        -4,                    # ~((1 << (((1 << 2) - 1) - 1)) - 1)
        -64,                   # ~((1 << (((1 << 3) - 1) - 1)) - 1)
        -16384,                # ~((1 << (((1 << 4) - 1) - 1)) - 1)
        -1073741824,           # ~((1 << (((1 << 5) - 1) - 1)) - 1)
        -4611686018427387904,  # ~((1 << (((1 << 6) - 1) - 1)) - 1)
        -4611686018427387904,  # ~((1 << (((1 << 7) - 1) - 1)) - 1)
        -4611686018427387904,  # ~((1 << (((1 << 8) - 1) - 1)) - 1)
    ]

    # Spacing constant used to compute offsets into ``TWO_TO_L``.
    REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1

[docs]    @classmethod
    def register_bit_size(cls, expected_unique_elements):
        """
        Computes the bit-width of HLL registers necessary to estimate a set of
        the specified cardinality.

        :param long expected_unique_elements: an upper bound on the number of unique
               elements that are expected.  This must be greater than zero.
        :returns: a register size in bits (i.e. ``log2(log2(n))``)
        :rtype: int
        """
        return max(
            HLL.MINIMUM_REGWIDTH_PARAM,
            NumberUtil.log2(NumberUtil.log2(expected_unique_elements))
        )

[docs]    @classmethod
    def alpha_m_squared(cls, m):
        """
        Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.

        :param int m: this must be a power of two, cannot be less than
               16 (2:sup:`4`), and cannot be greater than 65536 (2:sup:`16`).
        :returns: gamma times ``registerCount`` squared where gamma is
                  based on the value of ``registerCount``.
        :rtype: float
        """

        if m < 16:
            raise Exception("'m' cannot be less than 16 ({m} < 16).".format(m=m))

        elif m == 16:
            return 0.673 * m * m

        elif m == 32:
            return 0.697 * m * m

        elif m == 64:
            return 0.709 * m * m

        else:
            return (0.7213 / (1.0 + 1.079 / m)) * m * m

[docs]    @classmethod
    def pw_max_mask(cls, register_size_in_bits):
        """
        Computes a mask that prevents overflow of HyperLogLog registers.

        :param int register_size_in_bits: the size of the HLL registers, in bits.
        :returns: mask a ``long`` mask to prevent overflow of the registers
        :rtype: long
        """
        return cls.PW_MASK[register_size_in_bits]

[docs]    @classmethod
    def small_estimator_cutoff(cls, m):
        """
        The cutoff for using the "small range correction" formula, in the
        HyperLogLog algorithm.

        :param int m: the number of registers in the HLL. <em>m<em> in the paper.
        :returns: the cutoff for the small range correction.
        :rtype: float
        """
        return (float(m) * 5) / 2

[docs]    @classmethod
    def small_estimator(cls, m, number_of_zeroes):
        """
        The "small range correction" formula from the HyperLogLog algorithm. Only
        appropriate if both the estimator is smaller than ``(5/2) * m`` and
        there are still registers that have the zero value.

        :param int m: the number of registers in the HLL. <em>m<em> in the paper.
        :param int number_of_zeroes: the number of registers with value zero. ``V``
               in the paper.
        :returns: a corrected cardinality estimate.
        :rtype: float
        """
        return m * log(float(m) / number_of_zeroes)

[docs]    @classmethod
    def large_estimator_cutoff(cls, log2m, register_size_in_bits):
        """
        The cutoff for using the "large range correction" formula, from the
        HyperLogLog algorithm, adapted for 64 bit hashes.

        See `Blog post with section on 64 bit hashes and "large range correction" cutoff <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_.

        :param int log2m: log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
        :param int register_size_in_bits: the size of the HLL registers, in bits.
        :returns: the cutoff for the large range correction.
        :rtype: float
        """
        return TWO_TO_L[
            (cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m
        ] / 30.0

[docs]    @classmethod
    def large_estimator(cls, log2m, register_size_in_bits, estimator):
        """
        The "large range correction" formula from the HyperLogLog algorithm, adapted
        for 64 bit hashes. Only appropriate for estimators whose value exceeds
        the return of ``largeEstimatorCutoff()``.

        See `Blog post with section on 64 bit hashes and "large range correction" cutoff <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_.

        :param int log2m: log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
        :param int register_size_in_bits: the size of the HLL registers, in bits.
        :param float estimator: the original estimator ("E" in the paper).
        :returns: a corrected cardinality estimate.
        :rtype: float
        """
        two_to_l = TWO_TO_L[(cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m]
        try:
            return -1 * two_to_l * log(1.0 - (estimator/two_to_l))
        except ValueError:
            return 0


# Precomputed ``twoToL`` values indexed by a linear combination of
# ``regwidth`` and ``log2m``.
#
# The array is one-dimensional and can be accessed by using index
# ``(REG_WIDTH_INDEX_MULTIPLIER * regwidth) + log2m``
# for ``regwidth`` and ``log2m`` between the specified
# ``HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM`` constants.
#
# See ``large_estimator()``.
# See ``large_estimator_cutoff()``.
# See `Blog post with section on 2^L
# <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_
TWO_TO_L = [0.0] * (HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)
for reg_width in range(HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM+1):
    for log2m in range(HLL.MINIMUM_LOG2M_PARAM, HLL.MAXIMUM_LOG2M_PARAM+1):
        max_register_value = BitUtil.left_shift_int(1, reg_width) - 1

        # Since 1 is added to p(w) in the insertion algorithm, only
        # (maxRegisterValue - 1) bits are inspected hence the hash
        # space is one power of two smaller.
        pw_bits = max_register_value - 1
        total_bits = pw_bits + log2m
        two_to_l = 2**total_bits
        TWO_TO_L[(HLLUtil.REG_WIDTH_INDEX_MULTIPLIER * reg_width) + log2m] = two_to_l
Source code for python_hll.hllutil

python-hll

Navigation

Related Topics