Source code for python_hll.hllutil
# -*- coding: utf-8 -*-
from math import log
from python_hll.hll import HLL
from python_hll.util import NumberUtil
from python_hll.util import BitUtil
[docs]class HLLUtil:
"""
Static functions for computing constants and parameters used in the HLL
algorithm.
"""
# The number of bits used to represent a long value in two's complement binary form
LONG_BIT_LENGTH = 64
# Precomputed ``pw_max_mask`` values indexed by ``register_size_in_bits``.
# Calculated with this formula::
#
# int max_register_value = (1 << register_size_in_bits) - 1;
# // Mask with all bits set except for (max_register_value - 1) least significant bits (see add_raw())
# return ~((1L << (max_register_value - 1)) - 1);
#
# See ``pw_max_mask()``
PW_MASK = [
-9223372036854775808, # ~((1 << (((1 << 0) - 1) - 1)) - 1)
-1, # ~((1 << (((1 << 1) - 1) - 1)) - 1)
-4, # ~((1 << (((1 << 2) - 1) - 1)) - 1)
-64, # ~((1 << (((1 << 3) - 1) - 1)) - 1)
-16384, # ~((1 << (((1 << 4) - 1) - 1)) - 1)
-1073741824, # ~((1 << (((1 << 5) - 1) - 1)) - 1)
-4611686018427387904, # ~((1 << (((1 << 6) - 1) - 1)) - 1)
-4611686018427387904, # ~((1 << (((1 << 7) - 1) - 1)) - 1)
-4611686018427387904, # ~((1 << (((1 << 8) - 1) - 1)) - 1)
]
# Spacing constant used to compute offsets into ``TWO_TO_L``.
REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1
[docs] @classmethod
def register_bit_size(cls, expected_unique_elements):
"""
Computes the bit-width of HLL registers necessary to estimate a set of
the specified cardinality.
:param long expected_unique_elements: an upper bound on the number of unique
elements that are expected. This must be greater than zero.
:returns: a register size in bits (i.e. ``log2(log2(n))``)
:rtype: int
"""
return max(
HLL.MINIMUM_REGWIDTH_PARAM,
NumberUtil.log2(NumberUtil.log2(expected_unique_elements))
)
[docs] @classmethod
def alpha_m_squared(cls, m):
"""
Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
:param int m: this must be a power of two, cannot be less than
16 (2:sup:`4`), and cannot be greater than 65536 (2:sup:`16`).
:returns: gamma times ``registerCount`` squared where gamma is
based on the value of ``registerCount``.
:rtype: float
"""
if m < 16:
raise Exception("'m' cannot be less than 16 ({m} < 16).".format(m=m))
elif m == 16:
return 0.673 * m * m
elif m == 32:
return 0.697 * m * m
elif m == 64:
return 0.709 * m * m
else:
return (0.7213 / (1.0 + 1.079 / m)) * m * m
[docs] @classmethod
def pw_max_mask(cls, register_size_in_bits):
"""
Computes a mask that prevents overflow of HyperLogLog registers.
:param int register_size_in_bits: the size of the HLL registers, in bits.
:returns: mask a ``long`` mask to prevent overflow of the registers
:rtype: long
"""
return cls.PW_MASK[register_size_in_bits]
[docs] @classmethod
def small_estimator_cutoff(cls, m):
"""
The cutoff for using the "small range correction" formula, in the
HyperLogLog algorithm.
:param int m: the number of registers in the HLL. <em>m<em> in the paper.
:returns: the cutoff for the small range correction.
:rtype: float
"""
return (float(m) * 5) / 2
[docs] @classmethod
def small_estimator(cls, m, number_of_zeroes):
"""
The "small range correction" formula from the HyperLogLog algorithm. Only
appropriate if both the estimator is smaller than ``(5/2) * m`` and
there are still registers that have the zero value.
:param int m: the number of registers in the HLL. <em>m<em> in the paper.
:param int number_of_zeroes: the number of registers with value zero. ``V``
in the paper.
:returns: a corrected cardinality estimate.
:rtype: float
"""
return m * log(float(m) / number_of_zeroes)
[docs] @classmethod
def large_estimator_cutoff(cls, log2m, register_size_in_bits):
"""
The cutoff for using the "large range correction" formula, from the
HyperLogLog algorithm, adapted for 64 bit hashes.
See `Blog post with section on 64 bit hashes and "large range correction" cutoff <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_.
:param int log2m: log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
:param int register_size_in_bits: the size of the HLL registers, in bits.
:returns: the cutoff for the large range correction.
:rtype: float
"""
return TWO_TO_L[
(cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m
] / 30.0
[docs] @classmethod
def large_estimator(cls, log2m, register_size_in_bits, estimator):
"""
The "large range correction" formula from the HyperLogLog algorithm, adapted
for 64 bit hashes. Only appropriate for estimators whose value exceeds
the return of ``largeEstimatorCutoff()``.
See `Blog post with section on 64 bit hashes and "large range correction" cutoff <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_.
:param int log2m: log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
:param int register_size_in_bits: the size of the HLL registers, in bits.
:param float estimator: the original estimator ("E" in the paper).
:returns: a corrected cardinality estimate.
:rtype: float
"""
two_to_l = TWO_TO_L[(cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m]
try:
return -1 * two_to_l * log(1.0 - (estimator/two_to_l))
except ValueError:
return 0
# Precomputed ``twoToL`` values indexed by a linear combination of
# ``regwidth`` and ``log2m``.
#
# The array is one-dimensional and can be accessed by using index
# ``(REG_WIDTH_INDEX_MULTIPLIER * regwidth) + log2m``
# for ``regwidth`` and ``log2m`` between the specified
# ``HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM`` constants.
#
# See ``large_estimator()``.
# See ``large_estimator_cutoff()``.
# See `Blog post with section on 2^L
# <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_
TWO_TO_L = [0.0] * (HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)
for reg_width in range(HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM+1):
for log2m in range(HLL.MINIMUM_LOG2M_PARAM, HLL.MAXIMUM_LOG2M_PARAM+1):
max_register_value = BitUtil.left_shift_int(1, reg_width) - 1
# Since 1 is added to p(w) in the insertion algorithm, only
# (maxRegisterValue - 1) bits are inspected hence the hash
# space is one power of two smaller.
pw_bits = max_register_value - 1
total_bits = pw_bits + log2m
two_to_l = 2**total_bits
TWO_TO_L[(HLLUtil.REG_WIDTH_INDEX_MULTIPLIER * reg_width) + log2m] = two_to_l