diff options
Diffstat (limited to 'bench')
-rw-r--r-- | bench/statistics.py | 595 | ||||
-rw-r--r-- | bench/stats.py | 19 |
2 files changed, 0 insertions, 614 deletions
diff --git a/bench/statistics.py b/bench/statistics.py deleted file mode 100644 index 25a26d4..0000000 --- a/bench/statistics.py +++ /dev/null @@ -1,595 +0,0 @@ -## Module statistics.py -## -## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>. -## -## Licensed under the Apache License, Version 2.0 (the "License"); -## you may not use this file except in compliance with the License. -## You may obtain a copy of the License at -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -## See the License for the specific language governing permissions and -## limitations under the License. - - -""" -Basic statistics module. - -This module provides functions for calculating statistics of data, including -averages, variance, and standard deviation. - -Calculating averages --------------------- - -================== ============================================= -Function Description -================== ============================================= -mean Arithmetic mean (average) of data. -median Median (middle value) of data. -median_low Low median of data. -median_high High median of data. -median_grouped Median, or 50th percentile, of grouped data. -mode Mode (most common value) of data. -================== ============================================= - -Calculate the arithmetic mean ("the average") of data: - ->>> mean([-1.0, 2.5, 3.25, 5.75]) -2.625 - - -Calculate the standard median of discrete data: - ->>> median([2, 3, 4, 5]) -3.5 - - -Calculate the median, or 50th percentile, of data grouped into class intervals -centred on the data values provided. E.g. if your data points are rounded to -the nearest whole number: - ->>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS -2.8333333333... - -This should be interpreted in this way: you have two data points in the class -interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in -the class interval 3.5-4.5. The median of these data points is 2.8333... - - -Calculating variability or spread ---------------------------------- - -================== ============================================= -Function Description -================== ============================================= -pvariance Population variance of data. -variance Sample variance of data. -pstdev Population standard deviation of data. -stdev Sample standard deviation of data. -================== ============================================= - -Calculate the standard deviation of sample data: - ->>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS -4.38961843444... - -If you have previously calculated the mean, you can pass it as the optional -second argument to the four "spread" functions to avoid recalculating it: - ->>> data = [1, 2, 2, 4, 4, 4, 5, 6] ->>> mu = mean(data) ->>> pvariance(data, mu) -2.5 - - -Exceptions ----------- - -A single exception is defined: StatisticsError is a subclass of ValueError. - -""" - -__all__ = [ 'StatisticsError', - 'pstdev', 'pvariance', 'stdev', 'variance', - 'median', 'median_low', 'median_high', 'median_grouped', - 'mean', 'mode', - ] - - -import collections -import math - -from fractions import Fraction -from decimal import Decimal - - -# === Exceptions === - -class StatisticsError(ValueError): - pass - - -# === Private utilities === - -def _sum(data, start=0): - """_sum(data [, start]) -> value - - Return a high-precision sum of the given numeric data. If optional - argument ``start`` is given, it is added to the total. If ``data`` is - empty, ``start`` (defaulting to 0) is returned. - - - Examples - -------- - - >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) - 11.0 - - Some sources of round-off error will be avoided: - - >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero. - 1000.0 - - Fractions and Decimals are also supported: - - >>> from fractions import Fraction as F - >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) - Fraction(63, 20) - - >>> from decimal import Decimal as D - >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] - >>> _sum(data) - Decimal('0.6963') - - Mixed types are currently treated as an error, except that int is - allowed. - """ - # We fail as soon as we reach a value that is not an int or the type of - # the first value which is not an int. E.g. _sum([int, int, float, int]) - # is okay, but sum([int, int, float, Fraction]) is not. - allowed_types = set([int, type(start)]) - n, d = _exact_ratio(start) - partials = {d: n} # map {denominator: sum of numerators} - # Micro-optimizations. - exact_ratio = _exact_ratio - partials_get = partials.get - # Add numerators for each denominator. - for x in data: - _check_type(type(x), allowed_types) - n, d = exact_ratio(x) - partials[d] = partials_get(d, 0) + n - # Find the expected result type. If allowed_types has only one item, it - # will be int; if it has two, use the one which isn't int. - assert len(allowed_types) in (1, 2) - if len(allowed_types) == 1: - assert allowed_types.pop() is int - T = int - else: - T = (allowed_types - set([int])).pop() - if None in partials: - assert issubclass(T, (float, Decimal)) - assert not math.isfinite(partials[None]) - return T(partials[None]) - total = Fraction() - for d, n in sorted(partials.items()): - total += Fraction(n, d) - if issubclass(T, int): - assert total.denominator == 1 - return T(total.numerator) - if issubclass(T, Decimal): - return T(total.numerator)/total.denominator - return T(total) - - -def _check_type(T, allowed): - if T not in allowed: - if len(allowed) == 1: - allowed.add(T) - else: - types = ', '.join([t.__name__ for t in allowed] + [T.__name__]) - raise TypeError("unsupported mixed types: %s" % types) - - -def _exact_ratio(x): - """Convert Real number x exactly to (numerator, denominator) pair. - - >>> _exact_ratio(0.25) - (1, 4) - - x is expected to be an int, Fraction, Decimal or float. - """ - try: - try: - # int, Fraction - return (x.numerator, x.denominator) - except AttributeError: - # float - try: - return x.as_integer_ratio() - except AttributeError: - # Decimal - try: - return _decimal_to_ratio(x) - except AttributeError: - msg = "can't convert type '{}' to numerator/denominator" - raise TypeError(msg.format(type(x).__name__)) from None - except (OverflowError, ValueError): - # INF or NAN - if __debug__: - # Decimal signalling NANs cannot be converted to float :-( - if isinstance(x, Decimal): - assert not x.is_finite() - else: - assert not math.isfinite(x) - return (x, None) - - -# FIXME This is faster than Fraction.from_decimal, but still too slow. -def _decimal_to_ratio(d): - """Convert Decimal d to exact integer ratio (numerator, denominator). - - >>> from decimal import Decimal - >>> _decimal_to_ratio(Decimal("2.6")) - (26, 10) - - """ - sign, digits, exp = d.as_tuple() - if exp in ('F', 'n', 'N'): # INF, NAN, sNAN - assert not d.is_finite() - raise ValueError - num = 0 - for digit in digits: - num = num*10 + digit - if exp < 0: - den = 10**-exp - else: - num *= 10**exp - den = 1 - if sign: - num = -num - return (num, den) - - -def _counts(data): - # Generate a table of sorted (value, frequency) pairs. - table = collections.Counter(iter(data)).most_common() - if not table: - return table - # Extract the values with the highest frequency. - maxfreq = table[0][1] - for i in range(1, len(table)): - if table[i][1] != maxfreq: - table = table[:i] - break - return table - - -# === Measures of central tendency (averages) === - -def mean(data): - """Return the sample arithmetic mean of data. - - >>> mean([1, 2, 3, 4, 4]) - 2.8 - - >>> from fractions import Fraction as F - >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) - Fraction(13, 21) - - >>> from decimal import Decimal as D - >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) - Decimal('0.5625') - - If ``data`` is empty, StatisticsError will be raised. - """ - if iter(data) is data: - data = list(data) - n = len(data) - if n < 1: - raise StatisticsError('mean requires at least one data point') - return _sum(data)/n - - -# FIXME: investigate ways to calculate medians without sorting? Quickselect? -def median(data): - """Return the median (middle value) of numeric data. - - When the number of data points is odd, return the middle data point. - When the number of data points is even, the median is interpolated by - taking the average of the two middle values: - - >>> median([1, 3, 5]) - 3 - >>> median([1, 3, 5, 7]) - 4.0 - - """ - data = sorted(data) - n = len(data) - if n == 0: - raise StatisticsError("no median for empty data") - if n%2 == 1: - return data[n//2] - else: - i = n//2 - return (data[i - 1] + data[i])/2 - - -def median_low(data): - """Return the low median of numeric data. - - When the number of data points is odd, the middle value is returned. - When it is even, the smaller of the two middle values is returned. - - >>> median_low([1, 3, 5]) - 3 - >>> median_low([1, 3, 5, 7]) - 3 - - """ - data = sorted(data) - n = len(data) - if n == 0: - raise StatisticsError("no median for empty data") - if n%2 == 1: - return data[n//2] - else: - return data[n//2 - 1] - - -def median_high(data): - """Return the high median of data. - - When the number of data points is odd, the middle value is returned. - When it is even, the larger of the two middle values is returned. - - >>> median_high([1, 3, 5]) - 3 - >>> median_high([1, 3, 5, 7]) - 5 - - """ - data = sorted(data) - n = len(data) - if n == 0: - raise StatisticsError("no median for empty data") - return data[n//2] - - -def median_grouped(data, interval=1): - """"Return the 50th percentile (median) of grouped continuous data. - - >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]) - 3.7 - >>> median_grouped([52, 52, 53, 54]) - 52.5 - - This calculates the median as the 50th percentile, and should be - used when your data is continuous and grouped. In the above example, - the values 1, 2, 3, etc. actually represent the midpoint of classes - 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in - class 3.5-4.5, and interpolation is used to estimate it. - - Optional argument ``interval`` represents the class interval, and - defaults to 1. Changing the class interval naturally will change the - interpolated 50th percentile value: - - >>> median_grouped([1, 3, 3, 5, 7], interval=1) - 3.25 - >>> median_grouped([1, 3, 3, 5, 7], interval=2) - 3.5 - - This function does not check whether the data points are at least - ``interval`` apart. - """ - data = sorted(data) - n = len(data) - if n == 0: - raise StatisticsError("no median for empty data") - elif n == 1: - return data[0] - # Find the value at the midpoint. Remember this corresponds to the - # centre of the class interval. - x = data[n//2] - for obj in (x, interval): - if isinstance(obj, (str, bytes)): - raise TypeError('expected number but got %r' % obj) - try: - L = x - interval/2 # The lower limit of the median interval. - except TypeError: - # Mixed type. For now we just coerce to float. - L = float(x) - float(interval)/2 - cf = data.index(x) # Number of values below the median interval. - # FIXME The following line could be more efficient for big lists. - f = data.count(x) # Number of data points in the median interval. - return L + interval*(n/2 - cf)/f - - -def mode(data): - """Return the most common data point from discrete or nominal data. - - ``mode`` assumes discrete data, and returns a single value. This is the - standard treatment of the mode as commonly taught in schools: - - >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) - 3 - - This also works with nominal (non-numeric) data: - - >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) - 'red' - - If there is not exactly one most common value, ``mode`` will raise - StatisticsError. - """ - # Generate a table of sorted (value, frequency) pairs. - table = _counts(data) - if len(table) == 1: - return table[0][0] - elif table: - raise StatisticsError( - 'no unique mode; found %d equally common values' % len(table) - ) - else: - raise StatisticsError('no mode for empty data') - - -# === Measures of spread === - -# See http://mathworld.wolfram.com/Variance.html -# http://mathworld.wolfram.com/SampleVariance.html -# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance -# -# Under no circumstances use the so-called "computational formula for -# variance", as that is only suitable for hand calculations with a small -# amount of low-precision data. It has terrible numeric properties. -# -# See a comparison of three computational methods here: -# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ - -def _ss(data, c=None): - """Return sum of square deviations of sequence data. - - If ``c`` is None, the mean is calculated in one pass, and the deviations - from the mean are calculated in a second pass. Otherwise, deviations are - calculated from ``c`` as given. Use the second case with care, as it can - lead to garbage results. - """ - if c is None: - c = mean(data) - ss = _sum((x-c)**2 for x in data) - # The following sum should mathematically equal zero, but due to rounding - # error may not. - ss -= _sum((x-c) for x in data)**2/len(data) - assert not ss < 0, 'negative sum of square deviations: %f' % ss - return ss - - -def variance(data, xbar=None): - """Return the sample variance of data. - - data should be an iterable of Real-valued numbers, with at least two - values. The optional argument xbar, if given, should be the mean of - the data. If it is missing or None, the mean is automatically calculated. - - Use this function when your data is a sample from a population. To - calculate the variance from the entire population, see ``pvariance``. - - Examples: - - >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] - >>> variance(data) - 1.3720238095238095 - - If you have already calculated the mean of your data, you can pass it as - the optional second argument ``xbar`` to avoid recalculating it: - - >>> m = mean(data) - >>> variance(data, m) - 1.3720238095238095 - - This function does not check that ``xbar`` is actually the mean of - ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or - impossible results. - - Decimals and Fractions are supported: - - >>> from decimal import Decimal as D - >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) - Decimal('31.01875') - - >>> from fractions import Fraction as F - >>> variance([F(1, 6), F(1, 2), F(5, 3)]) - Fraction(67, 108) - - """ - if iter(data) is data: - data = list(data) - n = len(data) - if n < 2: - raise StatisticsError('variance requires at least two data points') - ss = _ss(data, xbar) - return ss/(n-1) - - -def pvariance(data, mu=None): - """Return the population variance of ``data``. - - data should be an iterable of Real-valued numbers, with at least one - value. The optional argument mu, if given, should be the mean of - the data. If it is missing or None, the mean is automatically calculated. - - Use this function to calculate the variance from the entire population. - To estimate the variance from a sample, the ``variance`` function is - usually a better choice. - - Examples: - - >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] - >>> pvariance(data) - 1.25 - - If you have already calculated the mean of the data, you can pass it as - the optional second argument to avoid recalculating it: - - >>> mu = mean(data) - >>> pvariance(data, mu) - 1.25 - - This function does not check that ``mu`` is actually the mean of ``data``. - Giving arbitrary values for ``mu`` may lead to invalid or impossible - results. - - Decimals and Fractions are supported: - - >>> from decimal import Decimal as D - >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) - Decimal('24.815') - - >>> from fractions import Fraction as F - >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) - Fraction(13, 72) - - """ - if iter(data) is data: - data = list(data) - n = len(data) - if n < 1: - raise StatisticsError('pvariance requires at least one data point') - ss = _ss(data, mu) - return ss/n - - -def stdev(data, xbar=None): - """Return the square root of the sample variance. - - See ``variance`` for arguments and other details. - - >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) - 1.0810874155219827 - - """ - var = variance(data, xbar) - try: - return var.sqrt() - except AttributeError: - return math.sqrt(var) - - -def pstdev(data, mu=None): - """Return the square root of the population variance. - - See ``pvariance`` for arguments and other details. - - >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) - 0.986893273527251 - - """ - var = pvariance(data, mu) - try: - return var.sqrt() - except AttributeError: - return math.sqrt(var) diff --git a/bench/stats.py b/bench/stats.py deleted file mode 100644 index c244b41..0000000 --- a/bench/stats.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import statistics - -def pairs(l, n): - return zip(*[l[i::n] for i in range(n)]) - -# data comes in pairs: -# n - time for running the program with no input -# m - time for running it with the benchmark input -# we measure (m - n) - -values = [ float(y) - float(x) for (x,y) in pairs(sys.stdin.readlines(),2)] - -print("mean = %.4f, median = %.4f, stdev = %.4f" % - (statistics.mean(values), statistics.median(values), - statistics.stdev(values))) - |