Skip to content

Commit 457e4d1

Browse files
authored
GH-102670: Use sumprod() to simplify, speed up, and improve accuracy of statistics functions (GH-102649)
1 parent 61479d4 commit 457e4d1

File tree

3 files changed

+27
-13
lines changed

3 files changed

+27
-13
lines changed

Lib/statistics.py

+14-12
Original file line numberDiff line numberDiff line change
@@ -1036,7 +1036,7 @@ def covariance(x, y, /):
10361036
raise StatisticsError('covariance requires at least two data points')
10371037
xbar = fsum(x) / n
10381038
ybar = fsum(y) / n
1039-
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
1039+
sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y))
10401040
return sxy / (n - 1)
10411041

10421042

@@ -1074,11 +1074,14 @@ def correlation(x, y, /, *, method='linear'):
10741074
start = (n - 1) / -2 # Center rankings around zero
10751075
x = _rank(x, start=start)
10761076
y = _rank(y, start=start)
1077-
xbar = fsum(x) / n
1078-
ybar = fsum(y) / n
1079-
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
1080-
sxx = fsum((d := xi - xbar) * d for xi in x)
1081-
syy = fsum((d := yi - ybar) * d for yi in y)
1077+
else:
1078+
xbar = fsum(x) / n
1079+
ybar = fsum(y) / n
1080+
x = [xi - xbar for xi in x]
1081+
y = [yi - ybar for yi in y]
1082+
sxy = sumprod(x, y)
1083+
sxx = sumprod(x, x)
1084+
syy = sumprod(y, y)
10821085
try:
10831086
return sxy / sqrt(sxx * syy)
10841087
except ZeroDivisionError:
@@ -1131,14 +1134,13 @@ def linear_regression(x, y, /, *, proportional=False):
11311134
raise StatisticsError('linear regression requires that both inputs have same number of data points')
11321135
if n < 2:
11331136
raise StatisticsError('linear regression requires at least two data points')
1134-
if proportional:
1135-
sxy = fsum(xi * yi for xi, yi in zip(x, y))
1136-
sxx = fsum(xi * xi for xi in x)
1137-
else:
1137+
if not proportional:
11381138
xbar = fsum(x) / n
11391139
ybar = fsum(y) / n
1140-
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
1141-
sxx = fsum((d := xi - xbar) * d for xi in x)
1140+
x = [xi - xbar for xi in x] # List because used three times below
1141+
y = (yi - ybar for yi in y) # Generator because only used once below
1142+
sxy = sumprod(x, y) + 0.0 # Add zero to coerce result to a float
1143+
sxx = sumprod(x, x)
11421144
try:
11431145
slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x)
11441146
except ZeroDivisionError:

Lib/test/test_statistics.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Test suite for statistics module, including helper NumericTestCase and
1+
x = """Test suite for statistics module, including helper NumericTestCase and
22
approx_equal function.
33
44
"""
@@ -2610,6 +2610,16 @@ def test_proportional(self):
26102610
self.assertAlmostEqual(slope, 20 + 1/150)
26112611
self.assertEqual(intercept, 0.0)
26122612

2613+
def test_float_output(self):
2614+
x = [Fraction(2, 3), Fraction(3, 4)]
2615+
y = [Fraction(4, 5), Fraction(5, 6)]
2616+
slope, intercept = statistics.linear_regression(x, y)
2617+
self.assertTrue(isinstance(slope, float))
2618+
self.assertTrue(isinstance(intercept, float))
2619+
slope, intercept = statistics.linear_regression(x, y, proportional=True)
2620+
self.assertTrue(isinstance(slope, float))
2621+
self.assertTrue(isinstance(intercept, float))
2622+
26132623
class TestNormalDist:
26142624

26152625
# General note on precision: The pdf(), cdf(), and overlap() methods
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Optimized fmean(), correlation(), covariance(), and linear_regression()
2+
using the new math.sumprod() function.

0 commit comments

Comments
 (0)