Skip to content

Commit 1bb2539

Browse files
BUG: Fix polluted window in rolling var (#63048)
Co-authored-by: suzyahyah <suzyahyah@users.noreply.github.com>
1 parent 446d809 commit 1bb2539

File tree

3 files changed

+116
-53
lines changed

3 files changed

+116
-53
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,7 @@ Groupby/resample/rolling
12601260
- Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`)
12611261
- Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`)
12621262
- Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`)
1263+
- Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` computing incorrect results due to numerical instability. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`)
12631264

12641265
Reshaping
12651266
^^^^^^^^^

pandas/_libs/window/aggregations.pyx

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -332,19 +332,13 @@ cdef float64_t calc_var(
332332
int ddof,
333333
float64_t nobs,
334334
float64_t ssqdm_x,
335-
int64_t num_consecutive_same_value
336335
) noexcept nogil:
337336
cdef:
338337
float64_t result
339338

340339
# Variance is unchanged if no observation is added or removed
341340
if (nobs >= minp) and (nobs > ddof):
342-
343-
# pathological case & repeatedly same values case
344-
if nobs == 1 or num_consecutive_same_value >= nobs:
345-
result = 0
346-
else:
347-
result = ssqdm_x / (nobs - <float64_t>ddof)
341+
result = ssqdm_x / (nobs - <float64_t>ddof)
348342
else:
349343
result = NaN
350344

@@ -357,27 +351,19 @@ cdef void add_var(
357351
float64_t *mean_x,
358352
float64_t *ssqdm_x,
359353
float64_t *compensation,
360-
int64_t *num_consecutive_same_value,
361-
float64_t *prev_value,
354+
bint *numerically_unstable,
362355
) noexcept nogil:
363356
""" add a value from the var calc """
364357
cdef:
365358
float64_t delta, prev_mean, y, t
359+
float64_t prev_m2 = ssqdm_x[0]
366360

367361
# GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
368362
if val != val:
369363
return
370364

371365
nobs[0] = nobs[0] + 1
372366

373-
# GH#42064, record num of same values to remove floating point artifacts
374-
if val == prev_value[0]:
375-
num_consecutive_same_value[0] += 1
376-
else:
377-
# reset to 1 (include current value itself)
378-
num_consecutive_same_value[0] = 1
379-
prev_value[0] = val
380-
381367
# Welford's method for the online variance-calculation
382368
# using Kahan summation
383369
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
@@ -392,17 +378,23 @@ cdef void add_var(
392378
mean_x[0] = 0
393379
ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])
394380

381+
if prev_m2 * InvCondTol > ssqdm_x[0]:
382+
# possible catastrophic cancellation
383+
numerically_unstable[0] = True
384+
395385

396386
cdef void remove_var(
397387
float64_t val,
398388
float64_t *nobs,
399389
float64_t *mean_x,
400390
float64_t *ssqdm_x,
401-
float64_t *compensation
391+
float64_t *compensation,
392+
bint *numerically_unstable,
402393
) noexcept nogil:
403394
""" remove a value from the var calc """
404395
cdef:
405396
float64_t delta, prev_mean, y, t
397+
float64_t prev_m2 = ssqdm_x[0]
406398
if val == val:
407399
nobs[0] = nobs[0] - 1
408400
if nobs[0]:
@@ -416,9 +408,14 @@ cdef void remove_var(
416408
delta = t
417409
mean_x[0] = mean_x[0] - delta / nobs[0]
418410
ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0])
411+
412+
if prev_m2 * InvCondTol > ssqdm_x[0]:
413+
# possible catastrophic cancellation
414+
numerically_unstable[0] = True
419415
else:
420416
mean_x[0] = 0
421417
ssqdm_x[0] = 0
418+
numerically_unstable[0] = False
422419

423420

424421
def roll_var(const float64_t[:] values, ndarray[int64_t] start,
@@ -428,11 +425,12 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
428425
"""
429426
cdef:
430427
float64_t mean_x, ssqdm_x, nobs, compensation_add,
431-
float64_t compensation_remove, prev_value
432-
int64_t s, e, num_consecutive_same_value
428+
float64_t compensation_remove
429+
int64_t s, e
433430
Py_ssize_t i, j, N = len(start)
434431
ndarray[float64_t] output
435432
bint is_monotonic_increasing_bounds
433+
bint requires_recompute, numerically_unstable
436434

437435
minp = max(minp, 1)
438436
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
@@ -449,32 +447,35 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
449447

450448
# Over the first window, observations can only be added
451449
# never removed
452-
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
453-
454-
prev_value = values[s]
455-
num_consecutive_same_value = 0
456-
457-
mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
458-
for j in range(s, e):
459-
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
460-
&num_consecutive_same_value, &prev_value)
461-
462-
else:
450+
requires_recompute = (
451+
i == 0
452+
or not is_monotonic_increasing_bounds
453+
or s >= end[i - 1]
454+
)
463455

456+
if not requires_recompute:
464457
# After the first window, observations can both be added
465458
# and removed
466459

467460
# calculate deletes
468461
for j in range(start[i - 1], s):
469462
remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
470-
&compensation_remove)
463+
&compensation_remove, &numerically_unstable)
471464

472465
# calculate adds
473466
for j in range(end[i - 1], e):
474467
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
475-
&num_consecutive_same_value, &prev_value)
468+
&numerically_unstable)
469+
470+
if requires_recompute or numerically_unstable:
471+
472+
mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
473+
for j in range(s, e):
474+
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
475+
&numerically_unstable)
476+
numerically_unstable = False
476477

477-
output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
478+
output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
478479

479480
if not is_monotonic_increasing_bounds:
480481
nobs = 0.0

pandas/tests/window/test_rolling.py

Lines changed: 80 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@
88

99
from pandas.compat import (
1010
IS64,
11-
is_platform_arm,
12-
is_platform_power,
13-
is_platform_riscv64,
1411
)
1512
from pandas.errors import Pandas4Warning
1613

@@ -1085,27 +1082,91 @@ def test_rolling_sem(frame_or_series):
10851082
tm.assert_series_equal(result, expected)
10861083

10871084

1088-
@pytest.mark.xfail(
1089-
is_platform_arm() or is_platform_power() or is_platform_riscv64(),
1090-
reason="GH 38921",
1091-
)
10921085
@pytest.mark.parametrize(
1093-
("func", "third_value", "values"),
1086+
("func", "values", "window", "ddof", "expected_values"),
10941087
[
1095-
("var", 1, [5e33, 0, 0.5, 0.5, 2, 0]),
1096-
("std", 1, [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0]),
1097-
("var", 2, [5e33, 0.5, 0, 0.5, 2, 0]),
1098-
("std", 2, [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0]),
1088+
("var", [99999999999999999, 1, 1, 2, 3, 1, 1], 2, 1, [5e33, 0, 0.5, 0.5, 2, 0]),
1089+
(
1090+
"std",
1091+
[99999999999999999, 1, 1, 2, 3, 1, 1],
1092+
2,
1093+
1,
1094+
[7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0],
1095+
),
1096+
("var", [99999999999999999, 1, 2, 2, 3, 1, 1], 2, 1, [5e33, 0.5, 0, 0.5, 2, 0]),
1097+
(
1098+
"std",
1099+
[99999999999999999, 1, 2, 2, 3, 1, 1],
1100+
2,
1101+
1,
1102+
[7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0],
1103+
),
1104+
(
1105+
"std",
1106+
[1.2e03, 1.3e17, 1.5e17, 1.995e03, 1.990e03],
1107+
2,
1108+
1,
1109+
[9.192388e16, 1.414214e16, 1.060660e17, 3.535534e00],
1110+
),
1111+
(
1112+
"var",
1113+
[
1114+
0.00000000e00,
1115+
0.00000000e00,
1116+
3.16188252e-18,
1117+
2.95781651e-16,
1118+
2.23153542e-51,
1119+
0.00000000e00,
1120+
0.00000000e00,
1121+
5.39943432e-48,
1122+
1.38206260e-73,
1123+
0.00000000e00,
1124+
],
1125+
3,
1126+
1,
1127+
[
1128+
3.33250036e-036,
1129+
2.88538519e-032,
1130+
2.88538519e-032,
1131+
2.91622617e-032,
1132+
1.65991678e-102,
1133+
9.71796366e-096,
1134+
9.71796366e-096,
1135+
9.71796366e-096,
1136+
],
1137+
),
1138+
(
1139+
"std",
1140+
[1, -1, 0, 1, 3, 2, -2, 10000000000, 1, 2, 0, -2, 1, 3, 0, 1],
1141+
6,
1142+
1,
1143+
[
1144+
1.41421356e00,
1145+
1.87082869e00,
1146+
4.08248290e09,
1147+
4.08248290e09,
1148+
4.08248290e09,
1149+
4.08248290e09,
1150+
4.08248290e09,
1151+
4.08248290e09,
1152+
1.72240142e00,
1153+
1.75119007e00,
1154+
1.64316767e00,
1155+
],
1156+
),
10991157
],
11001158
)
1101-
def test_rolling_var_numerical_issues(func, third_value, values):
1102-
# GH: 37051
1103-
ds = Series([99999999999999999, 1, third_value, 2, 3, 1, 1])
1104-
result = getattr(ds.rolling(2), func)()
1105-
expected = Series([np.nan] + values)
1106-
tm.assert_series_equal(result, expected)
1159+
def test_rolling_var_correctness(func, values, window, ddof, expected_values):
1160+
# GH: 37051, 42064, 54518, 52407, 47721
1161+
ts = Series(values)
1162+
result = getattr(ts.rolling(window=window), func)(ddof=ddof)
1163+
if result.last_valid_index():
1164+
result = result[
1165+
result.first_valid_index() : result.last_valid_index() + 1
1166+
].reset_index(drop=True)
1167+
expected = Series(expected_values)
1168+
tm.assert_series_equal(result, expected, atol=1e-55)
11071169
# GH 42064
1108-
# new `roll_var` will output 0.0 correctly
11091170
tm.assert_series_equal(result == 0, expected == 0)
11101171

11111172

0 commit comments

Comments
 (0)