Skip to content

Commit 5bc5967

Browse files
Codexclaude
andcommitted
[fix] Reconcile stuck "modified" config status on checksum request #1330
When a device applies a new configuration but fails to report its status back (e.g. due to a transient HTTP 502 from the controller), the config remains in "modified" state on the server forever. The device's agent will compare its local checksum with the remote checksum on the next polling cycle and find them identical, so it will not re-download or re-report. Without manual intervention the status stays "modified" indefinitely. Detect this condition on the DeviceChecksumView: if the config has been in "modified" state longer than a 5 minute grace period and the device is actively polling (proven by the very checksum request we are handling), set the status to "applied". Implementation notes: - Use the cached device object's config status as a fast path: if the cached status is not "modified" we return immediately with zero extra database queries, preserving the existing zero-query guarantee of the cached checksum path. - Only when the cached status says "modified" do we re-query Config fresh from the database. This covers the edge case where the cache has been populated with a "modified" status that was already reconciled a moment ago by a concurrent request. - The re-query uses .only() on the fields we need to keep it cheap. - A 5 minute grace period avoids fighting an in-flight apply that has not had time to report yet. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0d17acd commit 5bc5967

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed

openwisp_controller/config/controller/views.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,13 @@ class DeviceChecksumView(UpdateLastIpMixin, GetDeviceView):
144144
returns device's configuration checksum
145145
"""
146146

147+
# Grace period before reconciling a "modified" status.
148+
# If the config has been in "modified" state for longer than this,
149+
# and the device is actively requesting its checksum (proving it's
150+
# online and polling), we assume a previous report_status call was
151+
# lost due to a transient error and reconcile the status to "applied".
152+
_STATUS_RECONCILE_GRACE_SECONDS = 300 # 5 minutes
153+
147154
def get(self, request, pk):
148155
device = self.get_device()
149156
bad_request = forbid_unallowed(request, "GET", "key", device.key)
@@ -156,10 +163,89 @@ def get(self, request, pk):
156163
checksum_requested.send(
157164
sender=device.__class__, instance=device, request=request
158165
)
166+
self._reconcile_modified_status(device)
159167
return ControllerResponse(
160168
device.config.get_cached_checksum(), content_type="text/plain"
161169
)
162170

171+
@staticmethod
172+
def _reconcile_modified_status(device):
173+
"""
174+
Reconciles config status for devices stuck in "modified" state.
175+
176+
When a device applies a new configuration but fails to report its
177+
status back (e.g., due to a transient HTTP error like 502), the
178+
config remains in "modified" state on the controller even though
179+
the device already has the current configuration.
180+
181+
The device's agent will compare its local checksum with the
182+
remote checksum on the next polling cycle and find they match,
183+
so it won't re-download or re-report. The status stays "modified"
184+
indefinitely.
185+
186+
This method detects this condition: if the config has been in
187+
"modified" state for longer than the grace period and the device
188+
is actively polling (proven by this very checksum request), we
189+
set the status to "applied".
190+
191+
Fast path: the cached device object (from cache_memoize) is used
192+
first to check the status. Only when the cached status indicates
193+
"modified" do we re-query the database for the fresh state. This
194+
keeps the checksum endpoint zero-query for the common case where
195+
the status is already "applied".
196+
"""
197+
from django.utils import timezone
198+
199+
# Best-effort: never let reconciliation fail the checksum endpoint.
200+
try:
201+
# Fast path 1: if the cached device's config is not "modified",
202+
# there is nothing to reconcile.
203+
try:
204+
cached_config = device.config
205+
except Config.DoesNotExist:
206+
return
207+
if cached_config is None or cached_config.status != "modified":
208+
return
209+
210+
# Fast path 2: even if cached status is "modified", the cached
211+
# `modified` timestamp gives a lower bound on the real elapsed
212+
# time. If that lower bound is below the grace period, the real
213+
# value cannot be above it either, so skip the DB query entirely.
214+
grace = DeviceChecksumView._STATUS_RECONCILE_GRACE_SECONDS
215+
cached_elapsed = (
216+
timezone.now() - cached_config.modified
217+
).total_seconds()
218+
if cached_elapsed < grace:
219+
return
220+
221+
# Slow path: re-read config fresh from the database because
222+
# cache_memoize has a 30-day TTL and may be serving a stale
223+
# "modified" status that was already reconciled earlier.
224+
try:
225+
config = Config.objects.only("id", "status", "modified").get(
226+
device=device
227+
)
228+
except Config.DoesNotExist:
229+
return
230+
if config.status != "modified":
231+
return
232+
elapsed = (timezone.now() - config.modified).total_seconds()
233+
if elapsed < grace:
234+
return
235+
config.set_status_applied()
236+
logger.info(
237+
"Reconciled config status for device %s: was 'modified' "
238+
"for %d seconds with device actively polling, "
239+
"setting to 'applied'.",
240+
device,
241+
int(elapsed),
242+
)
243+
except Exception:
244+
logger.exception(
245+
"Failed to reconcile config status for device %s",
246+
device,
247+
)
248+
163249
@cache_memoize(
164250
timeout=Config._CHECKSUM_CACHE_TIMEOUT, args_rewrite=get_device_args_rewrite
165251
)

openwisp_controller/config/tests/test_controller.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,63 @@ def test_device_checksum_requested_signal_is_emitted(self):
270270
request=response.wsgi_request,
271271
)
272272

273+
def test_device_checksum_reconciles_modified_status(self):
274+
"""
275+
When a device with status "modified" requests its checksum,
276+
and enough time has passed (grace period), the status should
277+
be automatically reconciled to "applied".
278+
"""
279+
from datetime import timedelta
280+
281+
from django.utils import timezone
282+
283+
d = self._create_device_config()
284+
c = d.config
285+
c.set_status_modified()
286+
self.assertEqual(c.status, "modified")
287+
url = reverse("controller:device_checksum", args=[d.pk])
288+
289+
# First request within grace period: status should stay "modified"
290+
response = self.client.get(url, {"key": d.key})
291+
self.assertEqual(response.status_code, 200)
292+
c.refresh_from_db()
293+
self.assertEqual(c.status, "modified")
294+
295+
# Simulate that grace period has elapsed by backdating the
296+
# modified timestamp.
297+
Config.objects.filter(pk=c.pk).update(
298+
modified=timezone.now() - timedelta(seconds=600)
299+
)
300+
301+
# Second request after grace period: status should be reconciled
302+
# to "applied"
303+
response = self.client.get(url, {"key": d.key})
304+
self.assertEqual(response.status_code, 200)
305+
c.refresh_from_db()
306+
self.assertEqual(c.status, "applied")
307+
308+
def test_device_checksum_no_reconcile_for_applied(self):
309+
"""Status "applied" should not be changed."""
310+
d = self._create_device_config()
311+
c = d.config
312+
c.set_status_applied()
313+
url = reverse("controller:device_checksum", args=[d.pk])
314+
response = self.client.get(url, {"key": d.key})
315+
self.assertEqual(response.status_code, 200)
316+
c.refresh_from_db()
317+
self.assertEqual(c.status, "applied")
318+
319+
def test_device_checksum_no_reconcile_within_grace_period(self):
320+
"""Status should not be reconciled if within the grace period."""
321+
d = self._create_device_config()
322+
c = d.config
323+
c.set_status_modified()
324+
url = reverse("controller:device_checksum", args=[d.pk])
325+
response = self.client.get(url, {"key": d.key})
326+
self.assertEqual(response.status_code, 200)
327+
c.refresh_from_db()
328+
self.assertEqual(c.status, "modified")
329+
273330
def test_device_checksum_bad_uuid(self):
274331
d = self._create_device_config()
275332
pk = "{}-wrong".format(d.pk)

0 commit comments

Comments
 (0)