File: backend/remote.py
INITIAL_BACKOFF = float(os.environ.get("REMOTE_INITIAL_BACKOFF", "2"))
MAX_BACKOFF = float(os.environ.get("REMOTE_MAX_BACKOFF", "60"))
+# Poll loop is much more patient — the remote server is doing real work,
+# so we should tolerate long WiFi drops (default: 30 failures × 60s max
+# backoff ≈ up to ~30 minutes of network outage).
+POLL_MAX_FAILURES = int(os.environ.get("REMOTE_POLL_MAX_FAILURES", "30"))
+
def is_configured() -> bool:
"""Return True if remote processing is configured."""
def _log_retry(attempt, wait, exc):
)
# Step 4: Poll for completion (with retry on transient errors)
+ # Use POLL_MAX_FAILURES which is much higher than MAX_RETRIES —
+ # the server is doing real work and WiFi can drop for minutes.
consecutive_failures = 0
poll_backoff = INITIAL_BACKOFF
def _log_retry(attempt, wait, exc):
max_retries=2, # Light retry per poll cycle
)
status = json.loads(resp_body)
+ if consecutive_failures > 0:
+ logger.info(
+ "Remote job %s: reconnected after %d poll failure(s)",
+ job_id, consecutive_failures,
+ )
consecutive_failures = 0
poll_backoff = INITIAL_BACKOFF
except Exception as exc:
consecutive_failures += 1
- if consecutive_failures > MAX_RETRIES:
+ if consecutive_failures > POLL_MAX_FAILURES:
logger.error(
"Remote job %s: lost contact after %d consecutive poll failures",
job_id, consecutive_failures,
def _log_retry(attempt, wait, exc):
wait = min(poll_backoff, MAX_BACKOFF)
logger.warning(
"Remote job %s: poll failed (%d/%d): %s — retrying in %.0fs",
- job_id, consecutive_failures, MAX_RETRIES, exc, wait,
+ job_id, consecutive_failures, POLL_MAX_FAILURES, exc, wait,
)
if on_progress:
- on_progress("reconnecting", consecutive_failures, MAX_RETRIES)
+ on_progress("reconnecting", consecutive_failures, POLL_MAX_FAILURES)
time.sleep(wait)
poll_backoff *= 2
continue
def _log_retry(attempt, wait, exc):
if status["status"] == "completed":
break
elif status["status"] in ("failed", "cancelled"):
- raise RuntimeError(
- f"Remote job failed: {status.get('error', 'unknown')}"
- )
+ raise RuntimeError(_format_remote_error(status))
# Step 5: Download result (with retry)
if on_progress:
def _log_retry(attempt, wait, exc):
on_retry=_log_retry,
)
- # Poll for completion
+ # Poll for completion — use POLL_MAX_FAILURES for long WiFi outage tolerance
consecutive_failures = 0
poll_backoff = INITIAL_BACKOFF
def _log_retry(attempt, wait, exc):
max_retries=2,
)
status = json.loads(resp_body)
+ if consecutive_failures > 0:
+ logger.info(
+ "Remote job %s: reconnected after %d poll failure(s)",
+ job_id, consecutive_failures,
+ )
consecutive_failures = 0
poll_backoff = INITIAL_BACKOFF
except Exception as exc:
consecutive_failures += 1
- if consecutive_failures > MAX_RETRIES:
+ if consecutive_failures > POLL_MAX_FAILURES:
raise RuntimeError(
f"Lost connection to remote server after {consecutive_failures} "
f"poll failures: {exc}"
def _log_retry(attempt, wait, exc):
wait = min(poll_backoff, MAX_BACKOFF)
logger.warning(
"Remote job %s: poll failed (%d/%d): %s — retrying in %.0fs",
- job_id, consecutive_failures, MAX_RETRIES, exc, wait,
+ job_id, consecutive_failures, POLL_MAX_FAILURES, exc, wait,
)
if on_progress:
- on_progress("reconnecting", consecutive_failures, MAX_RETRIES)
+ on_progress("reconnecting", consecutive_failures, POLL_MAX_FAILURES)
time.sleep(wait)
poll_backoff *= 2
continue
def _log_retry(attempt, wait, exc):
if status["status"] == "completed":
break
elif status["status"] in ("failed", "cancelled"):
- raise RuntimeError(
- f"Remote job failed: {status.get('error', 'unknown')}"
- )
+ raise RuntimeError(_format_remote_error(status))
# Download result
if on_progress:
def _log_retry(attempt, wait, exc):
return output_path
+def _format_remote_error(status: dict) -> str:
+ """Format a remote job error into a user-friendly message.
+
+ Detects common failures (corrupt images, OOM, etc.) and adds context
+ so the user understands what happened and what to do about it.
+ """
+ error = status.get("error", "unknown")
+ job_status = status.get("status", "failed")
+
+ if job_status == "cancelled":
+ return f"Remote job was cancelled: {error}"
+
+ # Corrupt/unreadable image on remote server
+ if "cannot identify image file" in error or "truncated" in error.lower():
+ # Extract filename if present
+ import re
+ match = re.search(r"['\"]([^'\"]+\.\w{2,4})['\"]", error)
+ filename = match.group(1) if match else "unknown"
+ return (
+ f"Remote server encountered a corrupt/unreadable image ({filename}). "
+ f"This usually means an image was partially uploaded due to a network "
+ f"drop. The remote server needs the latest stacking.py/timelapse.py "
+ f"which skip corrupt images instead of crashing. Original error: {error}"
+ )
+
+ # Out of memory
+ if "MemoryError" in error or "OOM" in error or "out of memory" in error.lower():
+ return (
+ f"Remote server ran out of memory during processing. "
+ f"Try reducing the number of images or resolution. Original error: {error}"
+ )
+
+ return f"Remote job failed: {error}"
+
+
def _upload_batch_with_retry(job_id: str, paths: list[Path], on_retry=None):
"""Upload a batch of images with retry on transient network errors.
File: backend/tests/test_backend.py
def test_debug_messages_emitted_when_log_level_debug(monkeypatch, caplog):
cam.logger.debug("test debug message from camera")
assert any("test debug message from camera" in r.message for r in caplog.records)
+
+
+# ---------------------------------------------------------------------------
+# Remote module tests
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteModule:
+ """Tests for remote.py resilience improvements."""
+
+ def test_format_remote_error_corrupt_image(self):
+ import remote
+ status = {
+ "status": "failed",
+ "error": "cannot identify image file '/tmp/jobs/abc/images/000171_photo.jpg'",
+ }
+ msg = remote._format_remote_error(status)
+ assert "corrupt/unreadable image" in msg
+ assert "000171_photo.jpg" in msg
+ assert "skip corrupt images" in msg
+
+ def test_format_remote_error_truncated(self):
+ import remote
+ status = {
+ "status": "failed",
+ "error": "image file is Truncated",
+ }
+ msg = remote._format_remote_error(status)
+ assert "corrupt/unreadable image" in msg
+
+ def test_format_remote_error_oom(self):
+ import remote
+ status = {
+ "status": "failed",
+ "error": "MemoryError: unable to allocate array",
+ }
+ msg = remote._format_remote_error(status)
+ assert "out of memory" in msg.lower() or "ran out of memory" in msg.lower()
+
+ def test_format_remote_error_cancelled(self):
+ import remote
+ status = {
+ "status": "cancelled",
+ "error": "user cancelled",
+ }
+ msg = remote._format_remote_error(status)
+ assert "cancelled" in msg
+
+ def test_format_remote_error_generic(self):
+ import remote
+ status = {
+ "status": "failed",
+ "error": "something unexpected happened",
+ }
+ msg = remote._format_remote_error(status)
+ assert "Remote job failed" in msg
+ assert "something unexpected happened" in msg
+
+ def test_poll_max_failures_default(self):
+ import remote
+ assert remote.POLL_MAX_FAILURES >= 20, (
+ "POLL_MAX_FAILURES should be high enough to survive long WiFi outages"
+ )
+
+ def test_poll_max_failures_env_override(self, monkeypatch):
+ monkeypatch.setenv("REMOTE_POLL_MAX_FAILURES", "50")
+ # Re-import to pick up env var
+ if "remote" in sys.modules:
+ del sys.modules["remote"]
+ import remote
+ assert remote.POLL_MAX_FAILURES == 50
+ # Restore default
+ monkeypatch.delenv("REMOTE_POLL_MAX_FAILURES")
+ if "remote" in sys.modules:
+ del sys.modules["remote"]