Merge PR #403: Fix context overrun crash with local LLM backends

Authored by ch3ronsa. Fixes #348. Adds 'context size' (LM Studio) and 'context window' (Ollama) to context-length error detection phrases so local backend 400 errors trigger compression instead of aborting. Also removes 'error code: 400' from the non-retryable error list as defense in depth.
2026-04-28 06:51:16 +08:00 · 2026-03-04 17:48:44 -08:00
parent ff3a479156 e9ab711b66
commit 3220bb8aaa
1 changed files with 11 additions and 11 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -3333,23 +3333,24 @@ class AIAgent:
                            }

                    # Check for context-length errors BEFORE generic 4xx handler.
-                    # OpenRouter returns 400 (not 413) for "maximum context length"
-                    # errors — if we let the generic 4xx handler catch those first,
-                    # it aborts immediately instead of attempting compression+retry.
+                    # Local backends (LM Studio, Ollama, llama.cpp) often return
+                    # HTTP 400 with messages like "Context size has been exceeded"
+                    # which must trigger compression, not an immediate abort.
                    is_context_length_error = any(phrase in error_msg for phrase in [
-                        'context length', 'maximum context', 'token limit',
-                        'too many tokens', 'reduce the length', 'exceeds the limit',
+                        'context length', 'context size', 'maximum context',
+                        'token limit', 'too many tokens', 'reduce the length',
+                        'exceeds the limit', 'context window',
                        'request entity too large',  # OpenRouter/Nous 413 safety net
                    ])
                    
                    if is_context_length_error:
                        print(f"{self.log_prefix}⚠️  Context length exceeded - attempting compression...")
-                        
+
                        original_len = len(messages)
                        messages, active_system_prompt = self._compress_context(
                            messages, system_message, approx_tokens=approx_tokens
                        )
-                        
+
                        if len(messages) < original_len:
                            print(f"{self.log_prefix}   🗜️  Compressed {original_len} → {len(messages)} messages, retrying...")
                            continue  # Retry with compressed messages
@@ -3370,11 +3371,10 @@ class AIAgent:
                    # Check for non-retryable client errors (4xx HTTP status codes).
                    # These indicate a problem with the request itself (bad model ID,
                    # invalid API key, forbidden, etc.) and will never succeed on retry.
-                    # Note: 413 and context-length errors are excluded — handled above
-                    # via compression.
+                    # Note: 413 and context-length errors are excluded — handled above.
                    is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
                    is_client_error = (is_client_status_error or any(phrase in error_msg for phrase in [
-                        'error code: 400', 'error code: 401', 'error code: 403',
+                        'error code: 401', 'error code: 403',
                        'error code: 404', 'error code: 422',
                        'is not a valid model', 'invalid model', 'model not found',
                        'invalid api key', 'invalid_api_key', 'authentication',
@@ -3397,7 +3397,7 @@ class AIAgent:
                            "failed": True,
                            "error": str(api_error),
                        }
-                    
+
                    if retry_count >= max_retries:
                        print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.")
                        logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}")