2026-04-18 16:44:40 -06:00
|
|
|
"""Tests for agent-settings copy in the interactive setup wizard."""
|
|
|
|
|
|
|
|
|
|
from hermes_cli.setup import setup_agent_settings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_setup_agent_settings_uses_displayed_max_iterations_value(tmp_path, monkeypatch, capsys):
|
fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log) (#18761)
* fix(gateway): config.yaml wins over .env for agent/display/timezone settings
Regression from the silent config→env bridge. The bridge at module import
time is correct for max_turns (unconditional overwrite), but every other
agent.*, display.*, timezone, and security bridge key was guarded by
'if X not in os.environ' — so a stale .env entry from an old 'hermes setup'
run would shadow the user's current config.yaml indefinitely.
Symptom: agent.max_turns: 500 in config.yaml, HERMES_MAX_ITERATIONS=60
in .env from an old setup, and the gateway silently capped at 60
iterations per turn. Gateway logs confirmed api_calls never exceeded 60.
Three changes:
1. gateway/run.py: drop the 'not in os.environ' guards for all agent.*,
display.*, timezone, and security.* bridge keys. config.yaml is now
authoritative for these settings — same semantics already in place
for max_turns, terminal.*, and auxiliary.*. Also surface the bridge
failure (previously 'except Exception: pass') to stderr so operators
see bridge errors instead of silently falling back to .env.
2. gateway/run.py: INFO-log the resolved max_iterations at gateway
start so operators can verify the config→env bridge did the right
thing instead of chasing a phantom budget ceiling.
3. hermes_cli/setup.py: stop writing HERMES_MAX_ITERATIONS to .env in
the setup wizard. config.yaml is the single source of truth. Also
clean up any stale .env entry left behind by pre-fix setups.
Regression tests in tests/gateway/test_config_env_bridge_authority.py
guard each config→env key against the 'stale .env shadows config' bug.
* fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log)
Three issues observed in production gateway.log during a rapid restart
chain on 2026-05-02, all fixed here.
1. _send_restart_notification logged unconditional success
adapter.send() catches provider errors (e.g. Telegram 'Chat not found')
and returns SendResult(success=False); it never raises. The caller
ignored the return value and always logged 'Sent restart notification
to <chat>' at INFO, producing a misleading success line directly
below the 'Failed to send Telegram message' traceback on every boot.
Now inspects result.success and logs WARNING with the error otherwise.
2. WhatsApp bridge SIGTERM on shutdown classified as fatal error
_check_managed_bridge_exit() saw the bridge's returncode -15 (our own
SIGTERM from disconnect()) and fired the full fatal-error path,
producing 'ERROR ... WhatsApp bridge process exited unexpectedly' plus
'Fatal whatsapp adapter error (whatsapp_bridge_exited)' on every
planned shutdown, immediately before the normal '✓ whatsapp
disconnected'. Adds a _shutting_down flag that disconnect() sets
before the terminate, and _check_managed_bridge_exit() returns None
for returncode in {0, -2, -15} while shutting down. OOM-kill (137)
and other non-signal exits still hit the fatal path.
3. restart_drain_timeout default 60s → 180s
On 2026-05-02 01:43:27 a user /restart fired while three agents were
mid-API-call (82s, 112s, 154s into their turns). The 60s drain budget
expired and all three were force-interrupted. 180s covers realistic
in-flight agent turns; users on very-long-reasoning models can still
raise it further via agent.restart_drain_timeout in config.yaml.
Existing explicit user values are preserved by deep-merge.
Tests
- tests/gateway/test_restart_notification.py: two new tests assert INFO
is only logged on SendResult(success=True) and WARNING with the error
string is logged on SendResult(success=False).
- tests/gateway/test_whatsapp_connect.py: parametrized test for
returncode in {0, -2, -15} proves shutdown-time exits are suppressed;
separate test proves returncode 137 (SIGKILL/OOM) still surfaces as
fatal even when _shutting_down is set.
- _check_managed_bridge_exit() reads _shutting_down via getattr-with-
default so existing _make_adapter() test helpers that bypass __init__
(pitfall #17 in AGENTS.md) keep working unmodified.
2026-05-02 02:08:06 -07:00
|
|
|
"""The helper text should match the value shown in the prompt.
|
|
|
|
|
|
|
|
|
|
After PR#18413 max_turns is read exclusively from config.yaml — the
|
|
|
|
|
.env `HERMES_MAX_ITERATIONS` fallback was removed because it was
|
|
|
|
|
shadowing the user's current config (see the 60-vs-500 incident).
|
|
|
|
|
"""
|
2026-04-18 16:44:40 -06:00
|
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
|
|
|
|
|
|
config = {
|
fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log) (#18761)
* fix(gateway): config.yaml wins over .env for agent/display/timezone settings
Regression from the silent config→env bridge. The bridge at module import
time is correct for max_turns (unconditional overwrite), but every other
agent.*, display.*, timezone, and security bridge key was guarded by
'if X not in os.environ' — so a stale .env entry from an old 'hermes setup'
run would shadow the user's current config.yaml indefinitely.
Symptom: agent.max_turns: 500 in config.yaml, HERMES_MAX_ITERATIONS=60
in .env from an old setup, and the gateway silently capped at 60
iterations per turn. Gateway logs confirmed api_calls never exceeded 60.
Three changes:
1. gateway/run.py: drop the 'not in os.environ' guards for all agent.*,
display.*, timezone, and security.* bridge keys. config.yaml is now
authoritative for these settings — same semantics already in place
for max_turns, terminal.*, and auxiliary.*. Also surface the bridge
failure (previously 'except Exception: pass') to stderr so operators
see bridge errors instead of silently falling back to .env.
2. gateway/run.py: INFO-log the resolved max_iterations at gateway
start so operators can verify the config→env bridge did the right
thing instead of chasing a phantom budget ceiling.
3. hermes_cli/setup.py: stop writing HERMES_MAX_ITERATIONS to .env in
the setup wizard. config.yaml is the single source of truth. Also
clean up any stale .env entry left behind by pre-fix setups.
Regression tests in tests/gateway/test_config_env_bridge_authority.py
guard each config→env key against the 'stale .env shadows config' bug.
* fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log)
Three issues observed in production gateway.log during a rapid restart
chain on 2026-05-02, all fixed here.
1. _send_restart_notification logged unconditional success
adapter.send() catches provider errors (e.g. Telegram 'Chat not found')
and returns SendResult(success=False); it never raises. The caller
ignored the return value and always logged 'Sent restart notification
to <chat>' at INFO, producing a misleading success line directly
below the 'Failed to send Telegram message' traceback on every boot.
Now inspects result.success and logs WARNING with the error otherwise.
2. WhatsApp bridge SIGTERM on shutdown classified as fatal error
_check_managed_bridge_exit() saw the bridge's returncode -15 (our own
SIGTERM from disconnect()) and fired the full fatal-error path,
producing 'ERROR ... WhatsApp bridge process exited unexpectedly' plus
'Fatal whatsapp adapter error (whatsapp_bridge_exited)' on every
planned shutdown, immediately before the normal '✓ whatsapp
disconnected'. Adds a _shutting_down flag that disconnect() sets
before the terminate, and _check_managed_bridge_exit() returns None
for returncode in {0, -2, -15} while shutting down. OOM-kill (137)
and other non-signal exits still hit the fatal path.
3. restart_drain_timeout default 60s → 180s
On 2026-05-02 01:43:27 a user /restart fired while three agents were
mid-API-call (82s, 112s, 154s into their turns). The 60s drain budget
expired and all three were force-interrupted. 180s covers realistic
in-flight agent turns; users on very-long-reasoning models can still
raise it further via agent.restart_drain_timeout in config.yaml.
Existing explicit user values are preserved by deep-merge.
Tests
- tests/gateway/test_restart_notification.py: two new tests assert INFO
is only logged on SendResult(success=True) and WARNING with the error
string is logged on SendResult(success=False).
- tests/gateway/test_whatsapp_connect.py: parametrized test for
returncode in {0, -2, -15} proves shutdown-time exits are suppressed;
separate test proves returncode 137 (SIGKILL/OOM) still surfaces as
fatal even when _shutting_down is set.
- _check_managed_bridge_exit() reads _shutting_down via getattr-with-
default so existing _make_adapter() test helpers that bypass __init__
(pitfall #17 in AGENTS.md) keep working unmodified.
2026-05-02 02:08:06 -07:00
|
|
|
"agent": {"max_turns": 60},
|
2026-04-18 16:44:40 -06:00
|
|
|
"display": {"tool_progress": "all"},
|
|
|
|
|
"compression": {"threshold": 0.50},
|
|
|
|
|
"session_reset": {"mode": "both", "idle_minutes": 1440, "at_hour": 4},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prompt_answers = iter(["60", "all", "0.5"])
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: next(prompt_answers))
|
|
|
|
|
monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: 4)
|
|
|
|
|
monkeypatch.setattr("hermes_cli.setup.save_env_value", lambda *args, **kwargs: None)
|
fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log) (#18761)
* fix(gateway): config.yaml wins over .env for agent/display/timezone settings
Regression from the silent config→env bridge. The bridge at module import
time is correct for max_turns (unconditional overwrite), but every other
agent.*, display.*, timezone, and security bridge key was guarded by
'if X not in os.environ' — so a stale .env entry from an old 'hermes setup'
run would shadow the user's current config.yaml indefinitely.
Symptom: agent.max_turns: 500 in config.yaml, HERMES_MAX_ITERATIONS=60
in .env from an old setup, and the gateway silently capped at 60
iterations per turn. Gateway logs confirmed api_calls never exceeded 60.
Three changes:
1. gateway/run.py: drop the 'not in os.environ' guards for all agent.*,
display.*, timezone, and security.* bridge keys. config.yaml is now
authoritative for these settings — same semantics already in place
for max_turns, terminal.*, and auxiliary.*. Also surface the bridge
failure (previously 'except Exception: pass') to stderr so operators
see bridge errors instead of silently falling back to .env.
2. gateway/run.py: INFO-log the resolved max_iterations at gateway
start so operators can verify the config→env bridge did the right
thing instead of chasing a phantom budget ceiling.
3. hermes_cli/setup.py: stop writing HERMES_MAX_ITERATIONS to .env in
the setup wizard. config.yaml is the single source of truth. Also
clean up any stale .env entry left behind by pre-fix setups.
Regression tests in tests/gateway/test_config_env_bridge_authority.py
guard each config→env key against the 'stale .env shadows config' bug.
* fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log)
Three issues observed in production gateway.log during a rapid restart
chain on 2026-05-02, all fixed here.
1. _send_restart_notification logged unconditional success
adapter.send() catches provider errors (e.g. Telegram 'Chat not found')
and returns SendResult(success=False); it never raises. The caller
ignored the return value and always logged 'Sent restart notification
to <chat>' at INFO, producing a misleading success line directly
below the 'Failed to send Telegram message' traceback on every boot.
Now inspects result.success and logs WARNING with the error otherwise.
2. WhatsApp bridge SIGTERM on shutdown classified as fatal error
_check_managed_bridge_exit() saw the bridge's returncode -15 (our own
SIGTERM from disconnect()) and fired the full fatal-error path,
producing 'ERROR ... WhatsApp bridge process exited unexpectedly' plus
'Fatal whatsapp adapter error (whatsapp_bridge_exited)' on every
planned shutdown, immediately before the normal '✓ whatsapp
disconnected'. Adds a _shutting_down flag that disconnect() sets
before the terminate, and _check_managed_bridge_exit() returns None
for returncode in {0, -2, -15} while shutting down. OOM-kill (137)
and other non-signal exits still hit the fatal path.
3. restart_drain_timeout default 60s → 180s
On 2026-05-02 01:43:27 a user /restart fired while three agents were
mid-API-call (82s, 112s, 154s into their turns). The 60s drain budget
expired and all three were force-interrupted. 180s covers realistic
in-flight agent turns; users on very-long-reasoning models can still
raise it further via agent.restart_drain_timeout in config.yaml.
Existing explicit user values are preserved by deep-merge.
Tests
- tests/gateway/test_restart_notification.py: two new tests assert INFO
is only logged on SendResult(success=True) and WARNING with the error
string is logged on SendResult(success=False).
- tests/gateway/test_whatsapp_connect.py: parametrized test for
returncode in {0, -2, -15} proves shutdown-time exits are suppressed;
separate test proves returncode 137 (SIGKILL/OOM) still surfaces as
fatal even when _shutting_down is set.
- _check_managed_bridge_exit() reads _shutting_down via getattr-with-
default so existing _make_adapter() test helpers that bypass __init__
(pitfall #17 in AGENTS.md) keep working unmodified.
2026-05-02 02:08:06 -07:00
|
|
|
monkeypatch.setattr("hermes_cli.setup.remove_env_value", lambda *args, **kwargs: None)
|
2026-04-18 16:44:40 -06:00
|
|
|
monkeypatch.setattr("hermes_cli.setup.save_config", lambda *args, **kwargs: None)
|
|
|
|
|
|
|
|
|
|
setup_agent_settings(config)
|
|
|
|
|
|
|
|
|
|
out = capsys.readouterr().out
|
|
|
|
|
assert "Press Enter to keep 60." in out
|
|
|
|
|
assert "Default is 90" not in out
|
fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log) (#18761)
* fix(gateway): config.yaml wins over .env for agent/display/timezone settings
Regression from the silent config→env bridge. The bridge at module import
time is correct for max_turns (unconditional overwrite), but every other
agent.*, display.*, timezone, and security bridge key was guarded by
'if X not in os.environ' — so a stale .env entry from an old 'hermes setup'
run would shadow the user's current config.yaml indefinitely.
Symptom: agent.max_turns: 500 in config.yaml, HERMES_MAX_ITERATIONS=60
in .env from an old setup, and the gateway silently capped at 60
iterations per turn. Gateway logs confirmed api_calls never exceeded 60.
Three changes:
1. gateway/run.py: drop the 'not in os.environ' guards for all agent.*,
display.*, timezone, and security.* bridge keys. config.yaml is now
authoritative for these settings — same semantics already in place
for max_turns, terminal.*, and auxiliary.*. Also surface the bridge
failure (previously 'except Exception: pass') to stderr so operators
see bridge errors instead of silently falling back to .env.
2. gateway/run.py: INFO-log the resolved max_iterations at gateway
start so operators can verify the config→env bridge did the right
thing instead of chasing a phantom budget ceiling.
3. hermes_cli/setup.py: stop writing HERMES_MAX_ITERATIONS to .env in
the setup wizard. config.yaml is the single source of truth. Also
clean up any stale .env entry left behind by pre-fix setups.
Regression tests in tests/gateway/test_config_env_bridge_authority.py
guard each config→env key against the 'stale .env shadows config' bug.
* fix(gateway): shutdown + restart hygiene (drain timeout, false-fatal, success log)
Three issues observed in production gateway.log during a rapid restart
chain on 2026-05-02, all fixed here.
1. _send_restart_notification logged unconditional success
adapter.send() catches provider errors (e.g. Telegram 'Chat not found')
and returns SendResult(success=False); it never raises. The caller
ignored the return value and always logged 'Sent restart notification
to <chat>' at INFO, producing a misleading success line directly
below the 'Failed to send Telegram message' traceback on every boot.
Now inspects result.success and logs WARNING with the error otherwise.
2. WhatsApp bridge SIGTERM on shutdown classified as fatal error
_check_managed_bridge_exit() saw the bridge's returncode -15 (our own
SIGTERM from disconnect()) and fired the full fatal-error path,
producing 'ERROR ... WhatsApp bridge process exited unexpectedly' plus
'Fatal whatsapp adapter error (whatsapp_bridge_exited)' on every
planned shutdown, immediately before the normal '✓ whatsapp
disconnected'. Adds a _shutting_down flag that disconnect() sets
before the terminate, and _check_managed_bridge_exit() returns None
for returncode in {0, -2, -15} while shutting down. OOM-kill (137)
and other non-signal exits still hit the fatal path.
3. restart_drain_timeout default 60s → 180s
On 2026-05-02 01:43:27 a user /restart fired while three agents were
mid-API-call (82s, 112s, 154s into their turns). The 60s drain budget
expired and all three were force-interrupted. 180s covers realistic
in-flight agent turns; users on very-long-reasoning models can still
raise it further via agent.restart_drain_timeout in config.yaml.
Existing explicit user values are preserved by deep-merge.
Tests
- tests/gateway/test_restart_notification.py: two new tests assert INFO
is only logged on SendResult(success=True) and WARNING with the error
string is logged on SendResult(success=False).
- tests/gateway/test_whatsapp_connect.py: parametrized test for
returncode in {0, -2, -15} proves shutdown-time exits are suppressed;
separate test proves returncode 137 (SIGKILL/OOM) still surfaces as
fatal even when _shutting_down is set.
- _check_managed_bridge_exit() reads _shutting_down via getattr-with-
default so existing _make_adapter() test helpers that bypass __init__
(pitfall #17 in AGENTS.md) keep working unmodified.
2026-05-02 02:08:06 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_setup_agent_settings_prefers_config_over_stale_env(tmp_path, monkeypatch, capsys):
|
|
|
|
|
"""Config.yaml wins even when a stale .env value disagrees.
|
|
|
|
|
|
|
|
|
|
Regression guard for the bug where `.env HERMES_MAX_ITERATIONS=60`
|
|
|
|
|
from an old `hermes setup` run shadowed `agent.max_turns: 500` in
|
|
|
|
|
config.yaml. The wizard must now display the config value.
|
|
|
|
|
"""
|
|
|
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
|
|
|
|
|
|
config = {
|
|
|
|
|
"agent": {"max_turns": 500}, # user bumped this in config.yaml
|
|
|
|
|
"display": {"tool_progress": "all"},
|
|
|
|
|
"compression": {"threshold": 0.50},
|
|
|
|
|
"session_reset": {"mode": "both", "idle_minutes": 1440, "at_hour": 4},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prompt_answers = iter(["500", "all", "0.5"])
|
|
|
|
|
|
|
|
|
|
# Simulate stale .env value — the wizard must ignore this.
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"hermes_cli.setup.get_env_value",
|
|
|
|
|
lambda key: "60" if key == "HERMES_MAX_ITERATIONS" else "",
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: next(prompt_answers))
|
|
|
|
|
monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: 4)
|
|
|
|
|
monkeypatch.setattr("hermes_cli.setup.save_env_value", lambda *args, **kwargs: None)
|
|
|
|
|
|
|
|
|
|
removed_keys: list[str] = []
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"hermes_cli.setup.remove_env_value",
|
|
|
|
|
lambda key: (removed_keys.append(key), True)[1],
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr("hermes_cli.setup.save_config", lambda *args, **kwargs: None)
|
|
|
|
|
|
|
|
|
|
setup_agent_settings(config)
|
|
|
|
|
|
|
|
|
|
out = capsys.readouterr().out
|
|
|
|
|
# Config value wins
|
|
|
|
|
assert "Press Enter to keep 500." in out
|
|
|
|
|
assert "Press Enter to keep 60." not in out
|
|
|
|
|
# And the stale .env entry gets cleaned up
|
|
|
|
|
assert "HERMES_MAX_ITERATIONS" in removed_keys
|