Compare commits

...

2 Commits

Author SHA1 Message Date
Jai Suphavadeeprasit
4abe3428da Add Harbor Terminal-Bench launcher
Provide a generic launcher for running Hermes Agent through Harbor against an OpenAI-compatible autoscaler gateway using the patched Harbor fork.
2026-07-02 13:40:38 -04:00
Jai Suphavadeeprasit
e1aa0e6764 init 2026-06-30 01:55:04 -04:00
8 changed files with 479 additions and 3 deletions

View File

@@ -55,10 +55,32 @@ def hermes_client_tag() -> str:
return f"client=hermes-client-v{_hermes_version()}"
def nous_portal_tags() -> List[str]:
def conversation_tag(session_id: str) -> str:
"""Return the ``conversation=...`` tag for a Hermes session/conversation.
Format: ``conversation=<session_id>``. ``session_id`` is the canonical
Hermes conversation identifier (``AIAgent.session_id``) — the same value
used for ``~/.hermes/sessions/`` storage, session logs, and lineage.
Unlike the product/client tags this is high-cardinality (one value per
conversation), so it is only appended when a session id is actually
available — never as part of the always-on base tag set.
"""
return f"conversation={session_id}"
def nous_portal_tags(session_id: str | None = None) -> List[str]:
"""Return the canonical list of Nous Portal product tags.
Always returns a fresh list so callers can mutate it freely
(e.g. ``merged_extra.setdefault("tags", []).extend(nous_portal_tags())``).
When ``session_id`` is provided, a ``conversation=<session_id>`` tag is
appended so Portal usage can be attributed to a specific Hermes
conversation. Callers without a session id (e.g. the auxiliary client's
always-on base tags) omit it and get the canonical two-tag set.
"""
return ["product=hermes-agent", hermes_client_tag()]
tags = ["product=hermes-agent", hermes_client_tag()]
if session_id:
tags.append(conversation_tag(session_id))
return tags

View File

@@ -0,0 +1,93 @@
# Harbor Terminal-Bench Runner
`scripts/run_harbor_terminal_bench.sh` launches Harbor Terminal-Bench runs with
Hermes Agent against an OpenAI-compatible autoscaler gateway.
The script is intentionally generic: it does not hardcode personal usernames,
hostnames, private key paths, or internal gateway IPs. Provide those values via
environment variables when running it.
If `HARBOR_DIR` does not exist, the script clones the patched Harbor fork/ref
from `HARBOR_REPO_URL` and `HARBOR_REF`. The bundled
`scripts/patches/harbor-hermes-custom-endpoint.patch` is kept as a fallback for
unpatched upstream Harbor checkouts and is only applied when
`APPLY_HARBOR_PATCH=1`.
## Required Environment
```bash
export AUTOSCALER_SSH_TARGET="user@example-host"
export AUTOSCALER_SSH_KEY="$HOME/.ssh/id_ed25519"
export AUTOSCALER_REMOTE_GATEWAY="gateway-host-or-ip:30090"
```
You can copy the example environment file:
```bash
cp scripts/harbor-terminal-bench.env.example .env.harbor-terminal-bench
set -a
source .env.harbor-terminal-bench
set +a
```
## Optional Environment
```bash
export HARBOR_DIR="../harbor" # Harbor checkout path
export HARBOR_REPO_URL="git@github.com:NousResearch/harbor-fork.git"
export HARBOR_REF="hermes-custom-endpoint"
# export APPLY_HARBOR_PATCH="1" # Only for unpatched upstream Harbor
export HERMES_MODEL="hermes-large" # Autoscaler model id
export LOCAL_PORT="30090" # Local SSH tunnel port
export N_TASKS="10" # Unset for a full Terminal-Bench run
export N_CONCURRENT="1" # Harbor concurrency
export EXCLUDE_TASK_NAME="gpt2-codegolf" # Excluded by default for smoke runs
```
## Run A Smoke Task
```bash
INCLUDE_TASK_NAME=cancel-async-tasks \
./scripts/run_harbor_terminal_bench.sh
```
## Run A 10-Task Batch
```bash
N_TASKS=10 ./scripts/run_harbor_terminal_bench.sh
```
## Run A Larger Batch
```bash
N_TASKS=50 N_CONCURRENT=4 ./scripts/run_harbor_terminal_bench.sh
```
## Run The Full Dataset
Leave `N_TASKS` unset so Harbor does not receive `--n-tasks`:
```bash
unset N_TASKS
N_CONCURRENT=1 ./scripts/run_harbor_terminal_bench.sh
```
## Head Node Guard
The script refuses to run on hostnames that look like head nodes, for example
hosts containing `-hn1` or `head`, unless explicitly overridden:
```bash
ALLOW_HEAD_NODE_RUN=1 ./scripts/run_harbor_terminal_bench.sh
```
Only use the override when you are sure the host is an appropriate place to run
Docker/Harbor workloads.
## Notes
- Harbor task containers use `http://host.docker.internal:<LOCAL_PORT>/v1` by
default because Hermes runs inside Docker.
- `OPENAI_API_KEY` defaults to `dummy`; it is only used to populate Hermes'
custom provider config for the autoscaler endpoint.
- Set `NO_TUNNEL=1` if a local tunnel or gateway is already running.

View File

@@ -13,7 +13,7 @@ class NousProfile(ProviderProfile):
def build_extra_body(
self, *, session_id: str | None = None, **context
) -> dict[str, Any]:
return {"tags": nous_portal_tags()}
return {"tags": nous_portal_tags(session_id=session_id)}
def build_api_kwargs_extras(
self,

View File

@@ -0,0 +1,18 @@
# Required for starting an SSH tunnel to an OpenAI-compatible autoscaler gateway.
AUTOSCALER_SSH_TARGET=user@example-host
AUTOSCALER_SSH_KEY=$HOME/.ssh/id_ed25519
AUTOSCALER_REMOTE_GATEWAY=gateway-host-or-ip:30090
# Optional.
HARBOR_DIR=../harbor
HARBOR_REPO_URL=git@github.com:NousResearch/harbor-fork.git
HARBOR_REF=hermes-custom-endpoint
# Set APPLY_HARBOR_PATCH=1 only when using an unpatched upstream Harbor checkout.
# APPLY_HARBOR_PATCH=1
HERMES_MODEL=hermes-large
LOCAL_PORT=30090
N_CONCURRENT=1
EXCLUDE_TASK_NAME=gpt2-codegolf
# Leave N_TASKS unset for a full Terminal-Bench run.
# N_TASKS=10

View File

@@ -0,0 +1,119 @@
diff --git a/src/harbor/agents/installed/hermes.py b/src/harbor/agents/installed/hermes.py
index 5f16cbd5..bb1a5b80 100644
--- a/src/harbor/agents/installed/hermes.py
+++ b/src/harbor/agents/installed/hermes.py
@@ -86,10 +86,26 @@ class Hermes(BaseInstalledAgent):
# ------------------------------------------------------------------
@staticmethod
- def _build_config_yaml(model: str) -> str:
+ def _build_config_yaml(
+ model: str,
+ *,
+ custom_base_url: str | None = None,
+ custom_api_key: str | None = None,
+ ) -> str:
"""Generate a hermes config.yaml with full capabilities enabled."""
+ model_config: str | dict[str, str]
+ if custom_base_url:
+ model_config = {
+ "default": model,
+ "provider": "custom",
+ "base_url": custom_base_url,
+ "api_key": custom_api_key or "",
+ }
+ else:
+ model_config = model
+
config: dict[str, Any] = {
- "model": model,
+ "model": model_config,
"provider": "auto",
"toolsets": ["hermes-cli"],
"agent": {"max_turns": 90},
@@ -351,6 +367,8 @@ class Hermes(BaseInstalledAgent):
# Try native provider key first, fall back to OpenRouter.
hermes_provider_flag: str | None = None
+ custom_base_url: str | None = None
+ custom_api_key: str | None = None
use_native = False
if provider in _NATIVE_PROVIDERS:
@@ -359,7 +377,13 @@ class Hermes(BaseInstalledAgent):
key_val = os.environ.get(key_name)
if key_val:
env[key_name] = key_val
- hermes_provider_flag = native_flag
+ # Hermes Agent v0.18 treats OpenAI-compatible non-OpenAI
+ # endpoints as custom providers configured in config.yaml.
+ if provider == "openai" and os.environ.get("OPENAI_BASE_URL"):
+ custom_base_url = os.environ["OPENAI_BASE_URL"]
+ custom_api_key = key_val
+ else:
+ hermes_provider_flag = native_flag
use_native = True
break
# Forward OPENAI_BASE_URL when using native OpenAI key
@@ -380,10 +404,14 @@ class Hermes(BaseInstalledAgent):
raise ValueError("No API key found. Set OPENROUTER_API_KEY.")
env["OPENROUTER_API_KEY"] = openrouter_key
- # Native providers with --provider flag use just the model name;
- # everything else (OpenRouter, openai direct) uses provider/model.
- cli_model = model if hermes_provider_flag else self.model_name
- config_yaml = self._build_config_yaml(cli_model)
+ # Native providers with --provider flag and custom endpoints use just
+ # the model name; OpenRouter/direct OpenAI keep provider/model.
+ cli_model = model if hermes_provider_flag or custom_base_url else self.model_name
+ config_yaml = self._build_config_yaml(
+ cli_model,
+ custom_base_url=custom_base_url,
+ custom_api_key=custom_api_key,
+ )
# Pass instruction via env var (safe from shell escaping issues)
env["HARBOR_INSTRUCTION"] = instruction
diff --git a/tests/unit/agents/installed/test_hermes_cli.py b/tests/unit/agents/installed/test_hermes_cli.py
index 0b78678b..ab73f4f7 100644
--- a/tests/unit/agents/installed/test_hermes_cli.py
+++ b/tests/unit/agents/installed/test_hermes_cli.py
@@ -52,6 +52,7 @@ class TestHermesRunCommands:
async def test_openai_native_provider(self, temp_dir, monkeypatch):
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
monkeypatch.setenv("OPENAI_API_KEY", "openai-key")
+ monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
agent = Hermes(logs_dir=temp_dir, model_name="openai/gpt-4o")
mock_env = AsyncMock()
mock_env.exec.return_value = AsyncMock(return_code=0, stdout="", stderr="")
@@ -61,6 +62,30 @@ class TestHermesRunCommands:
assert "--provider" not in run_call.kwargs["command"]
assert run_call.kwargs["env"]["OPENAI_API_KEY"] == "openai-key"
+ @pytest.mark.asyncio
+ async def test_openai_base_url_uses_custom_provider(self, temp_dir, monkeypatch):
+ monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+ monkeypatch.setenv("OPENAI_API_KEY", "dummy")
+ monkeypatch.setenv("OPENAI_BASE_URL", "http://host.docker.internal:30090/v1")
+ agent = Hermes(logs_dir=temp_dir, model_name="openai/hermes-large")
+ mock_env = AsyncMock()
+ mock_env.exec.return_value = AsyncMock(return_code=0, stdout="", stderr="")
+ await agent.run("do something", mock_env, AsyncMock())
+ config_call = mock_env.exec.call_args_list[0]
+ config_yaml = config_call.kwargs["command"].split("<< 'EOF'\n", 1)[1].rsplit(
+ "\nEOF", 1
+ )[0]
+ config = yaml.safe_load(config_yaml)
+ assert config["model"] == {
+ "default": "hermes-large",
+ "provider": "custom",
+ "base_url": "http://host.docker.internal:30090/v1",
+ "api_key": "dummy",
+ }
+ run_call = self._get_run_call(mock_env.exec.call_args_list)
+ assert "--model hermes-large" in run_call.kwargs["command"]
+ assert "--provider" not in run_call.kwargs["command"]
+
@pytest.mark.asyncio
async def test_openrouter_fallback(self, temp_dir, monkeypatch):
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'USAGE'
Run Harbor Terminal-Bench with Hermes Agent through an OpenAI-compatible autoscaler gateway.
Required environment:
AUTOSCALER_SSH_TARGET SSH target for the gateway host, for example user@host
AUTOSCALER_SSH_KEY SSH private key path for the gateway host
AUTOSCALER_REMOTE_GATEWAY Remote gateway host:port reachable from AUTOSCALER_SSH_TARGET
Optional environment:
HARBOR_DIR Harbor checkout path (default: ../harbor)
HARBOR_REPO_URL Harbor repo URL to clone if HARBOR_DIR is missing
HARBOR_REF Harbor branch/tag/SHA to clone
APPLY_HARBOR_PATCH=1 Apply bundled patch for unpatched upstream Harbor
HERMES_MODEL Autoscaler model id (default: hermes-large)
LOCAL_PORT Local forwarded port (default: 30090)
N_TASKS Number of tasks to run (unset means full dataset)
N_CONCURRENT Harbor trial concurrency (default: 1)
JOB_NAME Harbor job name (default: hermes-large-tb-<timestamp>)
EXCLUDE_TASK_NAME Task glob to exclude (default: gpt2-codegolf)
INCLUDE_TASK_NAME Task glob to include instead of N_TASKS
OPENAI_API_KEY Dummy/custom provider key (default: dummy)
ALLOW_HEAD_NODE_RUN=1 Override the head-node safety guard
NO_TUNNEL=1 Do not start an SSH tunnel; use an existing local gateway
Examples:
AUTOSCALER_SSH_TARGET=user@example-host \
AUTOSCALER_SSH_KEY=~/.ssh/id_ed25519 \
AUTOSCALER_REMOTE_GATEWAY=10.0.0.10:30090 \
./scripts/run_harbor_terminal_bench.sh
INCLUDE_TASK_NAME=cancel-async-tasks ./scripts/run_harbor_terminal_bench.sh
N_TASKS=50 N_CONCURRENT=4 ./scripts/run_harbor_terminal_bench.sh
unset N_TASKS; N_CONCURRENT=1 ./scripts/run_harbor_terminal_bench.sh
USAGE
}
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
usage
exit 0
fi
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
harbor_dir="${HARBOR_DIR:-${repo_root}/../harbor}"
harbor_repo_url="${HARBOR_REPO_URL:-git@github.com:NousResearch/harbor-fork.git}"
harbor_ref="${HARBOR_REF:-hermes-custom-endpoint}"
apply_harbor_patch="${APPLY_HARBOR_PATCH:-0}"
harbor_patch="${HARBOR_PATCH:-${repo_root}/scripts/patches/harbor-hermes-custom-endpoint.patch}"
hermes_model="${HERMES_MODEL:-hermes-large}"
local_port="${LOCAL_PORT:-30090}"
n_tasks="${N_TASKS:-}"
n_concurrent="${N_CONCURRENT:-1}"
job_name="${JOB_NAME:-${hermes_model}-tb-$(date +%Y%m%d-%H%M%S)}"
exclude_task_name="${EXCLUDE_TASK_NAME:-gpt2-codegolf}"
include_task_name="${INCLUDE_TASK_NAME:-}"
api_key="${OPENAI_API_KEY:-dummy}"
local_base_url="http://127.0.0.1:${local_port}/v1"
docker_base_url="${DOCKER_BASE_URL:-http://host.docker.internal:${local_port}/v1}"
hostname_value="$(hostname -f 2>/dev/null || hostname)"
if [[ "${ALLOW_HEAD_NODE_RUN:-0}" != "1" ]] \
&& [[ "${hostname_value}" =~ (^|[-.])(hn[0-9]*|head)([-.]|$) ]]; then
cat >&2 <<EOF
Refusing to launch Harbor eval on possible head node: ${hostname_value}
Run from a workstation or compute allocation. If you are certain this host is
safe, set ALLOW_HEAD_NODE_RUN=1.
EOF
exit 64
fi
require_env() {
local name="$1"
if [[ -z "${!name:-}" ]]; then
echo "Missing required environment variable: ${name}" >&2
usage >&2
exit 2
fi
}
ensure_harbor_checkout() {
if [[ ! -f "${harbor_dir}/pyproject.toml" ]]; then
if [[ -e "${harbor_dir}" ]]; then
echo "HARBOR_DIR exists but is not a Harbor checkout: ${harbor_dir}" >&2
exit 1
fi
echo "Cloning Harbor into ${harbor_dir}"
if [[ -n "${harbor_ref}" ]]; then
git clone --branch "${harbor_ref}" --depth 1 "${harbor_repo_url}" "${harbor_dir}"
else
git clone --depth 1 "${harbor_repo_url}" "${harbor_dir}"
fi
fi
if [[ "${apply_harbor_patch}" == "1" ]]; then
if [[ ! -f "${harbor_patch}" ]]; then
echo "Missing Harbor patch: ${harbor_patch}" >&2
exit 1
fi
(
cd "${harbor_dir}"
if git apply --check "${harbor_patch}" >/dev/null 2>&1; then
git apply "${harbor_patch}"
echo "Applied Harbor Hermes custom endpoint patch."
elif git apply --reverse --check "${harbor_patch}" >/dev/null 2>&1; then
echo "Harbor Hermes custom endpoint patch is already applied."
else
cat >&2 <<EOF
Could not apply Harbor patch cleanly.
This usually means Harbor changed upstream or already has a different version
of the Hermes custom endpoint fix. Inspect:
${harbor_patch}
${harbor_dir}/src/harbor/agents/installed/hermes.py
EOF
exit 1
fi
)
fi
}
curl_models() {
curl -fsS --max-time 8 "${local_base_url}/models" >/dev/null
}
tunnel_pid=""
cleanup() {
if [[ -n "${tunnel_pid}" ]]; then
kill "${tunnel_pid}" >/dev/null 2>&1 || true
fi
}
trap cleanup EXIT
if [[ "${NO_TUNNEL:-0}" != "1" ]] && ! curl_models; then
require_env AUTOSCALER_SSH_TARGET
require_env AUTOSCALER_SSH_KEY
require_env AUTOSCALER_REMOTE_GATEWAY
ssh -i "${AUTOSCALER_SSH_KEY}" \
-o ExitOnForwardFailure=yes \
-o ServerAliveInterval=30 \
-N \
-L "${local_port}:${AUTOSCALER_REMOTE_GATEWAY}" \
"${AUTOSCALER_SSH_TARGET}" &
tunnel_pid="$!"
sleep 2
curl_models
fi
if ! docker run --rm curlimages/curl:latest -fsS --max-time 10 "${docker_base_url}/models" >/dev/null; then
cat >&2 <<EOF
Docker could not reach the autoscaler at ${docker_base_url}.
On macOS/Windows, host.docker.internal should work. On Linux you may need to
run Harbor with host networking or set DOCKER_BASE_URL to a container-reachable
gateway URL.
EOF
exit 1
fi
ensure_harbor_checkout
args=(
--dataset terminal-bench@2.0
--agent hermes
--model "openai/${hermes_model}"
--n-concurrent "${n_concurrent}"
--job-name "${job_name}"
-y
)
if [[ -n "${include_task_name}" ]]; then
args+=(--include-task-name "${include_task_name}")
else
if [[ -n "${n_tasks}" ]]; then
args+=(--n-tasks "${n_tasks}")
fi
if [[ -n "${exclude_task_name}" ]]; then
args+=(--exclude-task-name "${exclude_task_name}")
fi
fi
cd "${harbor_dir}"
OPENAI_API_KEY="${api_key}" \
OPENAI_BASE_URL="${docker_base_url}" \
uv run --no-dev harbor run "${args[@]}"

View File

@@ -42,6 +42,33 @@ def test_nous_portal_tags_returns_fresh_list():
assert "client=test-mutation" not in b
def test_conversation_tag_format():
"""The conversation tag carries the session id verbatim."""
from agent.portal_tags import conversation_tag
assert conversation_tag("abc-123") == "conversation=abc-123"
def test_nous_portal_tags_appends_conversation_when_session_id_given():
"""A session id adds a third, high-cardinality conversation tag."""
from agent.portal_tags import conversation_tag, nous_portal_tags
tags = nous_portal_tags(session_id="sess-42")
assert "product=hermes-agent" in tags
assert conversation_tag("sess-42") in tags
assert len(tags) == 3
def test_nous_portal_tags_omits_conversation_without_session_id():
"""Base tag set stays at two tags when no session id is available."""
from agent.portal_tags import nous_portal_tags
for empty in (None, ""):
tags = nous_portal_tags(session_id=empty)
assert len(tags) == 2
assert not any(t.startswith("conversation=") for t in tags)
def test_auxiliary_client_nous_extra_body_uses_helper():
"""auxiliary_client.NOUS_EXTRA_BODY must match the canonical helper output."""
from agent.auxiliary_client import NOUS_EXTRA_BODY

View File

@@ -414,6 +414,12 @@ class TestNousProfile:
body = p.build_extra_body()
assert body["tags"] == nous_portal_tags()
def test_tags_include_conversation_when_session_id(self):
from agent.portal_tags import conversation_tag
p = get_provider_profile("nous")
body = p.build_extra_body(session_id="sess-99")
assert conversation_tag("sess-99") in body["tags"]
def test_auth_type(self):
p = get_provider_profile("nous")
assert p.auth_type == "oauth_device_code"