mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 09:17:09 +08:00
Compare commits
1013 Commits
modal-inte
...
feat/paral
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0d96f1991c | ||
|
|
172a38c344 | ||
|
|
8bc0d4f77d | ||
|
|
8eabdefa8a | ||
|
|
f658af45c2 | ||
|
|
5212644861 | ||
|
|
1151f84351 | ||
|
|
9abd6bf342 | ||
|
|
d2c7ef6b41 | ||
|
|
a34102049b | ||
|
|
ef5d811aba | ||
|
|
2d44ed1c5b | ||
|
|
fa2e72ae9c | ||
|
|
5bfc4ed53b | ||
|
|
520aec20e0 | ||
|
|
64bec1d060 | ||
|
|
ac58309dbd | ||
|
|
a5a5d82a21 | ||
|
|
34e8d088c2 | ||
|
|
c754135965 | ||
|
|
c6b75baad0 | ||
|
|
a7ad6f6d28 | ||
|
|
1a2141d04d | ||
|
|
ff3f3169b2 | ||
|
|
f4580b6010 | ||
|
|
7b63a787b3 | ||
|
|
069570d103 | ||
|
|
0dafdcab86 | ||
|
|
654e16187e | ||
|
|
732c66b0f3 | ||
|
|
1f0944de21 | ||
|
|
f1a1b58319 | ||
|
|
c21d77ca08 | ||
|
|
d6c710706f | ||
|
|
a6d3becd6a | ||
|
|
3b67606c42 | ||
|
|
a2d0d07109 | ||
|
|
aedb773f0d | ||
|
|
aaf8f2d2d2 | ||
|
|
12f4800631 | ||
|
|
57b48a81ca | ||
|
|
7af33accf1 | ||
|
|
3214c05e82 | ||
|
|
4608a7fe4e | ||
|
|
af67ea8800 | ||
|
|
37c3dcf551 | ||
|
|
6a49fbb7da | ||
|
|
eb0b01de7b | ||
|
|
5b1528519c | ||
|
|
52f92eb689 | ||
|
|
7f9dd60c15 | ||
|
|
77da3bbc95 | ||
|
|
bb489a3903 | ||
|
|
167eb824cb | ||
|
|
efb64aee5a | ||
|
|
3045e29232 | ||
|
|
5d7d76025a | ||
|
|
e6c829384e | ||
|
|
5c658a416c | ||
|
|
a130aa8165 | ||
|
|
35d57ed752 | ||
|
|
5785bd3272 | ||
|
|
cf9482984e | ||
|
|
67275641f8 | ||
|
|
3ffaac00dd | ||
|
|
816a3ef6f1 | ||
|
|
a8bf414f4a | ||
|
|
3b312d45c5 | ||
|
|
fcd899f888 | ||
|
|
315f3ea429 | ||
|
|
b7d6eae64c | ||
|
|
b3765c28d0 | ||
|
|
4cfb66bac2 | ||
|
|
0c4cff352a | ||
|
|
503269b85a | ||
|
|
161436cfdd | ||
|
|
24f549a692 | ||
|
|
7a8778ac73 | ||
|
|
763c6d104d | ||
|
|
4d7d9d9715 | ||
|
|
a9c35f9175 | ||
|
|
37752ff1ac | ||
|
|
31b84213e4 | ||
|
|
2036c22f88 | ||
|
|
7185a66b96 | ||
|
|
2394e18729 | ||
|
|
99f7582175 | ||
|
|
93c5997290 | ||
|
|
2d1a1c1c47 | ||
|
|
71e81728ac | ||
|
|
ebe60646db | ||
|
|
f996d7950b | ||
|
|
ae4a674c84 | ||
|
|
169615abc8 | ||
|
|
7c30ac2141 | ||
|
|
192501528f | ||
|
|
5ae0b731d0 | ||
|
|
d9f373654b | ||
|
|
0efbb137e8 | ||
|
|
cf63b2471f | ||
|
|
f88343a6da | ||
|
|
491605cfea | ||
|
|
3aded1d4e5 | ||
|
|
4f0402ed3a | ||
|
|
ecac6321c4 | ||
|
|
20c6573e0a | ||
|
|
97b1c76b14 | ||
|
|
24a37032fa | ||
|
|
c0520223fd | ||
|
|
1f1caa836a | ||
|
|
b3ea7714f5 | ||
|
|
a7f9721785 | ||
|
|
a5461e07bf | ||
|
|
2e73a9e893 | ||
|
|
26bb56b775 | ||
|
|
95b1130485 | ||
|
|
3fb8938cd3 | ||
|
|
c5e8166c8b | ||
|
|
2b88568653 | ||
|
|
34b4fe495e | ||
|
|
4fdd6c0dac | ||
|
|
60b6abefd9 | ||
|
|
4d53b7ccaa | ||
|
|
081079da62 | ||
|
|
333e4abe30 | ||
|
|
cd77c7100c | ||
|
|
cf810c2950 | ||
|
|
a23bcb81ce | ||
|
|
d07d867718 | ||
|
|
666f2dd486 | ||
|
|
34792dd907 | ||
|
|
7ad6fc8a40 | ||
|
|
f824c10429 | ||
|
|
132e5ec179 | ||
|
|
66d3e6a0c2 | ||
|
|
4a09ae2985 | ||
|
|
8c734f2f27 | ||
|
|
245d174359 | ||
|
|
77f47768dd | ||
|
|
90fa9e54ca | ||
|
|
9d3a44e0e8 | ||
|
|
932d596466 | ||
|
|
d518f40e8b | ||
|
|
f016cfca46 | ||
|
|
b8120df860 | ||
|
|
0df7df52f3 | ||
|
|
bfa27d0a68 | ||
|
|
5a20c486e3 | ||
|
|
78e19ebc95 | ||
|
|
b383cafc44 | ||
|
|
b10ff83566 | ||
|
|
daa1f542f9 | ||
|
|
d507f593d0 | ||
|
|
f210510276 | ||
|
|
19b6f81ee7 | ||
|
|
76545ab365 | ||
|
|
b8c3bc7841 | ||
|
|
a680367568 | ||
|
|
dfd37a4b31 | ||
|
|
5ee9b67d9b | ||
|
|
542faf225f | ||
|
|
5684c68121 | ||
|
|
4be783446a | ||
|
|
8d719b180a | ||
|
|
bf048c8aec | ||
|
|
c5a9d1ef9d | ||
|
|
c7b6f423c7 | ||
|
|
6d34207167 | ||
|
|
fcde9be10d | ||
|
|
3830bbda41 | ||
|
|
4447e7d71a | ||
|
|
7bccd904c7 | ||
|
|
313d522b61 | ||
|
|
9ee4fe41fe | ||
|
|
39ee3512cb | ||
|
|
42673556af | ||
|
|
faab73ad58 | ||
|
|
7e36468511 | ||
|
|
9ba5d399e5 | ||
|
|
306d92a9d7 | ||
|
|
5baae0df88 | ||
|
|
24f6a193e7 | ||
|
|
8c0f8baf32 | ||
|
|
d80c30cc92 | ||
|
|
e64d646bad | ||
|
|
b84f9e410c | ||
|
|
ee5daba061 | ||
|
|
23e84de830 | ||
|
|
48e0dc8791 | ||
|
|
fb0f579b16 | ||
|
|
5a711f32b1 | ||
|
|
4d34427cc7 | ||
|
|
41877183bc | ||
|
|
451a007fb1 | ||
|
|
0a82396718 | ||
|
|
5da55ea1e3 | ||
|
|
064c009deb | ||
|
|
caab1cf453 | ||
|
|
55c70f3508 | ||
|
|
d29249b8fa | ||
|
|
f668e9fc75 | ||
|
|
74fe1e2254 | ||
|
|
348936752a | ||
|
|
69a36a3361 | ||
|
|
8712dd6d1c | ||
|
|
55a21fe37b | ||
|
|
f55f625277 | ||
|
|
9dac85b069 | ||
|
|
99bd69baa8 | ||
|
|
a62a137a4f | ||
|
|
82b18e8ac2 | ||
|
|
0111c9848d | ||
|
|
ab9cadfeee | ||
|
|
8bf28e1441 | ||
|
|
ce28f847ce | ||
|
|
5609117882 | ||
|
|
b4fbb6fe10 | ||
|
|
82d7e9429e | ||
|
|
e2821effb5 | ||
|
|
9742f11fda | ||
|
|
388dd4789c | ||
|
|
fdebca4573 | ||
|
|
479dfc096a | ||
|
|
3c6c11b7c9 | ||
|
|
bc091eb7ef | ||
|
|
f75b1d21b4 | ||
|
|
94053d75a6 | ||
|
|
2a68099675 | ||
|
|
6cd3bc6640 | ||
|
|
211b55815e | ||
|
|
8ae4a6f824 | ||
|
|
b98301677a | ||
|
|
f2fdde5ba4 | ||
|
|
4f56e31dc7 | ||
|
|
6d3804770c | ||
|
|
ab0f4126cf | ||
|
|
585f8528b2 | ||
|
|
75f523f5c0 | ||
|
|
68fbae5692 | ||
|
|
80f1dd8d37 | ||
|
|
b52b37ae64 | ||
|
|
d63b363cde | ||
|
|
c05c60665e | ||
|
|
b4873a5de7 | ||
|
|
913f8ce0a5 | ||
|
|
4a63737227 | ||
|
|
3e93db16bd | ||
|
|
f863a42351 | ||
|
|
dc55f493be | ||
|
|
936fda3f9e | ||
|
|
ecb8148a9f | ||
|
|
2dbbedc05a | ||
|
|
c30967806c | ||
|
|
145f719d30 | ||
|
|
b89eb29174 | ||
|
|
3670089a42 | ||
|
|
3982fcf095 | ||
|
|
8481fdcf08 | ||
|
|
39299e2de4 | ||
|
|
efec4fcaab | ||
|
|
5ce2c47d60 | ||
|
|
f6f3d1de9b | ||
|
|
ec0fe3242a | ||
|
|
f2e24faaca | ||
|
|
8c80b96318 | ||
|
|
2387465dcc | ||
|
|
32636ecf8a | ||
|
|
6055adbe1b | ||
|
|
ffd2f8dc50 | ||
|
|
e93b4d1dcd | ||
|
|
014a5b712d | ||
|
|
2317d115cd | ||
|
|
8253b54be9 | ||
|
|
5c867fd79f | ||
|
|
a44e041acf | ||
|
|
e9f05b3524 | ||
|
|
e2a834578d | ||
|
|
ffc752a79e | ||
|
|
399562a7d1 | ||
|
|
fec8a0da72 | ||
|
|
9f4542b3db | ||
|
|
363633e2ba | ||
|
|
a41ba57a7a | ||
|
|
884c8ea70a | ||
|
|
c886333d32 | ||
|
|
55b173dd03 | ||
|
|
9079a27814 | ||
|
|
d7d10b14cd | ||
|
|
a6499b6107 | ||
|
|
74a36b0729 | ||
|
|
efc7a7b957 | ||
|
|
4f1464b3af | ||
|
|
3a41079fac | ||
|
|
5279540bb4 | ||
|
|
577da79a47 | ||
|
|
1faa9648d3 | ||
|
|
ad57bf1e4b | ||
|
|
d5efb82c7c | ||
|
|
36214d14db | ||
|
|
ea2f7ef2f6 | ||
|
|
435530018b | ||
|
|
df61054a84 | ||
|
|
690b8bb563 | ||
|
|
c43451a50b | ||
|
|
1e312c6582 | ||
|
|
e36c8cd49a | ||
|
|
16cb6d1a6e | ||
|
|
21d61bdd71 | ||
|
|
ad9c26afb8 | ||
|
|
83f99d8203 | ||
|
|
6b37d38dee | ||
|
|
938499ddfb | ||
|
|
d92266d7c0 | ||
|
|
a352b5c193 | ||
|
|
82f7483999 | ||
|
|
56dc9277d7 | ||
|
|
d50e9bcef7 | ||
|
|
c4e520fd6e | ||
|
|
30ff395924 | ||
|
|
f55025952d | ||
|
|
1bc45ee8fe | ||
|
|
19016497ef | ||
|
|
d578d06f59 | ||
|
|
e25ad79d5d | ||
|
|
f2624a1426 | ||
|
|
15561ec425 | ||
|
|
93d93fdea4 | ||
|
|
87f4e4cb9b | ||
|
|
82cb1752d9 | ||
|
|
ada3713e77 | ||
|
|
7d79ce92ac | ||
|
|
1708dcd2b2 | ||
|
|
5702eba93b | ||
|
|
a1767fd69c | ||
|
|
b4b426c69d | ||
|
|
2465674fda | ||
|
|
2eca0d4af1 | ||
|
|
11a7c6b112 | ||
|
|
50ea8adf46 | ||
|
|
ca33372595 | ||
|
|
7d47e3b776 | ||
|
|
fe15a2c65c | ||
|
|
d400fb8b23 | ||
|
|
3221818b6e | ||
|
|
2af2f148ab | ||
|
|
d19109742e | ||
|
|
078e2e4b19 | ||
|
|
9aa2999388 | ||
|
|
d0d9897e81 | ||
|
|
9306a1e06a | ||
|
|
141b12bd39 | ||
|
|
ae3deff8d4 | ||
|
|
41adca4e77 | ||
|
|
8e901b31c1 | ||
|
|
11a5a64729 | ||
|
|
0dba3027c1 | ||
|
|
405c7e08be | ||
|
|
cb36930f1d | ||
|
|
90e6fa2612 | ||
|
|
fd22ae5fcb | ||
|
|
e1baab90f7 | ||
|
|
4fcfa329ba | ||
|
|
b336980229 | ||
|
|
7128f95621 | ||
|
|
ffc6d767ec | ||
|
|
44a2d0c01f | ||
|
|
3e2ed18ad0 | ||
|
|
db58cfb13d | ||
|
|
3220bb8aaa | ||
|
|
ff3a479156 | ||
|
|
6f4941616d | ||
|
|
bd3025d669 | ||
|
|
4c72329412 | ||
|
|
8311e8984b | ||
|
|
093acd72dd | ||
|
|
e9ab711b66 | ||
|
|
b2a9f6beaa | ||
|
|
d3504f84af | ||
|
|
34badeb19c | ||
|
|
f93b48226c | ||
|
|
4805be0119 | ||
|
|
a3ca71fe26 | ||
|
|
70a0a5ff4a | ||
|
|
021f62cb0c | ||
|
|
ba214e43c8 | ||
|
|
520a26c48f | ||
|
|
a787a0d60b | ||
|
|
8d2d8cc728 | ||
|
|
4ae61b0886 | ||
|
|
79871c2083 | ||
|
|
7796ac1411 | ||
|
|
c45aeb45b1 | ||
|
|
ee7fde6531 | ||
|
|
0ea6c34325 | ||
|
|
3db3d60368 | ||
|
|
bfd08d5648 | ||
|
|
7f9777a0b0 | ||
|
|
87a16ad2e5 | ||
|
|
f90a627f9a | ||
|
|
152e0800e6 | ||
|
|
d8f10fa515 | ||
|
|
e86f391cac | ||
|
|
e39de2e752 | ||
|
|
1538be45de | ||
|
|
95e3f4b001 | ||
|
|
b7821b6dc1 | ||
|
|
556a132f2d | ||
|
|
fafb9c23bf | ||
|
|
1754bdf1e8 | ||
|
|
fa3d7b3d03 | ||
|
|
73f2998d48 | ||
|
|
6a51fd23df | ||
|
|
ffec21236d | ||
|
|
db0521ce0e | ||
|
|
a1c25046a9 | ||
|
|
de0af4df66 | ||
|
|
0e1723ef74 | ||
|
|
aefc330b8f | ||
|
|
f967471758 | ||
|
|
4f5ffb8909 | ||
|
|
54909b0282 | ||
|
|
f084538cb9 | ||
|
|
535b46f813 | ||
|
|
4766b3cdb9 | ||
|
|
354af6ccee | ||
|
|
c9afbbac0b | ||
|
|
83fa442c1b | ||
|
|
1900e5238b | ||
|
|
ddae1aa2e9 | ||
|
|
16274d5a82 | ||
|
|
5749f5809c | ||
|
|
d10108f8ca | ||
|
|
8b520f9848 | ||
|
|
4cc431afab | ||
|
|
a718aed1be | ||
|
|
5f29e7b63c | ||
|
|
245c766512 | ||
|
|
f08ad94d4d | ||
|
|
cdf5375b9a | ||
|
|
bdf4758510 | ||
|
|
84e45b5c40 | ||
|
|
daedec6957 | ||
|
|
de59d91add | ||
|
|
68cc81a74d | ||
|
|
3ead3401e0 | ||
|
|
eec31b0089 | ||
|
|
7df14227a9 | ||
|
|
60effcfc44 | ||
|
|
63f5e14c69 | ||
|
|
64ff8f065b | ||
|
|
468b7fdbad | ||
|
|
14b0ad95c6 | ||
|
|
221e4228ec | ||
|
|
dd9d3f89b9 | ||
|
|
b0cce17da6 | ||
|
|
c6b3b8c847 | ||
|
|
2ba87a10b0 | ||
|
|
5fa3e24b76 | ||
|
|
ac6d747fa6 | ||
|
|
ee541c84f1 | ||
|
|
6053236158 | ||
|
|
11615014a4 | ||
|
|
3588396263 | ||
|
|
11a2ecb936 | ||
|
|
151e8d896c | ||
|
|
593c549bc4 | ||
|
|
aa2ecaef29 | ||
|
|
0eb0bec74c | ||
|
|
3c252ae44b | ||
|
|
6789084ec0 | ||
|
|
b603b6e1c9 | ||
|
|
3c13feed4c | ||
|
|
7652afb8de | ||
|
|
7862e7010c | ||
|
|
4faf2a6cf4 | ||
|
|
8c48bb080f | ||
|
|
6d2481ee5c | ||
|
|
ca5525bcd7 | ||
|
|
56b53bff6e | ||
|
|
fd335a4e26 | ||
|
|
c4ea996612 | ||
|
|
39bfd226b8 | ||
|
|
234b67f5fd | ||
|
|
e27e3a4f8a | ||
|
|
7a11ff95a9 | ||
|
|
33ab5cec82 | ||
|
|
1cb2311bad | ||
|
|
25c65bc99e | ||
|
|
afb680b50d | ||
|
|
c574a4d086 | ||
|
|
bd8b20b933 | ||
|
|
866fd9476b | ||
|
|
d2ec5aaacf | ||
|
|
e265006fd6 | ||
|
|
b1bf11b0fe | ||
|
|
6bf3aad62e | ||
|
|
3a840a130c | ||
|
|
14396e3fe7 | ||
|
|
1ad930cbd0 | ||
|
|
7a0b37712f | ||
|
|
e2b8740fcf | ||
|
|
45d132d098 | ||
|
|
719f2eef32 | ||
|
|
698b35933e | ||
|
|
0512ada793 | ||
|
|
47289ba6f1 | ||
|
|
5e5e0efc60 | ||
|
|
7b38afc179 | ||
|
|
e5893075f9 | ||
|
|
5e598a588f | ||
|
|
c2d8d17285 | ||
|
|
8bc2de4ab6 | ||
|
|
75a92a3f82 | ||
|
|
72963e9ccb | ||
|
|
92da8e7e62 | ||
|
|
c84d5ce738 | ||
|
|
dda9f3e734 | ||
|
|
834e25a662 | ||
|
|
196a13f3dc | ||
|
|
440d33eec4 | ||
|
|
11f5c1ecf0 | ||
|
|
3b745633e4 | ||
|
|
900d48714a | ||
|
|
3fdf03390e | ||
|
|
25fb9aafcb | ||
|
|
54147474d3 | ||
|
|
4d6f380bd1 | ||
|
|
93f5fd80b8 | ||
|
|
177be32b7f | ||
|
|
30efc263ff | ||
|
|
ed0e860abb | ||
|
|
41d8a80226 | ||
|
|
4ec386cc72 | ||
|
|
dd69f16c3e | ||
|
|
1db5598294 | ||
|
|
23d0b7af6a | ||
|
|
a7c2b9e280 | ||
|
|
70dfec9638 | ||
|
|
95b0610f36 | ||
|
|
500f0eab4a | ||
|
|
86b1db0598 | ||
|
|
5a79e423fe | ||
|
|
7f7643cf63 | ||
|
|
bf52468a91 | ||
|
|
b4688f10d4 | ||
|
|
31a5cd185a | ||
|
|
7166647ca1 | ||
|
|
f7300a858e | ||
|
|
e87859e82c | ||
|
|
de101a8202 | ||
|
|
7f1f4c2248 | ||
|
|
c33f8d381b | ||
|
|
3f58e47c63 | ||
|
|
b7f8a17c24 | ||
|
|
6cbb8f3a0c | ||
|
|
ec97f9ad1a | ||
|
|
10085041cf | ||
|
|
7b23dbfe68 | ||
|
|
8e0c48e6d2 | ||
|
|
b759602483 | ||
|
|
2205b22409 | ||
|
|
1ddf8c26f5 | ||
|
|
9769e07cd5 | ||
|
|
08250a53a1 | ||
|
|
ff6d62802d | ||
|
|
46506769f1 | ||
|
|
4ea29978fc | ||
|
|
dfd50ceccd | ||
|
|
6366177118 | ||
|
|
2390728cc3 | ||
|
|
b32c642af3 | ||
|
|
c36b256de5 | ||
|
|
0afe1b707d | ||
|
|
f213620c8b | ||
|
|
35655298e6 | ||
|
|
1e463a8e39 | ||
|
|
de5a88bd97 | ||
|
|
0862fa96fd | ||
|
|
924570c5be | ||
|
|
4d8689c10c | ||
|
|
1d7ce5e063 | ||
|
|
72d3425eef | ||
|
|
b7f099beed | ||
|
|
912ef50165 | ||
|
|
4a9086b848 | ||
|
|
50cb4d5fc7 | ||
|
|
2bc9508b7c | ||
|
|
337cd574c8 | ||
|
|
9fb027915e | ||
|
|
2b821c3a14 | ||
|
|
0d113fab1a | ||
|
|
19f28a633a | ||
|
|
2c817ce4a5 | ||
|
|
66a5bc64db | ||
|
|
7f423508e4 | ||
|
|
306c6706a6 | ||
|
|
64be67e062 | ||
|
|
0c0a2eb0a2 | ||
|
|
de0829cec3 | ||
|
|
20177660bb | ||
|
|
609fc6d080 | ||
|
|
518826e70c | ||
|
|
13992a58da | ||
|
|
0d2ac1c07f | ||
|
|
fb7df099e0 | ||
|
|
f14ff3e041 | ||
|
|
07fcb94bc0 | ||
|
|
66d9983d46 | ||
|
|
4f3cb98e5e | ||
|
|
8c1f5efcab | ||
|
|
c92bdd8785 | ||
|
|
e09ef6b8bc | ||
|
|
f7677ed275 | ||
|
|
e5f719a33b | ||
|
|
79bd65034c | ||
|
|
fbb1923fad | ||
|
|
bf75c450b7 | ||
|
|
b2172c4b2e | ||
|
|
69ccd76679 | ||
|
|
8b54bb4d89 | ||
|
|
2595d81733 | ||
|
|
f9e05218ca | ||
|
|
2ddda5da89 | ||
|
|
dc80f0b222 | ||
|
|
5007a122b2 | ||
|
|
43f2321225 | ||
|
|
1362f92f2e | ||
|
|
445d2646a9 | ||
|
|
ae8d25faca | ||
|
|
9061c03b6d | ||
|
|
8174f5a988 | ||
|
|
03f7b551be | ||
|
|
80ad6572a3 | ||
|
|
c77f3da0ce | ||
|
|
c104647450 | ||
|
|
547ba73b82 | ||
|
|
3526fa27fd | ||
|
|
9eabdb64ff | ||
|
|
6f543eac9f | ||
|
|
64eca85876 | ||
|
|
152271851f | ||
|
|
0909be3aa8 | ||
|
|
274e623b50 | ||
|
|
2972f982e4 | ||
|
|
df8a62d018 | ||
|
|
fec5d59fb3 | ||
|
|
7285e44064 | ||
|
|
2ff54ae6b3 | ||
|
|
f74ac0fc3a | ||
|
|
26a6da27fa | ||
|
|
19abbfff96 | ||
|
|
8aa531c7fa | ||
|
|
21cf339a85 | ||
|
|
588cdacd49 | ||
|
|
0cce536fb2 | ||
|
|
b281ecd50a | ||
|
|
b267e34092 | ||
|
|
58fce0a37b | ||
|
|
f0458ebdb8 | ||
|
|
0a231c0783 | ||
|
|
7c1f90045e | ||
|
|
a5ea272936 | ||
|
|
715825eac3 | ||
|
|
1a97e82000 | ||
|
|
70d1abf81b | ||
|
|
1fd0fcddb2 | ||
|
|
ab4bbf2fb2 | ||
|
|
669e4d0297 | ||
|
|
f92875bc3e | ||
|
|
7f36259f88 | ||
|
|
2c28d9f560 | ||
|
|
c21b071e77 | ||
|
|
de197bd7cb | ||
|
|
bf9dd83c10 | ||
|
|
760fb2ca0e | ||
|
|
a8ccaca8ea | ||
|
|
32070e6bc0 | ||
|
|
f02f647237 | ||
|
|
96043a8f7e | ||
|
|
0bb8d8faf5 | ||
|
|
f5c09a3aba | ||
|
|
3227cc65d1 | ||
|
|
90ca2ae16b | ||
|
|
25e260bb3a | ||
|
|
feea8332d6 | ||
|
|
ffbdd7fcce | ||
|
|
b699cf8c48 | ||
|
|
2efd9bbac4 | ||
|
|
0ac3af8776 | ||
|
|
fed9f06c4e | ||
|
|
240f33a06f | ||
|
|
254aafb265 | ||
|
|
8bd82119be | ||
|
|
9a148bb9a3 | ||
|
|
7a4241e406 | ||
|
|
cb92fbe749 | ||
|
|
1d04074464 | ||
|
|
c4096b4731 | ||
|
|
178658bf9f | ||
|
|
d372eb1f0e | ||
|
|
ebe25fefd6 | ||
|
|
688ccf05cb | ||
|
|
9dc5615b9d | ||
|
|
696e2316a8 | ||
|
|
f2891b70d0 | ||
|
|
dcf370cb6e | ||
|
|
1b8eb85eeb | ||
|
|
cf3236ed27 | ||
|
|
6c86c7c4a9 | ||
|
|
9cc2cf3241 | ||
|
|
9eb4a4a481 | ||
|
|
8463b7ea59 | ||
|
|
faa185e37c | ||
|
|
53b3177ca5 | ||
|
|
76badfed63 | ||
|
|
3c1e31de3e | ||
|
|
d2c932d3ac | ||
|
|
5a569eb1b6 | ||
|
|
e5bd25c73f | ||
|
|
eb88474dd8 | ||
|
|
9fc0ca0a72 | ||
|
|
95b6bd5df6 | ||
|
|
f1311ad3de | ||
|
|
0310170869 | ||
|
|
b6d7e222c1 | ||
|
|
e71d9a89d2 | ||
|
|
74c662b63a | ||
|
|
91bdb9eb2d | ||
|
|
47f16505d2 | ||
|
|
e63986b534 | ||
|
|
cbde8548f4 | ||
|
|
7a3656aea2 | ||
|
|
3ba8b15f13 | ||
|
|
7727a792f2 | ||
|
|
ce175d7372 | ||
|
|
609b19b630 | ||
|
|
e3cb957a10 | ||
|
|
55a0178490 | ||
|
|
9a858b8d67 | ||
|
|
cbff32585d | ||
|
|
8fc28c34ce | ||
|
|
d72b9eadec | ||
|
|
3c5bf5b9d8 | ||
|
|
5a07e26405 | ||
|
|
cd66546e24 | ||
|
|
21a59a4a7c | ||
|
|
b5dbf8e43d | ||
|
|
54e50b8a6e | ||
|
|
b35dbb0420 | ||
|
|
63f6afd75b | ||
|
|
3e311a0092 | ||
|
|
69d3d3c15a | ||
|
|
33bc1a3b58 | ||
|
|
740dd928f7 | ||
|
|
757d012ab5 | ||
|
|
f64a87209d | ||
|
|
41df8ee4f5 | ||
|
|
6877d5f3b5 | ||
|
|
6d74d424d3 | ||
|
|
9ec4f7504b | ||
|
|
9166d56f17 | ||
|
|
80b90dd0d9 | ||
|
|
91907789af | ||
|
|
6845852e82 | ||
|
|
99af12af3f | ||
|
|
fd76ff60ac | ||
|
|
cc6bea8b90 | ||
|
|
c1d9e9a285 | ||
|
|
681141a526 | ||
|
|
c100541f07 | ||
|
|
d64f62c2ef | ||
|
|
e049441d93 | ||
|
|
a30b2f34eb | ||
|
|
2bf96ad244 | ||
|
|
a183827128 | ||
|
|
54dd1b3038 | ||
|
|
75d251b81a | ||
|
|
7a6d4666a2 | ||
|
|
d802db4de0 | ||
|
|
b103bb4c8b | ||
|
|
a9d16c40c7 | ||
|
|
98e3a26b2a | ||
|
|
f209a92b7e | ||
|
|
0edfc7fa49 | ||
|
|
cefe038a87 | ||
|
|
0858ee2f27 | ||
|
|
4d1f2ea522 | ||
|
|
6447a6020c | ||
|
|
b3bf21db56 | ||
|
|
674a6f96d3 | ||
|
|
79f8831738 | ||
|
|
224c900532 | ||
|
|
4f9f5f70e3 | ||
|
|
38db6e9366 | ||
|
|
d18c753b3c | ||
|
|
8fedbf87d9 | ||
|
|
d8a369e194 | ||
|
|
90af34bc83 | ||
|
|
c7857dc1d4 | ||
|
|
08e4dc2563 | ||
|
|
92447141d9 | ||
|
|
e0ed44388f | ||
|
|
16d0aa7b4d | ||
|
|
6037b6a5ab | ||
|
|
e1604b2b4a | ||
|
|
db23f51bc6 | ||
|
|
3c6750f37b | ||
|
|
df2ec585f1 | ||
|
|
250b2ca01a | ||
|
|
c2d5f7bf26 | ||
|
|
e223b4ac09 | ||
|
|
f072801f38 | ||
|
|
ededaaa874 | ||
|
|
b1f55e3ee5 | ||
|
|
51b95236f9 | ||
|
|
9123cfb5dd | ||
|
|
9018e9dd70 | ||
|
|
08ff1c1aa8 | ||
|
|
6134939882 | ||
|
|
7cb6427dea | ||
|
|
79b62497d1 | ||
|
|
0729ef7353 | ||
|
|
8f6788474b | ||
|
|
5c2926102b | ||
|
|
bff37075f6 | ||
|
|
c98ee98525 | ||
|
|
ecb430effe | ||
|
|
7ee7221af1 | ||
|
|
748fd3db88 | ||
|
|
cbff1b818c | ||
|
|
a885d2f240 | ||
|
|
b6247b71b5 | ||
|
|
3555c6173d | ||
|
|
3976962621 | ||
|
|
a54a27595b | ||
|
|
7283b9f6cf | ||
|
|
3dfc0a9679 | ||
|
|
6903c4605c | ||
|
|
5b3f708fcb | ||
|
|
b33ed9176f | ||
|
|
c48817f69b | ||
|
|
70dd3a16dc | ||
|
|
9a19fe1f50 | ||
|
|
3961f8e7a4 | ||
|
|
fc37b17b1f | ||
|
|
630bd3d789 | ||
|
|
5c4c0c0cba | ||
|
|
24c241d29b | ||
|
|
a3d760ff12 | ||
|
|
77a3dda59d | ||
|
|
f6daceb449 | ||
|
|
cfef34f7a6 | ||
|
|
c007b9e5bd | ||
|
|
b9f3518b33 | ||
|
|
ba07d9d5e3 | ||
|
|
90e5211128 | ||
|
|
c0d412a736 | ||
|
|
f9eb5edb96 | ||
|
|
ba8b80a163 | ||
|
|
3b90fa5c9b | ||
|
|
273b367f05 | ||
|
|
783acd712d | ||
|
|
748f0b2b5f | ||
|
|
9350e26e68 | ||
|
|
997f793af1 | ||
|
|
4d5f29c74c | ||
|
|
d070b8698d | ||
|
|
057d3e1810 | ||
|
|
d49af633f0 | ||
|
|
3191a9ba11 | ||
|
|
53e13fe1f1 | ||
|
|
59cb0cecb2 | ||
|
|
1c6846c4c2 | ||
|
|
b88e441a07 | ||
|
|
4f57d7116d | ||
|
|
422607df7c | ||
|
|
3f4b494c61 | ||
|
|
109dffb242 | ||
|
|
0e8ee051c6 | ||
|
|
5c545e67f3 | ||
|
|
2daf5e4296 | ||
|
|
d0c8dd78c2 | ||
|
|
21c3e9973a | ||
|
|
8e4d013154 | ||
|
|
37fb01b17d | ||
|
|
ac0a70b369 | ||
|
|
a4bc6f73d7 | ||
|
|
56ee8a5cc6 | ||
|
|
440c244cac | ||
|
|
655303f2f1 | ||
|
|
14e59706b7 | ||
|
|
d59e93d5e9 | ||
|
|
9e85408c7b | ||
|
|
225ae32e7a | ||
|
|
50ef18644b | ||
|
|
41608beb35 | ||
|
|
d9a8e421a4 | ||
|
|
d7cef744ec | ||
|
|
54cbf30c14 | ||
|
|
dfa3c6265c | ||
|
|
a7f52911e1 | ||
|
|
1e31614572 | ||
|
|
3b615b0f7a | ||
|
|
e184f5ab3a | ||
|
|
d0f82e6dcc | ||
|
|
49e1f9ea89 | ||
|
|
6731230d73 | ||
|
|
ec59d71e60 | ||
|
|
bdac541d1e | ||
|
|
061fa70907 | ||
|
|
48b5cfd085 | ||
|
|
a7609c97be | ||
|
|
c33feb6dc9 | ||
|
|
2c7deb41f6 | ||
|
|
8117d0adab | ||
|
|
01a3a6ab0d | ||
|
|
45a8098d3a | ||
|
|
60812ae041 | ||
|
|
635bec06cb | ||
|
|
0f58dfdea4 | ||
|
|
dd5fe334f3 | ||
|
|
e0c9d495ef | ||
|
|
2f34e6fd30 | ||
|
|
69aa35a51c | ||
|
|
5404a8fcd8 | ||
|
|
eb49936a60 | ||
|
|
ff9ea6c4b1 | ||
|
|
586b0a7047 | ||
|
|
84718d183a | ||
|
|
3099a2f53c | ||
|
|
ed010752dd | ||
|
|
f5be6177b2 | ||
|
|
89c6f24d48 | ||
|
|
f23856df8e | ||
|
|
1b7bc299f3 | ||
|
|
a291cc99cf | ||
|
|
389ac5e017 | ||
|
|
fc792a4be9 | ||
|
|
07501bef14 | ||
|
|
137ce05324 | ||
|
|
ada0b4f131 | ||
|
|
abe925e212 | ||
|
|
8fb44608bf | ||
|
|
153cd5bb44 | ||
|
|
669545f551 | ||
|
|
cfe2f3fe15 | ||
|
|
140d609e0c | ||
|
|
a32ad1a656 | ||
|
|
62ba69a29d | ||
|
|
9b0f2a16ca | ||
|
|
85e629e915 | ||
|
|
999a28062d | ||
|
|
ba3fea24f1 | ||
|
|
6b4a8d0b17 | ||
|
|
5ec75e38b9 | ||
|
|
ad042fdd68 | ||
|
|
35ad3146a8 | ||
|
|
e8343f2d87 | ||
|
|
1b1307d0d1 | ||
|
|
7a11be9f3f | ||
|
|
192ce958c3 | ||
|
|
c441681dc2 | ||
|
|
dd70d57b9b | ||
|
|
f12ea1bc02 | ||
|
|
fa76a331b0 | ||
|
|
d999d9876d | ||
|
|
578a5fb6a9 | ||
|
|
a8809bbd3e | ||
|
|
a478e44585 | ||
|
|
c0494b3558 | ||
|
|
7f1cd014f2 | ||
|
|
07b615e96e | ||
|
|
ab387a6120 | ||
|
|
ac79725923 | ||
|
|
8dd38318fc | ||
|
|
533c064269 | ||
|
|
5c3105b437 | ||
|
|
3c0d0dba49 | ||
|
|
12bbca95ec | ||
|
|
f6574978de | ||
|
|
8380895ae3 | ||
|
|
f018999da9 | ||
|
|
51a6b7d2b5 | ||
|
|
9bfe185a2e | ||
|
|
beeb7896e0 | ||
|
|
212460289b | ||
|
|
221fb17c5e | ||
|
|
488deb04a4 | ||
|
|
9d9eea9ac9 | ||
|
|
e7f0ffbf5d | ||
|
|
a09b018bd5 | ||
|
|
7eac4ee9fe | ||
|
|
17a5efb416 | ||
|
|
3e634aa7e4 | ||
|
|
5d3398aa8a | ||
|
|
76d929e177 | ||
|
|
be91af7551 | ||
|
|
c9011fc7e1 | ||
|
|
ff776b57bf | ||
|
|
3ee788dacc | ||
|
|
fef504f038 | ||
|
|
bbb5776763 | ||
|
|
e87bee9ccd | ||
|
|
69a338610a | ||
|
|
aa6394e94f | ||
|
|
ef409c6a24 | ||
|
|
da4167560f | ||
|
|
3488576bd8 | ||
|
|
619c72e566 | ||
|
|
a3ba41fce2 | ||
|
|
c935a604f8 | ||
|
|
e114f09f70 | ||
|
|
9b4d9452ba |
201
.cursorrules
201
.cursorrules
@@ -1,201 +0,0 @@
|
|||||||
Hermes-Agent is an agent harness for LLMs with an interactive CLI.
|
|
||||||
|
|
||||||
## Development Environment
|
|
||||||
|
|
||||||
**IMPORTANT**: Always use the virtual environment if it exists:
|
|
||||||
```bash
|
|
||||||
source venv/bin/activate # Before running any Python commands
|
|
||||||
```
|
|
||||||
|
|
||||||
## Project Structure
|
|
||||||
|
|
||||||
- `hermes` - CLI launcher script (run with `./hermes`)
|
|
||||||
- `cli.py` - Interactive CLI with Rich UI, prompt_toolkit, animated spinners
|
|
||||||
- `cli-config.yaml` - CLI configuration (model, terminal, toolsets, personalities)
|
|
||||||
- `tools/` - Individual tool implementations (web, terminal, browser, vision, etc.)
|
|
||||||
- `tools/__init__.py` - Exports all tools for importing
|
|
||||||
- `model_tools.py` - Consolidates tool schemas and handlers for the agent
|
|
||||||
- `toolsets.py` - Groups tools into logical toolsets (web, terminal, browser, etc.)
|
|
||||||
- `toolset_distributions.py` - Probability-based tool selection for data generation
|
|
||||||
- `run_agent.py` - Primary agent runner with AIAgent class and KawaiiSpinner
|
|
||||||
- `batch_runner.py` - Parallel batch processing with checkpointing
|
|
||||||
- `tests/` - Test scripts
|
|
||||||
|
|
||||||
## File Dependency Chain
|
|
||||||
|
|
||||||
```
|
|
||||||
tools/*.py → tools/__init__.py → model_tools.py → toolsets.py → toolset_distributions.py
|
|
||||||
↑
|
|
||||||
run_agent.py ──────────────────────────┘
|
|
||||||
cli.py → run_agent.py (uses AIAgent with quiet_mode=True)
|
|
||||||
batch_runner.py → run_agent.py + toolset_distributions.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Always ensure consistency between tools, model_tools.py, and toolsets.py when changing any of them.
|
|
||||||
|
|
||||||
## CLI Architecture (cli.py)
|
|
||||||
|
|
||||||
The interactive CLI uses:
|
|
||||||
- **Rich** - For the welcome banner and styled panels
|
|
||||||
- **prompt_toolkit** - For fixed input area with history and `patch_stdout`
|
|
||||||
- **KawaiiSpinner** (in run_agent.py) - Animated feedback during API calls and tool execution
|
|
||||||
|
|
||||||
Key components:
|
|
||||||
- `HermesCLI` class - Main CLI controller with commands and conversation loop
|
|
||||||
- `load_cli_config()` - Loads `cli-config.yaml`, sets environment variables for terminal
|
|
||||||
- `build_welcome_banner()` - Displays ASCII art logo, tools, and skills summary
|
|
||||||
- `/commands` - Process user commands like `/help`, `/clear`, `/personality`, etc.
|
|
||||||
|
|
||||||
CLI uses `quiet_mode=True` when creating AIAgent to suppress verbose logging and enable kawaii-style feedback instead.
|
|
||||||
|
|
||||||
### Adding CLI Commands
|
|
||||||
|
|
||||||
1. Add to `COMMANDS` dict with description
|
|
||||||
2. Add handler in `process_command()` method
|
|
||||||
3. For persistent settings, use `save_config_value()` to update `cli-config.yaml`
|
|
||||||
|
|
||||||
## Adding a New Tool
|
|
||||||
|
|
||||||
Follow this strict order to maintain consistency:
|
|
||||||
|
|
||||||
1. Create `tools/your_tool.py` with:
|
|
||||||
- Handler function (sync or async) returning a JSON string via `json.dumps()`
|
|
||||||
- `check_*_requirements()` function to verify dependencies (e.g., API keys)
|
|
||||||
- Schema definition following OpenAI function-calling format
|
|
||||||
|
|
||||||
2. Export in `tools/__init__.py`:
|
|
||||||
- Import the handler and check function
|
|
||||||
- Add to `__all__` list
|
|
||||||
|
|
||||||
3. Register in `model_tools.py`:
|
|
||||||
- Create `get_*_tool_definitions()` function or add to existing
|
|
||||||
- Add routing in `handle_function_call()` dispatcher
|
|
||||||
- Update `get_all_tool_names()` with the tool name
|
|
||||||
- Update `get_toolset_for_tool()` mapping
|
|
||||||
- Update `get_available_toolsets()` and `check_toolset_requirements()`
|
|
||||||
|
|
||||||
4. Add to toolset in `toolsets.py`:
|
|
||||||
- Add to existing toolset or create new one in TOOLSETS dict
|
|
||||||
|
|
||||||
5. Optionally add to `toolset_distributions.py` for batch processing
|
|
||||||
|
|
||||||
## Tool Implementation Pattern
|
|
||||||
|
|
||||||
```python
|
|
||||||
# tools/example_tool.py
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
def check_example_requirements() -> bool:
|
|
||||||
"""Check if required API keys/dependencies are available."""
|
|
||||||
return bool(os.getenv("EXAMPLE_API_KEY"))
|
|
||||||
|
|
||||||
def example_tool(param: str, task_id: str = None) -> str:
|
|
||||||
"""Execute the tool and return JSON string result."""
|
|
||||||
try:
|
|
||||||
result = {"success": True, "data": "..."}
|
|
||||||
return json.dumps(result, ensure_ascii=False)
|
|
||||||
except Exception as e:
|
|
||||||
return json.dumps({"error": str(e)}, ensure_ascii=False)
|
|
||||||
```
|
|
||||||
|
|
||||||
All tool handlers MUST return a JSON string. Never return raw dicts.
|
|
||||||
|
|
||||||
## Stateful Tools
|
|
||||||
|
|
||||||
Tools that maintain state (terminal, browser) require:
|
|
||||||
- `task_id` parameter for session isolation between concurrent tasks
|
|
||||||
- `cleanup_*()` function to release resources
|
|
||||||
- Cleanup is called automatically in run_agent.py after conversation completes
|
|
||||||
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
API keys are loaded from `.env` file in repo root:
|
|
||||||
- `OPENROUTER_API_KEY` - Main LLM API access (primary provider)
|
|
||||||
- `FIRECRAWL_API_KEY` - Web search/extract tools
|
|
||||||
- `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` - Browser automation
|
|
||||||
- `FAL_KEY` - Image generation (FLUX model)
|
|
||||||
- `NOUS_API_KEY` - Vision and Mixture-of-Agents tools
|
|
||||||
|
|
||||||
Terminal tool configuration (can also be set in `cli-config.yaml`):
|
|
||||||
- `TERMINAL_ENV` - Backend: local, docker, singularity, modal, or ssh
|
|
||||||
- `TERMINAL_CWD` - Working directory
|
|
||||||
- `TERMINAL_SSH_HOST`, `TERMINAL_SSH_USER`, `TERMINAL_SSH_KEY` - For SSH backend
|
|
||||||
|
|
||||||
## Agent Loop (run_agent.py)
|
|
||||||
|
|
||||||
The AIAgent class handles:
|
|
||||||
- Processing enabled toolsets to provide to the model
|
|
||||||
- Piping prompts to the agent
|
|
||||||
- Looping LLM calls when tools are invoked, until natural language response
|
|
||||||
- Returning the final response
|
|
||||||
|
|
||||||
Uses OpenAI-compatible API (primarily OpenRouter) with the OpenAI Python SDK.
|
|
||||||
|
|
||||||
## Reasoning Model Support
|
|
||||||
|
|
||||||
For models that support chain-of-thought reasoning:
|
|
||||||
- Extract `reasoning_content` from API responses
|
|
||||||
- Store in `assistant_msg["reasoning"]` for trajectory export
|
|
||||||
- Pass back via `reasoning_content` field on subsequent turns
|
|
||||||
|
|
||||||
## Trajectory Format
|
|
||||||
|
|
||||||
Conversations are saved in ShareGPT format for training:
|
|
||||||
```json
|
|
||||||
{"from": "system", "value": "System prompt with <tools>...</tools>"}
|
|
||||||
{"from": "human", "value": "User message"}
|
|
||||||
{"from": "gpt", "value": "<think>reasoning</think>\n<tool_call>{...}</tool_call>"}
|
|
||||||
{"from": "tool", "value": "<tool_response>{...}</tool_response>"}
|
|
||||||
{"from": "gpt", "value": "Final response"}
|
|
||||||
```
|
|
||||||
|
|
||||||
Tool calls use `<tool_call>` XML tags, responses use `<tool_response>` tags, reasoning uses `<think>` tags.
|
|
||||||
|
|
||||||
## Batch Processing (batch_runner.py)
|
|
||||||
|
|
||||||
For processing multiple prompts:
|
|
||||||
- Parallel execution with multiprocessing
|
|
||||||
- Content-based resume for fault tolerance (matches on prompt text, not indices)
|
|
||||||
- Toolset distributions control probabilistic tool availability per prompt
|
|
||||||
- Output: `data/<run_name>/trajectories.jsonl` (combined) + individual batch files
|
|
||||||
|
|
||||||
## Logging
|
|
||||||
|
|
||||||
Trajectories restructure tools as a system prompt for storage in a format suitable for later training use.
|
|
||||||
|
|
||||||
## Skills System
|
|
||||||
|
|
||||||
Skills are on-demand knowledge documents the agent can load. Located in `skills/` directory:
|
|
||||||
|
|
||||||
```
|
|
||||||
skills/
|
|
||||||
├── mlops/ # Category folder
|
|
||||||
│ ├── axolotl/ # Skill folder
|
|
||||||
│ │ ├── SKILL.md # Main instructions (required)
|
|
||||||
│ │ ├── references/ # Additional docs, API specs
|
|
||||||
│ │ └── templates/ # Output formats, configs
|
|
||||||
│ └── vllm/
|
|
||||||
│ └── SKILL.md
|
|
||||||
└── example-skill/
|
|
||||||
└── SKILL.md
|
|
||||||
```
|
|
||||||
|
|
||||||
**Progressive disclosure** (token-efficient):
|
|
||||||
1. `skills_categories()` - List category names (~50 tokens)
|
|
||||||
2. `skills_list(category)` - Name + description per skill (~3k tokens)
|
|
||||||
3. `skill_view(name)` - Full content + tags + linked files
|
|
||||||
|
|
||||||
SKILL.md files use YAML frontmatter:
|
|
||||||
```yaml
|
|
||||||
---
|
|
||||||
name: skill-name
|
|
||||||
description: Brief description for listing
|
|
||||||
tags: [tag1, tag2]
|
|
||||||
related_skills: [other-skill]
|
|
||||||
version: 1.0.0
|
|
||||||
---
|
|
||||||
# Skill Content...
|
|
||||||
```
|
|
||||||
|
|
||||||
Tool files: `tools/skills_tool.py` → `model_tools.py` → `toolsets.py`
|
|
||||||
157
.env.example
157
.env.example
@@ -10,8 +10,40 @@
|
|||||||
OPENROUTER_API_KEY=
|
OPENROUTER_API_KEY=
|
||||||
|
|
||||||
# Default model to use (OpenRouter format: provider/model)
|
# Default model to use (OpenRouter format: provider/model)
|
||||||
# Examples: anthropic/claude-sonnet-4, openai/gpt-4o, google/gemini-2.0-flash, zhipuai/glm-4-plus
|
# Examples: anthropic/claude-opus-4.6, openai/gpt-4o, google/gemini-3-flash-preview, zhipuai/glm-4-plus
|
||||||
LLM_MODEL=anthropic/claude-sonnet-4
|
LLM_MODEL=anthropic/claude-opus-4.6
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# LLM PROVIDER (z.ai / GLM)
|
||||||
|
# =============================================================================
|
||||||
|
# z.ai provides access to ZhipuAI GLM models (GLM-4-Plus, etc.)
|
||||||
|
# Get your key at: https://z.ai or https://open.bigmodel.cn
|
||||||
|
GLM_API_KEY=
|
||||||
|
# GLM_BASE_URL=https://api.z.ai/api/paas/v4 # Override default base URL
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# LLM PROVIDER (Kimi / Moonshot)
|
||||||
|
# =============================================================================
|
||||||
|
# Kimi Code provides access to Moonshot AI coding models (kimi-k2.5, etc.)
|
||||||
|
# Get your key at: https://platform.kimi.ai (Kimi Code console)
|
||||||
|
# Keys prefixed sk-kimi- use the Kimi Code API (api.kimi.com) by default.
|
||||||
|
# Legacy keys from platform.moonshot.ai need KIMI_BASE_URL override below.
|
||||||
|
KIMI_API_KEY=
|
||||||
|
# KIMI_BASE_URL=https://api.kimi.com/coding/v1 # Default for sk-kimi- keys
|
||||||
|
# KIMI_BASE_URL=https://api.moonshot.ai/v1 # For legacy Moonshot keys
|
||||||
|
# KIMI_BASE_URL=https://api.moonshot.cn/v1 # For Moonshot China keys
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# LLM PROVIDER (MiniMax)
|
||||||
|
# =============================================================================
|
||||||
|
# MiniMax provides access to MiniMax models (global endpoint)
|
||||||
|
# Get your key at: https://www.minimax.io
|
||||||
|
MINIMAX_API_KEY=
|
||||||
|
# MINIMAX_BASE_URL=https://api.minimax.io/v1 # Override default base URL
|
||||||
|
|
||||||
|
# MiniMax China endpoint (for users in mainland China)
|
||||||
|
MINIMAX_CN_API_KEY=
|
||||||
|
# MINIMAX_CN_BASE_URL=https://api.minimaxi.com/v1 # Override default base URL
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# TOOL API KEYS
|
# TOOL API KEYS
|
||||||
@@ -21,32 +53,40 @@ LLM_MODEL=anthropic/claude-sonnet-4
|
|||||||
# Get at: https://firecrawl.dev/
|
# Get at: https://firecrawl.dev/
|
||||||
FIRECRAWL_API_KEY=
|
FIRECRAWL_API_KEY=
|
||||||
|
|
||||||
# Nous Research API Key - Vision analysis and multi-model reasoning
|
|
||||||
# Get at: https://inference-api.nousresearch.com/
|
|
||||||
NOUS_API_KEY=
|
|
||||||
|
|
||||||
# FAL.ai API Key - Image generation
|
# FAL.ai API Key - Image generation
|
||||||
# Get at: https://fal.ai/
|
# Get at: https://fal.ai/
|
||||||
FAL_KEY=
|
FAL_KEY=
|
||||||
|
|
||||||
|
# Honcho - Cross-session AI-native user modeling (optional)
|
||||||
|
# Builds a persistent understanding of the user across sessions and tools.
|
||||||
|
# Get at: https://app.honcho.dev
|
||||||
|
# Also requires ~/.honcho/config.json with enabled=true (see README).
|
||||||
|
HONCHO_API_KEY=
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# TERMINAL TOOL CONFIGURATION (mini-swe-agent backend)
|
# TERMINAL TOOL CONFIGURATION (mini-swe-agent backend)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Backend type: "local", "singularity", "docker", "modal", or "ssh"
|
# Backend type: "local", "singularity", "docker", "modal", or "ssh"
|
||||||
# - local: Runs directly on your machine (fastest, no isolation)
|
# Terminal backend is configured in ~/.hermes/config.yaml (terminal.backend).
|
||||||
# - ssh: Runs on remote server via SSH (great for sandboxing - agent can't touch its own code)
|
# Use 'hermes setup' or 'hermes config set terminal.backend docker' to change.
|
||||||
# - singularity: Runs in Apptainer/Singularity containers (HPC clusters, no root needed)
|
# Supported: local, docker, singularity, modal, ssh
|
||||||
# - docker: Runs in Docker containers (isolated, requires Docker + docker group)
|
#
|
||||||
# - modal: Runs in Modal cloud sandboxes (scalable, requires Modal account)
|
# Only override here if you need to force a backend without touching config.yaml:
|
||||||
TERMINAL_ENV=local
|
# TERMINAL_ENV=local
|
||||||
|
|
||||||
# Container images (for singularity/docker/modal backends)
|
# Container images (for singularity/docker/modal backends)
|
||||||
TERMINAL_DOCKER_IMAGE=python:3.11
|
# TERMINAL_DOCKER_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
|
||||||
TERMINAL_SINGULARITY_IMAGE=docker://python:3.11
|
# TERMINAL_SINGULARITY_IMAGE=docker://nikolaik/python-nodejs:python3.11-nodejs20
|
||||||
TERMINAL_MODAL_IMAGE=python:3.11
|
TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
|
||||||
|
|
||||||
# Working directory inside the container
|
|
||||||
TERMINAL_CWD=/tmp
|
# Working directory for terminal commands
|
||||||
|
# For local backend: "." means current directory (resolved automatically)
|
||||||
|
# For remote backends (ssh/docker/modal/singularity): use an absolute path
|
||||||
|
# INSIDE the target environment, or leave unset for the backend's default
|
||||||
|
# (/root for modal, / for docker, ~ for ssh). Do NOT use a host-local path.
|
||||||
|
# Usually managed by config.yaml (terminal.cwd) — uncomment to override
|
||||||
|
# TERMINAL_CWD=.
|
||||||
|
|
||||||
# Default command timeout in seconds
|
# Default command timeout in seconds
|
||||||
TERMINAL_TIMEOUT=60
|
TERMINAL_TIMEOUT=60
|
||||||
@@ -136,16 +176,43 @@ BROWSER_INACTIVITY_TIMEOUT=120
|
|||||||
# Contains full conversation history in trajectory format for debugging/replay
|
# Contains full conversation history in trajectory format for debugging/replay
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# LEGACY/OPTIONAL API KEYS
|
# VOICE TRANSCRIPTION & OPENAI TTS
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
# Required for voice message transcription (Whisper) and OpenAI TTS voices.
|
||||||
|
# Uses OpenAI's API directly (not via OpenRouter).
|
||||||
|
# Named VOICE_TOOLS_OPENAI_KEY to avoid interference with OpenRouter.
|
||||||
|
# Get at: https://platform.openai.com/api-keys
|
||||||
|
VOICE_TOOLS_OPENAI_KEY=
|
||||||
|
|
||||||
# Morph API Key - For legacy Hecate terminal backend (terminal-hecate tool)
|
# =============================================================================
|
||||||
# Get at: https://morph.so/
|
# SLACK INTEGRATION
|
||||||
MORPH_API_KEY=
|
# =============================================================================
|
||||||
|
# Slack Bot Token - From Slack App settings (OAuth & Permissions)
|
||||||
|
# Get at: https://api.slack.com/apps
|
||||||
|
# SLACK_BOT_TOKEN=xoxb-...
|
||||||
|
|
||||||
# Hecate VM Settings (only if using terminal-hecate tool)
|
# Slack App Token - For Socket Mode (App-Level Tokens in Slack App settings)
|
||||||
HECATE_VM_LIFETIME_SECONDS=300
|
# SLACK_APP_TOKEN=xapp-...
|
||||||
HECATE_DEFAULT_SNAPSHOT_ID=snapshot_p5294qxt
|
|
||||||
|
# Slack allowed users (comma-separated Slack user IDs)
|
||||||
|
# SLACK_ALLOWED_USERS=
|
||||||
|
|
||||||
|
# WhatsApp (built-in Baileys bridge — run `hermes whatsapp` to pair)
|
||||||
|
# WHATSAPP_ENABLED=false
|
||||||
|
# WHATSAPP_ALLOWED_USERS=15551234567
|
||||||
|
|
||||||
|
# Gateway-wide: allow ALL users without an allowlist (default: false = deny)
|
||||||
|
# Only set to true if you intentionally want open access.
|
||||||
|
# GATEWAY_ALLOW_ALL_USERS=false
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# RESPONSE PACING
|
||||||
|
# =============================================================================
|
||||||
|
# Human-like delays between message chunks on messaging platforms.
|
||||||
|
# Makes the bot feel less robotic.
|
||||||
|
# HERMES_HUMAN_DELAY_MODE=off # off | natural | custom
|
||||||
|
# HERMES_HUMAN_DELAY_MIN_MS=800 # Min delay in ms (custom mode)
|
||||||
|
# HERMES_HUMAN_DELAY_MAX_MS=2500 # Max delay in ms (custom mode)
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# DEBUG OPTIONS
|
# DEBUG OPTIONS
|
||||||
@@ -154,3 +221,45 @@ WEB_TOOLS_DEBUG=false
|
|||||||
VISION_TOOLS_DEBUG=false
|
VISION_TOOLS_DEBUG=false
|
||||||
MOA_TOOLS_DEBUG=false
|
MOA_TOOLS_DEBUG=false
|
||||||
IMAGE_TOOLS_DEBUG=false
|
IMAGE_TOOLS_DEBUG=false
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# CONTEXT COMPRESSION (Auto-shrinks long conversations)
|
||||||
|
# =============================================================================
|
||||||
|
# When conversation approaches model's context limit, middle turns are
|
||||||
|
# automatically summarized to free up space.
|
||||||
|
#
|
||||||
|
# Context compression is configured in ~/.hermes/config.yaml under compression:
|
||||||
|
# CONTEXT_COMPRESSION_ENABLED=true # Enable auto-compression (default: true)
|
||||||
|
# CONTEXT_COMPRESSION_THRESHOLD=0.85 # Compress at 85% of context limit
|
||||||
|
# Model is set via compression.summary_model in config.yaml (default: google/gemini-3-flash-preview)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# RL TRAINING (Tinker + Atropos)
|
||||||
|
# =============================================================================
|
||||||
|
# Run reinforcement learning training on language models using the Tinker API.
|
||||||
|
# Requires the rl-server to be running (from tinker-atropos package).
|
||||||
|
|
||||||
|
# Tinker API Key - RL training service
|
||||||
|
# Get at: https://tinker-console.thinkingmachines.ai/keys
|
||||||
|
TINKER_API_KEY=
|
||||||
|
|
||||||
|
# Weights & Biases API Key - Experiment tracking and metrics
|
||||||
|
# Get at: https://wandb.ai/authorize
|
||||||
|
WANDB_API_KEY=
|
||||||
|
|
||||||
|
# RL API Server URL (default: http://localhost:8080)
|
||||||
|
# Change if running the rl-server on a different host/port
|
||||||
|
# RL_API_URL=http://localhost:8080
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# SKILLS HUB (GitHub integration for skill search/install/publish)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# GitHub Personal Access Token — for higher API rate limits on skill search/install
|
||||||
|
# Get at: https://github.com/settings/tokens (Fine-grained recommended)
|
||||||
|
# GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
||||||
|
|
||||||
|
# GitHub App credentials (optional — for bot identity on PRs)
|
||||||
|
# GITHUB_APP_ID=
|
||||||
|
# GITHUB_APP_PRIVATE_KEY_PATH=
|
||||||
|
# GITHUB_APP_INSTALLATION_ID=
|
||||||
|
|||||||
144
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
144
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
name: "🐛 Bug Report"
|
||||||
|
description: Report a bug — something that's broken, crashes, or behaves incorrectly.
|
||||||
|
title: "[Bug]: "
|
||||||
|
labels: ["bug"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for reporting a bug! Please fill out the sections below so we can reproduce and fix it quickly.
|
||||||
|
|
||||||
|
**Before submitting**, please:
|
||||||
|
- [ ] Search [existing issues](https://github.com/NousResearch/hermes-agent/issues) to avoid duplicates
|
||||||
|
- [ ] Update to the latest version (`hermes update`) and confirm the bug still exists
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: description
|
||||||
|
attributes:
|
||||||
|
label: Bug Description
|
||||||
|
description: A clear description of what's broken. Include error messages, tracebacks, or screenshots if relevant.
|
||||||
|
placeholder: |
|
||||||
|
What happened? What did you expect to happen instead?
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: reproduction
|
||||||
|
attributes:
|
||||||
|
label: Steps to Reproduce
|
||||||
|
description: Minimal steps to trigger the bug. The more specific, the faster we can fix it.
|
||||||
|
placeholder: |
|
||||||
|
1. Run `hermes chat`
|
||||||
|
2. Send the message "..."
|
||||||
|
3. Agent calls tool X
|
||||||
|
4. Error appears: ...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: expected
|
||||||
|
attributes:
|
||||||
|
label: Expected Behavior
|
||||||
|
description: What should have happened instead?
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: actual
|
||||||
|
attributes:
|
||||||
|
label: Actual Behavior
|
||||||
|
description: What actually happened? Include full error output if available.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: dropdown
|
||||||
|
id: component
|
||||||
|
attributes:
|
||||||
|
label: Affected Component
|
||||||
|
description: Which part of Hermes is affected?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- CLI (interactive chat)
|
||||||
|
- Gateway (Telegram/Discord/Slack/WhatsApp)
|
||||||
|
- Setup / Installation
|
||||||
|
- Tools (terminal, file ops, web, code execution, etc.)
|
||||||
|
- Skills (skill loading, skill hub, skill guard)
|
||||||
|
- Agent Core (conversation loop, context compression, memory)
|
||||||
|
- Configuration (config.yaml, .env, hermes setup)
|
||||||
|
- Other
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: dropdown
|
||||||
|
id: platform
|
||||||
|
attributes:
|
||||||
|
label: Messaging Platform (if gateway-related)
|
||||||
|
description: Which platform adapter is affected?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- N/A (CLI only)
|
||||||
|
- Telegram
|
||||||
|
- Discord
|
||||||
|
- Slack
|
||||||
|
- WhatsApp
|
||||||
|
|
||||||
|
- type: input
|
||||||
|
id: os
|
||||||
|
attributes:
|
||||||
|
label: Operating System
|
||||||
|
description: e.g. Ubuntu 24.04, macOS 15.2, Windows 11
|
||||||
|
placeholder: Ubuntu 24.04
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: input
|
||||||
|
id: python-version
|
||||||
|
attributes:
|
||||||
|
label: Python Version
|
||||||
|
description: Output of `python --version`
|
||||||
|
placeholder: "3.11.9"
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: input
|
||||||
|
id: hermes-version
|
||||||
|
attributes:
|
||||||
|
label: Hermes Version
|
||||||
|
description: Output of `hermes version`
|
||||||
|
placeholder: "2.1.0"
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant Logs / Traceback
|
||||||
|
description: Paste any error output, traceback, or log messages. This will be auto-formatted as code.
|
||||||
|
render: shell
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: root-cause
|
||||||
|
attributes:
|
||||||
|
label: Root Cause Analysis (optional)
|
||||||
|
description: |
|
||||||
|
If you've dug into the code and identified the root cause, share it here.
|
||||||
|
Include file paths, line numbers, and code snippets if possible. This massively speeds up fixes.
|
||||||
|
placeholder: |
|
||||||
|
The bug is in `gateway/run.py` line 949. `len(history)` counts session_meta entries
|
||||||
|
but `agent_messages` was built from filtered history...
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: proposed-fix
|
||||||
|
attributes:
|
||||||
|
label: Proposed Fix (optional)
|
||||||
|
description: If you have a fix in mind (or a PR ready), describe it here.
|
||||||
|
placeholder: |
|
||||||
|
Replace `.get()` with `.pop()` on line 289 of `gateway/platforms/base.py`
|
||||||
|
to actually clear the pending message after retrieval.
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: pr-ready
|
||||||
|
attributes:
|
||||||
|
label: Are you willing to submit a PR for this?
|
||||||
|
options:
|
||||||
|
- label: I'd like to fix this myself and submit a PR
|
||||||
11
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
11
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
blank_issues_enabled: true
|
||||||
|
contact_links:
|
||||||
|
- name: 💬 Nous Research Discord
|
||||||
|
url: https://discord.gg/NousResearch
|
||||||
|
about: For quick questions, showcasing projects, sharing skills, and community chat.
|
||||||
|
- name: 📖 Documentation
|
||||||
|
url: https://github.com/NousResearch/hermes-agent/blob/main/README.md
|
||||||
|
about: Check the README and docs before opening an issue.
|
||||||
|
- name: 🤝 Contributing Guide
|
||||||
|
url: https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md
|
||||||
|
about: Read this before submitting a PR.
|
||||||
73
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
73
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
name: "✨ Feature Request"
|
||||||
|
description: Suggest a new feature or improvement.
|
||||||
|
title: "[Feature]: "
|
||||||
|
labels: ["enhancement"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for the suggestion! Before submitting, please consider:
|
||||||
|
|
||||||
|
- **Is this a new skill?** Most capabilities should be [skills, not tools](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#should-it-be-a-skill-or-a-tool). If it's a specialized integration (crypto, NFT, niche SaaS), it belongs on the Skills Hub, not bundled.
|
||||||
|
- **Search [existing issues](https://github.com/NousResearch/hermes-agent/issues)** — someone may have already proposed this.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: problem
|
||||||
|
attributes:
|
||||||
|
label: Problem or Use Case
|
||||||
|
description: What problem does this solve? What are you trying to do that you can't today?
|
||||||
|
placeholder: |
|
||||||
|
I'm trying to use Hermes with [provider/platform/workflow] but currently
|
||||||
|
there's no way to...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: solution
|
||||||
|
attributes:
|
||||||
|
label: Proposed Solution
|
||||||
|
description: How do you think this should work? Be as specific as you can — CLI flags, config options, UI behavior.
|
||||||
|
placeholder: |
|
||||||
|
Add a `--foo` flag to `hermes chat` that enables...
|
||||||
|
Or: Add a config key `bar.baz` that controls...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: alternatives
|
||||||
|
attributes:
|
||||||
|
label: Alternatives Considered
|
||||||
|
description: What other approaches did you consider? Why is the proposed solution better?
|
||||||
|
|
||||||
|
- type: dropdown
|
||||||
|
id: type
|
||||||
|
attributes:
|
||||||
|
label: Feature Type
|
||||||
|
options:
|
||||||
|
- New tool
|
||||||
|
- New bundled skill
|
||||||
|
- CLI improvement
|
||||||
|
- Gateway / messaging improvement
|
||||||
|
- Configuration option
|
||||||
|
- Performance / reliability
|
||||||
|
- Developer experience (tests, docs, CI)
|
||||||
|
- Other
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: dropdown
|
||||||
|
id: scope
|
||||||
|
attributes:
|
||||||
|
label: Scope
|
||||||
|
description: How big is this change?
|
||||||
|
options:
|
||||||
|
- Small (single file, < 50 lines)
|
||||||
|
- Medium (few files, < 300 lines)
|
||||||
|
- Large (new module or significant refactor)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: pr-ready
|
||||||
|
attributes:
|
||||||
|
label: Contribution
|
||||||
|
options:
|
||||||
|
- label: I'd like to implement this myself and submit a PR
|
||||||
100
.github/ISSUE_TEMPLATE/setup_help.yml
vendored
Normal file
100
.github/ISSUE_TEMPLATE/setup_help.yml
vendored
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
name: "🔧 Setup / Installation Help"
|
||||||
|
description: Having trouble installing or configuring Hermes? Ask here.
|
||||||
|
title: "[Setup]: "
|
||||||
|
labels: ["setup"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Sorry you're having trouble! Please fill out the details below so we can help.
|
||||||
|
|
||||||
|
**Quick checks first:**
|
||||||
|
- Run `hermes doctor` and include the output below
|
||||||
|
- Try `hermes update` to get the latest version
|
||||||
|
- Check the [README troubleshooting section](https://github.com/NousResearch/hermes-agent#troubleshooting)
|
||||||
|
- For general questions, consider the [Nous Research Discord](https://discord.gg/NousResearch) for faster help
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: description
|
||||||
|
attributes:
|
||||||
|
label: What's Going Wrong?
|
||||||
|
description: Describe what you're trying to do and where it fails.
|
||||||
|
placeholder: |
|
||||||
|
I ran `hermes setup` and selected Nous Portal, but when I try to
|
||||||
|
start the gateway I get...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: steps
|
||||||
|
attributes:
|
||||||
|
label: Steps Taken
|
||||||
|
description: What did you do? Include the exact commands you ran.
|
||||||
|
placeholder: |
|
||||||
|
1. Ran the install script: `curl -fsSL ... | bash`
|
||||||
|
2. Ran `hermes setup` and chose "Quick setup"
|
||||||
|
3. Selected OpenRouter, entered API key
|
||||||
|
4. Ran `hermes chat` and got error...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: dropdown
|
||||||
|
id: install-method
|
||||||
|
attributes:
|
||||||
|
label: Installation Method
|
||||||
|
options:
|
||||||
|
- Install script (curl | bash)
|
||||||
|
- Manual clone + pip/uv install
|
||||||
|
- PowerShell installer (Windows)
|
||||||
|
- Docker
|
||||||
|
- Other
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: input
|
||||||
|
id: os
|
||||||
|
attributes:
|
||||||
|
label: Operating System
|
||||||
|
placeholder: Ubuntu 24.04 / macOS 15.2 / Windows 11
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: input
|
||||||
|
id: python-version
|
||||||
|
attributes:
|
||||||
|
label: Python Version
|
||||||
|
description: Output of `python --version` (or `python3 --version`)
|
||||||
|
placeholder: "3.11.9"
|
||||||
|
|
||||||
|
- type: input
|
||||||
|
id: hermes-version
|
||||||
|
attributes:
|
||||||
|
label: Hermes Version
|
||||||
|
description: Output of `hermes version` (if install got that far)
|
||||||
|
placeholder: "2.1.0"
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: doctor-output
|
||||||
|
attributes:
|
||||||
|
label: Output of `hermes doctor`
|
||||||
|
description: Run `hermes doctor` and paste the full output. This will be auto-formatted.
|
||||||
|
render: shell
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: error-output
|
||||||
|
attributes:
|
||||||
|
label: Full Error Output
|
||||||
|
description: Paste the complete error message or traceback. This will be auto-formatted.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: tried
|
||||||
|
attributes:
|
||||||
|
label: What I've Already Tried
|
||||||
|
description: List any fixes or workarounds you've already attempted.
|
||||||
|
placeholder: |
|
||||||
|
- Ran `hermes update`
|
||||||
|
- Tried reinstalling with `pip install -e ".[all]"`
|
||||||
|
- Checked that OPENROUTER_API_KEY is set in ~/.hermes/.env
|
||||||
75
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
75
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
## What does this PR do?
|
||||||
|
|
||||||
|
<!-- Describe the change clearly. What problem does it solve? Why is this approach the right one? -->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Related Issue
|
||||||
|
|
||||||
|
<!-- Link the issue this PR addresses. If no issue exists, consider creating one first. -->
|
||||||
|
|
||||||
|
Fixes #
|
||||||
|
|
||||||
|
## Type of Change
|
||||||
|
|
||||||
|
<!-- Check the one that applies. -->
|
||||||
|
|
||||||
|
- [ ] 🐛 Bug fix (non-breaking change that fixes an issue)
|
||||||
|
- [ ] ✨ New feature (non-breaking change that adds functionality)
|
||||||
|
- [ ] 🔒 Security fix
|
||||||
|
- [ ] 📝 Documentation update
|
||||||
|
- [ ] ✅ Tests (adding or improving test coverage)
|
||||||
|
- [ ] ♻️ Refactor (no behavior change)
|
||||||
|
- [ ] 🎯 New skill (bundled or hub)
|
||||||
|
|
||||||
|
## Changes Made
|
||||||
|
|
||||||
|
<!-- List the specific changes. Include file paths for code changes. -->
|
||||||
|
|
||||||
|
-
|
||||||
|
|
||||||
|
## How to Test
|
||||||
|
|
||||||
|
<!-- Steps to verify this change works. For bugs: reproduction steps + proof that the fix works. -->
|
||||||
|
|
||||||
|
1.
|
||||||
|
2.
|
||||||
|
3.
|
||||||
|
|
||||||
|
## Checklist
|
||||||
|
|
||||||
|
<!-- Complete these before requesting review. -->
|
||||||
|
|
||||||
|
### Code
|
||||||
|
|
||||||
|
- [ ] I've read the [Contributing Guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md)
|
||||||
|
- [ ] My commit messages follow [Conventional Commits](https://www.conventionalcommits.org/) (`fix(scope):`, `feat(scope):`, etc.)
|
||||||
|
- [ ] I searched for [existing PRs](https://github.com/NousResearch/hermes-agent/pulls) to make sure this isn't a duplicate
|
||||||
|
- [ ] My PR contains **only** changes related to this fix/feature (no unrelated commits)
|
||||||
|
- [ ] I've run `pytest tests/ -q` and all tests pass
|
||||||
|
- [ ] I've added tests for my changes (required for bug fixes, strongly encouraged for features)
|
||||||
|
- [ ] I've tested on my platform: <!-- e.g. Ubuntu 24.04, macOS 15.2, Windows 11 -->
|
||||||
|
|
||||||
|
### Documentation & Housekeeping
|
||||||
|
|
||||||
|
<!-- Check all that apply. It's OK to check "N/A" if a category doesn't apply to your change. -->
|
||||||
|
|
||||||
|
- [ ] I've updated relevant documentation (README, `docs/`, docstrings) — or N/A
|
||||||
|
- [ ] I've updated `cli-config.yaml.example` if I added/changed config keys — or N/A
|
||||||
|
- [ ] I've updated `CONTRIBUTING.md` or `AGENTS.md` if I changed architecture or workflows — or N/A
|
||||||
|
- [ ] I've considered cross-platform impact (Windows, macOS) per the [compatibility guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#cross-platform-compatibility) — or N/A
|
||||||
|
- [ ] I've updated tool descriptions/schemas if I changed tool behavior — or N/A
|
||||||
|
|
||||||
|
## For New Skills
|
||||||
|
|
||||||
|
<!-- Only fill this out if you're adding a skill. Delete this section otherwise. -->
|
||||||
|
|
||||||
|
- [ ] This skill is **broadly useful** to most users (if bundled) — see [Contributing Guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#should-the-skill-be-bundled)
|
||||||
|
- [ ] SKILL.md follows the [standard format](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#skillmd-format) (frontmatter, trigger conditions, steps, pitfalls)
|
||||||
|
- [ ] No external dependencies that aren't already available (prefer stdlib, curl, existing Hermes tools)
|
||||||
|
- [ ] I've tested the skill end-to-end: `hermes --toolsets skills -q "Use the X skill to do Y"`
|
||||||
|
|
||||||
|
## Screenshots / Logs
|
||||||
|
|
||||||
|
<!-- If applicable, add screenshots or log output showing the fix/feature in action. -->
|
||||||
|
|
||||||
60
.github/workflows/deploy-site.yml
vendored
Normal file
60
.github/workflows/deploy-site.yml
vendored
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
name: Deploy Site
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
paths:
|
||||||
|
- 'website/**'
|
||||||
|
- 'landingpage/**'
|
||||||
|
- '.github/workflows/deploy-site.yml'
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
pages: write
|
||||||
|
id-token: write
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: pages
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
environment:
|
||||||
|
name: github-pages
|
||||||
|
url: ${{ steps.deploy.outputs.page_url }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
cache: npm
|
||||||
|
cache-dependency-path: website/package-lock.json
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: npm ci
|
||||||
|
working-directory: website
|
||||||
|
|
||||||
|
- name: Build Docusaurus
|
||||||
|
run: npm run build
|
||||||
|
working-directory: website
|
||||||
|
|
||||||
|
- name: Stage deployment
|
||||||
|
run: |
|
||||||
|
mkdir -p _site/docs
|
||||||
|
# Landing page at root
|
||||||
|
cp -r landingpage/* _site/
|
||||||
|
# Docusaurus at /docs/
|
||||||
|
cp -r website/build/* _site/docs/
|
||||||
|
# CNAME so GitHub Pages keeps the custom domain between deploys
|
||||||
|
echo "hermes-agent.nousresearch.com" > _site/CNAME
|
||||||
|
|
||||||
|
- name: Upload artifact
|
||||||
|
uses: actions/upload-pages-artifact@v3
|
||||||
|
with:
|
||||||
|
path: _site
|
||||||
|
|
||||||
|
- name: Deploy to GitHub Pages
|
||||||
|
id: deploy
|
||||||
|
uses: actions/deploy-pages@v4
|
||||||
42
.github/workflows/tests.yml
vendored
Normal file
42
.github/workflows/tests.yml
vendored
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
name: Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
# Cancel in-progress runs for the same PR/branch
|
||||||
|
concurrency:
|
||||||
|
group: tests-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 10
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v5
|
||||||
|
|
||||||
|
- name: Set up Python 3.11
|
||||||
|
run: uv python install 3.11
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
uv venv .venv --python 3.11
|
||||||
|
source .venv/bin/activate
|
||||||
|
uv pip install -e ".[all,dev]"
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: |
|
||||||
|
source .venv/bin/activate
|
||||||
|
python -m pytest tests/ -q --ignore=tests/integration --tb=short -n auto
|
||||||
|
env:
|
||||||
|
# Ensure tests don't accidentally call real APIs
|
||||||
|
OPENROUTER_API_KEY: ""
|
||||||
|
OPENAI_API_KEY: ""
|
||||||
|
NOUS_API_KEY: ""
|
||||||
11
.gitignore
vendored
11
.gitignore
vendored
@@ -1,7 +1,5 @@
|
|||||||
/venv/
|
/venv/
|
||||||
/_pycache/
|
/_pycache/
|
||||||
hecate/
|
|
||||||
hecate-lib/
|
|
||||||
*.pyc*
|
*.pyc*
|
||||||
__pycache__/
|
__pycache__/
|
||||||
.venv/
|
.venv/
|
||||||
@@ -39,6 +37,15 @@ agent-browser/
|
|||||||
*.pem
|
*.pem
|
||||||
privvy*
|
privvy*
|
||||||
images/
|
images/
|
||||||
|
__pycache__/
|
||||||
|
hermes_agent.egg-info/
|
||||||
|
wandb/
|
||||||
|
testlogs
|
||||||
|
|
||||||
# CLI config (may contain sensitive SSH paths)
|
# CLI config (may contain sensitive SSH paths)
|
||||||
cli-config.yaml
|
cli-config.yaml
|
||||||
|
|
||||||
|
# Skills Hub state (lives in ~/.hermes/skills/.hub/ at runtime, but just in case)
|
||||||
|
skills/.hub/
|
||||||
|
ignored/
|
||||||
|
.worktrees/
|
||||||
|
|||||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +1,6 @@
|
|||||||
[submodule "mini-swe-agent"]
|
[submodule "mini-swe-agent"]
|
||||||
path = mini-swe-agent
|
path = mini-swe-agent
|
||||||
url = https://github.com/SWE-agent/mini-swe-agent
|
url = https://github.com/SWE-agent/mini-swe-agent
|
||||||
|
[submodule "tinker-atropos"]
|
||||||
|
path = tinker-atropos
|
||||||
|
url = https://github.com/nousresearch/tinker-atropos
|
||||||
|
|||||||
242
AGENTS.md
Normal file
242
AGENTS.md
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
# Hermes Agent - Development Guide
|
||||||
|
|
||||||
|
Instructions for AI coding assistants and developers working on the hermes-agent codebase.
|
||||||
|
|
||||||
|
## Development Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source .venv/bin/activate # ALWAYS activate before running Python
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
hermes-agent/
|
||||||
|
├── run_agent.py # AIAgent class — core conversation loop
|
||||||
|
├── model_tools.py # Tool orchestration, _discover_tools(), handle_function_call()
|
||||||
|
├── toolsets.py # Toolset definitions, _HERMES_CORE_TOOLS list
|
||||||
|
├── cli.py # HermesCLI class — interactive CLI orchestrator
|
||||||
|
├── hermes_state.py # SessionDB — SQLite session store (FTS5 search)
|
||||||
|
├── agent/ # Agent internals
|
||||||
|
│ ├── prompt_builder.py # System prompt assembly
|
||||||
|
│ ├── context_compressor.py # Auto context compression
|
||||||
|
│ ├── prompt_caching.py # Anthropic prompt caching
|
||||||
|
│ ├── auxiliary_client.py # Auxiliary LLM client (vision, summarization)
|
||||||
|
│ ├── model_metadata.py # Model context lengths, token estimation
|
||||||
|
│ ├── display.py # KawaiiSpinner, tool preview formatting
|
||||||
|
│ ├── skill_commands.py # Skill slash commands (shared CLI/gateway)
|
||||||
|
│ └── trajectory.py # Trajectory saving helpers
|
||||||
|
├── hermes_cli/ # CLI subcommands and setup
|
||||||
|
│ ├── main.py # Entry point — all `hermes` subcommands
|
||||||
|
│ ├── config.py # DEFAULT_CONFIG, OPTIONAL_ENV_VARS, migration
|
||||||
|
│ ├── commands.py # Slash command definitions + SlashCommandCompleter
|
||||||
|
│ ├── callbacks.py # Terminal callbacks (clarify, sudo, approval)
|
||||||
|
│ └── setup.py # Interactive setup wizard
|
||||||
|
├── tools/ # Tool implementations (one file per tool)
|
||||||
|
│ ├── registry.py # Central tool registry (schemas, handlers, dispatch)
|
||||||
|
│ ├── approval.py # Dangerous command detection
|
||||||
|
│ ├── terminal_tool.py # Terminal orchestration
|
||||||
|
│ ├── process_registry.py # Background process management
|
||||||
|
│ ├── file_tools.py # File read/write/search/patch
|
||||||
|
│ ├── web_tools.py # Firecrawl search/extract
|
||||||
|
│ ├── browser_tool.py # Browserbase browser automation
|
||||||
|
│ ├── code_execution_tool.py # execute_code sandbox
|
||||||
|
│ ├── delegate_tool.py # Subagent delegation
|
||||||
|
│ ├── mcp_tool.py # MCP client (~1050 lines)
|
||||||
|
│ └── environments/ # Terminal backends (local, docker, ssh, modal, daytona, singularity)
|
||||||
|
├── gateway/ # Messaging platform gateway
|
||||||
|
│ ├── run.py # Main loop, slash commands, message dispatch
|
||||||
|
│ ├── session.py # SessionStore — conversation persistence
|
||||||
|
│ └── platforms/ # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal
|
||||||
|
├── cron/ # Scheduler (jobs.py, scheduler.py)
|
||||||
|
├── environments/ # RL training environments (Atropos)
|
||||||
|
├── tests/ # Pytest suite (~2500+ tests)
|
||||||
|
└── batch_runner.py # Parallel batch processing
|
||||||
|
```
|
||||||
|
|
||||||
|
**User config:** `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys)
|
||||||
|
|
||||||
|
## File Dependency Chain
|
||||||
|
|
||||||
|
```
|
||||||
|
tools/registry.py (no deps — imported by all tool files)
|
||||||
|
↑
|
||||||
|
tools/*.py (each calls registry.register() at import time)
|
||||||
|
↑
|
||||||
|
model_tools.py (imports tools/registry + triggers tool discovery)
|
||||||
|
↑
|
||||||
|
run_agent.py, cli.py, batch_runner.py, environments/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## AIAgent Class (run_agent.py)
|
||||||
|
|
||||||
|
```python
|
||||||
|
class AIAgent:
|
||||||
|
def __init__(self,
|
||||||
|
model: str = "anthropic/claude-opus-4.6",
|
||||||
|
max_iterations: int = 90,
|
||||||
|
enabled_toolsets: list = None,
|
||||||
|
disabled_toolsets: list = None,
|
||||||
|
quiet_mode: bool = False,
|
||||||
|
save_trajectories: bool = False,
|
||||||
|
platform: str = None, # "cli", "telegram", etc.
|
||||||
|
session_id: str = None,
|
||||||
|
skip_context_files: bool = False,
|
||||||
|
skip_memory: bool = False,
|
||||||
|
# ... plus provider, api_mode, callbacks, routing params
|
||||||
|
): ...
|
||||||
|
|
||||||
|
def chat(self, message: str) -> str:
|
||||||
|
"""Simple interface — returns final response string."""
|
||||||
|
|
||||||
|
def run_conversation(self, user_message: str, system_message: str = None,
|
||||||
|
conversation_history: list = None, task_id: str = None) -> dict:
|
||||||
|
"""Full interface — returns dict with final_response + messages."""
|
||||||
|
```
|
||||||
|
|
||||||
|
### Agent Loop
|
||||||
|
|
||||||
|
The core loop is inside `run_conversation()` — entirely synchronous:
|
||||||
|
|
||||||
|
```python
|
||||||
|
while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
|
||||||
|
response = client.chat.completions.create(model=model, messages=messages, tools=tool_schemas)
|
||||||
|
if response.tool_calls:
|
||||||
|
for tool_call in response.tool_calls:
|
||||||
|
result = handle_function_call(tool_call.name, tool_call.args, task_id)
|
||||||
|
messages.append(tool_result_message(result))
|
||||||
|
api_call_count += 1
|
||||||
|
else:
|
||||||
|
return response.content
|
||||||
|
```
|
||||||
|
|
||||||
|
Messages follow OpenAI format: `{"role": "system/user/assistant/tool", ...}`. Reasoning content is stored in `assistant_msg["reasoning"]`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CLI Architecture (cli.py)
|
||||||
|
|
||||||
|
- **Rich** for banner/panels, **prompt_toolkit** for input with autocomplete
|
||||||
|
- **KawaiiSpinner** (`agent/display.py`) — animated faces during API calls, `┊` activity feed for tool results
|
||||||
|
- `load_cli_config()` in cli.py merges hardcoded defaults + user config YAML
|
||||||
|
- `process_command()` is a method on `HermesCLI` (not in commands.py)
|
||||||
|
- Skill slash commands: `agent/skill_commands.py` scans `~/.hermes/skills/`, injects as **user message** (not system prompt) to preserve prompt caching
|
||||||
|
|
||||||
|
### Adding CLI Commands
|
||||||
|
|
||||||
|
1. Add to `COMMANDS` dict in `hermes_cli/commands.py`
|
||||||
|
2. Add handler in `HermesCLI.process_command()` in `cli.py`
|
||||||
|
3. For persistent settings, use `save_config_value()` in `cli.py`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Adding New Tools
|
||||||
|
|
||||||
|
Requires changes in **3 files**:
|
||||||
|
|
||||||
|
**1. Create `tools/your_tool.py`:**
|
||||||
|
```python
|
||||||
|
import json, os
|
||||||
|
from tools.registry import registry
|
||||||
|
|
||||||
|
def check_requirements() -> bool:
|
||||||
|
return bool(os.getenv("EXAMPLE_API_KEY"))
|
||||||
|
|
||||||
|
def example_tool(param: str, task_id: str = None) -> str:
|
||||||
|
return json.dumps({"success": True, "data": "..."})
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="example_tool",
|
||||||
|
toolset="example",
|
||||||
|
schema={"name": "example_tool", "description": "...", "parameters": {...}},
|
||||||
|
handler=lambda args, **kw: example_tool(param=args.get("param", ""), task_id=kw.get("task_id")),
|
||||||
|
check_fn=check_requirements,
|
||||||
|
requires_env=["EXAMPLE_API_KEY"],
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Add import** in `model_tools.py` `_discover_tools()` list.
|
||||||
|
|
||||||
|
**3. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.
|
||||||
|
|
||||||
|
The registry handles schema collection, dispatch, availability checking, and error wrapping. All handlers MUST return a JSON string.
|
||||||
|
|
||||||
|
**Agent-level tools** (todo, memory): intercepted by `run_agent.py` before `handle_function_call()`. See `todo_tool.py` for the pattern.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Adding Configuration
|
||||||
|
|
||||||
|
### config.yaml options:
|
||||||
|
1. Add to `DEFAULT_CONFIG` in `hermes_cli/config.py`
|
||||||
|
2. Bump `_config_version` (currently 5) to trigger migration for existing users
|
||||||
|
|
||||||
|
### .env variables:
|
||||||
|
1. Add to `OPTIONAL_ENV_VARS` in `hermes_cli/config.py` with metadata:
|
||||||
|
```python
|
||||||
|
"NEW_API_KEY": {
|
||||||
|
"description": "What it's for",
|
||||||
|
"prompt": "Display name",
|
||||||
|
"url": "https://...",
|
||||||
|
"password": True,
|
||||||
|
"category": "tool", # provider, tool, messaging, setting
|
||||||
|
},
|
||||||
|
```
|
||||||
|
|
||||||
|
### Config loaders (two separate systems):
|
||||||
|
|
||||||
|
| Loader | Used by | Location |
|
||||||
|
|--------|---------|----------|
|
||||||
|
| `load_cli_config()` | CLI mode | `cli.py` |
|
||||||
|
| `load_config()` | `hermes tools`, `hermes setup` | `hermes_cli/config.py` |
|
||||||
|
| Direct YAML load | Gateway | `gateway/run.py` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Important Policies
|
||||||
|
|
||||||
|
### Prompt Caching Must Not Break
|
||||||
|
|
||||||
|
Hermes-Agent ensures caching remains valid throughout a conversation. **Do NOT implement changes that would:**
|
||||||
|
- Alter past context mid-conversation
|
||||||
|
- Change toolsets mid-conversation
|
||||||
|
- Reload memories or rebuild system prompts mid-conversation
|
||||||
|
|
||||||
|
Cache-breaking forces dramatically higher costs. The ONLY time we alter context is during context compression.
|
||||||
|
|
||||||
|
### Working Directory Behavior
|
||||||
|
- **CLI**: Uses current directory (`.` → `os.getcwd()`)
|
||||||
|
- **Messaging**: Uses `MESSAGING_CWD` env var (default: home directory)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Pitfalls
|
||||||
|
|
||||||
|
### DO NOT use `simple_term_menu` for interactive menus
|
||||||
|
Rendering bugs in tmux/iTerm2 — ghosting on scroll. Use `curses` (stdlib) instead. See `hermes_cli/tools_config.py` for the pattern.
|
||||||
|
|
||||||
|
### DO NOT use `\033[K` (ANSI erase-to-EOL) in spinner/display code
|
||||||
|
Leaks as literal `?[K` text under `prompt_toolkit`'s `patch_stdout`. Use space-padding: `f"\r{line}{' ' * pad}"`.
|
||||||
|
|
||||||
|
### `_last_resolved_tool_names` is a process-global in `model_tools.py`
|
||||||
|
When subagents overwrite this global, `execute_code` calls after delegation may fail with missing tool imports. Known bug.
|
||||||
|
|
||||||
|
### Tests must not write to `~/.hermes/`
|
||||||
|
The `_isolate_hermes_home` autouse fixture in `tests/conftest.py` redirects `HERMES_HOME` to a temp dir. Never hardcode `~/.hermes/` paths in tests.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source .venv/bin/activate
|
||||||
|
python -m pytest tests/ -q # Full suite (~2500 tests, ~2 min)
|
||||||
|
python -m pytest tests/test_model_tools.py -q # Toolset resolution
|
||||||
|
python -m pytest tests/test_cli_init.py -q # CLI config loading
|
||||||
|
python -m pytest tests/gateway/ -q # Gateway tests
|
||||||
|
python -m pytest tests/tools/ -q # Tool-level tests
|
||||||
|
```
|
||||||
|
|
||||||
|
Always run the full suite before pushing changes.
|
||||||
522
CONTRIBUTING.md
Normal file
522
CONTRIBUTING.md
Normal file
@@ -0,0 +1,522 @@
|
|||||||
|
# Contributing to Hermes Agent
|
||||||
|
|
||||||
|
Thank you for contributing to Hermes Agent! This guide covers everything you need: setting up your dev environment, understanding the architecture, deciding what to build, and getting your PR merged.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contribution Priorities
|
||||||
|
|
||||||
|
We value contributions in this order:
|
||||||
|
|
||||||
|
1. **Bug fixes** — crashes, incorrect behavior, data loss. Always top priority.
|
||||||
|
2. **Cross-platform compatibility** — Windows, macOS, different Linux distros, different terminal emulators. We want Hermes to work everywhere.
|
||||||
|
3. **Security hardening** — shell injection, prompt injection, path traversal, privilege escalation. See [Security](#security-considerations).
|
||||||
|
4. **Performance and robustness** — retry logic, error handling, graceful degradation.
|
||||||
|
5. **New skills** — but only broadly useful ones. See [Should it be a Skill or a Tool?](#should-it-be-a-skill-or-a-tool)
|
||||||
|
6. **New tools** — rarely needed. Most capabilities should be skills. See below.
|
||||||
|
7. **Documentation** — fixes, clarifications, new examples.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Should it be a Skill or a Tool?
|
||||||
|
|
||||||
|
This is the most common question for new contributors. The answer is almost always **skill**.
|
||||||
|
|
||||||
|
### Make it a Skill when:
|
||||||
|
|
||||||
|
- The capability can be expressed as instructions + shell commands + existing tools
|
||||||
|
- It wraps an external CLI or API that the agent can call via `terminal` or `web_extract`
|
||||||
|
- It doesn't need custom Python integration or API key management baked into the agent
|
||||||
|
- Examples: arXiv search, git workflows, Docker management, PDF processing, email via CLI tools
|
||||||
|
|
||||||
|
### Make it a Tool when:
|
||||||
|
|
||||||
|
- It requires end-to-end integration with API keys, auth flows, or multi-component configuration managed by the agent harness
|
||||||
|
- It needs custom processing logic that must execute precisely every time (not "best effort" from LLM interpretation)
|
||||||
|
- It handles binary data, streaming, or real-time events that can't go through the terminal
|
||||||
|
- Examples: browser automation (Browserbase session management), TTS (audio encoding + platform delivery), vision analysis (base64 image handling)
|
||||||
|
|
||||||
|
### Should the Skill be bundled?
|
||||||
|
|
||||||
|
Bundled skills (in `skills/`) ship with every Hermes install. They should be **broadly useful to most users**:
|
||||||
|
|
||||||
|
- Document handling, web research, common dev workflows, system administration
|
||||||
|
- Used regularly by a wide range of people
|
||||||
|
|
||||||
|
If your skill is official and useful but not universally needed (e.g., a paid service integration, a heavyweight dependency), put it in **`optional-skills/`** — it ships with the repo but isn't activated by default. Users can discover it via `hermes skills browse` (labeled "official") and install it with `hermes skills install` (no third-party warning, builtin trust).
|
||||||
|
|
||||||
|
If your skill is specialized, community-contributed, or niche, it's better suited for a **Skills Hub** — upload it to a skills registry and share it in the [Nous Research Discord](https://discord.gg/NousResearch). Users can install it with `hermes skills install`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Development Setup
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
| Requirement | Notes |
|
||||||
|
|-------------|-------|
|
||||||
|
| **Git** | With `--recurse-submodules` support |
|
||||||
|
| **Python 3.11+** | uv will install it if missing |
|
||||||
|
| **uv** | Fast Python package manager ([install](https://docs.astral.sh/uv/)) |
|
||||||
|
| **Node.js 18+** | Optional — needed for browser tools and WhatsApp bridge |
|
||||||
|
|
||||||
|
### Clone and install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone --recurse-submodules https://github.com/NousResearch/hermes-agent.git
|
||||||
|
cd hermes-agent
|
||||||
|
|
||||||
|
# Create venv with Python 3.11
|
||||||
|
uv venv venv --python 3.11
|
||||||
|
export VIRTUAL_ENV="$(pwd)/venv"
|
||||||
|
|
||||||
|
# Install with all extras (messaging, cron, CLI menus, dev tools)
|
||||||
|
uv pip install -e ".[all,dev]"
|
||||||
|
uv pip install -e "./mini-swe-agent"
|
||||||
|
uv pip install -e "./tinker-atropos"
|
||||||
|
|
||||||
|
# Optional: browser tools
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configure for development
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p ~/.hermes/{cron,sessions,logs,memories,skills}
|
||||||
|
cp cli-config.yaml.example ~/.hermes/config.yaml
|
||||||
|
touch ~/.hermes/.env
|
||||||
|
|
||||||
|
# Add at minimum an LLM provider key:
|
||||||
|
echo 'OPENROUTER_API_KEY=sk-or-v1-your-key' >> ~/.hermes/.env
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Symlink for global access
|
||||||
|
mkdir -p ~/.local/bin
|
||||||
|
ln -sf "$(pwd)/venv/bin/hermes" ~/.local/bin/hermes
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
hermes doctor
|
||||||
|
hermes chat -q "Hello"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest tests/ -v
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
hermes-agent/
|
||||||
|
├── run_agent.py # AIAgent class — core conversation loop, tool dispatch, session persistence
|
||||||
|
├── cli.py # HermesCLI class — interactive TUI, prompt_toolkit integration
|
||||||
|
├── model_tools.py # Tool orchestration (thin layer over tools/registry.py)
|
||||||
|
├── toolsets.py # Tool groupings and presets (hermes-cli, hermes-telegram, etc.)
|
||||||
|
├── hermes_state.py # SQLite session database with FTS5 full-text search, session titles
|
||||||
|
├── batch_runner.py # Parallel batch processing for trajectory generation
|
||||||
|
│
|
||||||
|
├── agent/ # Agent internals (extracted modules)
|
||||||
|
│ ├── prompt_builder.py # System prompt assembly (identity, skills, context files, memory)
|
||||||
|
│ ├── context_compressor.py # Auto-summarization when approaching context limits
|
||||||
|
│ ├── auxiliary_client.py # Resolves auxiliary OpenAI clients (summarization, vision)
|
||||||
|
│ ├── display.py # KawaiiSpinner, tool progress formatting
|
||||||
|
│ ├── model_metadata.py # Model context lengths, token estimation
|
||||||
|
│ └── trajectory.py # Trajectory saving helpers
|
||||||
|
│
|
||||||
|
├── hermes_cli/ # CLI command implementations
|
||||||
|
│ ├── main.py # Entry point, argument parsing, command dispatch
|
||||||
|
│ ├── config.py # Config management, migration, env var definitions
|
||||||
|
│ ├── setup.py # Interactive setup wizard
|
||||||
|
│ ├── auth.py # Provider resolution, OAuth, Nous Portal
|
||||||
|
│ ├── models.py # OpenRouter model selection lists
|
||||||
|
│ ├── banner.py # Welcome banner, ASCII art
|
||||||
|
│ ├── commands.py # Slash command definitions + autocomplete
|
||||||
|
│ ├── callbacks.py # Interactive callbacks (clarify, sudo, approval)
|
||||||
|
│ ├── doctor.py # Diagnostics
|
||||||
|
│ └── skills_hub.py # Skills Hub CLI + /skills slash command
|
||||||
|
│
|
||||||
|
├── tools/ # Tool implementations (self-registering)
|
||||||
|
│ ├── registry.py # Central tool registry (schemas, handlers, dispatch)
|
||||||
|
│ ├── approval.py # Dangerous command detection + per-session approval
|
||||||
|
│ ├── terminal_tool.py # Terminal orchestration (sudo, env lifecycle, backends)
|
||||||
|
│ ├── file_operations.py # read_file, write_file, search, patch, etc.
|
||||||
|
│ ├── web_tools.py # web_search, web_extract (Firecrawl + Gemini summarization)
|
||||||
|
│ ├── vision_tools.py # Image analysis via multimodal models
|
||||||
|
│ ├── delegate_tool.py # Subagent spawning and parallel task execution
|
||||||
|
│ ├── code_execution_tool.py # Sandboxed Python with RPC tool access
|
||||||
|
│ ├── session_search_tool.py # Search past conversations with FTS5 + summarization
|
||||||
|
│ ├── cronjob_tools.py # Scheduled task management
|
||||||
|
│ ├── skill_tools.py # Skill search, load, manage
|
||||||
|
│ └── environments/ # Terminal execution backends
|
||||||
|
│ ├── base.py # BaseEnvironment ABC
|
||||||
|
│ ├── local.py, docker.py, ssh.py, singularity.py, modal.py, daytona.py
|
||||||
|
│
|
||||||
|
├── gateway/ # Messaging gateway
|
||||||
|
│ ├── run.py # GatewayRunner — platform lifecycle, message routing, cron
|
||||||
|
│ ├── config.py # Platform configuration resolution
|
||||||
|
│ ├── session.py # Session store, context prompts, reset policies
|
||||||
|
│ └── platforms/ # Platform adapters
|
||||||
|
│ ├── telegram.py, discord_adapter.py, slack.py, whatsapp.py
|
||||||
|
│
|
||||||
|
├── scripts/ # Installer and bridge scripts
|
||||||
|
│ ├── install.sh # Linux/macOS installer
|
||||||
|
│ ├── install.ps1 # Windows PowerShell installer
|
||||||
|
│ └── whatsapp-bridge/ # Node.js WhatsApp bridge (Baileys)
|
||||||
|
│
|
||||||
|
├── skills/ # Bundled skills (copied to ~/.hermes/skills/ on install)
|
||||||
|
├── optional-skills/ # Official optional skills (discoverable via hub, not activated by default)
|
||||||
|
├── environments/ # RL training environments (Atropos integration)
|
||||||
|
├── tests/ # Test suite
|
||||||
|
├── website/ # Documentation site (hermes-agent.nousresearch.com)
|
||||||
|
│
|
||||||
|
├── cli-config.yaml.example # Example configuration (copied to ~/.hermes/config.yaml)
|
||||||
|
└── AGENTS.md # Development guide for AI coding assistants
|
||||||
|
```
|
||||||
|
|
||||||
|
### User configuration (stored in `~/.hermes/`)
|
||||||
|
|
||||||
|
| Path | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `~/.hermes/config.yaml` | Settings (model, terminal, toolsets, compression, etc.) |
|
||||||
|
| `~/.hermes/.env` | API keys and secrets |
|
||||||
|
| `~/.hermes/auth.json` | OAuth credentials (Nous Portal) |
|
||||||
|
| `~/.hermes/skills/` | All active skills (bundled + hub-installed + agent-created) |
|
||||||
|
| `~/.hermes/memories/` | Persistent memory (MEMORY.md, USER.md) |
|
||||||
|
| `~/.hermes/state.db` | SQLite session database |
|
||||||
|
| `~/.hermes/sessions/` | JSON session logs |
|
||||||
|
| `~/.hermes/cron/` | Scheduled job data |
|
||||||
|
| `~/.hermes/whatsapp/session/` | WhatsApp bridge credentials |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture Overview
|
||||||
|
|
||||||
|
### Core Loop
|
||||||
|
|
||||||
|
```
|
||||||
|
User message → AIAgent._run_agent_loop()
|
||||||
|
├── Build system prompt (prompt_builder.py)
|
||||||
|
├── Build API kwargs (model, messages, tools, reasoning config)
|
||||||
|
├── Call LLM (OpenAI-compatible API)
|
||||||
|
├── If tool_calls in response:
|
||||||
|
│ ├── Execute each tool via registry dispatch
|
||||||
|
│ ├── Add tool results to conversation
|
||||||
|
│ └── Loop back to LLM call
|
||||||
|
├── If text response:
|
||||||
|
│ ├── Persist session to DB
|
||||||
|
│ └── Return final_response
|
||||||
|
└── Context compression if approaching token limit
|
||||||
|
```
|
||||||
|
|
||||||
|
### Key Design Patterns
|
||||||
|
|
||||||
|
- **Self-registering tools**: Each tool file calls `registry.register()` at import time. `model_tools.py` triggers discovery by importing all tool modules.
|
||||||
|
- **Toolset grouping**: Tools are grouped into toolsets (`web`, `terminal`, `file`, `browser`, etc.) that can be enabled/disabled per platform.
|
||||||
|
- **Session persistence**: All conversations are stored in SQLite (`hermes_state.py`) with full-text search and unique session titles. JSON logs go to `~/.hermes/sessions/`.
|
||||||
|
- **Ephemeral injection**: System prompts and prefill messages are injected at API call time, never persisted to the database or logs.
|
||||||
|
- **Provider abstraction**: The agent works with any OpenAI-compatible API. Provider resolution happens at init time (Nous Portal OAuth, OpenRouter API key, or custom endpoint).
|
||||||
|
- **Provider routing**: When using OpenRouter, `provider_routing` in config.yaml controls provider selection (sort by throughput/latency/price, allow/ignore specific providers, data retention policies). These are injected as `extra_body.provider` in API requests.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Code Style
|
||||||
|
|
||||||
|
- **PEP 8** with practical exceptions (we don't enforce strict line length)
|
||||||
|
- **Comments**: Only when explaining non-obvious intent, trade-offs, or API quirks. Don't narrate what the code does — `# increment counter` adds nothing
|
||||||
|
- **Error handling**: Catch specific exceptions. Log with `logger.warning()`/`logger.error()` — use `exc_info=True` for unexpected errors so stack traces appear in logs
|
||||||
|
- **Cross-platform**: Never assume Unix. See [Cross-Platform Compatibility](#cross-platform-compatibility)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Adding a New Tool
|
||||||
|
|
||||||
|
Before writing a tool, ask: [should this be a skill instead?](#should-it-be-a-skill-or-a-tool)
|
||||||
|
|
||||||
|
Tools self-register with the central registry. Each tool file co-locates its schema, handler, and registration:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""my_tool — Brief description of what this tool does."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from tools.registry import registry
|
||||||
|
|
||||||
|
|
||||||
|
def my_tool(param1: str, param2: int = 10, **kwargs) -> str:
|
||||||
|
"""Handler. Returns a string result (often JSON)."""
|
||||||
|
result = do_work(param1, param2)
|
||||||
|
return json.dumps(result)
|
||||||
|
|
||||||
|
|
||||||
|
MY_TOOL_SCHEMA = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "my_tool",
|
||||||
|
"description": "What this tool does and when the agent should use it.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"param1": {"type": "string", "description": "What param1 is"},
|
||||||
|
"param2": {"type": "integer", "description": "What param2 is", "default": 10},
|
||||||
|
},
|
||||||
|
"required": ["param1"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _check_requirements() -> bool:
|
||||||
|
"""Return True if this tool's dependencies are available."""
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="my_tool",
|
||||||
|
toolset="my_toolset",
|
||||||
|
schema=MY_TOOL_SCHEMA,
|
||||||
|
handler=lambda args, **kw: my_tool(**args, **kw),
|
||||||
|
check_fn=_check_requirements,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Then add the import to `model_tools.py` in the `_modules` list:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_modules = [
|
||||||
|
# ... existing modules ...
|
||||||
|
"tools.my_tool",
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
If it's a new toolset, add it to `toolsets.py` and to the relevant platform presets.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Adding a Skill
|
||||||
|
|
||||||
|
Bundled skills live in `skills/` organized by category. Official optional skills use the same structure in `optional-skills/`:
|
||||||
|
|
||||||
|
```
|
||||||
|
skills/
|
||||||
|
├── research/
|
||||||
|
│ └── arxiv/
|
||||||
|
│ ├── SKILL.md # Required: main instructions
|
||||||
|
│ └── scripts/ # Optional: helper scripts
|
||||||
|
│ └── search_arxiv.py
|
||||||
|
├── productivity/
|
||||||
|
│ └── ocr-and-documents/
|
||||||
|
│ ├── SKILL.md
|
||||||
|
│ ├── scripts/
|
||||||
|
│ └── references/
|
||||||
|
└── ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### SKILL.md format
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
name: my-skill
|
||||||
|
description: Brief description (shown in skill search results)
|
||||||
|
version: 1.0.0
|
||||||
|
author: Your Name
|
||||||
|
license: MIT
|
||||||
|
platforms: [macos, linux] # Optional — restrict to specific OS platforms
|
||||||
|
# Valid: macos, linux, windows
|
||||||
|
# Omit to load on all platforms (default)
|
||||||
|
metadata:
|
||||||
|
hermes:
|
||||||
|
tags: [Category, Subcategory, Keywords]
|
||||||
|
related_skills: [other-skill-name]
|
||||||
|
---
|
||||||
|
|
||||||
|
# Skill Title
|
||||||
|
|
||||||
|
Brief intro.
|
||||||
|
|
||||||
|
## When to Use
|
||||||
|
Trigger conditions — when should the agent load this skill?
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
Table of common commands or API calls.
|
||||||
|
|
||||||
|
## Procedure
|
||||||
|
Step-by-step instructions the agent follows.
|
||||||
|
|
||||||
|
## Pitfalls
|
||||||
|
Known failure modes and how to handle them.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
How the agent confirms it worked.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Platform-specific skills
|
||||||
|
|
||||||
|
Skills can declare which OS platforms they support via the `platforms` frontmatter field. Skills with this field are automatically hidden from the system prompt, `skills_list()`, and slash commands on incompatible platforms.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
platforms: [macos] # macOS only (e.g., iMessage, Apple Reminders)
|
||||||
|
platforms: [macos, linux] # macOS and Linux
|
||||||
|
platforms: [windows] # Windows only
|
||||||
|
```
|
||||||
|
|
||||||
|
If the field is omitted or empty, the skill loads on all platforms (backward compatible). See `skills/apple/` for examples of macOS-only skills.
|
||||||
|
|
||||||
|
### Skill guidelines
|
||||||
|
|
||||||
|
- **No external dependencies unless absolutely necessary.** Prefer stdlib Python, curl, and existing Hermes tools (`web_extract`, `terminal`, `read_file`).
|
||||||
|
- **Progressive disclosure.** Put the most common workflow first. Edge cases and advanced usage go at the bottom.
|
||||||
|
- **Include helper scripts** for XML/JSON parsing or complex logic — don't expect the LLM to write parsers inline every time.
|
||||||
|
- **Test it.** Run `hermes --toolsets skills -q "Use the X skill to do Y"` and verify the agent follows the instructions correctly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Cross-Platform Compatibility
|
||||||
|
|
||||||
|
Hermes runs on Linux, macOS, and Windows. When writing code that touches the OS:
|
||||||
|
|
||||||
|
### Critical rules
|
||||||
|
|
||||||
|
1. **`termios` and `fcntl` are Unix-only.** Always catch both `ImportError` and `NotImplementedError`:
|
||||||
|
```python
|
||||||
|
try:
|
||||||
|
from simple_term_menu import TerminalMenu
|
||||||
|
menu = TerminalMenu(options)
|
||||||
|
idx = menu.show()
|
||||||
|
except (ImportError, NotImplementedError):
|
||||||
|
# Fallback: numbered menu for Windows
|
||||||
|
for i, opt in enumerate(options):
|
||||||
|
print(f" {i+1}. {opt}")
|
||||||
|
idx = int(input("Choice: ")) - 1
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **File encoding.** Windows may save `.env` files in `cp1252`. Always handle encoding errors:
|
||||||
|
```python
|
||||||
|
try:
|
||||||
|
load_dotenv(env_path)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
load_dotenv(env_path, encoding="latin-1")
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Process management.** `os.setsid()`, `os.killpg()`, and signal handling differ on Windows. Use platform checks:
|
||||||
|
```python
|
||||||
|
import platform
|
||||||
|
if platform.system() != "Windows":
|
||||||
|
kwargs["preexec_fn"] = os.setsid
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Path separators.** Use `pathlib.Path` instead of string concatenation with `/`.
|
||||||
|
|
||||||
|
5. **Shell commands in installers.** If you change `scripts/install.sh`, check if the equivalent change is needed in `scripts/install.ps1`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
Hermes has terminal access. Security matters.
|
||||||
|
|
||||||
|
### Existing protections
|
||||||
|
|
||||||
|
| Layer | Implementation |
|
||||||
|
|-------|---------------|
|
||||||
|
| **Sudo password piping** | Uses `shlex.quote()` to prevent shell injection |
|
||||||
|
| **Dangerous command detection** | Regex patterns in `tools/approval.py` with user approval flow |
|
||||||
|
| **Cron prompt injection** | Scanner in `tools/cronjob_tools.py` blocks instruction-override patterns |
|
||||||
|
| **Write deny list** | Protected paths (`~/.ssh/authorized_keys`, `/etc/shadow`) resolved via `os.path.realpath()` to prevent symlink bypass |
|
||||||
|
| **Skills guard** | Security scanner for hub-installed skills (`tools/skills_guard.py`) |
|
||||||
|
| **Code execution sandbox** | `execute_code` child process runs with API keys stripped from environment |
|
||||||
|
| **Container hardening** | Docker: all capabilities dropped, no privilege escalation, PID limits, size-limited tmpfs |
|
||||||
|
|
||||||
|
### When contributing security-sensitive code
|
||||||
|
|
||||||
|
- **Always use `shlex.quote()`** when interpolating user input into shell commands
|
||||||
|
- **Resolve symlinks** with `os.path.realpath()` before path-based access control checks
|
||||||
|
- **Don't log secrets.** API keys, tokens, and passwords should never appear in log output
|
||||||
|
- **Catch broad exceptions** around tool execution so a single failure doesn't crash the agent loop
|
||||||
|
- **Test on all platforms** if your change touches file paths, process management, or shell commands
|
||||||
|
|
||||||
|
If your PR affects security, note it explicitly in the description.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pull Request Process
|
||||||
|
|
||||||
|
### Branch naming
|
||||||
|
|
||||||
|
```
|
||||||
|
fix/description # Bug fixes
|
||||||
|
feat/description # New features
|
||||||
|
docs/description # Documentation
|
||||||
|
test/description # Tests
|
||||||
|
refactor/description # Code restructuring
|
||||||
|
```
|
||||||
|
|
||||||
|
### Before submitting
|
||||||
|
|
||||||
|
1. **Run tests**: `pytest tests/ -v`
|
||||||
|
2. **Test manually**: Run `hermes` and exercise the code path you changed
|
||||||
|
3. **Check cross-platform impact**: If you touch file I/O, process management, or terminal handling, consider Windows and macOS
|
||||||
|
4. **Keep PRs focused**: One logical change per PR. Don't mix a bug fix with a refactor with a new feature.
|
||||||
|
|
||||||
|
### PR description
|
||||||
|
|
||||||
|
Include:
|
||||||
|
- **What** changed and **why**
|
||||||
|
- **How to test** it (reproduction steps for bugs, usage examples for features)
|
||||||
|
- **What platforms** you tested on
|
||||||
|
- Reference any related issues
|
||||||
|
|
||||||
|
### Commit messages
|
||||||
|
|
||||||
|
We use [Conventional Commits](https://www.conventionalcommits.org/):
|
||||||
|
|
||||||
|
```
|
||||||
|
<type>(<scope>): <description>
|
||||||
|
```
|
||||||
|
|
||||||
|
| Type | Use for |
|
||||||
|
|------|---------|
|
||||||
|
| `fix` | Bug fixes |
|
||||||
|
| `feat` | New features |
|
||||||
|
| `docs` | Documentation |
|
||||||
|
| `test` | Tests |
|
||||||
|
| `refactor` | Code restructuring (no behavior change) |
|
||||||
|
| `chore` | Build, CI, dependency updates |
|
||||||
|
|
||||||
|
Scopes: `cli`, `gateway`, `tools`, `skills`, `agent`, `install`, `whatsapp`, `security`, etc.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
```
|
||||||
|
fix(cli): prevent crash in save_config_value when model is a string
|
||||||
|
feat(gateway): add WhatsApp multi-user session isolation
|
||||||
|
fix(security): prevent shell injection in sudo password piping
|
||||||
|
test(tools): add unit tests for file_operations
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reporting Issues
|
||||||
|
|
||||||
|
- Use [GitHub Issues](https://github.com/NousResearch/hermes-agent/issues)
|
||||||
|
- Include: OS, Python version, Hermes version (`hermes version`), full error traceback
|
||||||
|
- Include steps to reproduce
|
||||||
|
- Check existing issues before creating duplicates
|
||||||
|
- For security vulnerabilities, please report privately
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Community
|
||||||
|
|
||||||
|
- **Discord**: [discord.gg/NousResearch](https://discord.gg/NousResearch) — for questions, showcasing projects, and sharing skills
|
||||||
|
- **GitHub Discussions**: For design proposals and architecture discussions
|
||||||
|
- **Skills Hub**: Upload specialized skills to a registry and share them with the community
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
By contributing, you agree that your contributions will be licensed under the [MIT License](LICENSE).
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 Nous Research
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
671
README.md
671
README.md
@@ -1,612 +1,121 @@
|
|||||||
# Hermes Agent
|
<p align="center">
|
||||||
|
<img src="assets/banner.png" alt="Hermes Agent" width="100%">
|
||||||
|
</p>
|
||||||
|
|
||||||
An AI agent with advanced tool-calling capabilities, featuring a flexible toolsets system for organizing and managing tools.
|
# Hermes Agent ⚕
|
||||||
|
|
||||||
## Features
|
<p align="center">
|
||||||
|
<a href="https://hermes-agent.nousresearch.com/docs/"><img src="https://img.shields.io/badge/Docs-hermes--agent.nousresearch.com-FFD700?style=for-the-badge" alt="Documentation"></a>
|
||||||
|
<a href="https://discord.gg/NousResearch"><img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord"></a>
|
||||||
|
<a href="https://github.com/NousResearch/hermes-agent/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-green?style=for-the-badge" alt="License: MIT"></a>
|
||||||
|
<a href="https://nousresearch.com"><img src="https://img.shields.io/badge/Built%20by-Nous%20Research-blueviolet?style=for-the-badge" alt="Built by Nous Research"></a>
|
||||||
|
</p>
|
||||||
|
|
||||||
- **Interactive CLI**: Beautiful terminal interface with animated feedback, personalities, and session management
|
**The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.
|
||||||
- **Web Tools**: Search, extract content, and crawl websites
|
|
||||||
- **Terminal Tools**: Execute commands via local, Docker, Singularity, Modal, or SSH backends
|
|
||||||
- **Browser Tools**: Automate web browsers to navigate, click, type, and extract content
|
|
||||||
- **Vision Tools**: Analyze images from URLs
|
|
||||||
- **Reasoning Tools**: Advanced multi-model reasoning (Mixture of Agents)
|
|
||||||
- **Creative Tools**: Generate images from text prompts
|
|
||||||
- **Skills Tools**: On-demand knowledge documents with progressive disclosure
|
|
||||||
- **Toolsets System**: Organize tools into logical groups for different scenarios
|
|
||||||
- **Batch Processing**: Process datasets in parallel with checkpointing and statistics tracking
|
|
||||||
- **Ephemeral System Prompts**: Guide model behavior without polluting training datasets
|
|
||||||
|
|
||||||
## Quick Start (CLI)
|
Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
|
||||||
|
|
||||||
```bash
|
<table>
|
||||||
# After setup (see below), just run:
|
<tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
|
||||||
./hermes
|
<tr><td><b>Lives where you do</b></td><td>Telegram, Discord, Slack, WhatsApp, Signal, and CLI — all from a single gateway process. Voice memo transcription, cross-platform conversation continuity.</td></tr>
|
||||||
|
<tr><td><b>A closed learning loop</b></td><td>Agent-curated memory with periodic nudges. Autonomous skill creation after complex tasks. Skills self-improve during use. FTS5 session search with LLM summarization for cross-session recall. <a href="https://github.com/plastic-labs/honcho">Honcho</a> dialectic user modeling. Compatible with the <a href="https://agentskills.io">agentskills.io</a> open standard.</td></tr>
|
||||||
|
<tr><td><b>Scheduled automations</b></td><td>Built-in cron scheduler with delivery to any platform. Daily reports, nightly backups, weekly audits — all in natural language, running unattended.</td></tr>
|
||||||
|
<tr><td><b>Delegates and parallelizes</b></td><td>Spawn isolated subagents for parallel workstreams. Write Python scripts that call tools via RPC, collapsing multi-step pipelines into zero-context-cost turns.</td></tr>
|
||||||
|
<tr><td><b>Runs anywhere, not just your laptop</b></td><td>Six terminal backends — local, Docker, SSH, Daytona, Singularity, and Modal. Daytona and Modal offer serverless persistence — your agent's environment hibernates when idle and wakes on demand, costing nearly nothing between sessions. Run it on a $5 VPS or a GPU cluster.</td></tr>
|
||||||
|
<tr><td><b>Research-ready</b></td><td>Batch trajectory generation, Atropos RL environments, trajectory compression for training the next generation of tool-calling models.</td></tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
# Or with options:
|
|
||||||
./hermes --model "anthropic/claude-sonnet-4" --toolsets "web,terminal"
|
|
||||||
```
|
|
||||||
|
|
||||||
The CLI provides:
|
|
||||||
- Animated spinners during thinking and tool execution
|
|
||||||
- Kawaii-style feedback messages
|
|
||||||
- `/commands` for configuration, history, and session management
|
|
||||||
- Customizable personalities (`/personality kawaii`, `/personality pirate`, etc.)
|
|
||||||
- Persistent configuration via `cli-config.yaml`
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
### 1. Clone the Repository
|
|
||||||
```bash
|
|
||||||
# Clone with submodules (recommended)
|
|
||||||
git clone --recurse-submodules https://github.com/NousResearch/Hermes-Agent.git
|
|
||||||
cd Hermes-Agent
|
|
||||||
|
|
||||||
# Or if already cloned without submodules:
|
|
||||||
git submodule update --init --recursive
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Install Dependencies
|
|
||||||
```bash
|
|
||||||
# Create and activate virtual environment (recommended)
|
|
||||||
python3 -m venv venv
|
|
||||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
||||||
|
|
||||||
# Install Python packages
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
# Install mini-swe-agent for terminal tools
|
|
||||||
pip install -e ./mini-swe-agent
|
|
||||||
|
|
||||||
# Install Node.js dependencies for browser tools (requires Node.js)
|
|
||||||
npm install
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Configure Environment Variables
|
|
||||||
```bash
|
|
||||||
# Copy the example environment file
|
|
||||||
cp .env.example .env
|
|
||||||
|
|
||||||
# Edit .env and add your API keys
|
|
||||||
nano .env # or use your preferred editor
|
|
||||||
```
|
|
||||||
|
|
||||||
**Required API Keys:**
|
|
||||||
- `OPENROUTER_API_KEY` - LLM access via OpenRouter (get at: https://openrouter.ai/keys)
|
|
||||||
- `FIRECRAWL_API_KEY` - Web tools (get at: https://firecrawl.dev/)
|
|
||||||
- `NOUS_API_KEY` - Vision & reasoning tools (get at: https://inference-api.nousresearch.com/)
|
|
||||||
- `FAL_KEY` - Image generation (get at: https://fal.ai/)
|
|
||||||
|
|
||||||
**Optional API Keys (for specific features):**
|
|
||||||
- `BROWSERBASE_API_KEY` - Browser automation (get at: https://browserbase.com/)
|
|
||||||
- `BROWSERBASE_PROJECT_ID` - From Browserbase dashboard
|
|
||||||
- `MORPH_API_KEY` - For legacy Hecate terminal backend (get at: https://morph.so/)
|
|
||||||
|
|
||||||
### 4. Configure Terminal Backend
|
|
||||||
|
|
||||||
The terminal tool uses **mini-swe-agent** environments. Configure in `.env` or `cli-config.yaml`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Backend: "local", "docker", "singularity", "modal", or "ssh"
|
|
||||||
TERMINAL_ENV=local # Default: runs on host machine (no isolation)
|
|
||||||
TERMINAL_ENV=ssh # Remote execution via SSH (agent code stays local)
|
|
||||||
TERMINAL_ENV=singularity # Recommended for HPC: Apptainer/Singularity containers
|
|
||||||
TERMINAL_ENV=docker # Isolated Docker containers
|
|
||||||
TERMINAL_ENV=modal # Cloud execution via Modal
|
|
||||||
|
|
||||||
# Container image (for docker/singularity/modal backends)
|
|
||||||
TERMINAL_DOCKER_IMAGE=python:3.11-slim
|
|
||||||
TERMINAL_SINGULARITY_IMAGE=docker://python:3.11-slim
|
|
||||||
TERMINAL_TIMEOUT=60
|
|
||||||
|
|
||||||
# SSH backend (for ssh)
|
|
||||||
TERMINAL_SSH_HOST=my-server.example.com
|
|
||||||
TERMINAL_SSH_USER=myuser
|
|
||||||
TERMINAL_SSH_KEY=~/.ssh/id_rsa # Optional, uses ssh-agent if not set
|
|
||||||
```
|
|
||||||
|
|
||||||
**Backend Requirements:**
|
|
||||||
- **local**: No extra setup (runs directly on your machine, no isolation)
|
|
||||||
- **ssh**: SSH access to remote machine (great for sandboxing - agent can't touch its own code)
|
|
||||||
- **singularity**: Requires Apptainer or Singularity installed (common on HPC clusters, no root needed)
|
|
||||||
- **docker**: Requires Docker installed and user in `docker` group
|
|
||||||
- **modal**: Requires Modal account (see setup below)
|
|
||||||
|
|
||||||
### Singularity/Apptainer Setup (Recommended for HPC)
|
|
||||||
|
|
||||||
Singularity/Apptainer provides rootless container execution, ideal for HPC clusters:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Verify Apptainer is installed
|
|
||||||
apptainer --version # or: singularity --version
|
|
||||||
|
|
||||||
# 2. Set up cache directories (important for parallel workers)
|
|
||||||
# Use /scratch if available (HPC), otherwise /tmp
|
|
||||||
export APPTAINER_CACHEDIR=/scratch/$USER/.apptainer
|
|
||||||
export APPTAINER_TMPDIR=/scratch/$USER/.apptainer/tmp
|
|
||||||
mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR"
|
|
||||||
|
|
||||||
# 3. Pre-build SIF image (recommended for parallel batch processing)
|
|
||||||
# This avoids race conditions when multiple workers start simultaneously
|
|
||||||
apptainer build $APPTAINER_CACHEDIR/python-nodejs.sif docker://nikolaik/python-nodejs:python3.11-nodejs20
|
|
||||||
|
|
||||||
# 4. Configure .env to use the local SIF
|
|
||||||
TERMINAL_ENV=singularity
|
|
||||||
TERMINAL_SINGULARITY_IMAGE=/scratch/$USER/.apptainer/python-nodejs.sif
|
|
||||||
```
|
|
||||||
|
|
||||||
**Tip:** The batch scripts in `configs/` automatically handle SIF pre-building if `/scratch` is available.
|
|
||||||
|
|
||||||
### Modal Cloud Backend Setup
|
|
||||||
|
|
||||||
[Modal](https://modal.com) provides serverless cloud compute for running sandboxed environments at scale.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Install Modal and dependencies
|
|
||||||
pip install modal boto3
|
|
||||||
|
|
||||||
# 2. Authenticate with Modal (opens browser)
|
|
||||||
modal setup
|
|
||||||
|
|
||||||
# 3. Set terminal backend to modal in .env
|
|
||||||
TERMINAL_ENV=modal
|
|
||||||
```
|
|
||||||
|
|
||||||
Modal uses CLI-based authentication (stored in `~/.modal/`), so no API key is needed in `.env`. After running `modal setup`, commands will automatically execute in Modal's cloud sandboxes.
|
|
||||||
|
|
||||||
### Browser Tools Setup
|
|
||||||
|
|
||||||
Browser tools enable the agent to navigate websites, fill forms, click buttons, and extract content. They use [agent-browser](https://github.com/vercel-labs/agent-browser) CLI with [Browserbase](https://browserbase.com) cloud execution.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Install Node.js (if not already installed)
|
|
||||||
# Use nvm (recommended) or your package manager
|
|
||||||
|
|
||||||
# 2. Install agent-browser CLI (choose one option):
|
|
||||||
npm install -g agent-browser # Option A: Global install (recommended)
|
|
||||||
npm install # Option B: Local install (uses npx fallback)
|
|
||||||
|
|
||||||
# 3. Get Browserbase credentials
|
|
||||||
# Sign up at https://browserbase.com/ and get your:
|
|
||||||
# - API Key (from Settings → API Keys)
|
|
||||||
# - Project ID (from your project dashboard)
|
|
||||||
|
|
||||||
# 4. Add to your .env file:
|
|
||||||
BROWSERBASE_API_KEY=your_api_key_here
|
|
||||||
BROWSERBASE_PROJECT_ID=your_project_id_here
|
|
||||||
```
|
|
||||||
|
|
||||||
**Available Browser Tools:**
|
|
||||||
|
|
||||||
| Tool | Description |
|
|
||||||
|------|-------------|
|
|
||||||
| `browser_navigate` | Navigate to a URL |
|
|
||||||
| `browser_snapshot` | Get text-based page snapshot with element refs |
|
|
||||||
| `browser_click` | Click an element by ref (e.g., `@e5`) |
|
|
||||||
| `browser_type` | Type text into an input field |
|
|
||||||
| `browser_scroll` | Scroll up or down |
|
|
||||||
| `browser_back` | Go back in browser history |
|
|
||||||
| `browser_press` | Press a keyboard key (Enter, Tab, etc.) |
|
|
||||||
| `browser_close` | Close the browser session |
|
|
||||||
| `browser_get_images` | Get list of images on the page |
|
|
||||||
|
|
||||||
**Example Usage:**
|
|
||||||
```bash
|
|
||||||
# Use browser tools with web search and vision
|
|
||||||
python run_agent.py \
|
|
||||||
--query "Go to amazon.com and find the price of the latest Kindle" \
|
|
||||||
--enabled_toolsets=browser,web,vision
|
|
||||||
|
|
||||||
# Use browser-focused distribution
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file=browser_tasks.jsonl \
|
|
||||||
--distribution=browser_use \
|
|
||||||
--run_name=browser_run
|
|
||||||
```
|
|
||||||
|
|
||||||
See `.env.example` for all available configuration options including debug settings.
|
|
||||||
|
|
||||||
### Skills Tools
|
|
||||||
|
|
||||||
Skills are on-demand knowledge documents the agent can load when needed. They follow a **progressive disclosure** pattern to minimize token usage:
|
|
||||||
|
|
||||||
```
|
|
||||||
skills/
|
|
||||||
├── mlops/ # Category folder
|
|
||||||
│ ├── axolotl/ # Skill folder
|
|
||||||
│ │ ├── SKILL.md # Main instructions (required)
|
|
||||||
│ │ ├── references/ # Additional docs, API specs
|
|
||||||
│ │ └── templates/ # Output formats, configs
|
|
||||||
│ └── vllm/
|
|
||||||
│ └── SKILL.md
|
|
||||||
```
|
|
||||||
|
|
||||||
**Available Skills Tools:**
|
|
||||||
|
|
||||||
| Tool | Description |
|
|
||||||
|------|-------------|
|
|
||||||
| `skills_categories` | List available skill categories (~50 tokens) |
|
|
||||||
| `skills_list` | List skills with name + description (~3k tokens for 40 skills) |
|
|
||||||
| `skill_view` | Load full skill content, tags, and linked files |
|
|
||||||
|
|
||||||
**Example Usage:**
|
|
||||||
```bash
|
|
||||||
# Use skills tools
|
|
||||||
python run_agent.py \
|
|
||||||
--query "What skills do you have for fine-tuning? Show me the axolotl skill." \
|
|
||||||
--enabled_toolsets=skills
|
|
||||||
```
|
|
||||||
|
|
||||||
**Creating Skills:**
|
|
||||||
|
|
||||||
Skills use YAML frontmatter for metadata:
|
|
||||||
```yaml
|
|
||||||
---
|
---
|
||||||
name: my-skill
|
|
||||||
description: Brief description shown in skills_list
|
## Quick Install
|
||||||
tags: [tag1, tag2]
|
|
||||||
related_skills: [other-skill]
|
```bash
|
||||||
version: 1.0.0
|
curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
|
||||||
|
```
|
||||||
|
|
||||||
|
Works on Linux, macOS, and WSL2. The installer handles everything — Python, Node.js, dependencies, and the `hermes` command. No prerequisites except git.
|
||||||
|
|
||||||
|
> **Windows:** Native Windows is not supported. Please install [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) and run the command above.
|
||||||
|
|
||||||
|
After installation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source ~/.bashrc # reload shell (or: source ~/.zshrc)
|
||||||
|
hermes setup # configure your LLM provider
|
||||||
|
hermes # start chatting!
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
# Skill Content
|
|
||||||
|
|
||||||
Instructions, examples, and guidelines here...
|
## Getting Started
|
||||||
```
|
|
||||||
|
|
||||||
Skills can include:
|
|
||||||
- `references/` - Additional documentation, API specs, examples
|
|
||||||
- `templates/` - Output formats, config files, boilerplate code
|
|
||||||
- `scripts/` - Executable helpers (Python, shell scripts)
|
|
||||||
|
|
||||||
## Session Logging
|
|
||||||
|
|
||||||
Every conversation is automatically logged to `logs/` for debugging and inspection:
|
|
||||||
|
|
||||||
```
|
|
||||||
logs/
|
|
||||||
├── session_20260201_143052_a1b2c3.json
|
|
||||||
├── session_20260201_150217_d4e5f6.json
|
|
||||||
└── ...
|
|
||||||
```
|
|
||||||
|
|
||||||
**Log Format:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"session_id": "20260201_143052_a1b2c3",
|
|
||||||
"model": "anthropic/claude-sonnet-4",
|
|
||||||
"session_start": "2026-02-01T14:30:52.123456",
|
|
||||||
"last_updated": "2026-02-01T14:35:12.789012",
|
|
||||||
"message_count": 8,
|
|
||||||
"conversations": [
|
|
||||||
{"from": "system", "value": "..."},
|
|
||||||
{"from": "human", "value": "..."},
|
|
||||||
{"from": "gpt", "value": "..."},
|
|
||||||
{"from": "tool", "value": "..."}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- **Automatic**: Logs are created and updated automatically after each conversation turn
|
|
||||||
- **Session ID in Banner**: The CLI displays the session ID in the welcome banner
|
|
||||||
- **Trajectory Format**: Uses the same format as batch processing for consistency
|
|
||||||
- **Git Ignored**: `logs/` is in `.gitignore` so logs aren't committed
|
|
||||||
|
|
||||||
## Interactive CLI
|
|
||||||
|
|
||||||
The CLI provides a rich interactive experience for working with the agent.
|
|
||||||
|
|
||||||
### Running the CLI
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Basic usage
|
hermes # Interactive CLI — start a conversation
|
||||||
./hermes
|
hermes model # Switch provider or model
|
||||||
|
hermes setup # Re-run the setup wizard
|
||||||
# With specific model
|
hermes gateway # Start the messaging gateway (Telegram, Discord, etc.)
|
||||||
./hermes --model "anthropic/claude-sonnet-4"
|
hermes update # Update to the latest version
|
||||||
|
hermes doctor # Diagnose any issues
|
||||||
# With specific toolsets
|
|
||||||
./hermes --toolsets "web,terminal,skills"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### CLI Commands
|
📖 **[Full documentation →](https://hermes-agent.nousresearch.com/docs/)**
|
||||||
|
|
||||||
| Command | Description |
|
---
|
||||||
|---------|-------------|
|
|
||||||
| `/help` | Show available commands |
|
|
||||||
| `/tools` | List available tools by toolset |
|
|
||||||
| `/toolsets` | List available toolsets |
|
|
||||||
| `/model [name]` | Show or change the current model |
|
|
||||||
| `/prompt [text]` | View/set custom system prompt |
|
|
||||||
| `/personality [name]` | Set a predefined personality |
|
|
||||||
| `/clear` | Clear screen and reset conversation |
|
|
||||||
| `/reset` | Reset conversation only |
|
|
||||||
| `/history` | Show conversation history |
|
|
||||||
| `/save` | Save current conversation to file |
|
|
||||||
| `/config` | Show current configuration |
|
|
||||||
| `/quit` | Exit the CLI |
|
|
||||||
|
|
||||||
### Configuration
|
## Documentation
|
||||||
|
|
||||||
Copy `cli-config.yaml.example` to `cli-config.yaml` and customize:
|
All documentation lives at **[hermes-agent.nousresearch.com/docs](https://hermes-agent.nousresearch.com/docs/)**:
|
||||||
|
|
||||||
```yaml
|
| Section | What's Covered |
|
||||||
# Model settings
|
|---------|---------------|
|
||||||
model:
|
| [Quickstart](https://hermes-agent.nousresearch.com/docs/getting-started/quickstart) | Install → setup → first conversation in 2 minutes |
|
||||||
default: "anthropic/claude-sonnet-4"
|
| [CLI Usage](https://hermes-agent.nousresearch.com/docs/user-guide/cli) | Commands, keybindings, personalities, sessions |
|
||||||
|
| [Configuration](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) | Config file, providers, models, all options |
|
||||||
|
| [Messaging Gateway](https://hermes-agent.nousresearch.com/docs/user-guide/messaging) | Telegram, Discord, Slack, WhatsApp, Signal, Home Assistant |
|
||||||
|
| [Security](https://hermes-agent.nousresearch.com/docs/user-guide/security) | Command approval, DM pairing, container isolation |
|
||||||
|
| [Tools & Toolsets](https://hermes-agent.nousresearch.com/docs/user-guide/features/tools) | 40+ tools, toolset system, terminal backends |
|
||||||
|
| [Skills System](https://hermes-agent.nousresearch.com/docs/user-guide/features/skills) | Procedural memory, Skills Hub, creating skills |
|
||||||
|
| [Memory](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) | Persistent memory, user profiles, best practices |
|
||||||
|
| [MCP Integration](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) | Connect any MCP server for extended capabilities |
|
||||||
|
| [Cron Scheduling](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) | Scheduled tasks with platform delivery |
|
||||||
|
| [Context Files](https://hermes-agent.nousresearch.com/docs/user-guide/features/context-files) | Project context that shapes every conversation |
|
||||||
|
| [Architecture](https://hermes-agent.nousresearch.com/docs/developer-guide/architecture) | Project structure, agent loop, key classes |
|
||||||
|
| [Contributing](https://hermes-agent.nousresearch.com/docs/developer-guide/contributing) | Development setup, PR process, code style |
|
||||||
|
| [CLI Reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) | All commands and flags |
|
||||||
|
| [Environment Variables](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) | Complete env var reference |
|
||||||
|
|
||||||
# Terminal backend (local, docker, singularity, modal, or ssh)
|
---
|
||||||
terminal:
|
|
||||||
env_type: "local"
|
|
||||||
cwd: "." # Use current directory
|
|
||||||
|
|
||||||
# Or use SSH for remote execution (keeps agent code isolated)
|
## Contributing
|
||||||
# terminal:
|
|
||||||
# env_type: "ssh"
|
|
||||||
# ssh_host: "my-server.example.com"
|
|
||||||
# ssh_user: "myuser"
|
|
||||||
# ssh_key: "~/.ssh/id_rsa"
|
|
||||||
# cwd: "/home/myuser/project"
|
|
||||||
|
|
||||||
# Enable specific toolsets
|
We welcome contributions! See the [Contributing Guide](https://hermes-agent.nousresearch.com/docs/developer-guide/contributing) for development setup, code style, and PR process.
|
||||||
toolsets:
|
|
||||||
- all # or: web, terminal, browser, vision, etc.
|
|
||||||
|
|
||||||
# Custom personalities (use with /personality command)
|
Quick start for contributors:
|
||||||
agent:
|
|
||||||
personalities:
|
|
||||||
helpful: "You are a helpful assistant."
|
|
||||||
kawaii: "You are a kawaii assistant! Use cute expressions..."
|
|
||||||
```
|
|
||||||
|
|
||||||
### Personalities
|
|
||||||
|
|
||||||
Built-in personalities available via `/personality`:
|
|
||||||
- `helpful`, `concise`, `technical`, `creative`, `teacher`
|
|
||||||
- `kawaii`, `catgirl`, `pirate`, `shakespeare`, `surfer`
|
|
||||||
- `noir`, `uwu`, `philosopher`, `hype`
|
|
||||||
|
|
||||||
## Toolsets System
|
|
||||||
|
|
||||||
The agent uses a toolsets system for organizing and managing tools. All tools must be part of a toolset to be accessible - individual tool selection is not supported. This ensures consistent and logical grouping of capabilities.
|
|
||||||
|
|
||||||
### Key Concepts
|
|
||||||
|
|
||||||
- **Toolsets**: Logical groups of tools for specific use cases (e.g., "research", "development", "debugging")
|
|
||||||
- **Composition**: Toolsets can include other toolsets for powerful combinations
|
|
||||||
- **Custom Toolsets**: Create your own toolsets at runtime or by editing `toolsets.py`
|
|
||||||
- **Toolset-Only Access**: Tools are only accessible through toolsets, not individually
|
|
||||||
|
|
||||||
### Available Toolsets
|
|
||||||
|
|
||||||
See `toolsets.py` for the complete list of predefined toolsets including:
|
|
||||||
- Basic toolsets (web, terminal, vision, creative, reasoning)
|
|
||||||
- Composite toolsets (research, development, analysis, etc.)
|
|
||||||
- Scenario-specific toolsets (debugging, documentation, API testing, etc.)
|
|
||||||
- Special toolsets (safe mode without terminal, minimal, offline)
|
|
||||||
|
|
||||||
### Using Toolsets
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Use a predefined toolset
|
git clone --recurse-submodules https://github.com/NousResearch/hermes-agent.git
|
||||||
python run_agent.py --enabled_toolsets=research --query "Find latest AI papers"
|
cd hermes-agent
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
# Combine multiple toolsets
|
uv venv .venv --python 3.11
|
||||||
python run_agent.py --enabled_toolsets=web,vision --query "Analyze this website"
|
source .venv/bin/activate
|
||||||
|
uv pip install -e ".[all,dev]"
|
||||||
# Enable all toolsets explicitly (same as omitting the flag)
|
uv pip install -e "./mini-swe-agent"
|
||||||
python run_agent.py --enabled_toolsets=all --query "Do web research and run commands if helpful"
|
python -m pytest tests/ -q
|
||||||
|
|
||||||
# Safe mode (no terminal access)
|
|
||||||
python run_agent.py --enabled_toolsets=safe --query "Help without running commands"
|
|
||||||
|
|
||||||
# List all available toolsets and tools
|
|
||||||
python run_agent.py --list_tools
|
|
||||||
```
|
```
|
||||||
|
|
||||||
See `toolsets.py` for the complete list of available toolsets and how to create custom ones.
|
---
|
||||||
|
|
||||||
## Basic Usage
|
## Community
|
||||||
|
|
||||||
### Default (all tools enabled)
|
- 💬 [Discord](https://discord.gg/NousResearch)
|
||||||
```bash
|
- 📚 [Skills Hub](https://agentskills.io)
|
||||||
# Uses OpenRouter by default - just set OPENROUTER_API_KEY in .env
|
- 🐛 [Issues](https://github.com/NousResearch/hermes-agent/issues)
|
||||||
python run_agent.py \
|
- 💡 [Discussions](https://github.com/NousResearch/hermes-agent/discussions)
|
||||||
--query "search up the latest docs on jit in python 3.13 and write me basic example that's not in their docs. profile its perf" \
|
|
||||||
--max_turns 20 \
|
|
||||||
--model anthropic/claude-sonnet-4-20250514
|
|
||||||
```
|
|
||||||
|
|
||||||
### With specific toolset
|
---
|
||||||
```bash
|
|
||||||
python run_agent.py \
|
|
||||||
--query "Debug this Python error" \
|
|
||||||
--enabled_toolsets=debugging \
|
|
||||||
--model anthropic/claude-sonnet-4-20250514
|
|
||||||
```
|
|
||||||
|
|
||||||
### Python API
|
## License
|
||||||
```python
|
|
||||||
from run_agent import AIAgent
|
|
||||||
|
|
||||||
# Uses OpenRouter by default (reads OPENROUTER_API_KEY from .env)
|
MIT — see [LICENSE](LICENSE).
|
||||||
agent = AIAgent(
|
|
||||||
model="anthropic/claude-sonnet-4-20250514",
|
|
||||||
enabled_toolsets=["research"]
|
|
||||||
)
|
|
||||||
response = agent.chat("Find information about quantum computing")
|
|
||||||
|
|
||||||
# Create custom toolset at runtime
|
Built by [Nous Research](https://nousresearch.com).
|
||||||
from toolsets import create_custom_toolset
|
|
||||||
|
|
||||||
create_custom_toolset(
|
|
||||||
name="my_tools",
|
|
||||||
description="My custom toolkit",
|
|
||||||
tools=["web_search"],
|
|
||||||
includes=["terminal", "vision"]
|
|
||||||
)
|
|
||||||
|
|
||||||
agent = AIAgent(enabled_toolsets=["my_tools"])
|
|
||||||
```
|
|
||||||
|
|
||||||
## Batch Processing
|
|
||||||
|
|
||||||
Process multiple prompts from a dataset in parallel with automatic checkpointing and statistics tracking:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Basic batch processing
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file=prompts.jsonl \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name=my_run
|
|
||||||
|
|
||||||
# With specific distribution
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file=prompts.jsonl \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name=image_run \
|
|
||||||
--distribution=image_gen \
|
|
||||||
--num_workers=4
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Features:**
|
|
||||||
- Parallel processing with configurable workers
|
|
||||||
- Toolset distributions for varied data generation
|
|
||||||
- Automatic checkpointing and resume capability
|
|
||||||
- Combined output in `data/<run_name>/trajectories.jsonl`
|
|
||||||
- Tool usage statistics and success rates
|
|
||||||
|
|
||||||
Use `--list_distributions` to see available toolset distributions for varied data generation.
|
|
||||||
|
|
||||||
### Trajectory Compression
|
|
||||||
|
|
||||||
Post-process trajectories to fit within token budgets for training:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Compress a directory of JSONL files
|
|
||||||
python trajectory_compressor.py --input=data/my_run
|
|
||||||
|
|
||||||
# Compress a single JSONL file
|
|
||||||
python trajectory_compressor.py --input=data/trajectories.jsonl
|
|
||||||
|
|
||||||
# Compress a 15% sample (useful for creating smaller training sets)
|
|
||||||
python trajectory_compressor.py --input=data/trajectories.jsonl --sample_percent=15
|
|
||||||
|
|
||||||
# Custom output and token target
|
|
||||||
python trajectory_compressor.py \
|
|
||||||
--input=data/trajectories.jsonl \
|
|
||||||
--output=data/compressed.jsonl \
|
|
||||||
--target_max_tokens=16000
|
|
||||||
```
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- Protects first turns (system, human, first GPT response, first tool call)
|
|
||||||
- Protects last N turns (configurable)
|
|
||||||
- Summarizes middle turns using LLM to fit target token budget
|
|
||||||
- Supports both directory and single file input
|
|
||||||
- Optional random sampling with `--sample_percent`
|
|
||||||
- Configurable via `configs/trajectory_compression.yaml`
|
|
||||||
|
|
||||||
### Ephemeral System Prompts
|
|
||||||
|
|
||||||
The ephemeral system prompt feature allows you to guide the model's behavior during batch processing **without** saving that prompt to the training dataset trajectories. This is useful for:
|
|
||||||
|
|
||||||
- Guiding model behavior during data collection
|
|
||||||
- Adding task-specific instructions
|
|
||||||
- Keeping saved trajectories clean and focused on tool-calling format
|
|
||||||
|
|
||||||
**Example:**
|
|
||||||
```bash
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file=prompts.jsonl \
|
|
||||||
--batch_size=10 \
|
|
||||||
--run_name=my_run \
|
|
||||||
--ephemeral_system_prompt="You are a helpful assistant focused on image generation."
|
|
||||||
```
|
|
||||||
|
|
||||||
The ephemeral prompt will influence the model's behavior during execution, but **only the standard tool-calling system prompt** will be saved in the trajectory files.
|
|
||||||
|
|
||||||
The ephemeral prompt influences model behavior during execution, but **only the standard tool-calling system prompt** is saved in trajectory files.
|
|
||||||
|
|
||||||
## Command Line Arguments
|
|
||||||
|
|
||||||
**Single Agent (`run_agent.py`):**
|
|
||||||
- `--query`: The question or task for the agent
|
|
||||||
- `--model`: Model to use (default: claude-opus-4-20250514)
|
|
||||||
- `--api_key`: API key for authentication
|
|
||||||
- `--base_url`: API endpoint URL
|
|
||||||
- `--max_turns`: Maximum number of tool-calling iterations
|
|
||||||
- `--enabled_toolsets`: Comma-separated list of toolsets to enable. Use `all` (or `*`) to enable everything. If omitted, all toolsets are enabled by default.
|
|
||||||
- `--disabled_toolsets`: Comma-separated list of toolsets to disable
|
|
||||||
- `--list_tools`: List all available toolsets and tools
|
|
||||||
- `--save_trajectories`: Save conversation trajectories to JSONL files
|
|
||||||
|
|
||||||
**Batch Processing (`batch_runner.py`):**
|
|
||||||
- `--dataset_file`: Path to JSONL file with prompts
|
|
||||||
- `--batch_size`: Number of prompts per batch
|
|
||||||
- `--run_name`: Name for this run (for output/checkpointing)
|
|
||||||
- `--distribution`: Toolset distribution to use (default: "default")
|
|
||||||
- `--num_workers`: Number of parallel workers (default: 4)
|
|
||||||
- `--resume`: Resume from checkpoint if interrupted
|
|
||||||
- `--ephemeral_system_prompt`: System prompt used during execution but NOT saved to trajectories
|
|
||||||
- `--list_distributions`: List available toolset distributions
|
|
||||||
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
All environment variables can be configured in the `.env` file (copy from `.env.example`).
|
|
||||||
|
|
||||||
**LLM Provider (OpenRouter):**
|
|
||||||
- `OPENROUTER_API_KEY`: Primary LLM access via OpenRouter (supports Claude, GPT-4, Gemini, etc.)
|
|
||||||
- `LLM_MODEL`: Default model (e.g., `anthropic/claude-sonnet-4`, `openai/gpt-4o`)
|
|
||||||
|
|
||||||
**Tool API Keys:**
|
|
||||||
- `FIRECRAWL_API_KEY`: Web tools (search, extract, crawl)
|
|
||||||
- `NOUS_API_KEY`: Vision and reasoning tools
|
|
||||||
- `FAL_KEY`: Image generation tools
|
|
||||||
|
|
||||||
**Terminal Tool Configuration (mini-swe-agent backend):**
|
|
||||||
- `TERMINAL_ENV`: Backend type - `local`, `docker`, `singularity`, `modal`, or `ssh` (default: `local`)
|
|
||||||
- `TERMINAL_DOCKER_IMAGE`: Docker image for docker backend (default: `python:3.11-slim`)
|
|
||||||
- `TERMINAL_SINGULARITY_IMAGE`: Singularity/Apptainer image (can be `docker://...` URL or local `.sif` path)
|
|
||||||
- `TERMINAL_TIMEOUT`: Command timeout in seconds (default: `60`)
|
|
||||||
- `TERMINAL_LIFETIME_SECONDS`: Cleanup inactive environments after this time (default: `300`)
|
|
||||||
- `TERMINAL_CWD`: Working directory inside containers (default: `/tmp`)
|
|
||||||
- `TERMINAL_SCRATCH_DIR`: Custom scratch directory for sandbox storage (optional, auto-detects `/scratch`)
|
|
||||||
- `SUDO_PASSWORD`: Enable sudo commands by piping password via `sudo -S` (works with all backends)
|
|
||||||
- If unset in CLI mode, you'll be prompted interactively when sudo is needed (45s timeout)
|
|
||||||
|
|
||||||
**SSH Backend Configuration (for remote execution):**
|
|
||||||
- `TERMINAL_SSH_HOST`: Remote server hostname or IP
|
|
||||||
- `TERMINAL_SSH_USER`: SSH username
|
|
||||||
- `TERMINAL_SSH_PORT`: SSH port (default: `22`)
|
|
||||||
- `TERMINAL_SSH_KEY`: Path to SSH private key (optional, uses ssh-agent if not set)
|
|
||||||
|
|
||||||
**Browser Tool Configuration (agent-browser + Browserbase):**
|
|
||||||
- `BROWSERBASE_API_KEY`: Browserbase API key for cloud browser execution
|
|
||||||
- `BROWSERBASE_PROJECT_ID`: Browserbase project ID
|
|
||||||
- `BROWSER_SESSION_TIMEOUT`: Session timeout in seconds (default: `300`)
|
|
||||||
|
|
||||||
**Legacy Hecate Terminal Backend (optional):**
|
|
||||||
- `MORPH_API_KEY`: For Hecate/MorphCloud terminal backend
|
|
||||||
- `HECATE_VM_LIFETIME_SECONDS`: VM lifetime (default: 300)
|
|
||||||
- `HECATE_DEFAULT_SNAPSHOT_ID`: Default snapshot (default: snapshot_p5294qxt)
|
|
||||||
|
|
||||||
**Debug Options:**
|
|
||||||
- `WEB_TOOLS_DEBUG`, `VISION_TOOLS_DEBUG`, `MOA_TOOLS_DEBUG`, `IMAGE_TOOLS_DEBUG`: Enable debug logging
|
|
||||||
|
|
||||||
## Key Files
|
|
||||||
|
|
||||||
| File | Purpose |
|
|
||||||
|------|---------|
|
|
||||||
| `hermes` | CLI launcher script (run with `./hermes`) |
|
|
||||||
| `cli.py` | Interactive CLI implementation |
|
|
||||||
| `cli-config.yaml` | CLI configuration (copy from `.example`) |
|
|
||||||
| `run_agent.py` | Main agent runner - single query execution |
|
|
||||||
| `batch_runner.py` | Parallel batch processing with checkpointing |
|
|
||||||
| `model_tools.py` | Core tool definitions and handlers |
|
|
||||||
| `toolsets.py` | Toolset definitions and composition |
|
|
||||||
| `toolset_distributions.py` | Probability distributions for data generation |
|
|
||||||
| `trajectory_compressor.py` | Post-process trajectories for training |
|
|
||||||
| `tools/` | Individual tool implementations |
|
|
||||||
| `tools/skills_tool.py` | Skills system with progressive disclosure |
|
|
||||||
| `skills/` | On-demand knowledge documents |
|
|
||||||
| `docs/` | Documentation |
|
|
||||||
| `configs/` | Example batch run scripts |
|
|
||||||
|
|||||||
729
TODO.md
729
TODO.md
@@ -1,729 +0,0 @@
|
|||||||
# Hermes Agent - Future Improvements
|
|
||||||
|
|
||||||
> Ideas for enhancing the agent's capabilities, generated from self-analysis of the codebase.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚨 HIGH PRIORITY - Immediate Fixes
|
|
||||||
|
|
||||||
These items need to be addressed ASAP:
|
|
||||||
|
|
||||||
### 1. SUDO Breaking Terminal Tool 🔐 ✅ COMPLETE
|
|
||||||
- [x] **Problem:** SUDO commands break the terminal tool execution (hangs indefinitely)
|
|
||||||
- [x] **Fix:** Created custom environment wrappers in `tools/terminal_tool.py`
|
|
||||||
- `stdin=subprocess.DEVNULL` prevents hanging on interactive prompts
|
|
||||||
- Sudo fails gracefully with clear error if no password configured
|
|
||||||
- Same UX as Claude Code - agent sees error, tells user to run it themselves
|
|
||||||
- [x] **All 5 environments now have consistent behavior:**
|
|
||||||
- `_LocalEnvironment` - local execution
|
|
||||||
- `_DockerEnvironment` - Docker containers
|
|
||||||
- `_SingularityEnvironment` - Singularity/Apptainer containers
|
|
||||||
- `_ModalEnvironment` - Modal cloud sandboxes
|
|
||||||
- `_SSHEnvironment` - remote SSH execution
|
|
||||||
- [x] **Optional sudo support via `SUDO_PASSWORD` env var:**
|
|
||||||
- Shared `_transform_sudo_command()` helper used by all environments
|
|
||||||
- If set, auto-transforms `sudo cmd` → pipes password via `sudo -S`
|
|
||||||
- Documented in `.env.example`, `cli-config.yaml`, and README
|
|
||||||
- Works for chained commands: `cmd1 && sudo cmd2`
|
|
||||||
- [x] **Interactive sudo prompt in CLI mode:**
|
|
||||||
- When sudo detected and no password configured, prompts user
|
|
||||||
- 45-second timeout (auto-skips if no input)
|
|
||||||
- Hidden password input via `getpass` (password not visible)
|
|
||||||
- Password cached for session (don't ask repeatedly)
|
|
||||||
- Spinner pauses during prompt for clean UX
|
|
||||||
- Uses `HERMES_INTERACTIVE` env var to detect CLI mode
|
|
||||||
|
|
||||||
### 2. Fix `browser_get_images` Tool 🖼️ ✅ VERIFIED WORKING
|
|
||||||
- [x] **Tested:** Tool works correctly on multiple sites
|
|
||||||
- [x] **Results:** Successfully extracts image URLs, alt text, dimensions
|
|
||||||
- [x] **Note:** Some sites (Pixabay, etc.) have Cloudflare bot protection that blocks headless browsers - this is expected behavior, not a bug
|
|
||||||
|
|
||||||
### 3. Better Action Logging for Debugging 📝 ✅ COMPLETE
|
|
||||||
- [x] **Problem:** Need better logging of agent actions for debugging
|
|
||||||
- [x] **Implementation:**
|
|
||||||
- Save full session trajectories to `logs/` directory as JSON
|
|
||||||
- Each session gets a unique file: `session_YYYYMMDD_HHMMSS_UUID.json`
|
|
||||||
- Logs all messages, tool calls with inputs/outputs, timestamps
|
|
||||||
- Structured JSON format for easy parsing and replay
|
|
||||||
- Automatic on CLI runs (configurable)
|
|
||||||
|
|
||||||
### 4. Stream Thinking Summaries in Real-Time 💭 ⏸️ DEFERRED
|
|
||||||
- [ ] **Problem:** Thinking/reasoning summaries not shown while streaming
|
|
||||||
- [ ] **Complexity:** This is a significant refactor - leaving for later
|
|
||||||
|
|
||||||
**OpenRouter Streaming Info:**
|
|
||||||
- Uses `stream=True` with OpenAI SDK
|
|
||||||
- Reasoning comes in `choices[].delta.reasoning_details` chunks
|
|
||||||
- Types: `reasoning.summary`, `reasoning.text`, `reasoning.encrypted`
|
|
||||||
- Tool call arguments stream as partial JSON (need accumulation)
|
|
||||||
- Items paradigm: same ID emitted multiple times with updated content
|
|
||||||
|
|
||||||
**Key Challenges:**
|
|
||||||
- Tool call JSON accumulation (partial `{"query": "wea` → `{"query": "weather"}`)
|
|
||||||
- Multiple concurrent outputs (thinking + tool calls + text simultaneously)
|
|
||||||
- State management for partial responses
|
|
||||||
- Error handling if connection drops mid-stream
|
|
||||||
- Deciding when tool calls are "complete" enough to execute
|
|
||||||
|
|
||||||
**UX Questions to Resolve:**
|
|
||||||
- Show raw thinking text or summarized?
|
|
||||||
- Live expanding text vs. spinner replacement?
|
|
||||||
- Markdown rendering while streaming?
|
|
||||||
- How to handle thinking + tool call display simultaneously?
|
|
||||||
|
|
||||||
**Implementation Options:**
|
|
||||||
- New `run_conversation_streaming()` method (keep non-streaming as fallback)
|
|
||||||
- Wrapper that handles streaming internally
|
|
||||||
- Big refactor of existing `run_conversation()`
|
|
||||||
|
|
||||||
**References:**
|
|
||||||
- https://openrouter.ai/docs/api/reference/streaming
|
|
||||||
- https://openrouter.ai/docs/guides/best-practices/reasoning-tokens#streaming-response
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 1. Subagent Architecture (Context Isolation) 🎯
|
|
||||||
|
|
||||||
**Problem:** Long-running tools (terminal commands, browser automation, complex file operations) consume massive context. A single `ls -la` can add hundreds of lines. Browser snapshots, debugging sessions, and iterative terminal work quickly bloat the main conversation, leaving less room for actual reasoning.
|
|
||||||
|
|
||||||
**Solution:** The main agent becomes an **orchestrator** that delegates context-heavy tasks to **subagents**.
|
|
||||||
|
|
||||||
**Architecture:**
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────────┐
|
|
||||||
│ ORCHESTRATOR (main agent) │
|
|
||||||
│ - Receives user request │
|
|
||||||
│ - Plans approach │
|
|
||||||
│ - Delegates heavy tasks to subagents │
|
|
||||||
│ - Receives summarized results │
|
|
||||||
│ - Maintains clean, focused context │
|
|
||||||
└─────────────────────────────────────────────────────────────────┘
|
|
||||||
│ │ │
|
|
||||||
▼ ▼ ▼
|
|
||||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ TERMINAL AGENT │ │ BROWSER AGENT │ │ CODE AGENT │
|
|
||||||
│ - terminal tool │ │ - browser tools │ │ - file tools │
|
|
||||||
│ - file tools │ │ - web_search │ │ - terminal │
|
|
||||||
│ │ │ - web_extract │ │ │
|
|
||||||
│ Isolated context│ │ Isolated context│ │ Isolated context│
|
|
||||||
│ Returns summary │ │ Returns summary │ │ Returns summary │
|
|
||||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**How it works:**
|
|
||||||
1. User asks: "Set up a new Python project with FastAPI and tests"
|
|
||||||
2. Orchestrator plans: "I need to create files, install deps, write code"
|
|
||||||
3. Orchestrator calls: `terminal_task(goal="Create venv, install fastapi pytest", context="New project in ~/myapp")`
|
|
||||||
4. **Subagent spawns** with fresh context, only terminal/file tools
|
|
||||||
5. Subagent iterates (may take 10+ tool calls, lots of output)
|
|
||||||
6. Subagent completes → returns summary: "Created venv, installed fastapi==0.109.0, pytest==8.0.0"
|
|
||||||
7. Orchestrator receives **only the summary**, context stays clean
|
|
||||||
8. Orchestrator continues with next subtask
|
|
||||||
|
|
||||||
**Key tools to implement:**
|
|
||||||
- [ ] `terminal_task(goal, context, cwd?)` - Delegate terminal/shell work
|
|
||||||
- [ ] `browser_task(goal, context, start_url?)` - Delegate web research/automation
|
|
||||||
- [ ] `code_task(goal, context, files?)` - Delegate code writing/modification
|
|
||||||
- [ ] Generic `delegate_task(goal, context, toolsets=[])` - Flexible delegation
|
|
||||||
|
|
||||||
**Implementation details:**
|
|
||||||
- [ ] Subagent uses same `run_agent.py` but with:
|
|
||||||
- Fresh/empty conversation history
|
|
||||||
- Limited toolset (only what's needed)
|
|
||||||
- Smaller max_iterations (focused task)
|
|
||||||
- Task-specific system prompt
|
|
||||||
- [ ] Subagent returns structured result:
|
|
||||||
```python
|
|
||||||
{
|
|
||||||
"success": True,
|
|
||||||
"summary": "Installed 3 packages, created 2 files",
|
|
||||||
"details": "Optional longer explanation if needed",
|
|
||||||
"artifacts": ["~/myapp/requirements.txt", "~/myapp/main.py"], # Files created
|
|
||||||
"errors": [] # Any issues encountered
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- [ ] Orchestrator sees only the summary in its context
|
|
||||||
- [ ] Full subagent transcript saved separately for debugging
|
|
||||||
|
|
||||||
**Benefits:**
|
|
||||||
- 🧹 **Clean context** - Orchestrator stays focused, doesn't drown in tool output
|
|
||||||
- 📊 **Better token efficiency** - 50 terminal outputs → 1 summary paragraph
|
|
||||||
- 🎯 **Focused subagents** - Each agent has just the tools it needs
|
|
||||||
- 🔄 **Parallel potential** - Independent subtasks could run concurrently
|
|
||||||
- 🐛 **Easier debugging** - Each subtask has its own isolated transcript
|
|
||||||
|
|
||||||
**When to use subagents vs direct tools:**
|
|
||||||
- **Subagent**: Multi-step tasks, iteration likely, lots of output expected
|
|
||||||
- **Direct**: Quick one-off commands, simple file reads, user needs to see output
|
|
||||||
|
|
||||||
**Files to modify:** `run_agent.py` (add orchestration mode), new `tools/delegate_tools.py`, new `subagent_runner.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 2. Context Management (complements Subagents)
|
|
||||||
|
|
||||||
**Problem:** Context grows unbounded during long conversations. Trajectory compression exists for training data post-hoc, but live conversations lack intelligent context management.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Incremental summarization** - Compress old tool outputs on-the-fly during conversations
|
|
||||||
- Trigger when context exceeds threshold (e.g., 80% of max tokens)
|
|
||||||
- Preserve recent turns fully, summarize older tool responses
|
|
||||||
- Could reuse logic from `trajectory_compressor.py`
|
|
||||||
|
|
||||||
- [ ] **Semantic memory retrieval** - Vector store for long conversation recall
|
|
||||||
- Embed important facts/findings as conversation progresses
|
|
||||||
- Retrieve relevant memories when needed instead of keeping everything in context
|
|
||||||
- Consider lightweight solutions: ChromaDB, FAISS, or even a simple embedding cache
|
|
||||||
|
|
||||||
- [ ] **Working vs. episodic memory** distinction
|
|
||||||
- Working memory: Current task state, recent tool results (always in context)
|
|
||||||
- Episodic memory: Past findings, tried approaches (retrieved on demand)
|
|
||||||
- Clear eviction policies for each
|
|
||||||
|
|
||||||
**Files to modify:** `run_agent.py` (add memory manager), possibly new `tools/memory_tool.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 3. Self-Reflection & Course Correction 🔄
|
|
||||||
|
|
||||||
**Problem:** Current retry logic handles malformed outputs but not semantic failures. Agent doesn't reason about *why* something failed.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Meta-reasoning after failures** - When a tool returns an error or unexpected result:
|
|
||||||
```
|
|
||||||
Tool failed → Reflect: "Why did this fail? What assumptions were wrong?"
|
|
||||||
→ Adjust approach → Retry with new strategy
|
|
||||||
```
|
|
||||||
- Could be a lightweight LLM call or structured self-prompt
|
|
||||||
|
|
||||||
- [ ] **Planning/replanning module** - For complex multi-step tasks:
|
|
||||||
- Generate plan before execution
|
|
||||||
- After each step, evaluate: "Am I on track? Should I revise the plan?"
|
|
||||||
- Store plan in working memory, update as needed
|
|
||||||
|
|
||||||
- [ ] **Approach memory** - Remember what didn't work:
|
|
||||||
- "I tried X for this type of problem and it failed because Y"
|
|
||||||
- Prevents repeating failed strategies in the same conversation
|
|
||||||
|
|
||||||
**Files to modify:** `run_agent.py` (add reflection hooks in tool loop), new `tools/reflection_tool.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 4. Tool Composition & Learning 🔧
|
|
||||||
|
|
||||||
**Problem:** Tools are atomic. Complex tasks require repeated manual orchestration of the same tool sequences.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Macro tools / Tool chains** - Define reusable tool sequences:
|
|
||||||
```yaml
|
|
||||||
research_topic:
|
|
||||||
description: "Deep research on a topic"
|
|
||||||
steps:
|
|
||||||
- web_search: {query: "$topic"}
|
|
||||||
- web_extract: {urls: "$search_results.urls[:3]"}
|
|
||||||
- summarize: {content: "$extracted"}
|
|
||||||
```
|
|
||||||
- Could be defined in skills or a new `macros/` directory
|
|
||||||
- Agent can invoke macro as single tool call
|
|
||||||
|
|
||||||
- [ ] **Tool failure patterns** - Learn from failures:
|
|
||||||
- Track: tool, input pattern, error type, what worked instead
|
|
||||||
- Before calling a tool, check: "Has this pattern failed before?"
|
|
||||||
- Persistent across sessions (stored in skills or separate DB)
|
|
||||||
|
|
||||||
- [ ] **Parallel tool execution** - When tools are independent, run concurrently:
|
|
||||||
- Detect independence (no data dependencies between calls)
|
|
||||||
- Use `asyncio.gather()` for parallel execution
|
|
||||||
- Already have async support in some tools, just need orchestration
|
|
||||||
|
|
||||||
**Files to modify:** `model_tools.py`, `toolsets.py`, new `tool_macros.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 5. Dynamic Skills Expansion 📚
|
|
||||||
|
|
||||||
**Problem:** Skills system is elegant but static. Skills must be manually created and added.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Skill acquisition from successful tasks** - After completing a complex task:
|
|
||||||
- "This approach worked well. Save as a skill?"
|
|
||||||
- Extract: goal, steps taken, tools used, key decisions
|
|
||||||
- Generate SKILL.md automatically
|
|
||||||
- Store in user's skills directory
|
|
||||||
|
|
||||||
- [ ] **Skill templates** - Common patterns that can be parameterized:
|
|
||||||
```markdown
|
|
||||||
# Debug {language} Error
|
|
||||||
1. Reproduce the error
|
|
||||||
2. Search for error message: `web_search("{error_message} {language}")`
|
|
||||||
3. Check common causes: {common_causes}
|
|
||||||
4. Apply fix and verify
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Skill chaining** - Combine skills for complex workflows:
|
|
||||||
- Skills can reference other skills as dependencies
|
|
||||||
- "To do X, first apply skill Y, then skill Z"
|
|
||||||
- Directed graph of skill dependencies
|
|
||||||
|
|
||||||
**Files to modify:** `tools/skills_tool.py`, `skills/` directory structure, new `skill_generator.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 6. Task Continuation Hints 🎯
|
|
||||||
|
|
||||||
**Problem:** Could be more helpful by suggesting logical next steps.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Suggest next steps** - At end of a task, suggest logical continuations:
|
|
||||||
- "Code is written. Want me to also write tests / docs / deploy?"
|
|
||||||
- Based on common workflows for task type
|
|
||||||
- Non-intrusive, just offer options
|
|
||||||
|
|
||||||
**Files to modify:** `run_agent.py`, response generation logic
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 7. Interactive Clarifying Questions Tool ❓
|
|
||||||
|
|
||||||
**Problem:** Agent sometimes makes assumptions or guesses when it should ask the user. Currently can only ask via text, which gets lost in long outputs.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Multiple-choice prompt tool** - Let agent present structured choices to user:
|
|
||||||
```
|
|
||||||
ask_user_choice(
|
|
||||||
question="Should the language switcher enable only German or all languages?",
|
|
||||||
choices=[
|
|
||||||
"Only enable German - works immediately",
|
|
||||||
"Enable all, mark untranslated - show fallback notice",
|
|
||||||
"Let me specify something else"
|
|
||||||
]
|
|
||||||
)
|
|
||||||
```
|
|
||||||
- Renders as interactive terminal UI with arrow key / Tab navigation
|
|
||||||
- User selects option, result returned to agent
|
|
||||||
- Up to 4 choices + optional free-text option
|
|
||||||
|
|
||||||
- [ ] **Implementation:**
|
|
||||||
- Use `inquirer` or `questionary` Python library for rich terminal prompts
|
|
||||||
- Tool returns selected option text (or user's custom input)
|
|
||||||
- **CLI-only** - only works when running via `cli.py` (not API/programmatic use)
|
|
||||||
- Graceful fallback: if not in interactive mode, return error asking agent to rephrase as text
|
|
||||||
|
|
||||||
- [ ] **Use cases:**
|
|
||||||
- Clarify ambiguous requirements before starting work
|
|
||||||
- Confirm destructive operations with clear options
|
|
||||||
- Let user choose between implementation approaches
|
|
||||||
- Checkpoint complex multi-step workflows
|
|
||||||
|
|
||||||
**Files to modify:** New `tools/ask_user_tool.py`, `cli.py` (detect interactive mode), `model_tools.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 8. Resource Awareness & Efficiency 💰
|
|
||||||
|
|
||||||
**Problem:** No awareness of costs, time, or resource usage. Could be smarter about efficiency.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Tool result caching** - Don't repeat identical operations:
|
|
||||||
- Cache web searches, extractions within a session
|
|
||||||
- Invalidation based on time-sensitivity of query
|
|
||||||
- Hash-based lookup: same input → cached output
|
|
||||||
|
|
||||||
- [ ] **Lazy evaluation** - Don't fetch everything upfront:
|
|
||||||
- Get summaries first, full content only if needed
|
|
||||||
- "I found 5 relevant pages. Want me to deep-dive on any?"
|
|
||||||
|
|
||||||
**Files to modify:** `model_tools.py`, new `resource_tracker.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 9. Collaborative Problem Solving 🤝
|
|
||||||
|
|
||||||
**Problem:** Interaction is command/response. Complex problems benefit from dialogue.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Assumption surfacing** - Make implicit assumptions explicit:
|
|
||||||
- "I'm assuming you want Python 3.11+. Correct?"
|
|
||||||
- "This solution assumes you have sudo access..."
|
|
||||||
- Let user correct before going down wrong path
|
|
||||||
|
|
||||||
- [ ] **Checkpoint & confirm** - For high-stakes operations:
|
|
||||||
- "About to delete 47 files. Here's the list - proceed?"
|
|
||||||
- "This will modify your database. Want a backup first?"
|
|
||||||
- Configurable threshold for when to ask
|
|
||||||
|
|
||||||
**Files to modify:** `run_agent.py`, system prompt configuration
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 10. Project-Local Context 💾
|
|
||||||
|
|
||||||
**Problem:** Valuable context lost between sessions.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Project awareness** - Remember project-specific context:
|
|
||||||
- Store `.hermes/context.md` in project directory
|
|
||||||
- "This is a Django project using PostgreSQL"
|
|
||||||
- Coding style preferences, deployment setup, etc.
|
|
||||||
- Load automatically when working in that directory
|
|
||||||
|
|
||||||
- [ ] **Handoff notes** - Leave notes for future sessions:
|
|
||||||
- Write to `.hermes/notes.md` in project
|
|
||||||
- "TODO for next session: finish implementing X"
|
|
||||||
- "Known issues: Y doesn't work on Windows"
|
|
||||||
|
|
||||||
**Files to modify:** New `project_context.py`, auto-load in `run_agent.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 11. Graceful Degradation & Robustness 🛡️
|
|
||||||
|
|
||||||
**Problem:** When things go wrong, recovery is limited. Should fail gracefully.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Fallback chains** - When primary approach fails, have backups:
|
|
||||||
- `web_extract` fails → try `browser_navigate` → try `web_search` for cached version
|
|
||||||
- Define fallback order per tool type
|
|
||||||
|
|
||||||
- [ ] **Partial progress preservation** - Don't lose work on failure:
|
|
||||||
- Long task fails midway → save what we've got
|
|
||||||
- "I completed 3/5 steps before the error. Here's what I have..."
|
|
||||||
|
|
||||||
- [ ] **Self-healing** - Detect and recover from bad states:
|
|
||||||
- Browser stuck → close and retry
|
|
||||||
- Terminal hung → timeout and reset
|
|
||||||
|
|
||||||
**Files to modify:** `model_tools.py`, tool implementations, new `fallback_manager.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 12. Tools & Skills Wishlist 🧰
|
|
||||||
|
|
||||||
*Things that would need new tool implementations (can't do well with current tools):*
|
|
||||||
|
|
||||||
### High-Impact
|
|
||||||
|
|
||||||
- [ ] **Audio/Video Transcription** 🎬 *(See also: Section 16 for detailed spec)*
|
|
||||||
- Transcribe audio files, podcasts, YouTube videos
|
|
||||||
- Extract key moments from video
|
|
||||||
- Voice memo transcription for messaging integrations
|
|
||||||
- *Provider options: Whisper API, Deepgram, local Whisper*
|
|
||||||
|
|
||||||
- [ ] **Diagram Rendering** 📊
|
|
||||||
- Render Mermaid/PlantUML to actual images
|
|
||||||
- Can generate the code, but rendering requires external service or tool
|
|
||||||
- "Show me how these components connect" → actual visual diagram
|
|
||||||
|
|
||||||
### Medium-Impact
|
|
||||||
|
|
||||||
- [ ] **Canvas / Visual Workspace** 🖼️
|
|
||||||
- Agent-controlled visual panel for rendering interactive UI
|
|
||||||
- Inspired by OpenClaw's Canvas feature
|
|
||||||
- **Capabilities:**
|
|
||||||
- `present` / `hide` - Show/hide the canvas panel
|
|
||||||
- `navigate` - Load HTML files or URLs into the canvas
|
|
||||||
- `eval` - Execute JavaScript in the canvas context
|
|
||||||
- `snapshot` - Capture the rendered UI as an image
|
|
||||||
- **Use cases:**
|
|
||||||
- Display generated HTML/CSS/JS previews
|
|
||||||
- Show interactive data visualizations (charts, graphs)
|
|
||||||
- Render diagrams (Mermaid → rendered output)
|
|
||||||
- Present structured information in rich format
|
|
||||||
- A2UI-style component system for structured agent UI
|
|
||||||
- **Implementation options:**
|
|
||||||
- Electron-based panel for CLI
|
|
||||||
- WebSocket-connected web app
|
|
||||||
- VS Code webview extension
|
|
||||||
- *Would let agent "show" things rather than just describe them*
|
|
||||||
|
|
||||||
- [ ] **Document Generation** 📄
|
|
||||||
- Create styled PDFs, Word docs, presentations
|
|
||||||
- *Can do basic PDF via terminal tools, but limited*
|
|
||||||
|
|
||||||
- [ ] **Diff/Patch Tool** 📝
|
|
||||||
- Surgical code modifications with preview
|
|
||||||
- "Change line 45-50 to X" without rewriting whole file
|
|
||||||
- Show diffs before applying
|
|
||||||
- *Can use `diff`/`patch` but a native tool would be safer*
|
|
||||||
|
|
||||||
### Skills to Create
|
|
||||||
|
|
||||||
- [ ] **Domain-specific skill packs:**
|
|
||||||
- DevOps/Infrastructure (Terraform, K8s, AWS)
|
|
||||||
- Data Science workflows (EDA, model training)
|
|
||||||
- Security/pentesting procedures
|
|
||||||
|
|
||||||
- [ ] **Framework-specific skills:**
|
|
||||||
- React/Vue/Angular patterns
|
|
||||||
- Django/Rails/Express conventions
|
|
||||||
- Database optimization playbooks
|
|
||||||
|
|
||||||
- [ ] **Troubleshooting flowcharts:**
|
|
||||||
- "Docker container won't start" → decision tree
|
|
||||||
- "Production is slow" → systematic diagnosis
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 13. Messaging Platform Integrations 💬
|
|
||||||
|
|
||||||
**Problem:** Agent currently only works via `cli.py` which requires direct terminal access. Users may want to interact via messaging apps from their phone or other devices.
|
|
||||||
|
|
||||||
**Architecture:**
|
|
||||||
- `run_agent.py` already accepts `conversation_history` parameter and returns updated messages ✅
|
|
||||||
- Need: persistent session storage, platform monitors, session key resolution
|
|
||||||
|
|
||||||
**Implementation approach:**
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────┐
|
|
||||||
│ Platform Monitor (e.g., telegram_monitor.py) │
|
|
||||||
│ ├─ Long-running daemon connecting to messaging platform │
|
|
||||||
│ ├─ On message: resolve session key → load history from disk│
|
|
||||||
│ ├─ Call run_agent.py with loaded history │
|
|
||||||
│ ├─ Save updated history back to disk (JSONL) │
|
|
||||||
│ └─ Send response back to platform │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**Platform support (each user sets up their own credentials):**
|
|
||||||
- [ ] **Telegram** - via `python-telegram-bot` or `grammy` equivalent
|
|
||||||
- Bot token from @BotFather
|
|
||||||
- Easiest to set up, good for personal use
|
|
||||||
- [ ] **Discord** - via `discord.py`
|
|
||||||
- Bot token from Discord Developer Portal
|
|
||||||
- Can work in servers (group sessions) or DMs
|
|
||||||
- [ ] **WhatsApp** - via `baileys` (WhatsApp Web protocol)
|
|
||||||
- QR code scan to authenticate
|
|
||||||
- More complex, but reaches most people
|
|
||||||
|
|
||||||
**Session management:**
|
|
||||||
- [ ] **Session store** - JSONL persistence per session key
|
|
||||||
- `~/.hermes/sessions/{session_key}.jsonl`
|
|
||||||
- Session keys: `telegram:dm:{user_id}`, `discord:channel:{id}`, etc.
|
|
||||||
- [ ] **Session expiry** - Configurable reset policies
|
|
||||||
- Daily reset (default 4am) OR idle timeout (e.g., 2 hours)
|
|
||||||
- Manual reset via `/reset` or `/new` command in chat
|
|
||||||
- [ ] **Session continuity** - Conversations persist across messages until reset
|
|
||||||
|
|
||||||
**Files to create:** `monitors/telegram_monitor.py`, `monitors/discord_monitor.py`, `monitors/session_store.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 14. Scheduled Tasks / Cron Jobs ⏰
|
|
||||||
|
|
||||||
**Problem:** Agent only runs on-demand. Some tasks benefit from scheduled execution (daily summaries, monitoring, reminders).
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Cron-style scheduler** - Run agent turns on a schedule
|
|
||||||
- Store jobs in `~/.hermes/cron/jobs.json`
|
|
||||||
- Each job: `{ id, schedule, prompt, session_mode, delivery }`
|
|
||||||
- Uses APScheduler or similar Python library
|
|
||||||
|
|
||||||
- [ ] **Session modes:**
|
|
||||||
- `isolated` - Fresh session each run (no history, clean context)
|
|
||||||
- `main` - Append to main session (agent remembers previous scheduled runs)
|
|
||||||
|
|
||||||
- [ ] **Delivery options:**
|
|
||||||
- Write output to file (`~/.hermes/cron/output/{job_id}/{timestamp}.md`)
|
|
||||||
- Send to messaging channel (if integrations enabled)
|
|
||||||
- Both
|
|
||||||
|
|
||||||
- [ ] **CLI interface:**
|
|
||||||
```bash
|
|
||||||
# List scheduled jobs
|
|
||||||
python cli.py --cron list
|
|
||||||
|
|
||||||
# Add a job (runs daily at 9am)
|
|
||||||
python cli.py --cron add "Summarize my email inbox" --schedule "0 9 * * *"
|
|
||||||
|
|
||||||
# Quick syntax for simple intervals
|
|
||||||
python cli.py --cron add "Check server status" --every 30m
|
|
||||||
|
|
||||||
# Remove a job
|
|
||||||
python cli.py --cron remove <job_id>
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Agent self-scheduling** - Let the agent create its own cron jobs
|
|
||||||
- New tool: `schedule_task(prompt, schedule, session_mode)`
|
|
||||||
- "Remind me to check the deployment tomorrow at 9am"
|
|
||||||
- Agent can set follow-up tasks for itself
|
|
||||||
|
|
||||||
- [ ] **In-chat command:** `/cronjob {prompt} {frequency}` when using messaging integrations
|
|
||||||
|
|
||||||
**Files to create:** `cron/scheduler.py`, `cron/jobs.py`, `tools/schedule_tool.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 15. Text-to-Speech (TTS) 🔊
|
|
||||||
|
|
||||||
**Problem:** Agent can only respond with text. Some users prefer audio responses (accessibility, hands-free use, podcasts).
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **TTS tool** - Generate audio files from text
|
|
||||||
```python
|
|
||||||
tts_generate(text="Here's your summary...", voice="nova", output="summary.mp3")
|
|
||||||
```
|
|
||||||
- Returns path to generated audio file
|
|
||||||
- For messaging integrations: can send as voice message
|
|
||||||
|
|
||||||
- [ ] **Provider options:**
|
|
||||||
- Edge TTS (free, good quality, many voices)
|
|
||||||
- OpenAI TTS (paid, excellent quality)
|
|
||||||
- ElevenLabs (paid, best quality, voice cloning)
|
|
||||||
- Local options (Coqui TTS, Bark)
|
|
||||||
|
|
||||||
- [ ] **Modes:**
|
|
||||||
- On-demand: User explicitly asks "read this to me"
|
|
||||||
- Auto-TTS: Configurable to always generate audio for responses
|
|
||||||
- Long-text handling: Summarize or chunk very long responses
|
|
||||||
|
|
||||||
- [ ] **Integration with messaging:**
|
|
||||||
- When enabled, can send voice notes instead of/alongside text
|
|
||||||
- User preference per channel
|
|
||||||
|
|
||||||
**Files to create:** `tools/tts_tool.py`, config in `cli-config.yaml`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 16. Speech-to-Text / Audio Transcription 🎤
|
|
||||||
|
|
||||||
**Problem:** Users may want to send voice memos instead of typing. Agent is blind to audio content.
|
|
||||||
|
|
||||||
**Ideas:**
|
|
||||||
- [ ] **Voice memo transcription** - For messaging integrations
|
|
||||||
- User sends voice message → transcribe → process as text
|
|
||||||
- Seamless: user speaks, agent responds
|
|
||||||
|
|
||||||
- [ ] **Audio/video file transcription** - Existing idea, expanded:
|
|
||||||
- Transcribe local audio files (mp3, wav, m4a)
|
|
||||||
- Transcribe YouTube videos (download audio → transcribe)
|
|
||||||
- Extract key moments with timestamps
|
|
||||||
|
|
||||||
- [ ] **Provider options:**
|
|
||||||
- OpenAI Whisper API (good quality, cheap)
|
|
||||||
- Deepgram (fast, good for real-time)
|
|
||||||
- Local Whisper (free, runs on GPU)
|
|
||||||
- Groq Whisper (fast, free tier available)
|
|
||||||
|
|
||||||
- [ ] **Tool interface:**
|
|
||||||
```python
|
|
||||||
transcribe(source="audio.mp3") # Local file
|
|
||||||
transcribe(source="https://youtube.com/...") # YouTube
|
|
||||||
transcribe(source="voice_message", data=bytes) # Voice memo
|
|
||||||
```
|
|
||||||
|
|
||||||
**Files to create:** `tools/transcribe_tool.py`, integrate with messaging monitors
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Priority Order (Suggested)
|
|
||||||
|
|
||||||
1. **🎯 Subagent Architecture** - Critical for context management, enables everything else
|
|
||||||
2. **Memory & Context Management** - Complements subagents for remaining context
|
|
||||||
3. **Self-Reflection** - Improves reliability and reduces wasted tool calls
|
|
||||||
4. **Project-Local Context** - Practical win, keeps useful info across sessions
|
|
||||||
5. **Messaging Integrations** - Unlocks mobile access, new interaction patterns
|
|
||||||
6. **Scheduled Tasks / Cron Jobs** - Enables automation, reminders, monitoring
|
|
||||||
7. **Tool Composition** - Quality of life, builds on other improvements
|
|
||||||
8. **Dynamic Skills** - Force multiplier for repeated tasks
|
|
||||||
9. **Interactive Clarifying Questions** - Better UX for ambiguous tasks
|
|
||||||
10. **TTS / Audio Transcription** - Accessibility, hands-free use
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Removed Items (Unrealistic)
|
|
||||||
|
|
||||||
The following were removed because they're architecturally impossible:
|
|
||||||
|
|
||||||
- ~~Proactive suggestions / Prefetching~~ - Agent only runs on user request, can't interject
|
|
||||||
- ~~Clipboard integration~~ - No access to user's local system clipboard
|
|
||||||
|
|
||||||
The following **moved to active TODO** (now possible with new architecture):
|
|
||||||
|
|
||||||
- ~~Session save/restore~~ → See **Messaging Integrations** (session persistence)
|
|
||||||
- ~~Voice/TTS playback~~ → See **TTS** (can generate audio files, send via messaging)
|
|
||||||
- ~~Set reminders~~ → See **Scheduled Tasks / Cron Jobs**
|
|
||||||
|
|
||||||
The following were removed because they're **already possible**:
|
|
||||||
|
|
||||||
- ~~HTTP/API Client~~ → Use `curl` or Python `requests` in terminal
|
|
||||||
- ~~Structured Data Manipulation~~ → Use `pandas` in terminal
|
|
||||||
- ~~Git-Native Operations~~ → Use `git` CLI in terminal
|
|
||||||
- ~~Symbolic Math~~ → Use `SymPy` in terminal
|
|
||||||
- ~~Code Quality Tools~~ → Run linters (`eslint`, `black`, `mypy`) in terminal
|
|
||||||
- ~~Testing Framework~~ → Run `pytest`, `jest`, etc. in terminal
|
|
||||||
- ~~Translation~~ → LLM handles this fine, or use translation APIs
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🧪 Brainstorm Ideas (Not Yet Fleshed Out)
|
|
||||||
|
|
||||||
*These are early-stage ideas that need more thinking before implementation. Captured here so they don't get lost.*
|
|
||||||
|
|
||||||
### Remote/Distributed Execution 🌐
|
|
||||||
|
|
||||||
**Concept:** Run agent on a powerful remote server while interacting from a thin client.
|
|
||||||
|
|
||||||
**Why interesting:**
|
|
||||||
- Run on beefy GPU server for local LLM inference
|
|
||||||
- Agent has access to remote machine's resources (files, tools, internet)
|
|
||||||
- User interacts via lightweight client (phone, low-power laptop)
|
|
||||||
|
|
||||||
**Open questions:**
|
|
||||||
- How does this differ from just SSH + running cli.py on remote?
|
|
||||||
- Would need secure communication channel (WebSocket? gRPC?)
|
|
||||||
- How to handle tool outputs that reference remote paths?
|
|
||||||
- Credential management for remote execution
|
|
||||||
- Latency considerations for interactive use
|
|
||||||
|
|
||||||
**Possible architecture:**
|
|
||||||
```
|
|
||||||
┌─────────────┐ ┌─────────────────────────┐
|
|
||||||
│ Thin Client │ ◄─────► │ Remote Hermes Server │
|
|
||||||
│ (phone/web) │ WS/API │ - Full agent + tools │
|
|
||||||
└─────────────┘ │ - GPU for local LLM │
|
|
||||||
│ - Access to server files│
|
|
||||||
└─────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**Related to:** Messaging integrations (could be the "server" that monitors receive from)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Multi-Agent Parallel Execution 🤖🤖
|
|
||||||
|
|
||||||
**Concept:** Extension of Subagent Architecture (Section 1) - run multiple subagents in parallel.
|
|
||||||
|
|
||||||
**Why interesting:**
|
|
||||||
- Independent subtasks don't need to wait for each other
|
|
||||||
- "Research X while setting up Y" - both run simultaneously
|
|
||||||
- Faster completion for complex multi-part tasks
|
|
||||||
|
|
||||||
**Open questions:**
|
|
||||||
- How to detect which tasks are truly independent?
|
|
||||||
- Resource management (API rate limits, concurrent connections)
|
|
||||||
- How to merge results when parallel tasks have conflicts?
|
|
||||||
- Cost implications of multiple parallel LLM calls
|
|
||||||
|
|
||||||
*Note: Basic subagent delegation (Section 1) should be implemented first, parallel execution is an optimization on top.*
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Plugin/Extension System 🔌
|
|
||||||
|
|
||||||
**Concept:** Allow users to add custom tools/skills without modifying core code.
|
|
||||||
|
|
||||||
**Why interesting:**
|
|
||||||
- Community contributions
|
|
||||||
- Organization-specific tools
|
|
||||||
- Clean separation of core vs. extensions
|
|
||||||
|
|
||||||
**Open questions:**
|
|
||||||
- Security implications of loading arbitrary code
|
|
||||||
- Versioning and compatibility
|
|
||||||
- Discovery and installation UX
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Last updated: $(date +%Y-%m-%d)* 🤖
|
|
||||||
Binary file not shown.
Binary file not shown.
6
agent/__init__.py
Normal file
6
agent/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
"""Agent internals -- extracted modules from run_agent.py.
|
||||||
|
|
||||||
|
These modules contain pure utility functions and self-contained classes
|
||||||
|
that were previously embedded in the 3,600-line run_agent.py. Extracting
|
||||||
|
them makes run_agent.py focused on the AIAgent orchestrator class.
|
||||||
|
"""
|
||||||
600
agent/auxiliary_client.py
Normal file
600
agent/auxiliary_client.py
Normal file
@@ -0,0 +1,600 @@
|
|||||||
|
"""Shared auxiliary OpenAI client for cheap/fast side tasks.
|
||||||
|
|
||||||
|
Provides a single resolution chain so every consumer (context compression,
|
||||||
|
session search, web extraction, vision analysis, browser vision) picks up
|
||||||
|
the best available backend without duplicating fallback logic.
|
||||||
|
|
||||||
|
Resolution order for text tasks (auto mode):
|
||||||
|
1. OpenRouter (OPENROUTER_API_KEY)
|
||||||
|
2. Nous Portal (~/.hermes/auth.json active provider)
|
||||||
|
3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY)
|
||||||
|
4. Codex OAuth (Responses API via chatgpt.com with gpt-5.3-codex,
|
||||||
|
wrapped to look like a chat.completions client)
|
||||||
|
5. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, MiniMax-CN)
|
||||||
|
— checked via PROVIDER_REGISTRY entries with auth_type='api_key'
|
||||||
|
6. None
|
||||||
|
|
||||||
|
Resolution order for vision/multimodal tasks (auto mode):
|
||||||
|
1. OpenRouter
|
||||||
|
2. Nous Portal
|
||||||
|
3. None (steps 3-5 are skipped — they may not support multimodal)
|
||||||
|
|
||||||
|
Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
|
||||||
|
CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task:
|
||||||
|
"openrouter", "nous", "codex", or "main" (= steps 3-5).
|
||||||
|
Default "auto" follows the chains above.
|
||||||
|
|
||||||
|
Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
|
||||||
|
AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug
|
||||||
|
than the provider's default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
from hermes_constants import OPENROUTER_BASE_URL
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Default auxiliary models for direct API-key providers (cheap/fast for side tasks)
|
||||||
|
_API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
|
||||||
|
"zai": "glm-4.5-flash",
|
||||||
|
"kimi-coding": "kimi-k2-turbo-preview",
|
||||||
|
"minimax": "MiniMax-M2.5-highspeed",
|
||||||
|
"minimax-cn": "MiniMax-M2.5-highspeed",
|
||||||
|
}
|
||||||
|
|
||||||
|
# OpenRouter app attribution headers
|
||||||
|
_OR_HEADERS = {
|
||||||
|
"HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
|
||||||
|
"X-OpenRouter-Title": "Hermes Agent",
|
||||||
|
"X-OpenRouter-Categories": "productivity,cli-agent",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Nous Portal extra_body for product attribution.
|
||||||
|
# Callers should pass this as extra_body in chat.completions.create()
|
||||||
|
# when the auxiliary client is backed by Nous Portal.
|
||||||
|
NOUS_EXTRA_BODY = {"tags": ["product=hermes-agent"]}
|
||||||
|
|
||||||
|
# Set at resolve time — True if the auxiliary client points to Nous Portal
|
||||||
|
auxiliary_is_nous: bool = False
|
||||||
|
|
||||||
|
# Default auxiliary models per provider
|
||||||
|
_OPENROUTER_MODEL = "google/gemini-3-flash-preview"
|
||||||
|
_NOUS_MODEL = "gemini-3-flash"
|
||||||
|
_NOUS_DEFAULT_BASE_URL = "https://inference-api.nousresearch.com/v1"
|
||||||
|
_AUTH_JSON_PATH = Path.home() / ".hermes" / "auth.json"
|
||||||
|
|
||||||
|
# Codex fallback: uses the Responses API (the only endpoint the Codex
|
||||||
|
# OAuth token can access) with a fast model for auxiliary tasks.
|
||||||
|
_CODEX_AUX_MODEL = "gpt-5.3-codex"
|
||||||
|
_CODEX_AUX_BASE_URL = "https://chatgpt.com/backend-api/codex"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Codex Responses → chat.completions adapter ─────────────────────────────
|
||||||
|
# All auxiliary consumers call client.chat.completions.create(**kwargs) and
|
||||||
|
# read response.choices[0].message.content. This adapter translates those
|
||||||
|
# calls to the Codex Responses API so callers don't need any changes.
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_content_for_responses(content: Any) -> Any:
|
||||||
|
"""Convert chat.completions content to Responses API format.
|
||||||
|
|
||||||
|
chat.completions uses:
|
||||||
|
{"type": "text", "text": "..."}
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
|
||||||
|
|
||||||
|
Responses API uses:
|
||||||
|
{"type": "input_text", "text": "..."}
|
||||||
|
{"type": "input_image", "image_url": "data:image/png;base64,..."}
|
||||||
|
|
||||||
|
If content is a plain string, it's returned as-is (the Responses API
|
||||||
|
accepts strings directly for text-only messages).
|
||||||
|
"""
|
||||||
|
if isinstance(content, str):
|
||||||
|
return content
|
||||||
|
if not isinstance(content, list):
|
||||||
|
return str(content) if content else ""
|
||||||
|
|
||||||
|
converted: List[Dict[str, Any]] = []
|
||||||
|
for part in content:
|
||||||
|
if not isinstance(part, dict):
|
||||||
|
continue
|
||||||
|
ptype = part.get("type", "")
|
||||||
|
if ptype == "text":
|
||||||
|
converted.append({"type": "input_text", "text": part.get("text", "")})
|
||||||
|
elif ptype == "image_url":
|
||||||
|
# chat.completions nests the URL: {"image_url": {"url": "..."}}
|
||||||
|
image_data = part.get("image_url", {})
|
||||||
|
url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data)
|
||||||
|
entry: Dict[str, Any] = {"type": "input_image", "image_url": url}
|
||||||
|
# Preserve detail if specified
|
||||||
|
detail = image_data.get("detail") if isinstance(image_data, dict) else None
|
||||||
|
if detail:
|
||||||
|
entry["detail"] = detail
|
||||||
|
converted.append(entry)
|
||||||
|
elif ptype in ("input_text", "input_image"):
|
||||||
|
# Already in Responses format — pass through
|
||||||
|
converted.append(part)
|
||||||
|
else:
|
||||||
|
# Unknown content type — try to preserve as text
|
||||||
|
text = part.get("text", "")
|
||||||
|
if text:
|
||||||
|
converted.append({"type": "input_text", "text": text})
|
||||||
|
|
||||||
|
return converted or ""
|
||||||
|
|
||||||
|
|
||||||
|
class _CodexCompletionsAdapter:
|
||||||
|
"""Drop-in shim that accepts chat.completions.create() kwargs and
|
||||||
|
routes them through the Codex Responses streaming API."""
|
||||||
|
|
||||||
|
def __init__(self, real_client: OpenAI, model: str):
|
||||||
|
self._client = real_client
|
||||||
|
self._model = model
|
||||||
|
|
||||||
|
def create(self, **kwargs) -> Any:
|
||||||
|
messages = kwargs.get("messages", [])
|
||||||
|
model = kwargs.get("model", self._model)
|
||||||
|
temperature = kwargs.get("temperature")
|
||||||
|
|
||||||
|
# Separate system/instructions from conversation messages.
|
||||||
|
# Convert chat.completions multimodal content blocks to Responses
|
||||||
|
# API format (input_text / input_image instead of text / image_url).
|
||||||
|
instructions = "You are a helpful assistant."
|
||||||
|
input_msgs: List[Dict[str, Any]] = []
|
||||||
|
for msg in messages:
|
||||||
|
role = msg.get("role", "user")
|
||||||
|
content = msg.get("content") or ""
|
||||||
|
if role == "system":
|
||||||
|
instructions = content if isinstance(content, str) else str(content)
|
||||||
|
else:
|
||||||
|
input_msgs.append({
|
||||||
|
"role": role,
|
||||||
|
"content": _convert_content_for_responses(content),
|
||||||
|
})
|
||||||
|
|
||||||
|
resp_kwargs: Dict[str, Any] = {
|
||||||
|
"model": model,
|
||||||
|
"instructions": instructions,
|
||||||
|
"input": input_msgs or [{"role": "user", "content": ""}],
|
||||||
|
"store": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Note: the Codex endpoint (chatgpt.com/backend-api/codex) does NOT
|
||||||
|
# support max_output_tokens or temperature — omit to avoid 400 errors.
|
||||||
|
|
||||||
|
# Tools support for flush_memories and similar callers
|
||||||
|
tools = kwargs.get("tools")
|
||||||
|
if tools:
|
||||||
|
converted = []
|
||||||
|
for t in tools:
|
||||||
|
fn = t.get("function", {}) if isinstance(t, dict) else {}
|
||||||
|
name = fn.get("name")
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
converted.append({
|
||||||
|
"type": "function",
|
||||||
|
"name": name,
|
||||||
|
"description": fn.get("description", ""),
|
||||||
|
"parameters": fn.get("parameters", {}),
|
||||||
|
})
|
||||||
|
if converted:
|
||||||
|
resp_kwargs["tools"] = converted
|
||||||
|
|
||||||
|
# Stream and collect the response
|
||||||
|
text_parts: List[str] = []
|
||||||
|
tool_calls_raw: List[Any] = []
|
||||||
|
usage = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
with self._client.responses.stream(**resp_kwargs) as stream:
|
||||||
|
for _event in stream:
|
||||||
|
pass
|
||||||
|
final = stream.get_final_response()
|
||||||
|
|
||||||
|
# Extract text and tool calls from the Responses output
|
||||||
|
for item in getattr(final, "output", []):
|
||||||
|
item_type = getattr(item, "type", None)
|
||||||
|
if item_type == "message":
|
||||||
|
for part in getattr(item, "content", []):
|
||||||
|
ptype = getattr(part, "type", None)
|
||||||
|
if ptype in ("output_text", "text"):
|
||||||
|
text_parts.append(getattr(part, "text", ""))
|
||||||
|
elif item_type == "function_call":
|
||||||
|
tool_calls_raw.append(SimpleNamespace(
|
||||||
|
id=getattr(item, "call_id", ""),
|
||||||
|
type="function",
|
||||||
|
function=SimpleNamespace(
|
||||||
|
name=getattr(item, "name", ""),
|
||||||
|
arguments=getattr(item, "arguments", "{}"),
|
||||||
|
),
|
||||||
|
))
|
||||||
|
|
||||||
|
resp_usage = getattr(final, "usage", None)
|
||||||
|
if resp_usage:
|
||||||
|
usage = SimpleNamespace(
|
||||||
|
prompt_tokens=getattr(resp_usage, "input_tokens", 0),
|
||||||
|
completion_tokens=getattr(resp_usage, "output_tokens", 0),
|
||||||
|
total_tokens=getattr(resp_usage, "total_tokens", 0),
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Codex auxiliary Responses API call failed: %s", exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
content = "".join(text_parts).strip() or None
|
||||||
|
|
||||||
|
# Build a response that looks like chat.completions
|
||||||
|
message = SimpleNamespace(
|
||||||
|
role="assistant",
|
||||||
|
content=content,
|
||||||
|
tool_calls=tool_calls_raw or None,
|
||||||
|
)
|
||||||
|
choice = SimpleNamespace(
|
||||||
|
index=0,
|
||||||
|
message=message,
|
||||||
|
finish_reason="stop" if not tool_calls_raw else "tool_calls",
|
||||||
|
)
|
||||||
|
return SimpleNamespace(
|
||||||
|
choices=[choice],
|
||||||
|
model=model,
|
||||||
|
usage=usage,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _CodexChatShim:
|
||||||
|
"""Wraps the adapter to provide client.chat.completions.create()."""
|
||||||
|
|
||||||
|
def __init__(self, adapter: _CodexCompletionsAdapter):
|
||||||
|
self.completions = adapter
|
||||||
|
|
||||||
|
|
||||||
|
class CodexAuxiliaryClient:
|
||||||
|
"""OpenAI-client-compatible wrapper that routes through Codex Responses API.
|
||||||
|
|
||||||
|
Consumers can call client.chat.completions.create(**kwargs) as normal.
|
||||||
|
Also exposes .api_key and .base_url for introspection by async wrappers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, real_client: OpenAI, model: str):
|
||||||
|
self._real_client = real_client
|
||||||
|
adapter = _CodexCompletionsAdapter(real_client, model)
|
||||||
|
self.chat = _CodexChatShim(adapter)
|
||||||
|
self.api_key = real_client.api_key
|
||||||
|
self.base_url = real_client.base_url
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self._real_client.close()
|
||||||
|
|
||||||
|
|
||||||
|
class _AsyncCodexCompletionsAdapter:
|
||||||
|
"""Async version of the Codex Responses adapter.
|
||||||
|
|
||||||
|
Wraps the sync adapter via asyncio.to_thread() so async consumers
|
||||||
|
(web_tools, session_search) can await it as normal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, sync_adapter: _CodexCompletionsAdapter):
|
||||||
|
self._sync = sync_adapter
|
||||||
|
|
||||||
|
async def create(self, **kwargs) -> Any:
|
||||||
|
import asyncio
|
||||||
|
return await asyncio.to_thread(self._sync.create, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class _AsyncCodexChatShim:
|
||||||
|
def __init__(self, adapter: _AsyncCodexCompletionsAdapter):
|
||||||
|
self.completions = adapter
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncCodexAuxiliaryClient:
|
||||||
|
"""Async-compatible wrapper matching AsyncOpenAI.chat.completions.create()."""
|
||||||
|
|
||||||
|
def __init__(self, sync_wrapper: "CodexAuxiliaryClient"):
|
||||||
|
sync_adapter = sync_wrapper.chat.completions
|
||||||
|
async_adapter = _AsyncCodexCompletionsAdapter(sync_adapter)
|
||||||
|
self.chat = _AsyncCodexChatShim(async_adapter)
|
||||||
|
self.api_key = sync_wrapper.api_key
|
||||||
|
self.base_url = sync_wrapper.base_url
|
||||||
|
|
||||||
|
|
||||||
|
def _read_nous_auth() -> Optional[dict]:
|
||||||
|
"""Read and validate ~/.hermes/auth.json for an active Nous provider.
|
||||||
|
|
||||||
|
Returns the provider state dict if Nous is active with tokens,
|
||||||
|
otherwise None.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not _AUTH_JSON_PATH.is_file():
|
||||||
|
return None
|
||||||
|
data = json.loads(_AUTH_JSON_PATH.read_text())
|
||||||
|
if data.get("active_provider") != "nous":
|
||||||
|
return None
|
||||||
|
provider = data.get("providers", {}).get("nous", {})
|
||||||
|
# Must have at least an access_token or agent_key
|
||||||
|
if not provider.get("agent_key") and not provider.get("access_token"):
|
||||||
|
return None
|
||||||
|
return provider
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Could not read Nous auth: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _nous_api_key(provider: dict) -> str:
|
||||||
|
"""Extract the best API key from a Nous provider state dict."""
|
||||||
|
return provider.get("agent_key") or provider.get("access_token", "")
|
||||||
|
|
||||||
|
|
||||||
|
def _nous_base_url() -> str:
|
||||||
|
"""Resolve the Nous inference base URL from env or default."""
|
||||||
|
return os.getenv("NOUS_INFERENCE_BASE_URL", _NOUS_DEFAULT_BASE_URL)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_codex_access_token() -> Optional[str]:
|
||||||
|
"""Read a valid Codex OAuth access token from Hermes auth store (~/.hermes/auth.json)."""
|
||||||
|
try:
|
||||||
|
from hermes_cli.auth import _read_codex_tokens
|
||||||
|
data = _read_codex_tokens()
|
||||||
|
tokens = data.get("tokens", {})
|
||||||
|
access_token = tokens.get("access_token")
|
||||||
|
if isinstance(access_token, str) and access_token.strip():
|
||||||
|
return access_token.strip()
|
||||||
|
return None
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Could not read Codex auth for auxiliary client: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
"""Try each API-key provider in PROVIDER_REGISTRY order.
|
||||||
|
|
||||||
|
Returns (client, model) for the first provider whose env var is set,
|
||||||
|
or (None, None) if none are configured.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from hermes_cli.auth import PROVIDER_REGISTRY
|
||||||
|
except ImportError:
|
||||||
|
logger.debug("Could not import PROVIDER_REGISTRY for API-key fallback")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
for provider_id, pconfig in PROVIDER_REGISTRY.items():
|
||||||
|
if pconfig.auth_type != "api_key":
|
||||||
|
continue
|
||||||
|
# Check if any of the provider's env vars are set
|
||||||
|
api_key = ""
|
||||||
|
for env_var in pconfig.api_key_env_vars:
|
||||||
|
val = os.getenv(env_var, "").strip()
|
||||||
|
if val:
|
||||||
|
api_key = val
|
||||||
|
break
|
||||||
|
if not api_key:
|
||||||
|
continue
|
||||||
|
# Resolve base URL (with optional env-var override)
|
||||||
|
# Kimi Code keys (sk-kimi-) need api.kimi.com/coding/v1
|
||||||
|
env_url = ""
|
||||||
|
if pconfig.base_url_env_var:
|
||||||
|
env_url = os.getenv(pconfig.base_url_env_var, "").strip()
|
||||||
|
if env_url:
|
||||||
|
base_url = env_url.rstrip("/")
|
||||||
|
elif provider_id == "kimi-coding" and api_key.startswith("sk-kimi-"):
|
||||||
|
base_url = "https://api.kimi.com/coding/v1"
|
||||||
|
else:
|
||||||
|
base_url = pconfig.inference_base_url
|
||||||
|
model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id, "default")
|
||||||
|
logger.debug("Auxiliary text client: %s (%s)", pconfig.name, model)
|
||||||
|
extra = {}
|
||||||
|
if "api.kimi.com" in base_url.lower():
|
||||||
|
extra["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
|
||||||
|
return OpenAI(api_key=api_key, base_url=base_url, **extra), model
|
||||||
|
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Provider resolution helpers ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _get_auxiliary_provider(task: str = "") -> str:
|
||||||
|
"""Read the provider override for a specific auxiliary task.
|
||||||
|
|
||||||
|
Checks AUXILIARY_{TASK}_PROVIDER first (e.g. AUXILIARY_VISION_PROVIDER),
|
||||||
|
then CONTEXT_{TASK}_PROVIDER (for the compression section's summary_provider),
|
||||||
|
then falls back to "auto". Returns one of: "auto", "openrouter", "nous", "main".
|
||||||
|
"""
|
||||||
|
if task:
|
||||||
|
for prefix in ("AUXILIARY_", "CONTEXT_"):
|
||||||
|
val = os.getenv(f"{prefix}{task.upper()}_PROVIDER", "").strip().lower()
|
||||||
|
if val and val != "auto":
|
||||||
|
return val
|
||||||
|
return "auto"
|
||||||
|
|
||||||
|
|
||||||
|
def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
or_key = os.getenv("OPENROUTER_API_KEY")
|
||||||
|
if not or_key:
|
||||||
|
return None, None
|
||||||
|
logger.debug("Auxiliary client: OpenRouter")
|
||||||
|
return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
|
||||||
|
default_headers=_OR_HEADERS), _OPENROUTER_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
def _try_nous() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
nous = _read_nous_auth()
|
||||||
|
if not nous:
|
||||||
|
return None, None
|
||||||
|
global auxiliary_is_nous
|
||||||
|
auxiliary_is_nous = True
|
||||||
|
logger.debug("Auxiliary client: Nous Portal")
|
||||||
|
return (
|
||||||
|
OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
|
||||||
|
_NOUS_MODEL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
custom_base = os.getenv("OPENAI_BASE_URL")
|
||||||
|
custom_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
if not custom_base or not custom_key:
|
||||||
|
return None, None
|
||||||
|
model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini"
|
||||||
|
logger.debug("Auxiliary client: custom endpoint (%s)", model)
|
||||||
|
return OpenAI(api_key=custom_key, base_url=custom_base), model
|
||||||
|
|
||||||
|
|
||||||
|
def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
|
||||||
|
codex_token = _read_codex_access_token()
|
||||||
|
if not codex_token:
|
||||||
|
return None, None
|
||||||
|
logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", _CODEX_AUX_MODEL)
|
||||||
|
real_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
|
||||||
|
return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
"""Resolve a specific forced provider. Returns (None, None) if creds missing."""
|
||||||
|
if forced == "openrouter":
|
||||||
|
client, model = _try_openrouter()
|
||||||
|
if client is None:
|
||||||
|
logger.warning("auxiliary.provider=openrouter but OPENROUTER_API_KEY not set")
|
||||||
|
return client, model
|
||||||
|
|
||||||
|
if forced == "nous":
|
||||||
|
client, model = _try_nous()
|
||||||
|
if client is None:
|
||||||
|
logger.warning("auxiliary.provider=nous but Nous Portal not configured (run: hermes login)")
|
||||||
|
return client, model
|
||||||
|
|
||||||
|
if forced == "codex":
|
||||||
|
client, model = _try_codex()
|
||||||
|
if client is None:
|
||||||
|
logger.warning("auxiliary.provider=codex but no Codex OAuth token found (run: hermes model)")
|
||||||
|
return client, model
|
||||||
|
|
||||||
|
if forced == "main":
|
||||||
|
# "main" = skip OpenRouter/Nous, use the main chat model's credentials.
|
||||||
|
for try_fn in (_try_custom_endpoint, _try_codex, _resolve_api_key_provider):
|
||||||
|
client, model = try_fn()
|
||||||
|
if client is not None:
|
||||||
|
return client, model
|
||||||
|
logger.warning("auxiliary.provider=main but no main endpoint credentials found")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Unknown provider name — fall through to auto
|
||||||
|
logger.warning("Unknown auxiliary.provider=%r, falling back to auto", forced)
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
"""Full auto-detection chain: OpenRouter → Nous → custom → Codex → API-key → None."""
|
||||||
|
for try_fn in (_try_openrouter, _try_nous, _try_custom_endpoint,
|
||||||
|
_try_codex, _resolve_api_key_provider):
|
||||||
|
client, model = try_fn()
|
||||||
|
if client is not None:
|
||||||
|
return client, model
|
||||||
|
logger.debug("Auxiliary client: none available")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public API ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
"""Return (client, default_model_slug) for text-only auxiliary tasks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task: Optional task name ("compression", "web_extract") to check
|
||||||
|
for a task-specific provider override.
|
||||||
|
|
||||||
|
Callers may override the returned model with a per-task env var
|
||||||
|
(e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
|
||||||
|
"""
|
||||||
|
forced = _get_auxiliary_provider(task)
|
||||||
|
if forced != "auto":
|
||||||
|
return _resolve_forced_provider(forced)
|
||||||
|
return _resolve_auto()
|
||||||
|
|
||||||
|
|
||||||
|
def get_async_text_auxiliary_client(task: str = ""):
|
||||||
|
"""Return (async_client, model_slug) for async consumers.
|
||||||
|
|
||||||
|
For standard providers returns (AsyncOpenAI, model). For Codex returns
|
||||||
|
(AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
|
||||||
|
Returns (None, None) when no provider is available.
|
||||||
|
"""
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
|
sync_client, model = get_text_auxiliary_client(task)
|
||||||
|
if sync_client is None:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
if isinstance(sync_client, CodexAuxiliaryClient):
|
||||||
|
return AsyncCodexAuxiliaryClient(sync_client), model
|
||||||
|
|
||||||
|
async_kwargs = {
|
||||||
|
"api_key": sync_client.api_key,
|
||||||
|
"base_url": str(sync_client.base_url),
|
||||||
|
}
|
||||||
|
if "openrouter" in str(sync_client.base_url).lower():
|
||||||
|
async_kwargs["default_headers"] = dict(_OR_HEADERS)
|
||||||
|
elif "api.kimi.com" in str(sync_client.base_url).lower():
|
||||||
|
async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
|
||||||
|
return AsyncOpenAI(**async_kwargs), model
|
||||||
|
|
||||||
|
|
||||||
|
def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
"""Return (client, default_model_slug) for vision/multimodal auxiliary tasks.
|
||||||
|
|
||||||
|
Checks AUXILIARY_VISION_PROVIDER for a forced provider, otherwise
|
||||||
|
auto-detects. Callers may override the returned model with
|
||||||
|
AUXILIARY_VISION_MODEL.
|
||||||
|
|
||||||
|
In auto mode, only providers known to support multimodal are tried:
|
||||||
|
OpenRouter, Nous Portal, and Codex OAuth (gpt-5.3-codex supports
|
||||||
|
vision via the Responses API). Custom endpoints and API-key
|
||||||
|
providers are skipped — they may not handle vision input. To use
|
||||||
|
them, set AUXILIARY_VISION_PROVIDER explicitly.
|
||||||
|
"""
|
||||||
|
forced = _get_auxiliary_provider("vision")
|
||||||
|
if forced != "auto":
|
||||||
|
return _resolve_forced_provider(forced)
|
||||||
|
# Auto: try providers known to support multimodal first, then fall
|
||||||
|
# back to the user's custom endpoint. Many local models (Qwen-VL,
|
||||||
|
# LLaVA, Pixtral, etc.) support vision — skipping them entirely
|
||||||
|
# caused silent failures for local-only users.
|
||||||
|
for try_fn in (_try_openrouter, _try_nous, _try_codex,
|
||||||
|
_try_custom_endpoint):
|
||||||
|
client, model = try_fn()
|
||||||
|
if client is not None:
|
||||||
|
return client, model
|
||||||
|
logger.debug("Auxiliary vision client: none available")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def get_auxiliary_extra_body() -> dict:
|
||||||
|
"""Return extra_body kwargs for auxiliary API calls.
|
||||||
|
|
||||||
|
Includes Nous Portal product tags when the auxiliary client is backed
|
||||||
|
by Nous Portal. Returns empty dict otherwise.
|
||||||
|
"""
|
||||||
|
return dict(NOUS_EXTRA_BODY) if auxiliary_is_nous else {}
|
||||||
|
|
||||||
|
|
||||||
|
def auxiliary_max_tokens_param(value: int) -> dict:
|
||||||
|
"""Return the correct max tokens kwarg for the auxiliary client's provider.
|
||||||
|
|
||||||
|
OpenRouter and local models use 'max_tokens'. Direct OpenAI with newer
|
||||||
|
models (gpt-4o, o-series, gpt-5+) requires 'max_completion_tokens'.
|
||||||
|
The Codex adapter translates max_tokens internally, so we use max_tokens
|
||||||
|
for it as well.
|
||||||
|
"""
|
||||||
|
custom_base = os.getenv("OPENAI_BASE_URL", "")
|
||||||
|
or_key = os.getenv("OPENROUTER_API_KEY")
|
||||||
|
# Only use max_completion_tokens for direct OpenAI custom endpoints
|
||||||
|
if (not or_key
|
||||||
|
and _read_nous_auth() is None
|
||||||
|
and "api.openai.com" in custom_base.lower()):
|
||||||
|
return {"max_completion_tokens": value}
|
||||||
|
return {"max_tokens": value}
|
||||||
365
agent/context_compressor.py
Normal file
365
agent/context_compressor.py
Normal file
@@ -0,0 +1,365 @@
|
|||||||
|
"""Automatic context window compression for long conversations.
|
||||||
|
|
||||||
|
Self-contained class with its own OpenAI client for summarization.
|
||||||
|
Uses Gemini Flash (cheap/fast) to summarize middle turns while
|
||||||
|
protecting head and tail context.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from agent.auxiliary_client import get_text_auxiliary_client
|
||||||
|
from agent.model_metadata import (
|
||||||
|
get_model_context_length,
|
||||||
|
estimate_messages_tokens_rough,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ContextCompressor:
|
||||||
|
"""Compresses conversation context when approaching the model's context limit.
|
||||||
|
|
||||||
|
Algorithm: protect first N + last N turns, summarize everything in between.
|
||||||
|
Token tracking uses actual counts from API responses for accuracy.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
threshold_percent: float = 0.85,
|
||||||
|
protect_first_n: int = 3,
|
||||||
|
protect_last_n: int = 4,
|
||||||
|
summary_target_tokens: int = 2500,
|
||||||
|
quiet_mode: bool = False,
|
||||||
|
summary_model_override: str = None,
|
||||||
|
base_url: str = "",
|
||||||
|
):
|
||||||
|
self.model = model
|
||||||
|
self.base_url = base_url
|
||||||
|
self.threshold_percent = threshold_percent
|
||||||
|
self.protect_first_n = protect_first_n
|
||||||
|
self.protect_last_n = protect_last_n
|
||||||
|
self.summary_target_tokens = summary_target_tokens
|
||||||
|
self.quiet_mode = quiet_mode
|
||||||
|
|
||||||
|
self.context_length = get_model_context_length(model, base_url=base_url)
|
||||||
|
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||||
|
self.compression_count = 0
|
||||||
|
self._context_probed = False # True after a step-down from context error
|
||||||
|
|
||||||
|
self.last_prompt_tokens = 0
|
||||||
|
self.last_completion_tokens = 0
|
||||||
|
self.last_total_tokens = 0
|
||||||
|
|
||||||
|
self.client, default_model = get_text_auxiliary_client("compression")
|
||||||
|
self.summary_model = summary_model_override or default_model
|
||||||
|
|
||||||
|
def update_from_response(self, usage: Dict[str, Any]):
|
||||||
|
"""Update tracked token usage from API response."""
|
||||||
|
self.last_prompt_tokens = usage.get("prompt_tokens", 0)
|
||||||
|
self.last_completion_tokens = usage.get("completion_tokens", 0)
|
||||||
|
self.last_total_tokens = usage.get("total_tokens", 0)
|
||||||
|
|
||||||
|
def should_compress(self, prompt_tokens: int = None) -> bool:
|
||||||
|
"""Check if context exceeds the compression threshold."""
|
||||||
|
tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens
|
||||||
|
return tokens >= self.threshold_tokens
|
||||||
|
|
||||||
|
def should_compress_preflight(self, messages: List[Dict[str, Any]]) -> bool:
|
||||||
|
"""Quick pre-flight check using rough estimate (before API call)."""
|
||||||
|
rough_estimate = estimate_messages_tokens_rough(messages)
|
||||||
|
return rough_estimate >= self.threshold_tokens
|
||||||
|
|
||||||
|
def get_status(self) -> Dict[str, Any]:
|
||||||
|
"""Get current compression status for display/logging."""
|
||||||
|
return {
|
||||||
|
"last_prompt_tokens": self.last_prompt_tokens,
|
||||||
|
"threshold_tokens": self.threshold_tokens,
|
||||||
|
"context_length": self.context_length,
|
||||||
|
"usage_percent": (self.last_prompt_tokens / self.context_length * 100) if self.context_length else 0,
|
||||||
|
"compression_count": self.compression_count,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
|
||||||
|
"""Generate a concise summary of conversation turns.
|
||||||
|
|
||||||
|
Tries the auxiliary model first, then falls back to the user's main
|
||||||
|
model. Returns None if all attempts fail — the caller should drop
|
||||||
|
the middle turns without a summary rather than inject a useless
|
||||||
|
placeholder.
|
||||||
|
"""
|
||||||
|
parts = []
|
||||||
|
for msg in turns_to_summarize:
|
||||||
|
role = msg.get("role", "unknown")
|
||||||
|
content = msg.get("content") or ""
|
||||||
|
if len(content) > 2000:
|
||||||
|
content = content[:1000] + "\n...[truncated]...\n" + content[-500:]
|
||||||
|
tool_calls = msg.get("tool_calls", [])
|
||||||
|
if tool_calls:
|
||||||
|
tool_names = [tc.get("function", {}).get("name", "?") for tc in tool_calls if isinstance(tc, dict)]
|
||||||
|
content += f"\n[Tool calls: {', '.join(tool_names)}]"
|
||||||
|
parts.append(f"[{role.upper()}]: {content}")
|
||||||
|
|
||||||
|
content_to_summarize = "\n\n".join(parts)
|
||||||
|
prompt = f"""Summarize these conversation turns concisely. This summary will replace these turns in the conversation history.
|
||||||
|
|
||||||
|
Write from a neutral perspective describing:
|
||||||
|
1. What actions were taken (tool calls, searches, file operations)
|
||||||
|
2. Key information or results obtained
|
||||||
|
3. Important decisions or findings
|
||||||
|
4. Relevant data, file names, or outputs
|
||||||
|
|
||||||
|
Keep factual and informative. Target ~{self.summary_target_tokens} tokens.
|
||||||
|
|
||||||
|
---
|
||||||
|
TURNS TO SUMMARIZE:
|
||||||
|
{content_to_summarize}
|
||||||
|
---
|
||||||
|
|
||||||
|
Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
||||||
|
|
||||||
|
# 1. Try the auxiliary model (cheap/fast)
|
||||||
|
if self.client:
|
||||||
|
try:
|
||||||
|
return self._call_summary_model(self.client, self.summary_model, prompt)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
|
||||||
|
|
||||||
|
# 2. Fallback: try the user's main model endpoint
|
||||||
|
fallback_client, fallback_model = self._get_fallback_client()
|
||||||
|
if fallback_client is not None:
|
||||||
|
try:
|
||||||
|
logger.info("Retrying context summary with main model (%s)", fallback_model)
|
||||||
|
summary = self._call_summary_model(fallback_client, fallback_model, prompt)
|
||||||
|
self.client = fallback_client
|
||||||
|
self.summary_model = fallback_model
|
||||||
|
return summary
|
||||||
|
except Exception as fallback_err:
|
||||||
|
logging.warning(f"Main model summary also failed: {fallback_err}")
|
||||||
|
|
||||||
|
# 3. All models failed — return None so the caller drops turns without a summary
|
||||||
|
logging.warning("Context compression: no model available for summary. Middle turns will be dropped without summary.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _call_summary_model(self, client, model: str, prompt: str) -> str:
|
||||||
|
"""Make the actual LLM call to generate a summary. Raises on failure."""
|
||||||
|
kwargs = {
|
||||||
|
"model": model,
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
"temperature": 0.3,
|
||||||
|
"timeout": 30.0,
|
||||||
|
}
|
||||||
|
# Most providers (OpenRouter, local models) use max_tokens.
|
||||||
|
# Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
|
||||||
|
# requires max_completion_tokens instead.
|
||||||
|
try:
|
||||||
|
kwargs["max_tokens"] = self.summary_target_tokens * 2
|
||||||
|
response = client.chat.completions.create(**kwargs)
|
||||||
|
except Exception as first_err:
|
||||||
|
if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
|
||||||
|
kwargs.pop("max_tokens", None)
|
||||||
|
kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
|
||||||
|
response = client.chat.completions.create(**kwargs)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
summary = response.choices[0].message.content.strip()
|
||||||
|
if not summary.startswith("[CONTEXT SUMMARY]:"):
|
||||||
|
summary = "[CONTEXT SUMMARY]: " + summary
|
||||||
|
return summary
|
||||||
|
|
||||||
|
def _get_fallback_client(self):
|
||||||
|
"""Try to build a fallback client from the main model's endpoint config.
|
||||||
|
|
||||||
|
When the primary auxiliary client fails (e.g. stale OpenRouter key), this
|
||||||
|
creates a client using the user's active custom endpoint (OPENAI_BASE_URL)
|
||||||
|
so compression can still produce a real summary instead of a static string.
|
||||||
|
|
||||||
|
Returns (client, model) or (None, None).
|
||||||
|
"""
|
||||||
|
custom_base = os.getenv("OPENAI_BASE_URL")
|
||||||
|
custom_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
if not custom_base or not custom_key:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Don't fallback to the same provider that just failed
|
||||||
|
from hermes_constants import OPENROUTER_BASE_URL
|
||||||
|
if custom_base.rstrip("/") == OPENROUTER_BASE_URL.rstrip("/"):
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
model = os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or self.model
|
||||||
|
try:
|
||||||
|
from openai import OpenAI as _OpenAI
|
||||||
|
client = _OpenAI(api_key=custom_key, base_url=custom_base)
|
||||||
|
logger.debug("Built fallback auxiliary client: %s via %s", model, custom_base)
|
||||||
|
return client, model
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Could not build fallback auxiliary client: %s", exc)
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tool-call / tool-result pair integrity helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_tool_call_id(tc) -> str:
|
||||||
|
"""Extract the call ID from a tool_call entry (dict or SimpleNamespace)."""
|
||||||
|
if isinstance(tc, dict):
|
||||||
|
return tc.get("id", "")
|
||||||
|
return getattr(tc, "id", "") or ""
|
||||||
|
|
||||||
|
def _sanitize_tool_pairs(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
"""Fix orphaned tool_call / tool_result pairs after compression.
|
||||||
|
|
||||||
|
Two failure modes:
|
||||||
|
1. A tool *result* references a call_id whose assistant tool_call was
|
||||||
|
removed (summarized/truncated). The API rejects this with
|
||||||
|
"No tool call found for function call output with call_id ...".
|
||||||
|
2. An assistant message has tool_calls whose results were dropped.
|
||||||
|
The API rejects this because every tool_call must be followed by
|
||||||
|
a tool result with the matching call_id.
|
||||||
|
|
||||||
|
This method removes orphaned results and inserts stub results for
|
||||||
|
orphaned calls so the message list is always well-formed.
|
||||||
|
"""
|
||||||
|
surviving_call_ids: set = set()
|
||||||
|
for msg in messages:
|
||||||
|
if msg.get("role") == "assistant":
|
||||||
|
for tc in msg.get("tool_calls") or []:
|
||||||
|
cid = self._get_tool_call_id(tc)
|
||||||
|
if cid:
|
||||||
|
surviving_call_ids.add(cid)
|
||||||
|
|
||||||
|
result_call_ids: set = set()
|
||||||
|
for msg in messages:
|
||||||
|
if msg.get("role") == "tool":
|
||||||
|
cid = msg.get("tool_call_id")
|
||||||
|
if cid:
|
||||||
|
result_call_ids.add(cid)
|
||||||
|
|
||||||
|
# 1. Remove tool results whose call_id has no matching assistant tool_call
|
||||||
|
orphaned_results = result_call_ids - surviving_call_ids
|
||||||
|
if orphaned_results:
|
||||||
|
messages = [
|
||||||
|
m for m in messages
|
||||||
|
if not (m.get("role") == "tool" and m.get("tool_call_id") in orphaned_results)
|
||||||
|
]
|
||||||
|
if not self.quiet_mode:
|
||||||
|
logger.info("Compression sanitizer: removed %d orphaned tool result(s)", len(orphaned_results))
|
||||||
|
|
||||||
|
# 2. Add stub results for assistant tool_calls whose results were dropped
|
||||||
|
missing_results = surviving_call_ids - result_call_ids
|
||||||
|
if missing_results:
|
||||||
|
patched: List[Dict[str, Any]] = []
|
||||||
|
for msg in messages:
|
||||||
|
patched.append(msg)
|
||||||
|
if msg.get("role") == "assistant":
|
||||||
|
for tc in msg.get("tool_calls") or []:
|
||||||
|
cid = self._get_tool_call_id(tc)
|
||||||
|
if cid in missing_results:
|
||||||
|
patched.append({
|
||||||
|
"role": "tool",
|
||||||
|
"content": "[Result from earlier conversation — see context summary above]",
|
||||||
|
"tool_call_id": cid,
|
||||||
|
})
|
||||||
|
messages = patched
|
||||||
|
if not self.quiet_mode:
|
||||||
|
logger.info("Compression sanitizer: added %d stub tool result(s)", len(missing_results))
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
def _align_boundary_forward(self, messages: List[Dict[str, Any]], idx: int) -> int:
|
||||||
|
"""Push a compress-start boundary forward past any orphan tool results.
|
||||||
|
|
||||||
|
If ``messages[idx]`` is a tool result, slide forward until we hit a
|
||||||
|
non-tool message so we don't start the summarised region mid-group.
|
||||||
|
"""
|
||||||
|
while idx < len(messages) and messages[idx].get("role") == "tool":
|
||||||
|
idx += 1
|
||||||
|
return idx
|
||||||
|
|
||||||
|
def _align_boundary_backward(self, messages: List[Dict[str, Any]], idx: int) -> int:
|
||||||
|
"""Pull a compress-end boundary backward to avoid splitting a
|
||||||
|
tool_call / result group.
|
||||||
|
|
||||||
|
If the message just before ``idx`` is an assistant message with
|
||||||
|
tool_calls, those tool results will start at ``idx`` and would be
|
||||||
|
separated from their parent. Move backwards to include the whole
|
||||||
|
group in the summarised region.
|
||||||
|
"""
|
||||||
|
if idx <= 0 or idx >= len(messages):
|
||||||
|
return idx
|
||||||
|
prev = messages[idx - 1]
|
||||||
|
if prev.get("role") == "assistant" and prev.get("tool_calls"):
|
||||||
|
# The results for this assistant turn sit at idx..idx+k.
|
||||||
|
# Include the assistant message in the summarised region too.
|
||||||
|
idx -= 1
|
||||||
|
return idx
|
||||||
|
|
||||||
|
def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]:
|
||||||
|
"""Compress conversation messages by summarizing middle turns.
|
||||||
|
|
||||||
|
Keeps first N + last N turns, summarizes everything in between.
|
||||||
|
After compression, orphaned tool_call / tool_result pairs are cleaned
|
||||||
|
up so the API never receives mismatched IDs.
|
||||||
|
"""
|
||||||
|
n_messages = len(messages)
|
||||||
|
if n_messages <= self.protect_first_n + self.protect_last_n + 1:
|
||||||
|
if not self.quiet_mode:
|
||||||
|
print(f"⚠️ Cannot compress: only {n_messages} messages (need > {self.protect_first_n + self.protect_last_n + 1})")
|
||||||
|
return messages
|
||||||
|
|
||||||
|
compress_start = self.protect_first_n
|
||||||
|
compress_end = n_messages - self.protect_last_n
|
||||||
|
if compress_start >= compress_end:
|
||||||
|
return messages
|
||||||
|
|
||||||
|
# Adjust boundaries to avoid splitting tool_call/result groups.
|
||||||
|
compress_start = self._align_boundary_forward(messages, compress_start)
|
||||||
|
compress_end = self._align_boundary_backward(messages, compress_end)
|
||||||
|
if compress_start >= compress_end:
|
||||||
|
return messages
|
||||||
|
|
||||||
|
turns_to_summarize = messages[compress_start:compress_end]
|
||||||
|
display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
|
||||||
|
|
||||||
|
if not self.quiet_mode:
|
||||||
|
print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
|
||||||
|
print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
|
||||||
|
|
||||||
|
if not self.quiet_mode:
|
||||||
|
print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")
|
||||||
|
|
||||||
|
summary = self._generate_summary(turns_to_summarize)
|
||||||
|
|
||||||
|
compressed = []
|
||||||
|
for i in range(compress_start):
|
||||||
|
msg = messages[i].copy()
|
||||||
|
if i == 0 and msg.get("role") == "system" and self.compression_count == 0:
|
||||||
|
msg["content"] = (msg.get("content") or "") + "\n\n[Note: Some earlier conversation turns may be summarized to preserve context space.]"
|
||||||
|
compressed.append(msg)
|
||||||
|
|
||||||
|
if summary:
|
||||||
|
last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user"
|
||||||
|
summary_role = "user" if last_head_role in ("assistant", "tool") else "assistant"
|
||||||
|
compressed.append({"role": summary_role, "content": summary})
|
||||||
|
else:
|
||||||
|
if not self.quiet_mode:
|
||||||
|
print(" ⚠️ No summary model available — middle turns dropped without summary")
|
||||||
|
|
||||||
|
for i in range(compress_end, n_messages):
|
||||||
|
compressed.append(messages[i].copy())
|
||||||
|
|
||||||
|
self.compression_count += 1
|
||||||
|
|
||||||
|
compressed = self._sanitize_tool_pairs(compressed)
|
||||||
|
|
||||||
|
if not self.quiet_mode:
|
||||||
|
new_estimate = estimate_messages_tokens_rough(compressed)
|
||||||
|
saved_estimate = display_tokens - new_estimate
|
||||||
|
print(f" ✅ Compressed: {n_messages} → {len(compressed)} messages (~{saved_estimate:,} tokens saved)")
|
||||||
|
print(f" 💡 Compression #{self.compression_count} complete")
|
||||||
|
|
||||||
|
return compressed
|
||||||
469
agent/display.py
Normal file
469
agent/display.py
Normal file
@@ -0,0 +1,469 @@
|
|||||||
|
"""CLI presentation -- spinner, kawaii faces, tool preview formatting.
|
||||||
|
|
||||||
|
Pure display functions and classes with no AIAgent dependency.
|
||||||
|
Used by AIAgent._execute_tool_calls for CLI feedback.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
# ANSI escape codes for coloring tool failure indicators
|
||||||
|
_RED = "\033[31m"
|
||||||
|
_RESET = "\033[0m"
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Tool preview (one-line summary of a tool call's primary argument)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def build_tool_preview(tool_name: str, args: dict, max_len: int = 40) -> str:
|
||||||
|
"""Build a short preview of a tool call's primary argument for display."""
|
||||||
|
primary_args = {
|
||||||
|
"terminal": "command", "web_search": "query", "web_extract": "urls",
|
||||||
|
"read_file": "path", "write_file": "path", "patch": "path",
|
||||||
|
"search_files": "pattern", "browser_navigate": "url",
|
||||||
|
"browser_click": "ref", "browser_type": "text",
|
||||||
|
"image_generate": "prompt", "text_to_speech": "text",
|
||||||
|
"vision_analyze": "question", "mixture_of_agents": "user_prompt",
|
||||||
|
"skill_view": "name", "skills_list": "category",
|
||||||
|
"schedule_cronjob": "name",
|
||||||
|
"execute_code": "code", "delegate_task": "goal",
|
||||||
|
"clarify": "question", "skill_manage": "name",
|
||||||
|
}
|
||||||
|
|
||||||
|
if tool_name == "process":
|
||||||
|
action = args.get("action", "")
|
||||||
|
sid = args.get("session_id", "")
|
||||||
|
data = args.get("data", "")
|
||||||
|
timeout_val = args.get("timeout")
|
||||||
|
parts = [action]
|
||||||
|
if sid:
|
||||||
|
parts.append(sid[:16])
|
||||||
|
if data:
|
||||||
|
parts.append(f'"{data[:20]}"')
|
||||||
|
if timeout_val and action == "wait":
|
||||||
|
parts.append(f"{timeout_val}s")
|
||||||
|
return " ".join(parts) if parts else None
|
||||||
|
|
||||||
|
if tool_name == "todo":
|
||||||
|
todos_arg = args.get("todos")
|
||||||
|
merge = args.get("merge", False)
|
||||||
|
if todos_arg is None:
|
||||||
|
return "reading task list"
|
||||||
|
elif merge:
|
||||||
|
return f"updating {len(todos_arg)} task(s)"
|
||||||
|
else:
|
||||||
|
return f"planning {len(todos_arg)} task(s)"
|
||||||
|
|
||||||
|
if tool_name == "session_search":
|
||||||
|
query = args.get("query", "")
|
||||||
|
return f"recall: \"{query[:25]}{'...' if len(query) > 25 else ''}\""
|
||||||
|
|
||||||
|
if tool_name == "memory":
|
||||||
|
action = args.get("action", "")
|
||||||
|
target = args.get("target", "")
|
||||||
|
if action == "add":
|
||||||
|
content = args.get("content", "")
|
||||||
|
return f"+{target}: \"{content[:25]}{'...' if len(content) > 25 else ''}\""
|
||||||
|
elif action == "replace":
|
||||||
|
return f"~{target}: \"{args.get('old_text', '')[:20]}\""
|
||||||
|
elif action == "remove":
|
||||||
|
return f"-{target}: \"{args.get('old_text', '')[:20]}\""
|
||||||
|
return action
|
||||||
|
|
||||||
|
if tool_name == "send_message":
|
||||||
|
target = args.get("target", "?")
|
||||||
|
msg = args.get("message", "")
|
||||||
|
if len(msg) > 20:
|
||||||
|
msg = msg[:17] + "..."
|
||||||
|
return f"to {target}: \"{msg}\""
|
||||||
|
|
||||||
|
if tool_name.startswith("rl_"):
|
||||||
|
rl_previews = {
|
||||||
|
"rl_list_environments": "listing envs",
|
||||||
|
"rl_select_environment": args.get("name", ""),
|
||||||
|
"rl_get_current_config": "reading config",
|
||||||
|
"rl_edit_config": f"{args.get('field', '')}={args.get('value', '')}",
|
||||||
|
"rl_start_training": "starting",
|
||||||
|
"rl_check_status": args.get("run_id", "")[:16],
|
||||||
|
"rl_stop_training": f"stopping {args.get('run_id', '')[:16]}",
|
||||||
|
"rl_get_results": args.get("run_id", "")[:16],
|
||||||
|
"rl_list_runs": "listing runs",
|
||||||
|
"rl_test_inference": f"{args.get('num_steps', 3)} steps",
|
||||||
|
}
|
||||||
|
return rl_previews.get(tool_name)
|
||||||
|
|
||||||
|
key = primary_args.get(tool_name)
|
||||||
|
if not key:
|
||||||
|
for fallback_key in ("query", "text", "command", "path", "name", "prompt", "code", "goal"):
|
||||||
|
if fallback_key in args:
|
||||||
|
key = fallback_key
|
||||||
|
break
|
||||||
|
|
||||||
|
if not key or key not in args:
|
||||||
|
return None
|
||||||
|
|
||||||
|
value = args[key]
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = value[0] if value else ""
|
||||||
|
|
||||||
|
preview = str(value).strip()
|
||||||
|
if not preview:
|
||||||
|
return None
|
||||||
|
if len(preview) > max_len:
|
||||||
|
preview = preview[:max_len - 3] + "..."
|
||||||
|
return preview
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# KawaiiSpinner
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class KawaiiSpinner:
|
||||||
|
"""Animated spinner with kawaii faces for CLI feedback during tool execution."""
|
||||||
|
|
||||||
|
SPINNERS = {
|
||||||
|
'dots': ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'],
|
||||||
|
'bounce': ['⠁', '⠂', '⠄', '⡀', '⢀', '⠠', '⠐', '⠈'],
|
||||||
|
'grow': ['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█', '▇', '▆', '▅', '▄', '▃', '▂'],
|
||||||
|
'arrows': ['←', '↖', '↑', '↗', '→', '↘', '↓', '↙'],
|
||||||
|
'star': ['✶', '✷', '✸', '✹', '✺', '✹', '✸', '✷'],
|
||||||
|
'moon': ['🌑', '🌒', '🌓', '🌔', '🌕', '🌖', '🌗', '🌘'],
|
||||||
|
'pulse': ['◜', '◠', '◝', '◞', '◡', '◟'],
|
||||||
|
'brain': ['🧠', '💭', '💡', '✨', '💫', '🌟', '💡', '💭'],
|
||||||
|
'sparkle': ['⁺', '˚', '*', '✧', '✦', '✧', '*', '˚'],
|
||||||
|
}
|
||||||
|
|
||||||
|
KAWAII_WAITING = [
|
||||||
|
"(。◕‿◕。)", "(◕‿◕✿)", "٩(◕‿◕。)۶", "(✿◠‿◠)", "( ˘▽˘)っ",
|
||||||
|
"♪(´ε` )", "(◕ᴗ◕✿)", "ヾ(^∇^)", "(≧◡≦)", "(★ω★)",
|
||||||
|
]
|
||||||
|
|
||||||
|
KAWAII_THINKING = [
|
||||||
|
"(。•́︿•̀。)", "(◔_◔)", "(¬‿¬)", "( •_•)>⌐■-■", "(⌐■_■)",
|
||||||
|
"(´・_・`)", "◉_◉", "(°ロ°)", "( ˘⌣˘)♡", "ヽ(>∀<☆)☆",
|
||||||
|
"٩(๑❛ᴗ❛๑)۶", "(⊙_⊙)", "(¬_¬)", "( ͡° ͜ʖ ͡°)", "ಠ_ಠ",
|
||||||
|
]
|
||||||
|
|
||||||
|
THINKING_VERBS = [
|
||||||
|
"pondering", "contemplating", "musing", "cogitating", "ruminating",
|
||||||
|
"deliberating", "mulling", "reflecting", "processing", "reasoning",
|
||||||
|
"analyzing", "computing", "synthesizing", "formulating", "brainstorming",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, message: str = "", spinner_type: str = 'dots'):
|
||||||
|
self.message = message
|
||||||
|
self.spinner_frames = self.SPINNERS.get(spinner_type, self.SPINNERS['dots'])
|
||||||
|
self.running = False
|
||||||
|
self.thread = None
|
||||||
|
self.frame_idx = 0
|
||||||
|
self.start_time = None
|
||||||
|
self.last_line_len = 0
|
||||||
|
# Capture stdout NOW, before any redirect_stdout(devnull) from
|
||||||
|
# child agents can replace sys.stdout with a black hole.
|
||||||
|
self._out = sys.stdout
|
||||||
|
|
||||||
|
def _write(self, text: str, end: str = '\n', flush: bool = False):
|
||||||
|
"""Write to the stdout captured at spinner creation time."""
|
||||||
|
try:
|
||||||
|
self._out.write(text + end)
|
||||||
|
if flush:
|
||||||
|
self._out.flush()
|
||||||
|
except (ValueError, OSError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _animate(self):
|
||||||
|
while self.running:
|
||||||
|
if os.getenv("HERMES_SPINNER_PAUSE"):
|
||||||
|
time.sleep(0.1)
|
||||||
|
continue
|
||||||
|
frame = self.spinner_frames[self.frame_idx % len(self.spinner_frames)]
|
||||||
|
elapsed = time.time() - self.start_time
|
||||||
|
line = f" {frame} {self.message} ({elapsed:.1f}s)"
|
||||||
|
pad = max(self.last_line_len - len(line), 0)
|
||||||
|
self._write(f"\r{line}{' ' * pad}", end='', flush=True)
|
||||||
|
self.last_line_len = len(line)
|
||||||
|
self.frame_idx += 1
|
||||||
|
time.sleep(0.12)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
if self.running:
|
||||||
|
return
|
||||||
|
self.running = True
|
||||||
|
self.start_time = time.time()
|
||||||
|
self.thread = threading.Thread(target=self._animate, daemon=True)
|
||||||
|
self.thread.start()
|
||||||
|
|
||||||
|
def update_text(self, new_message: str):
|
||||||
|
self.message = new_message
|
||||||
|
|
||||||
|
def print_above(self, text: str):
|
||||||
|
"""Print a line above the spinner without disrupting animation.
|
||||||
|
|
||||||
|
Clears the current spinner line, prints the text, and lets the
|
||||||
|
next animation tick redraw the spinner on the line below.
|
||||||
|
Thread-safe: uses the captured stdout reference (self._out).
|
||||||
|
Works inside redirect_stdout(devnull) because _write bypasses
|
||||||
|
sys.stdout and writes to the stdout captured at spinner creation.
|
||||||
|
"""
|
||||||
|
if not self.running:
|
||||||
|
self._write(f" {text}", flush=True)
|
||||||
|
return
|
||||||
|
# Clear spinner line with spaces (not \033[K) to avoid garbled escape
|
||||||
|
# codes when prompt_toolkit's patch_stdout is active — same approach
|
||||||
|
# as stop(). Then print text; spinner redraws on next tick.
|
||||||
|
blanks = ' ' * max(self.last_line_len + 5, 40)
|
||||||
|
self._write(f"\r{blanks}\r {text}", flush=True)
|
||||||
|
|
||||||
|
def stop(self, final_message: str = None):
|
||||||
|
self.running = False
|
||||||
|
if self.thread:
|
||||||
|
self.thread.join(timeout=0.5)
|
||||||
|
# Clear the spinner line with spaces instead of \033[K to avoid
|
||||||
|
# garbled escape codes when prompt_toolkit's patch_stdout is active.
|
||||||
|
blanks = ' ' * max(self.last_line_len + 5, 40)
|
||||||
|
self._write(f"\r{blanks}\r", end='', flush=True)
|
||||||
|
if final_message:
|
||||||
|
self._write(f" {final_message}", flush=True)
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.start()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
self.stop()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Kawaii face arrays (used by AIAgent._execute_tool_calls for spinner text)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
KAWAII_SEARCH = [
|
||||||
|
"♪(´ε` )", "(。◕‿◕。)", "ヾ(^∇^)", "(◕ᴗ◕✿)", "( ˘▽˘)っ",
|
||||||
|
"٩(◕‿◕。)۶", "(✿◠‿◠)", "♪~(´ε` )", "(ノ´ヮ`)ノ*:・゚✧", "\(◎o◎)/",
|
||||||
|
]
|
||||||
|
KAWAII_READ = [
|
||||||
|
"φ(゜▽゜*)♪", "( ˘▽˘)っ", "(⌐■_■)", "٩(。•́‿•̀。)۶", "(◕‿◕✿)",
|
||||||
|
"ヾ(@⌒ー⌒@)ノ", "(✧ω✧)", "♪(๑ᴖ◡ᴖ๑)♪", "(≧◡≦)", "( ´ ▽ ` )ノ",
|
||||||
|
]
|
||||||
|
KAWAII_TERMINAL = [
|
||||||
|
"ヽ(>∀<☆)ノ", "(ノ°∀°)ノ", "٩(^ᴗ^)۶", "ヾ(⌐■_■)ノ♪", "(•̀ᴗ•́)و",
|
||||||
|
"┗(^0^)┓", "(`・ω・´)", "\( ̄▽ ̄)/", "(ง •̀_•́)ง", "ヽ(´▽`)/",
|
||||||
|
]
|
||||||
|
KAWAII_BROWSER = [
|
||||||
|
"(ノ°∀°)ノ", "(☞゚ヮ゚)☞", "( ͡° ͜ʖ ͡°)", "┌( ಠ_ಠ)┘", "(⊙_⊙)?",
|
||||||
|
"ヾ(•ω•`)o", "( ̄ω ̄)", "( ˇωˇ )", "(ᵔᴥᵔ)", "\(◎o◎)/",
|
||||||
|
]
|
||||||
|
KAWAII_CREATE = [
|
||||||
|
"✧*。٩(ˊᗜˋ*)و✧", "(ノ◕ヮ◕)ノ*:・゚✧", "ヽ(>∀<☆)ノ", "٩(♡ε♡)۶", "(◕‿◕)♡",
|
||||||
|
"✿◕ ‿ ◕✿", "(*≧▽≦)", "ヾ(^-^)ノ", "(☆▽☆)", "°˖✧◝(⁰▿⁰)◜✧˖°",
|
||||||
|
]
|
||||||
|
KAWAII_SKILL = [
|
||||||
|
"ヾ(@⌒ー⌒@)ノ", "(๑˃ᴗ˂)ﻭ", "٩(◕‿◕。)۶", "(✿╹◡╹)", "ヽ(・∀・)ノ",
|
||||||
|
"(ノ´ヮ`)ノ*:・゚✧", "♪(๑ᴖ◡ᴖ๑)♪", "(◠‿◠)", "٩(ˊᗜˋ*)و", "(^▽^)",
|
||||||
|
"ヾ(^∇^)", "(★ω★)/", "٩(。•́‿•̀。)۶", "(◕ᴗ◕✿)", "\(◎o◎)/",
|
||||||
|
"(✧ω✧)", "ヽ(>∀<☆)ノ", "( ˘▽˘)っ", "(≧◡≦) ♡", "ヾ( ̄▽ ̄)",
|
||||||
|
]
|
||||||
|
KAWAII_THINK = [
|
||||||
|
"(っ°Д°;)っ", "(;′⌒`)", "(・_・ヾ", "( ´_ゝ`)", "( ̄ヘ ̄)",
|
||||||
|
"(。-`ω´-)", "( ˘︹˘ )", "(¬_¬)", "ヽ(ー_ー )ノ", "(;一_一)",
|
||||||
|
]
|
||||||
|
KAWAII_GENERIC = [
|
||||||
|
"♪(´ε` )", "(◕‿◕✿)", "ヾ(^∇^)", "٩(◕‿◕。)۶", "(✿◠‿◠)",
|
||||||
|
"(ノ´ヮ`)ノ*:・゚✧", "ヽ(>∀<☆)ノ", "(☆▽☆)", "( ˘▽˘)っ", "(≧◡≦)",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Cute tool message (completion line that replaces the spinner)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str]:
|
||||||
|
"""Inspect a tool result string for signs of failure.
|
||||||
|
|
||||||
|
Returns ``(is_failure, suffix)`` where *suffix* is an informational tag
|
||||||
|
like ``" [exit 1]"`` for terminal failures, or ``" [error]"`` for generic
|
||||||
|
failures. On success, returns ``(False, "")``.
|
||||||
|
"""
|
||||||
|
if result is None:
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
if tool_name == "terminal":
|
||||||
|
try:
|
||||||
|
data = json.loads(result)
|
||||||
|
exit_code = data.get("exit_code")
|
||||||
|
if exit_code is not None and exit_code != 0:
|
||||||
|
return True, f" [exit {exit_code}]"
|
||||||
|
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||||
|
pass
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
# Memory-specific: distinguish "full" from real errors
|
||||||
|
if tool_name == "memory":
|
||||||
|
try:
|
||||||
|
data = json.loads(result)
|
||||||
|
if data.get("success") is False and "exceed the limit" in data.get("error", ""):
|
||||||
|
return True, " [full]"
|
||||||
|
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Generic heuristic for non-terminal tools
|
||||||
|
lower = result[:500].lower()
|
||||||
|
if '"error"' in lower or '"failed"' in lower or result.startswith("Error"):
|
||||||
|
return True, " [error]"
|
||||||
|
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_cute_tool_message(
|
||||||
|
tool_name: str, args: dict, duration: float, result: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Generate a formatted tool completion line for CLI quiet mode.
|
||||||
|
|
||||||
|
Format: ``| {emoji} {verb:9} {detail} {duration}``
|
||||||
|
|
||||||
|
When *result* is provided the line is checked for failure indicators.
|
||||||
|
Failed tool calls get a red prefix and an informational suffix.
|
||||||
|
"""
|
||||||
|
dur = f"{duration:.1f}s"
|
||||||
|
is_failure, failure_suffix = _detect_tool_failure(tool_name, result)
|
||||||
|
|
||||||
|
def _trunc(s, n=40):
|
||||||
|
s = str(s)
|
||||||
|
return (s[:n-3] + "...") if len(s) > n else s
|
||||||
|
|
||||||
|
def _path(p, n=35):
|
||||||
|
p = str(p)
|
||||||
|
return ("..." + p[-(n-3):]) if len(p) > n else p
|
||||||
|
|
||||||
|
def _wrap(line: str) -> str:
|
||||||
|
"""Append failure suffix when the tool failed."""
|
||||||
|
if not is_failure:
|
||||||
|
return line
|
||||||
|
return f"{line}{failure_suffix}"
|
||||||
|
|
||||||
|
if tool_name == "web_search":
|
||||||
|
return _wrap(f"┊ 🔍 search {_trunc(args.get('query', ''), 42)} {dur}")
|
||||||
|
if tool_name == "web_extract":
|
||||||
|
urls = args.get("urls", [])
|
||||||
|
if urls:
|
||||||
|
url = urls[0] if isinstance(urls, list) else str(urls)
|
||||||
|
domain = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||||
|
extra = f" +{len(urls)-1}" if len(urls) > 1 else ""
|
||||||
|
return _wrap(f"┊ 📄 fetch {_trunc(domain, 35)}{extra} {dur}")
|
||||||
|
return _wrap(f"┊ 📄 fetch pages {dur}")
|
||||||
|
if tool_name == "web_crawl":
|
||||||
|
url = args.get("url", "")
|
||||||
|
domain = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||||
|
return _wrap(f"┊ 🕸️ crawl {_trunc(domain, 35)} {dur}")
|
||||||
|
if tool_name == "terminal":
|
||||||
|
return _wrap(f"┊ 💻 $ {_trunc(args.get('command', ''), 42)} {dur}")
|
||||||
|
if tool_name == "process":
|
||||||
|
action = args.get("action", "?")
|
||||||
|
sid = args.get("session_id", "")[:12]
|
||||||
|
labels = {"list": "ls processes", "poll": f"poll {sid}", "log": f"log {sid}",
|
||||||
|
"wait": f"wait {sid}", "kill": f"kill {sid}", "write": f"write {sid}", "submit": f"submit {sid}"}
|
||||||
|
return _wrap(f"┊ ⚙️ proc {labels.get(action, f'{action} {sid}')} {dur}")
|
||||||
|
if tool_name == "read_file":
|
||||||
|
return _wrap(f"┊ 📖 read {_path(args.get('path', ''))} {dur}")
|
||||||
|
if tool_name == "write_file":
|
||||||
|
return _wrap(f"┊ ✍️ write {_path(args.get('path', ''))} {dur}")
|
||||||
|
if tool_name == "patch":
|
||||||
|
return _wrap(f"┊ 🔧 patch {_path(args.get('path', ''))} {dur}")
|
||||||
|
if tool_name == "search_files":
|
||||||
|
pattern = _trunc(args.get("pattern", ""), 35)
|
||||||
|
target = args.get("target", "content")
|
||||||
|
verb = "find" if target == "files" else "grep"
|
||||||
|
return _wrap(f"┊ 🔎 {verb:9} {pattern} {dur}")
|
||||||
|
if tool_name == "browser_navigate":
|
||||||
|
url = args.get("url", "")
|
||||||
|
domain = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||||
|
return _wrap(f"┊ 🌐 navigate {_trunc(domain, 35)} {dur}")
|
||||||
|
if tool_name == "browser_snapshot":
|
||||||
|
mode = "full" if args.get("full") else "compact"
|
||||||
|
return _wrap(f"┊ 📸 snapshot {mode} {dur}")
|
||||||
|
if tool_name == "browser_click":
|
||||||
|
return _wrap(f"┊ 👆 click {args.get('ref', '?')} {dur}")
|
||||||
|
if tool_name == "browser_type":
|
||||||
|
return _wrap(f"┊ ⌨️ type \"{_trunc(args.get('text', ''), 30)}\" {dur}")
|
||||||
|
if tool_name == "browser_scroll":
|
||||||
|
d = args.get("direction", "down")
|
||||||
|
arrow = {"down": "↓", "up": "↑", "right": "→", "left": "←"}.get(d, "↓")
|
||||||
|
return _wrap(f"┊ {arrow} scroll {d} {dur}")
|
||||||
|
if tool_name == "browser_back":
|
||||||
|
return _wrap(f"┊ ◀️ back {dur}")
|
||||||
|
if tool_name == "browser_press":
|
||||||
|
return _wrap(f"┊ ⌨️ press {args.get('key', '?')} {dur}")
|
||||||
|
if tool_name == "browser_close":
|
||||||
|
return _wrap(f"┊ 🚪 close browser {dur}")
|
||||||
|
if tool_name == "browser_get_images":
|
||||||
|
return _wrap(f"┊ 🖼️ images extracting {dur}")
|
||||||
|
if tool_name == "browser_vision":
|
||||||
|
return _wrap(f"┊ 👁️ vision analyzing page {dur}")
|
||||||
|
if tool_name == "todo":
|
||||||
|
todos_arg = args.get("todos")
|
||||||
|
merge = args.get("merge", False)
|
||||||
|
if todos_arg is None:
|
||||||
|
return _wrap(f"┊ 📋 plan reading tasks {dur}")
|
||||||
|
elif merge:
|
||||||
|
return _wrap(f"┊ 📋 plan update {len(todos_arg)} task(s) {dur}")
|
||||||
|
else:
|
||||||
|
return _wrap(f"┊ 📋 plan {len(todos_arg)} task(s) {dur}")
|
||||||
|
if tool_name == "session_search":
|
||||||
|
return _wrap(f"┊ 🔍 recall \"{_trunc(args.get('query', ''), 35)}\" {dur}")
|
||||||
|
if tool_name == "memory":
|
||||||
|
action = args.get("action", "?")
|
||||||
|
target = args.get("target", "")
|
||||||
|
if action == "add":
|
||||||
|
return _wrap(f"┊ 🧠 memory +{target}: \"{_trunc(args.get('content', ''), 30)}\" {dur}")
|
||||||
|
elif action == "replace":
|
||||||
|
return _wrap(f"┊ 🧠 memory ~{target}: \"{_trunc(args.get('old_text', ''), 20)}\" {dur}")
|
||||||
|
elif action == "remove":
|
||||||
|
return _wrap(f"┊ 🧠 memory -{target}: \"{_trunc(args.get('old_text', ''), 20)}\" {dur}")
|
||||||
|
return _wrap(f"┊ 🧠 memory {action} {dur}")
|
||||||
|
if tool_name == "skills_list":
|
||||||
|
return _wrap(f"┊ 📚 skills list {args.get('category', 'all')} {dur}")
|
||||||
|
if tool_name == "skill_view":
|
||||||
|
return _wrap(f"┊ 📚 skill {_trunc(args.get('name', ''), 30)} {dur}")
|
||||||
|
if tool_name == "image_generate":
|
||||||
|
return _wrap(f"┊ 🎨 create {_trunc(args.get('prompt', ''), 35)} {dur}")
|
||||||
|
if tool_name == "text_to_speech":
|
||||||
|
return _wrap(f"┊ 🔊 speak {_trunc(args.get('text', ''), 30)} {dur}")
|
||||||
|
if tool_name == "vision_analyze":
|
||||||
|
return _wrap(f"┊ 👁️ vision {_trunc(args.get('question', ''), 30)} {dur}")
|
||||||
|
if tool_name == "mixture_of_agents":
|
||||||
|
return _wrap(f"┊ 🧠 reason {_trunc(args.get('user_prompt', ''), 30)} {dur}")
|
||||||
|
if tool_name == "send_message":
|
||||||
|
return _wrap(f"┊ 📨 send {args.get('target', '?')}: \"{_trunc(args.get('message', ''), 25)}\" {dur}")
|
||||||
|
if tool_name == "schedule_cronjob":
|
||||||
|
return _wrap(f"┊ ⏰ schedule {_trunc(args.get('name', args.get('prompt', 'task')), 30)} {dur}")
|
||||||
|
if tool_name == "list_cronjobs":
|
||||||
|
return _wrap(f"┊ ⏰ jobs listing {dur}")
|
||||||
|
if tool_name == "remove_cronjob":
|
||||||
|
return _wrap(f"┊ ⏰ remove job {args.get('job_id', '?')} {dur}")
|
||||||
|
if tool_name.startswith("rl_"):
|
||||||
|
rl = {
|
||||||
|
"rl_list_environments": "list envs", "rl_select_environment": f"select {args.get('name', '')}",
|
||||||
|
"rl_get_current_config": "get config", "rl_edit_config": f"set {args.get('field', '?')}",
|
||||||
|
"rl_start_training": "start training", "rl_check_status": f"status {args.get('run_id', '?')[:12]}",
|
||||||
|
"rl_stop_training": f"stop {args.get('run_id', '?')[:12]}", "rl_get_results": f"results {args.get('run_id', '?')[:12]}",
|
||||||
|
"rl_list_runs": "list runs", "rl_test_inference": "test inference",
|
||||||
|
}
|
||||||
|
return _wrap(f"┊ 🧪 rl {rl.get(tool_name, tool_name.replace('rl_', ''))} {dur}")
|
||||||
|
if tool_name == "execute_code":
|
||||||
|
code = args.get("code", "")
|
||||||
|
first_line = code.strip().split("\n")[0] if code.strip() else ""
|
||||||
|
return _wrap(f"┊ 🐍 exec {_trunc(first_line, 35)} {dur}")
|
||||||
|
if tool_name == "delegate_task":
|
||||||
|
tasks = args.get("tasks")
|
||||||
|
if tasks and isinstance(tasks, list):
|
||||||
|
return _wrap(f"┊ 🔀 delegate {len(tasks)} parallel tasks {dur}")
|
||||||
|
return _wrap(f"┊ 🔀 delegate {_trunc(args.get('goal', ''), 35)} {dur}")
|
||||||
|
|
||||||
|
preview = build_tool_preview(tool_name, args) or ""
|
||||||
|
return _wrap(f"┊ ⚡ {tool_name[:9]:9} {_trunc(preview, 35)} {dur}")
|
||||||
818
agent/insights.py
Normal file
818
agent/insights.py
Normal file
@@ -0,0 +1,818 @@
|
|||||||
|
"""
|
||||||
|
Session Insights Engine for Hermes Agent.
|
||||||
|
|
||||||
|
Analyzes historical session data from the SQLite state database to produce
|
||||||
|
comprehensive usage insights — token consumption, cost estimates, tool usage
|
||||||
|
patterns, activity trends, model/platform breakdowns, and session metrics.
|
||||||
|
|
||||||
|
Inspired by Claude Code's /insights command, adapted for Hermes Agent's
|
||||||
|
multi-platform architecture with additional cost estimation and platform
|
||||||
|
breakdown capabilities.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from agent.insights import InsightsEngine
|
||||||
|
engine = InsightsEngine(db)
|
||||||
|
report = engine.generate(days=30)
|
||||||
|
print(engine.format_terminal(report))
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Model pricing (USD per million tokens) — approximate as of early 2026
|
||||||
|
# =========================================================================
|
||||||
|
MODEL_PRICING = {
|
||||||
|
# OpenAI
|
||||||
|
"gpt-4o": {"input": 2.50, "output": 10.00},
|
||||||
|
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
||||||
|
"gpt-4.1": {"input": 2.00, "output": 8.00},
|
||||||
|
"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
|
||||||
|
"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
|
||||||
|
"gpt-4.5-preview": {"input": 75.00, "output": 150.00},
|
||||||
|
"gpt-5": {"input": 10.00, "output": 30.00},
|
||||||
|
"gpt-5.4": {"input": 10.00, "output": 30.00},
|
||||||
|
"o3": {"input": 10.00, "output": 40.00},
|
||||||
|
"o3-mini": {"input": 1.10, "output": 4.40},
|
||||||
|
"o4-mini": {"input": 1.10, "output": 4.40},
|
||||||
|
# Anthropic
|
||||||
|
"claude-opus-4-20250514": {"input": 15.00, "output": 75.00},
|
||||||
|
"claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00},
|
||||||
|
"claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
|
||||||
|
"claude-3-5-haiku-20241022": {"input": 0.80, "output": 4.00},
|
||||||
|
"claude-3-opus-20240229": {"input": 15.00, "output": 75.00},
|
||||||
|
"claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
|
||||||
|
# DeepSeek
|
||||||
|
"deepseek-chat": {"input": 0.14, "output": 0.28},
|
||||||
|
"deepseek-reasoner": {"input": 0.55, "output": 2.19},
|
||||||
|
# Google
|
||||||
|
"gemini-2.5-pro": {"input": 1.25, "output": 10.00},
|
||||||
|
"gemini-2.5-flash": {"input": 0.15, "output": 0.60},
|
||||||
|
"gemini-2.0-flash": {"input": 0.10, "output": 0.40},
|
||||||
|
# Meta (via providers)
|
||||||
|
"llama-4-maverick": {"input": 0.50, "output": 0.70},
|
||||||
|
"llama-4-scout": {"input": 0.20, "output": 0.30},
|
||||||
|
# Z.AI / GLM (direct provider — pricing not published externally, treat as local)
|
||||||
|
"glm-5": {"input": 0.0, "output": 0.0},
|
||||||
|
"glm-4.7": {"input": 0.0, "output": 0.0},
|
||||||
|
"glm-4.5": {"input": 0.0, "output": 0.0},
|
||||||
|
"glm-4.5-flash": {"input": 0.0, "output": 0.0},
|
||||||
|
# Kimi / Moonshot (direct provider — pricing not published externally, treat as local)
|
||||||
|
"kimi-k2.5": {"input": 0.0, "output": 0.0},
|
||||||
|
"kimi-k2-thinking": {"input": 0.0, "output": 0.0},
|
||||||
|
"kimi-k2-turbo-preview": {"input": 0.0, "output": 0.0},
|
||||||
|
"kimi-k2-0905-preview": {"input": 0.0, "output": 0.0},
|
||||||
|
# MiniMax (direct provider — pricing not published externally, treat as local)
|
||||||
|
"MiniMax-M2.5": {"input": 0.0, "output": 0.0},
|
||||||
|
"MiniMax-M2.5-highspeed": {"input": 0.0, "output": 0.0},
|
||||||
|
"MiniMax-M2.1": {"input": 0.0, "output": 0.0},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fallback: unknown/custom models get zero cost (we can't assume pricing
|
||||||
|
# for self-hosted models, custom OAI endpoints, local inference, etc.)
|
||||||
|
_DEFAULT_PRICING = {"input": 0.0, "output": 0.0}
|
||||||
|
|
||||||
|
|
||||||
|
def _has_known_pricing(model_name: str) -> bool:
|
||||||
|
"""Check if a model has known pricing (vs unknown/custom endpoint)."""
|
||||||
|
return _get_pricing(model_name) is not _DEFAULT_PRICING
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pricing(model_name: str) -> Dict[str, float]:
|
||||||
|
"""Look up pricing for a model. Uses fuzzy matching on model name.
|
||||||
|
|
||||||
|
Returns _DEFAULT_PRICING (zero cost) for unknown/custom models —
|
||||||
|
we can't assume costs for self-hosted endpoints, local inference, etc.
|
||||||
|
"""
|
||||||
|
if not model_name:
|
||||||
|
return _DEFAULT_PRICING
|
||||||
|
|
||||||
|
# Strip provider prefix (e.g., "anthropic/claude-..." -> "claude-...")
|
||||||
|
bare = model_name.split("/")[-1].lower()
|
||||||
|
|
||||||
|
# Exact match first
|
||||||
|
if bare in MODEL_PRICING:
|
||||||
|
return MODEL_PRICING[bare]
|
||||||
|
|
||||||
|
# Fuzzy prefix match — prefer the LONGEST matching key to avoid
|
||||||
|
# e.g. "gpt-4o" matching before "gpt-4o-mini" for "gpt-4o-mini-2024-07-18"
|
||||||
|
best_match = None
|
||||||
|
best_len = 0
|
||||||
|
for key, price in MODEL_PRICING.items():
|
||||||
|
if bare.startswith(key) and len(key) > best_len:
|
||||||
|
best_match = price
|
||||||
|
best_len = len(key)
|
||||||
|
if best_match:
|
||||||
|
return best_match
|
||||||
|
|
||||||
|
# Keyword heuristics (checked in most-specific-first order)
|
||||||
|
if "opus" in bare:
|
||||||
|
return {"input": 15.00, "output": 75.00}
|
||||||
|
if "sonnet" in bare:
|
||||||
|
return {"input": 3.00, "output": 15.00}
|
||||||
|
if "haiku" in bare:
|
||||||
|
return {"input": 0.80, "output": 4.00}
|
||||||
|
if "gpt-4o-mini" in bare:
|
||||||
|
return {"input": 0.15, "output": 0.60}
|
||||||
|
if "gpt-4o" in bare:
|
||||||
|
return {"input": 2.50, "output": 10.00}
|
||||||
|
if "gpt-5" in bare:
|
||||||
|
return {"input": 10.00, "output": 30.00}
|
||||||
|
if "deepseek" in bare:
|
||||||
|
return {"input": 0.14, "output": 0.28}
|
||||||
|
if "gemini" in bare:
|
||||||
|
return {"input": 0.15, "output": 0.60}
|
||||||
|
|
||||||
|
return _DEFAULT_PRICING
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
|
||||||
|
"""Estimate the USD cost for a given model and token counts."""
|
||||||
|
pricing = _get_pricing(model)
|
||||||
|
return (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
||||||
|
|
||||||
|
|
||||||
|
def _format_duration(seconds: float) -> str:
|
||||||
|
"""Format seconds into a human-readable duration string."""
|
||||||
|
if seconds < 60:
|
||||||
|
return f"{seconds:.0f}s"
|
||||||
|
minutes = seconds / 60
|
||||||
|
if minutes < 60:
|
||||||
|
return f"{minutes:.0f}m"
|
||||||
|
hours = minutes / 60
|
||||||
|
if hours < 24:
|
||||||
|
remaining_min = int(minutes % 60)
|
||||||
|
return f"{int(hours)}h {remaining_min}m" if remaining_min else f"{int(hours)}h"
|
||||||
|
days = hours / 24
|
||||||
|
return f"{days:.1f}d"
|
||||||
|
|
||||||
|
|
||||||
|
def _bar_chart(values: List[int], max_width: int = 20) -> List[str]:
|
||||||
|
"""Create simple horizontal bar chart strings from values."""
|
||||||
|
peak = max(values) if values else 1
|
||||||
|
if peak == 0:
|
||||||
|
return ["" for _ in values]
|
||||||
|
return ["█" * max(1, int(v / peak * max_width)) if v > 0 else "" for v in values]
|
||||||
|
|
||||||
|
|
||||||
|
class InsightsEngine:
|
||||||
|
"""
|
||||||
|
Analyzes session history and produces usage insights.
|
||||||
|
|
||||||
|
Works directly with a SessionDB instance (or raw sqlite3 connection)
|
||||||
|
to query session and message data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db):
|
||||||
|
"""
|
||||||
|
Initialize with a SessionDB instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: A SessionDB instance (from hermes_state.py)
|
||||||
|
"""
|
||||||
|
self.db = db
|
||||||
|
self._conn = db._conn
|
||||||
|
|
||||||
|
def generate(self, days: int = 30, source: str = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Generate a complete insights report.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
days: Number of days to look back (default: 30)
|
||||||
|
source: Optional filter by source platform
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with all computed insights
|
||||||
|
"""
|
||||||
|
cutoff = time.time() - (days * 86400)
|
||||||
|
|
||||||
|
# Gather raw data
|
||||||
|
sessions = self._get_sessions(cutoff, source)
|
||||||
|
tool_usage = self._get_tool_usage(cutoff, source)
|
||||||
|
message_stats = self._get_message_stats(cutoff, source)
|
||||||
|
|
||||||
|
if not sessions:
|
||||||
|
return {
|
||||||
|
"days": days,
|
||||||
|
"source_filter": source,
|
||||||
|
"empty": True,
|
||||||
|
"overview": {},
|
||||||
|
"models": [],
|
||||||
|
"platforms": [],
|
||||||
|
"tools": [],
|
||||||
|
"activity": {},
|
||||||
|
"top_sessions": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute insights
|
||||||
|
overview = self._compute_overview(sessions, message_stats)
|
||||||
|
models = self._compute_model_breakdown(sessions)
|
||||||
|
platforms = self._compute_platform_breakdown(sessions)
|
||||||
|
tools = self._compute_tool_breakdown(tool_usage)
|
||||||
|
activity = self._compute_activity_patterns(sessions)
|
||||||
|
top_sessions = self._compute_top_sessions(sessions)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"days": days,
|
||||||
|
"source_filter": source,
|
||||||
|
"empty": False,
|
||||||
|
"generated_at": time.time(),
|
||||||
|
"overview": overview,
|
||||||
|
"models": models,
|
||||||
|
"platforms": platforms,
|
||||||
|
"tools": tools,
|
||||||
|
"activity": activity,
|
||||||
|
"top_sessions": top_sessions,
|
||||||
|
}
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Data gathering (SQL queries)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
# Columns we actually need (skip system_prompt, model_config blobs)
|
||||||
|
_SESSION_COLS = ("id, source, model, started_at, ended_at, "
|
||||||
|
"message_count, tool_call_count, input_tokens, output_tokens")
|
||||||
|
|
||||||
|
def _get_sessions(self, cutoff: float, source: str = None) -> List[Dict]:
|
||||||
|
"""Fetch sessions within the time window."""
|
||||||
|
if source:
|
||||||
|
cursor = self._conn.execute(
|
||||||
|
f"""SELECT {self._SESSION_COLS} FROM sessions
|
||||||
|
WHERE started_at >= ? AND source = ?
|
||||||
|
ORDER BY started_at DESC""",
|
||||||
|
(cutoff, source),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cursor = self._conn.execute(
|
||||||
|
f"""SELECT {self._SESSION_COLS} FROM sessions
|
||||||
|
WHERE started_at >= ?
|
||||||
|
ORDER BY started_at DESC""",
|
||||||
|
(cutoff,),
|
||||||
|
)
|
||||||
|
return [dict(row) for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
def _get_tool_usage(self, cutoff: float, source: str = None) -> List[Dict]:
|
||||||
|
"""Get tool call counts from messages.
|
||||||
|
|
||||||
|
Uses two sources:
|
||||||
|
1. tool_name column on 'tool' role messages (set by gateway)
|
||||||
|
2. tool_calls JSON on 'assistant' role messages (covers CLI where
|
||||||
|
tool_name is not populated on tool responses)
|
||||||
|
"""
|
||||||
|
tool_counts = Counter()
|
||||||
|
|
||||||
|
# Source 1: explicit tool_name on tool response messages
|
||||||
|
if source:
|
||||||
|
cursor = self._conn.execute(
|
||||||
|
"""SELECT m.tool_name, COUNT(*) as count
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE s.started_at >= ? AND s.source = ?
|
||||||
|
AND m.role = 'tool' AND m.tool_name IS NOT NULL
|
||||||
|
GROUP BY m.tool_name
|
||||||
|
ORDER BY count DESC""",
|
||||||
|
(cutoff, source),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cursor = self._conn.execute(
|
||||||
|
"""SELECT m.tool_name, COUNT(*) as count
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE s.started_at >= ?
|
||||||
|
AND m.role = 'tool' AND m.tool_name IS NOT NULL
|
||||||
|
GROUP BY m.tool_name
|
||||||
|
ORDER BY count DESC""",
|
||||||
|
(cutoff,),
|
||||||
|
)
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
tool_counts[row["tool_name"]] += row["count"]
|
||||||
|
|
||||||
|
# Source 2: extract from tool_calls JSON on assistant messages
|
||||||
|
# (covers CLI sessions where tool_name is NULL on tool responses)
|
||||||
|
if source:
|
||||||
|
cursor2 = self._conn.execute(
|
||||||
|
"""SELECT m.tool_calls
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE s.started_at >= ? AND s.source = ?
|
||||||
|
AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""",
|
||||||
|
(cutoff, source),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cursor2 = self._conn.execute(
|
||||||
|
"""SELECT m.tool_calls
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE s.started_at >= ?
|
||||||
|
AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""",
|
||||||
|
(cutoff,),
|
||||||
|
)
|
||||||
|
|
||||||
|
tool_calls_counts = Counter()
|
||||||
|
for row in cursor2.fetchall():
|
||||||
|
try:
|
||||||
|
calls = row["tool_calls"]
|
||||||
|
if isinstance(calls, str):
|
||||||
|
calls = json.loads(calls)
|
||||||
|
if isinstance(calls, list):
|
||||||
|
for call in calls:
|
||||||
|
func = call.get("function", {}) if isinstance(call, dict) else {}
|
||||||
|
name = func.get("name")
|
||||||
|
if name:
|
||||||
|
tool_calls_counts[name] += 1
|
||||||
|
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Merge: prefer tool_name source, supplement with tool_calls source
|
||||||
|
# for tools not already counted
|
||||||
|
if not tool_counts and tool_calls_counts:
|
||||||
|
# No tool_name data at all — use tool_calls exclusively
|
||||||
|
tool_counts = tool_calls_counts
|
||||||
|
elif tool_counts and tool_calls_counts:
|
||||||
|
# Both sources have data — use whichever has the higher count per tool
|
||||||
|
# (they may overlap, so take the max to avoid double-counting)
|
||||||
|
all_tools = set(tool_counts) | set(tool_calls_counts)
|
||||||
|
merged = Counter()
|
||||||
|
for tool in all_tools:
|
||||||
|
merged[tool] = max(tool_counts.get(tool, 0), tool_calls_counts.get(tool, 0))
|
||||||
|
tool_counts = merged
|
||||||
|
|
||||||
|
# Convert to the expected format
|
||||||
|
return [
|
||||||
|
{"tool_name": name, "count": count}
|
||||||
|
for name, count in tool_counts.most_common()
|
||||||
|
]
|
||||||
|
|
||||||
|
def _get_message_stats(self, cutoff: float, source: str = None) -> Dict:
|
||||||
|
"""Get aggregate message statistics."""
|
||||||
|
if source:
|
||||||
|
cursor = self._conn.execute(
|
||||||
|
"""SELECT
|
||||||
|
COUNT(*) as total_messages,
|
||||||
|
SUM(CASE WHEN m.role = 'user' THEN 1 ELSE 0 END) as user_messages,
|
||||||
|
SUM(CASE WHEN m.role = 'assistant' THEN 1 ELSE 0 END) as assistant_messages,
|
||||||
|
SUM(CASE WHEN m.role = 'tool' THEN 1 ELSE 0 END) as tool_messages
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE s.started_at >= ? AND s.source = ?""",
|
||||||
|
(cutoff, source),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cursor = self._conn.execute(
|
||||||
|
"""SELECT
|
||||||
|
COUNT(*) as total_messages,
|
||||||
|
SUM(CASE WHEN m.role = 'user' THEN 1 ELSE 0 END) as user_messages,
|
||||||
|
SUM(CASE WHEN m.role = 'assistant' THEN 1 ELSE 0 END) as assistant_messages,
|
||||||
|
SUM(CASE WHEN m.role = 'tool' THEN 1 ELSE 0 END) as tool_messages
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE s.started_at >= ?""",
|
||||||
|
(cutoff,),
|
||||||
|
)
|
||||||
|
row = cursor.fetchone()
|
||||||
|
return dict(row) if row else {
|
||||||
|
"total_messages": 0, "user_messages": 0,
|
||||||
|
"assistant_messages": 0, "tool_messages": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Computation
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _compute_overview(self, sessions: List[Dict], message_stats: Dict) -> Dict:
|
||||||
|
"""Compute high-level overview statistics."""
|
||||||
|
total_input = sum(s.get("input_tokens") or 0 for s in sessions)
|
||||||
|
total_output = sum(s.get("output_tokens") or 0 for s in sessions)
|
||||||
|
total_tokens = total_input + total_output
|
||||||
|
total_tool_calls = sum(s.get("tool_call_count") or 0 for s in sessions)
|
||||||
|
total_messages = sum(s.get("message_count") or 0 for s in sessions)
|
||||||
|
|
||||||
|
# Cost estimation (weighted by model)
|
||||||
|
total_cost = 0.0
|
||||||
|
models_with_pricing = set()
|
||||||
|
models_without_pricing = set()
|
||||||
|
for s in sessions:
|
||||||
|
model = s.get("model") or ""
|
||||||
|
inp = s.get("input_tokens") or 0
|
||||||
|
out = s.get("output_tokens") or 0
|
||||||
|
total_cost += _estimate_cost(model, inp, out)
|
||||||
|
display = model.split("/")[-1] if "/" in model else (model or "unknown")
|
||||||
|
if _has_known_pricing(model):
|
||||||
|
models_with_pricing.add(display)
|
||||||
|
else:
|
||||||
|
models_without_pricing.add(display)
|
||||||
|
|
||||||
|
# Session duration stats (guard against negative durations from clock drift)
|
||||||
|
durations = []
|
||||||
|
for s in sessions:
|
||||||
|
start = s.get("started_at")
|
||||||
|
end = s.get("ended_at")
|
||||||
|
if start and end and end > start:
|
||||||
|
durations.append(end - start)
|
||||||
|
|
||||||
|
total_hours = sum(durations) / 3600 if durations else 0
|
||||||
|
avg_duration = sum(durations) / len(durations) if durations else 0
|
||||||
|
|
||||||
|
# Earliest and latest session
|
||||||
|
started_timestamps = [s["started_at"] for s in sessions if s.get("started_at")]
|
||||||
|
date_range_start = min(started_timestamps) if started_timestamps else None
|
||||||
|
date_range_end = max(started_timestamps) if started_timestamps else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_sessions": len(sessions),
|
||||||
|
"total_messages": total_messages,
|
||||||
|
"total_tool_calls": total_tool_calls,
|
||||||
|
"total_input_tokens": total_input,
|
||||||
|
"total_output_tokens": total_output,
|
||||||
|
"total_tokens": total_tokens,
|
||||||
|
"estimated_cost": total_cost,
|
||||||
|
"total_hours": total_hours,
|
||||||
|
"avg_session_duration": avg_duration,
|
||||||
|
"avg_messages_per_session": total_messages / len(sessions) if sessions else 0,
|
||||||
|
"avg_tokens_per_session": total_tokens / len(sessions) if sessions else 0,
|
||||||
|
"user_messages": message_stats.get("user_messages") or 0,
|
||||||
|
"assistant_messages": message_stats.get("assistant_messages") or 0,
|
||||||
|
"tool_messages": message_stats.get("tool_messages") or 0,
|
||||||
|
"date_range_start": date_range_start,
|
||||||
|
"date_range_end": date_range_end,
|
||||||
|
"models_with_pricing": sorted(models_with_pricing),
|
||||||
|
"models_without_pricing": sorted(models_without_pricing),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _compute_model_breakdown(self, sessions: List[Dict]) -> List[Dict]:
|
||||||
|
"""Break down usage by model."""
|
||||||
|
model_data = defaultdict(lambda: {
|
||||||
|
"sessions": 0, "input_tokens": 0, "output_tokens": 0,
|
||||||
|
"total_tokens": 0, "tool_calls": 0, "cost": 0.0,
|
||||||
|
})
|
||||||
|
|
||||||
|
for s in sessions:
|
||||||
|
model = s.get("model") or "unknown"
|
||||||
|
# Normalize: strip provider prefix for display
|
||||||
|
display_model = model.split("/")[-1] if "/" in model else model
|
||||||
|
d = model_data[display_model]
|
||||||
|
d["sessions"] += 1
|
||||||
|
inp = s.get("input_tokens") or 0
|
||||||
|
out = s.get("output_tokens") or 0
|
||||||
|
d["input_tokens"] += inp
|
||||||
|
d["output_tokens"] += out
|
||||||
|
d["total_tokens"] += inp + out
|
||||||
|
d["tool_calls"] += s.get("tool_call_count") or 0
|
||||||
|
d["cost"] += _estimate_cost(model, inp, out)
|
||||||
|
d["has_pricing"] = _has_known_pricing(model)
|
||||||
|
|
||||||
|
result = [
|
||||||
|
{"model": model, **data}
|
||||||
|
for model, data in model_data.items()
|
||||||
|
]
|
||||||
|
# Sort by tokens first, fall back to session count when tokens are 0
|
||||||
|
result.sort(key=lambda x: (x["total_tokens"], x["sessions"]), reverse=True)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _compute_platform_breakdown(self, sessions: List[Dict]) -> List[Dict]:
|
||||||
|
"""Break down usage by platform/source."""
|
||||||
|
platform_data = defaultdict(lambda: {
|
||||||
|
"sessions": 0, "messages": 0, "input_tokens": 0,
|
||||||
|
"output_tokens": 0, "total_tokens": 0, "tool_calls": 0,
|
||||||
|
})
|
||||||
|
|
||||||
|
for s in sessions:
|
||||||
|
source = s.get("source") or "unknown"
|
||||||
|
d = platform_data[source]
|
||||||
|
d["sessions"] += 1
|
||||||
|
d["messages"] += s.get("message_count") or 0
|
||||||
|
inp = s.get("input_tokens") or 0
|
||||||
|
out = s.get("output_tokens") or 0
|
||||||
|
d["input_tokens"] += inp
|
||||||
|
d["output_tokens"] += out
|
||||||
|
d["total_tokens"] += inp + out
|
||||||
|
d["tool_calls"] += s.get("tool_call_count") or 0
|
||||||
|
|
||||||
|
result = [
|
||||||
|
{"platform": platform, **data}
|
||||||
|
for platform, data in platform_data.items()
|
||||||
|
]
|
||||||
|
result.sort(key=lambda x: x["sessions"], reverse=True)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _compute_tool_breakdown(self, tool_usage: List[Dict]) -> List[Dict]:
|
||||||
|
"""Process tool usage data into a ranked list with percentages."""
|
||||||
|
total_calls = sum(t["count"] for t in tool_usage) if tool_usage else 0
|
||||||
|
result = []
|
||||||
|
for t in tool_usage:
|
||||||
|
pct = (t["count"] / total_calls * 100) if total_calls else 0
|
||||||
|
result.append({
|
||||||
|
"tool": t["tool_name"],
|
||||||
|
"count": t["count"],
|
||||||
|
"percentage": pct,
|
||||||
|
})
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _compute_activity_patterns(self, sessions: List[Dict]) -> Dict:
|
||||||
|
"""Analyze activity patterns by day of week and hour."""
|
||||||
|
day_counts = Counter() # 0=Monday ... 6=Sunday
|
||||||
|
hour_counts = Counter()
|
||||||
|
daily_counts = Counter() # date string -> count
|
||||||
|
|
||||||
|
for s in sessions:
|
||||||
|
ts = s.get("started_at")
|
||||||
|
if not ts:
|
||||||
|
continue
|
||||||
|
dt = datetime.fromtimestamp(ts)
|
||||||
|
day_counts[dt.weekday()] += 1
|
||||||
|
hour_counts[dt.hour] += 1
|
||||||
|
daily_counts[dt.strftime("%Y-%m-%d")] += 1
|
||||||
|
|
||||||
|
day_names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
||||||
|
day_breakdown = [
|
||||||
|
{"day": day_names[i], "count": day_counts.get(i, 0)}
|
||||||
|
for i in range(7)
|
||||||
|
]
|
||||||
|
|
||||||
|
hour_breakdown = [
|
||||||
|
{"hour": i, "count": hour_counts.get(i, 0)}
|
||||||
|
for i in range(24)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Busiest day and hour
|
||||||
|
busiest_day = max(day_breakdown, key=lambda x: x["count"]) if day_breakdown else None
|
||||||
|
busiest_hour = max(hour_breakdown, key=lambda x: x["count"]) if hour_breakdown else None
|
||||||
|
|
||||||
|
# Active days (days with at least one session)
|
||||||
|
active_days = len(daily_counts)
|
||||||
|
|
||||||
|
# Streak calculation
|
||||||
|
if daily_counts:
|
||||||
|
all_dates = sorted(daily_counts.keys())
|
||||||
|
current_streak = 1
|
||||||
|
max_streak = 1
|
||||||
|
for i in range(1, len(all_dates)):
|
||||||
|
d1 = datetime.strptime(all_dates[i - 1], "%Y-%m-%d")
|
||||||
|
d2 = datetime.strptime(all_dates[i], "%Y-%m-%d")
|
||||||
|
if (d2 - d1).days == 1:
|
||||||
|
current_streak += 1
|
||||||
|
max_streak = max(max_streak, current_streak)
|
||||||
|
else:
|
||||||
|
current_streak = 1
|
||||||
|
else:
|
||||||
|
max_streak = 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"by_day": day_breakdown,
|
||||||
|
"by_hour": hour_breakdown,
|
||||||
|
"busiest_day": busiest_day,
|
||||||
|
"busiest_hour": busiest_hour,
|
||||||
|
"active_days": active_days,
|
||||||
|
"max_streak": max_streak,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _compute_top_sessions(self, sessions: List[Dict]) -> List[Dict]:
|
||||||
|
"""Find notable sessions (longest, most messages, most tokens)."""
|
||||||
|
top = []
|
||||||
|
|
||||||
|
# Longest by duration
|
||||||
|
sessions_with_duration = [
|
||||||
|
s for s in sessions
|
||||||
|
if s.get("started_at") and s.get("ended_at")
|
||||||
|
]
|
||||||
|
if sessions_with_duration:
|
||||||
|
longest = max(
|
||||||
|
sessions_with_duration,
|
||||||
|
key=lambda s: (s["ended_at"] - s["started_at"]),
|
||||||
|
)
|
||||||
|
dur = longest["ended_at"] - longest["started_at"]
|
||||||
|
top.append({
|
||||||
|
"label": "Longest session",
|
||||||
|
"session_id": longest["id"][:16],
|
||||||
|
"value": _format_duration(dur),
|
||||||
|
"date": datetime.fromtimestamp(longest["started_at"]).strftime("%b %d"),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Most messages
|
||||||
|
most_msgs = max(sessions, key=lambda s: s.get("message_count") or 0)
|
||||||
|
if (most_msgs.get("message_count") or 0) > 0:
|
||||||
|
top.append({
|
||||||
|
"label": "Most messages",
|
||||||
|
"session_id": most_msgs["id"][:16],
|
||||||
|
"value": f"{most_msgs['message_count']} msgs",
|
||||||
|
"date": datetime.fromtimestamp(most_msgs["started_at"]).strftime("%b %d") if most_msgs.get("started_at") else "?",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Most tokens
|
||||||
|
most_tokens = max(
|
||||||
|
sessions,
|
||||||
|
key=lambda s: (s.get("input_tokens") or 0) + (s.get("output_tokens") or 0),
|
||||||
|
)
|
||||||
|
token_total = (most_tokens.get("input_tokens") or 0) + (most_tokens.get("output_tokens") or 0)
|
||||||
|
if token_total > 0:
|
||||||
|
top.append({
|
||||||
|
"label": "Most tokens",
|
||||||
|
"session_id": most_tokens["id"][:16],
|
||||||
|
"value": f"{token_total:,} tokens",
|
||||||
|
"date": datetime.fromtimestamp(most_tokens["started_at"]).strftime("%b %d") if most_tokens.get("started_at") else "?",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Most tool calls
|
||||||
|
most_tools = max(sessions, key=lambda s: s.get("tool_call_count") or 0)
|
||||||
|
if (most_tools.get("tool_call_count") or 0) > 0:
|
||||||
|
top.append({
|
||||||
|
"label": "Most tool calls",
|
||||||
|
"session_id": most_tools["id"][:16],
|
||||||
|
"value": f"{most_tools['tool_call_count']} calls",
|
||||||
|
"date": datetime.fromtimestamp(most_tools["started_at"]).strftime("%b %d") if most_tools.get("started_at") else "?",
|
||||||
|
})
|
||||||
|
|
||||||
|
return top
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Formatting
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def format_terminal(self, report: Dict) -> str:
|
||||||
|
"""Format the insights report for terminal display (CLI)."""
|
||||||
|
if report.get("empty"):
|
||||||
|
days = report.get("days", 30)
|
||||||
|
src = f" (source: {report['source_filter']})" if report.get("source_filter") else ""
|
||||||
|
return f" No sessions found in the last {days} days{src}."
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
o = report["overview"]
|
||||||
|
days = report["days"]
|
||||||
|
src_filter = report.get("source_filter")
|
||||||
|
|
||||||
|
# Header
|
||||||
|
lines.append("")
|
||||||
|
lines.append(" ╔══════════════════════════════════════════════════════════╗")
|
||||||
|
lines.append(" ║ 📊 Hermes Insights ║")
|
||||||
|
period_label = f"Last {days} days"
|
||||||
|
if src_filter:
|
||||||
|
period_label += f" ({src_filter})"
|
||||||
|
padding = 58 - len(period_label) - 2
|
||||||
|
left_pad = padding // 2
|
||||||
|
right_pad = padding - left_pad
|
||||||
|
lines.append(f" ║{' ' * left_pad} {period_label} {' ' * right_pad}║")
|
||||||
|
lines.append(" ╚══════════════════════════════════════════════════════════╝")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Date range
|
||||||
|
if o.get("date_range_start") and o.get("date_range_end"):
|
||||||
|
start_str = datetime.fromtimestamp(o["date_range_start"]).strftime("%b %d, %Y")
|
||||||
|
end_str = datetime.fromtimestamp(o["date_range_end"]).strftime("%b %d, %Y")
|
||||||
|
lines.append(f" Period: {start_str} — {end_str}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Overview
|
||||||
|
lines.append(" 📋 Overview")
|
||||||
|
lines.append(" " + "─" * 56)
|
||||||
|
lines.append(f" Sessions: {o['total_sessions']:<12} Messages: {o['total_messages']:,}")
|
||||||
|
lines.append(f" Tool calls: {o['total_tool_calls']:<12,} User messages: {o['user_messages']:,}")
|
||||||
|
lines.append(f" Input tokens: {o['total_input_tokens']:<12,} Output tokens: {o['total_output_tokens']:,}")
|
||||||
|
cost_str = f"${o['estimated_cost']:.2f}"
|
||||||
|
if o.get("models_without_pricing"):
|
||||||
|
cost_str += " *"
|
||||||
|
lines.append(f" Total tokens: {o['total_tokens']:<12,} Est. cost: {cost_str}")
|
||||||
|
if o["total_hours"] > 0:
|
||||||
|
lines.append(f" Active time: ~{_format_duration(o['total_hours'] * 3600):<11} Avg session: ~{_format_duration(o['avg_session_duration'])}")
|
||||||
|
lines.append(f" Avg msgs/session: {o['avg_messages_per_session']:.1f}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Model breakdown
|
||||||
|
if report["models"]:
|
||||||
|
lines.append(" 🤖 Models Used")
|
||||||
|
lines.append(" " + "─" * 56)
|
||||||
|
lines.append(f" {'Model':<30} {'Sessions':>8} {'Tokens':>12} {'Cost':>8}")
|
||||||
|
for m in report["models"]:
|
||||||
|
model_name = m["model"][:28]
|
||||||
|
if m.get("has_pricing"):
|
||||||
|
cost_cell = f"${m['cost']:>6.2f}"
|
||||||
|
else:
|
||||||
|
cost_cell = " N/A"
|
||||||
|
lines.append(f" {model_name:<30} {m['sessions']:>8} {m['total_tokens']:>12,} {cost_cell}")
|
||||||
|
if o.get("models_without_pricing"):
|
||||||
|
lines.append(f" * Cost N/A for custom/self-hosted models")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Platform breakdown
|
||||||
|
if len(report["platforms"]) > 1 or (report["platforms"] and report["platforms"][0]["platform"] != "cli"):
|
||||||
|
lines.append(" 📱 Platforms")
|
||||||
|
lines.append(" " + "─" * 56)
|
||||||
|
lines.append(f" {'Platform':<14} {'Sessions':>8} {'Messages':>10} {'Tokens':>14}")
|
||||||
|
for p in report["platforms"]:
|
||||||
|
lines.append(f" {p['platform']:<14} {p['sessions']:>8} {p['messages']:>10,} {p['total_tokens']:>14,}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Tool usage
|
||||||
|
if report["tools"]:
|
||||||
|
lines.append(" 🔧 Top Tools")
|
||||||
|
lines.append(" " + "─" * 56)
|
||||||
|
lines.append(f" {'Tool':<28} {'Calls':>8} {'%':>8}")
|
||||||
|
for t in report["tools"][:15]: # Top 15
|
||||||
|
lines.append(f" {t['tool']:<28} {t['count']:>8,} {t['percentage']:>7.1f}%")
|
||||||
|
if len(report["tools"]) > 15:
|
||||||
|
lines.append(f" ... and {len(report['tools']) - 15} more tools")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Activity patterns
|
||||||
|
act = report.get("activity", {})
|
||||||
|
if act.get("by_day"):
|
||||||
|
lines.append(" 📅 Activity Patterns")
|
||||||
|
lines.append(" " + "─" * 56)
|
||||||
|
|
||||||
|
# Day of week chart
|
||||||
|
day_values = [d["count"] for d in act["by_day"]]
|
||||||
|
bars = _bar_chart(day_values, max_width=15)
|
||||||
|
for i, d in enumerate(act["by_day"]):
|
||||||
|
bar = bars[i]
|
||||||
|
lines.append(f" {d['day']} {bar:<15} {d['count']}")
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Peak hours (show top 5 busiest hours)
|
||||||
|
busy_hours = sorted(act["by_hour"], key=lambda x: x["count"], reverse=True)
|
||||||
|
busy_hours = [h for h in busy_hours if h["count"] > 0][:5]
|
||||||
|
if busy_hours:
|
||||||
|
hour_strs = []
|
||||||
|
for h in busy_hours:
|
||||||
|
hr = h["hour"]
|
||||||
|
ampm = "AM" if hr < 12 else "PM"
|
||||||
|
display_hr = hr % 12 or 12
|
||||||
|
hour_strs.append(f"{display_hr}{ampm} ({h['count']})")
|
||||||
|
lines.append(f" Peak hours: {', '.join(hour_strs)}")
|
||||||
|
|
||||||
|
if act.get("active_days"):
|
||||||
|
lines.append(f" Active days: {act['active_days']}")
|
||||||
|
if act.get("max_streak") and act["max_streak"] > 1:
|
||||||
|
lines.append(f" Best streak: {act['max_streak']} consecutive days")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Notable sessions
|
||||||
|
if report.get("top_sessions"):
|
||||||
|
lines.append(" 🏆 Notable Sessions")
|
||||||
|
lines.append(" " + "─" * 56)
|
||||||
|
for ts in report["top_sessions"]:
|
||||||
|
lines.append(f" {ts['label']:<20} {ts['value']:<18} ({ts['date']}, {ts['session_id']})")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def format_gateway(self, report: Dict) -> str:
|
||||||
|
"""Format the insights report for gateway/messaging (shorter)."""
|
||||||
|
if report.get("empty"):
|
||||||
|
days = report.get("days", 30)
|
||||||
|
return f"No sessions found in the last {days} days."
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
o = report["overview"]
|
||||||
|
days = report["days"]
|
||||||
|
|
||||||
|
lines.append(f"📊 **Hermes Insights** — Last {days} days\n")
|
||||||
|
|
||||||
|
# Overview
|
||||||
|
lines.append(f"**Sessions:** {o['total_sessions']} | **Messages:** {o['total_messages']:,} | **Tool calls:** {o['total_tool_calls']:,}")
|
||||||
|
lines.append(f"**Tokens:** {o['total_tokens']:,} (in: {o['total_input_tokens']:,} / out: {o['total_output_tokens']:,})")
|
||||||
|
cost_note = ""
|
||||||
|
if o.get("models_without_pricing"):
|
||||||
|
cost_note = " _(excludes custom/self-hosted models)_"
|
||||||
|
lines.append(f"**Est. cost:** ${o['estimated_cost']:.2f}{cost_note}")
|
||||||
|
if o["total_hours"] > 0:
|
||||||
|
lines.append(f"**Active time:** ~{_format_duration(o['total_hours'] * 3600)} | **Avg session:** ~{_format_duration(o['avg_session_duration'])}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Models (top 5)
|
||||||
|
if report["models"]:
|
||||||
|
lines.append("**🤖 Models:**")
|
||||||
|
for m in report["models"][:5]:
|
||||||
|
cost_str = f"${m['cost']:.2f}" if m.get("has_pricing") else "N/A"
|
||||||
|
lines.append(f" {m['model'][:25]} — {m['sessions']} sessions, {m['total_tokens']:,} tokens, {cost_str}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Platforms (if multi-platform)
|
||||||
|
if len(report["platforms"]) > 1:
|
||||||
|
lines.append("**📱 Platforms:**")
|
||||||
|
for p in report["platforms"]:
|
||||||
|
lines.append(f" {p['platform']} — {p['sessions']} sessions, {p['messages']:,} msgs")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Tools (top 8)
|
||||||
|
if report["tools"]:
|
||||||
|
lines.append("**🔧 Top Tools:**")
|
||||||
|
for t in report["tools"][:8]:
|
||||||
|
lines.append(f" {t['tool']} — {t['count']:,} calls ({t['percentage']:.1f}%)")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Activity summary
|
||||||
|
act = report.get("activity", {})
|
||||||
|
if act.get("busiest_day") and act.get("busiest_hour"):
|
||||||
|
hr = act["busiest_hour"]["hour"]
|
||||||
|
ampm = "AM" if hr < 12 else "PM"
|
||||||
|
display_hr = hr % 12 or 12
|
||||||
|
lines.append(f"**📅 Busiest:** {act['busiest_day']['day']}s ({act['busiest_day']['count']} sessions), {display_hr}{ampm} ({act['busiest_hour']['count']} sessions)")
|
||||||
|
if act.get("active_days"):
|
||||||
|
lines.append(f"**Active days:** {act['active_days']}", )
|
||||||
|
if act.get("max_streak", 0) > 1:
|
||||||
|
lines.append(f"**Best streak:** {act['max_streak']} consecutive days")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
224
agent/model_metadata.py
Normal file
224
agent/model_metadata.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
"""Model metadata, context lengths, and token estimation utilities.
|
||||||
|
|
||||||
|
Pure utility functions with no AIAgent dependency. Used by ContextCompressor
|
||||||
|
and run_agent.py for pre-flight context checks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from hermes_constants import OPENROUTER_MODELS_URL
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
||||||
|
_model_metadata_cache_time: float = 0
|
||||||
|
_MODEL_CACHE_TTL = 3600
|
||||||
|
|
||||||
|
# Descending tiers for context length probing when the model is unknown.
|
||||||
|
# We start high and step down on context-length errors until one works.
|
||||||
|
CONTEXT_PROBE_TIERS = [
|
||||||
|
2_000_000,
|
||||||
|
1_000_000,
|
||||||
|
512_000,
|
||||||
|
200_000,
|
||||||
|
128_000,
|
||||||
|
64_000,
|
||||||
|
32_000,
|
||||||
|
]
|
||||||
|
|
||||||
|
DEFAULT_CONTEXT_LENGTHS = {
|
||||||
|
"anthropic/claude-opus-4": 200000,
|
||||||
|
"anthropic/claude-opus-4.5": 200000,
|
||||||
|
"anthropic/claude-opus-4.6": 200000,
|
||||||
|
"anthropic/claude-sonnet-4": 200000,
|
||||||
|
"anthropic/claude-sonnet-4-20250514": 200000,
|
||||||
|
"anthropic/claude-haiku-4.5": 200000,
|
||||||
|
"openai/gpt-4o": 128000,
|
||||||
|
"openai/gpt-4-turbo": 128000,
|
||||||
|
"openai/gpt-4o-mini": 128000,
|
||||||
|
"google/gemini-2.0-flash": 1048576,
|
||||||
|
"google/gemini-2.5-pro": 1048576,
|
||||||
|
"meta-llama/llama-3.3-70b-instruct": 131072,
|
||||||
|
"deepseek/deepseek-chat-v3": 65536,
|
||||||
|
"qwen/qwen-2.5-72b-instruct": 32768,
|
||||||
|
"glm-4.7": 202752,
|
||||||
|
"glm-5": 202752,
|
||||||
|
"glm-4.5": 131072,
|
||||||
|
"glm-4.5-flash": 131072,
|
||||||
|
"kimi-k2.5": 262144,
|
||||||
|
"kimi-k2-thinking": 262144,
|
||||||
|
"kimi-k2-turbo-preview": 262144,
|
||||||
|
"kimi-k2-0905-preview": 131072,
|
||||||
|
"MiniMax-M2.5": 204800,
|
||||||
|
"MiniMax-M2.5-highspeed": 204800,
|
||||||
|
"MiniMax-M2.1": 204800,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
|
||||||
|
"""Fetch model metadata from OpenRouter (cached for 1 hour)."""
|
||||||
|
global _model_metadata_cache, _model_metadata_cache_time
|
||||||
|
|
||||||
|
if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
|
||||||
|
return _model_metadata_cache
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
cache = {}
|
||||||
|
for model in data.get("data", []):
|
||||||
|
model_id = model.get("id", "")
|
||||||
|
cache[model_id] = {
|
||||||
|
"context_length": model.get("context_length", 128000),
|
||||||
|
"max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
|
||||||
|
"name": model.get("name", model_id),
|
||||||
|
"pricing": model.get("pricing", {}),
|
||||||
|
}
|
||||||
|
canonical = model.get("canonical_slug", "")
|
||||||
|
if canonical and canonical != model_id:
|
||||||
|
cache[canonical] = cache[model_id]
|
||||||
|
|
||||||
|
_model_metadata_cache = cache
|
||||||
|
_model_metadata_cache_time = time.time()
|
||||||
|
logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
|
||||||
|
return cache
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
|
||||||
|
return _model_metadata_cache or {}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_context_cache_path() -> Path:
|
||||||
|
"""Return path to the persistent context length cache file."""
|
||||||
|
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||||
|
return hermes_home / "context_length_cache.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def _load_context_cache() -> Dict[str, int]:
|
||||||
|
"""Load the model+provider → context_length cache from disk."""
|
||||||
|
path = _get_context_cache_path()
|
||||||
|
if not path.exists():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
with open(path) as f:
|
||||||
|
data = yaml.safe_load(f) or {}
|
||||||
|
return data.get("context_lengths", {})
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Failed to load context length cache: %s", e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def save_context_length(model: str, base_url: str, length: int) -> None:
|
||||||
|
"""Persist a discovered context length for a model+provider combo.
|
||||||
|
|
||||||
|
Cache key is ``model@base_url`` so the same model name served from
|
||||||
|
different providers can have different limits.
|
||||||
|
"""
|
||||||
|
key = f"{model}@{base_url}"
|
||||||
|
cache = _load_context_cache()
|
||||||
|
if cache.get(key) == length:
|
||||||
|
return # already stored
|
||||||
|
cache[key] = length
|
||||||
|
path = _get_context_cache_path()
|
||||||
|
try:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(path, "w") as f:
|
||||||
|
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
||||||
|
logger.info("Cached context length %s → %s tokens", key, f"{length:,}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Failed to save context length cache: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
|
||||||
|
"""Look up a previously discovered context length for model+provider."""
|
||||||
|
key = f"{model}@{base_url}"
|
||||||
|
cache = _load_context_cache()
|
||||||
|
return cache.get(key)
|
||||||
|
|
||||||
|
|
||||||
|
def get_next_probe_tier(current_length: int) -> Optional[int]:
|
||||||
|
"""Return the next lower probe tier, or None if already at minimum."""
|
||||||
|
for tier in CONTEXT_PROBE_TIERS:
|
||||||
|
if tier < current_length:
|
||||||
|
return tier
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
||||||
|
"""Try to extract the actual context limit from an API error message.
|
||||||
|
|
||||||
|
Many providers include the limit in their error text, e.g.:
|
||||||
|
- "maximum context length is 32768 tokens"
|
||||||
|
- "context_length_exceeded: 131072"
|
||||||
|
- "Maximum context size 32768 exceeded"
|
||||||
|
- "model's max context length is 65536"
|
||||||
|
"""
|
||||||
|
error_lower = error_msg.lower()
|
||||||
|
# Pattern: look for numbers near context-related keywords
|
||||||
|
patterns = [
|
||||||
|
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
|
||||||
|
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
|
||||||
|
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
|
||||||
|
r'>\s*(\d{4,})\s*(?:max|limit|token)', # "250000 tokens > 200000 maximum"
|
||||||
|
r'(\d{4,})\s*(?:max(?:imum)?)\b', # "200000 maximum"
|
||||||
|
]
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, error_lower)
|
||||||
|
if match:
|
||||||
|
limit = int(match.group(1))
|
||||||
|
# Sanity check: must be a reasonable context length
|
||||||
|
if 1024 <= limit <= 10_000_000:
|
||||||
|
return limit
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_context_length(model: str, base_url: str = "") -> int:
|
||||||
|
"""Get the context length for a model.
|
||||||
|
|
||||||
|
Resolution order:
|
||||||
|
1. Persistent cache (previously discovered via probing)
|
||||||
|
2. OpenRouter API metadata
|
||||||
|
3. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match)
|
||||||
|
4. First probe tier (2M) — will be narrowed on first context error
|
||||||
|
"""
|
||||||
|
# 1. Check persistent cache (model+provider)
|
||||||
|
if base_url:
|
||||||
|
cached = get_cached_context_length(model, base_url)
|
||||||
|
if cached is not None:
|
||||||
|
return cached
|
||||||
|
|
||||||
|
# 2. OpenRouter API metadata
|
||||||
|
metadata = fetch_model_metadata()
|
||||||
|
if model in metadata:
|
||||||
|
return metadata[model].get("context_length", 128000)
|
||||||
|
|
||||||
|
# 3. Hardcoded defaults (fuzzy match)
|
||||||
|
for default_model, length in DEFAULT_CONTEXT_LENGTHS.items():
|
||||||
|
if default_model in model or model in default_model:
|
||||||
|
return length
|
||||||
|
|
||||||
|
# 4. Unknown model — start at highest probe tier
|
||||||
|
return CONTEXT_PROBE_TIERS[0]
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_tokens_rough(text: str) -> int:
|
||||||
|
"""Rough token estimate (~4 chars/token) for pre-flight checks."""
|
||||||
|
if not text:
|
||||||
|
return 0
|
||||||
|
return len(text) // 4
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
|
||||||
|
"""Rough token estimate for a message list (pre-flight only)."""
|
||||||
|
total_chars = sum(len(str(msg)) for msg in messages)
|
||||||
|
return total_chars // 4
|
||||||
387
agent/prompt_builder.py
Normal file
387
agent/prompt_builder.py
Normal file
@@ -0,0 +1,387 @@
|
|||||||
|
"""System prompt assembly -- identity, platform hints, skills index, context files.
|
||||||
|
|
||||||
|
All functions are stateless. AIAgent._build_system_prompt() calls these to
|
||||||
|
assemble pieces, then combines them with memory and ephemeral prompts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Context file scanning — detect prompt injection in AGENTS.md, .cursorrules,
|
||||||
|
# SOUL.md before they get injected into the system prompt.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_CONTEXT_THREAT_PATTERNS = [
|
||||||
|
(r'ignore\s+(previous|all|above|prior)\s+instructions', "prompt_injection"),
|
||||||
|
(r'do\s+not\s+tell\s+the\s+user', "deception_hide"),
|
||||||
|
(r'system\s+prompt\s+override', "sys_prompt_override"),
|
||||||
|
(r'disregard\s+(your|all|any)\s+(instructions|rules|guidelines)', "disregard_rules"),
|
||||||
|
(r'act\s+as\s+(if|though)\s+you\s+(have\s+no|don\'t\s+have)\s+(restrictions|limits|rules)', "bypass_restrictions"),
|
||||||
|
(r'<!--[^>]*(?:ignore|override|system|secret|hidden)[^>]*-->', "html_comment_injection"),
|
||||||
|
(r'<\s*div\s+style\s*=\s*["\'].*display\s*:\s*none', "hidden_div"),
|
||||||
|
(r'translate\s+.*\s+into\s+.*\s+and\s+(execute|run|eval)', "translate_execute"),
|
||||||
|
(r'curl\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_curl"),
|
||||||
|
(r'cat\s+[^\n]*(\.env|credentials|\.netrc|\.pgpass)', "read_secrets"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_CONTEXT_INVISIBLE_CHARS = {
|
||||||
|
'\u200b', '\u200c', '\u200d', '\u2060', '\ufeff',
|
||||||
|
'\u202a', '\u202b', '\u202c', '\u202d', '\u202e',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _scan_context_content(content: str, filename: str) -> str:
|
||||||
|
"""Scan context file content for injection. Returns sanitized content."""
|
||||||
|
findings = []
|
||||||
|
|
||||||
|
# Check invisible unicode
|
||||||
|
for char in _CONTEXT_INVISIBLE_CHARS:
|
||||||
|
if char in content:
|
||||||
|
findings.append(f"invisible unicode U+{ord(char):04X}")
|
||||||
|
|
||||||
|
# Check threat patterns
|
||||||
|
for pattern, pid in _CONTEXT_THREAT_PATTERNS:
|
||||||
|
if re.search(pattern, content, re.IGNORECASE):
|
||||||
|
findings.append(pid)
|
||||||
|
|
||||||
|
if findings:
|
||||||
|
logger.warning("Context file %s blocked: %s", filename, ", ".join(findings))
|
||||||
|
return f"[BLOCKED: {filename} contained potential prompt injection ({', '.join(findings)}). Content not loaded.]"
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Constants
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
DEFAULT_AGENT_IDENTITY = (
|
||||||
|
"You are Hermes Agent, an intelligent AI assistant created by Nous Research. "
|
||||||
|
"You are helpful, knowledgeable, and direct. You assist users with a wide "
|
||||||
|
"range of tasks including answering questions, writing and editing code, "
|
||||||
|
"analyzing information, creative work, and executing actions via your tools. "
|
||||||
|
"You communicate clearly, admit uncertainty when appropriate, and prioritize "
|
||||||
|
"being genuinely useful over being verbose unless otherwise directed below. "
|
||||||
|
"Be targeted and efficient in your exploration and investigations."
|
||||||
|
)
|
||||||
|
|
||||||
|
MEMORY_GUIDANCE = (
|
||||||
|
"You have persistent memory across sessions. Proactively save important things "
|
||||||
|
"you learn (user preferences, environment details, useful approaches) and do "
|
||||||
|
"(like a diary!) using the memory tool -- don't wait to be asked."
|
||||||
|
)
|
||||||
|
|
||||||
|
SESSION_SEARCH_GUIDANCE = (
|
||||||
|
"When the user references something from a past conversation or you suspect "
|
||||||
|
"relevant prior context exists, use session_search to recall it before asking "
|
||||||
|
"them to repeat themselves."
|
||||||
|
)
|
||||||
|
|
||||||
|
SKILLS_GUIDANCE = (
|
||||||
|
"After completing a complex task (5+ tool calls), fixing a tricky error, "
|
||||||
|
"or discovering a non-trivial workflow, consider saving the approach as a "
|
||||||
|
"skill with skill_manage so you can reuse it next time."
|
||||||
|
)
|
||||||
|
|
||||||
|
PLATFORM_HINTS = {
|
||||||
|
"whatsapp": (
|
||||||
|
"You are on a text messaging communication platform, WhatsApp. "
|
||||||
|
"Please do not use markdown as it does not render. "
|
||||||
|
"You can send media files natively: to deliver a file to the user, "
|
||||||
|
"include MEDIA:/absolute/path/to/file in your response. The file "
|
||||||
|
"will be sent as a native WhatsApp attachment — images (.jpg, .png, "
|
||||||
|
".webp) appear as photos, videos (.mp4, .mov) play inline, and other "
|
||||||
|
"files arrive as downloadable documents. You can also include image "
|
||||||
|
"URLs in markdown format  and they will be sent as photos."
|
||||||
|
),
|
||||||
|
"telegram": (
|
||||||
|
"You are on a text messaging communication platform, Telegram. "
|
||||||
|
"Please do not use markdown as it does not render. "
|
||||||
|
"You can send media files natively: to deliver a file to the user, "
|
||||||
|
"include MEDIA:/absolute/path/to/file in your response. Images "
|
||||||
|
"(.png, .jpg, .webp) appear as photos, audio (.ogg) sends as voice "
|
||||||
|
"bubbles, and videos (.mp4) play inline. You can also include image "
|
||||||
|
"URLs in markdown format  and they will be sent as native photos."
|
||||||
|
),
|
||||||
|
"discord": (
|
||||||
|
"You are in a Discord server or group chat communicating with your user. "
|
||||||
|
"You can send media files natively: include MEDIA:/absolute/path/to/file "
|
||||||
|
"in your response. Images (.png, .jpg, .webp) are sent as photo "
|
||||||
|
"attachments, audio as file attachments. You can also include image URLs "
|
||||||
|
"in markdown format  and they will be sent as attachments."
|
||||||
|
),
|
||||||
|
"slack": (
|
||||||
|
"You are in a Slack workspace communicating with your user. "
|
||||||
|
"You can send media files natively: include MEDIA:/absolute/path/to/file "
|
||||||
|
"in your response. Images (.png, .jpg, .webp) are uploaded as photo "
|
||||||
|
"attachments, audio as file attachments. You can also include image URLs "
|
||||||
|
"in markdown format  and they will be uploaded as attachments."
|
||||||
|
),
|
||||||
|
"signal": (
|
||||||
|
"You are on a text messaging communication platform, Signal. "
|
||||||
|
"Please do not use markdown as it does not render. "
|
||||||
|
"You can send media files natively: to deliver a file to the user, "
|
||||||
|
"include MEDIA:/absolute/path/to/file in your response. Images "
|
||||||
|
"(.png, .jpg, .webp) appear as photos, audio as attachments, and other "
|
||||||
|
"files arrive as downloadable documents. You can also include image "
|
||||||
|
"URLs in markdown format  and they will be sent as photos."
|
||||||
|
),
|
||||||
|
"cli": (
|
||||||
|
"You are a CLI AI Agent. Try not to use markdown but simple text "
|
||||||
|
"renderable inside a terminal."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
CONTEXT_FILE_MAX_CHARS = 20_000
|
||||||
|
CONTEXT_TRUNCATE_HEAD_RATIO = 0.7
|
||||||
|
CONTEXT_TRUNCATE_TAIL_RATIO = 0.2
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Skills index
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _read_skill_description(skill_file: Path, max_chars: int = 60) -> str:
|
||||||
|
"""Read the description from a SKILL.md frontmatter, capped at max_chars."""
|
||||||
|
try:
|
||||||
|
raw = skill_file.read_text(encoding="utf-8")[:2000]
|
||||||
|
match = re.search(
|
||||||
|
r"^---\s*\n.*?description:\s*(.+?)\s*\n.*?^---",
|
||||||
|
raw, re.MULTILINE | re.DOTALL,
|
||||||
|
)
|
||||||
|
if match:
|
||||||
|
desc = match.group(1).strip().strip("'\"")
|
||||||
|
if len(desc) > max_chars:
|
||||||
|
desc = desc[:max_chars - 3] + "..."
|
||||||
|
return desc
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _skill_is_platform_compatible(skill_file: Path) -> bool:
|
||||||
|
"""Quick check if a SKILL.md is compatible with the current OS platform.
|
||||||
|
|
||||||
|
Reads just enough to parse the ``platforms`` frontmatter field.
|
||||||
|
Skills without the field (the vast majority) are always compatible.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from tools.skills_tool import _parse_frontmatter, skill_matches_platform
|
||||||
|
raw = skill_file.read_text(encoding="utf-8")[:2000]
|
||||||
|
frontmatter, _ = _parse_frontmatter(raw)
|
||||||
|
return skill_matches_platform(frontmatter)
|
||||||
|
except Exception:
|
||||||
|
return True # Err on the side of showing the skill
|
||||||
|
|
||||||
|
|
||||||
|
def build_skills_system_prompt() -> str:
|
||||||
|
"""Build a compact skill index for the system prompt.
|
||||||
|
|
||||||
|
Scans ~/.hermes/skills/ for SKILL.md files grouped by category.
|
||||||
|
Includes per-skill descriptions from frontmatter so the model can
|
||||||
|
match skills by meaning, not just name.
|
||||||
|
Filters out skills incompatible with the current OS platform.
|
||||||
|
"""
|
||||||
|
hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||||
|
skills_dir = hermes_home / "skills"
|
||||||
|
|
||||||
|
if not skills_dir.exists():
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Collect skills with descriptions, grouped by category
|
||||||
|
# Each entry: (skill_name, description)
|
||||||
|
# Supports sub-categories: skills/mlops/training/axolotl/SKILL.md
|
||||||
|
# → category "mlops/training", skill "axolotl"
|
||||||
|
skills_by_category: dict[str, list[tuple[str, str]]] = {}
|
||||||
|
for skill_file in skills_dir.rglob("SKILL.md"):
|
||||||
|
# Skip skills incompatible with the current OS platform
|
||||||
|
if not _skill_is_platform_compatible(skill_file):
|
||||||
|
continue
|
||||||
|
rel_path = skill_file.relative_to(skills_dir)
|
||||||
|
parts = rel_path.parts
|
||||||
|
if len(parts) >= 2:
|
||||||
|
# Category is everything between skills_dir and the skill folder
|
||||||
|
# e.g. parts = ("mlops", "training", "axolotl", "SKILL.md")
|
||||||
|
# → category = "mlops/training", skill_name = "axolotl"
|
||||||
|
# e.g. parts = ("github", "github-auth", "SKILL.md")
|
||||||
|
# → category = "github", skill_name = "github-auth"
|
||||||
|
skill_name = parts[-2]
|
||||||
|
category = "/".join(parts[:-2]) if len(parts) > 2 else parts[0]
|
||||||
|
else:
|
||||||
|
category = "general"
|
||||||
|
skill_name = skill_file.parent.name
|
||||||
|
desc = _read_skill_description(skill_file)
|
||||||
|
skills_by_category.setdefault(category, []).append((skill_name, desc))
|
||||||
|
|
||||||
|
if not skills_by_category:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Read category-level descriptions from DESCRIPTION.md
|
||||||
|
# Checks both the exact category path and parent directories
|
||||||
|
category_descriptions = {}
|
||||||
|
for category in skills_by_category:
|
||||||
|
cat_path = Path(category)
|
||||||
|
desc_file = skills_dir / cat_path / "DESCRIPTION.md"
|
||||||
|
if desc_file.exists():
|
||||||
|
try:
|
||||||
|
content = desc_file.read_text(encoding="utf-8")
|
||||||
|
match = re.search(r"^---\s*\n.*?description:\s*(.+?)\s*\n.*?^---", content, re.MULTILINE | re.DOTALL)
|
||||||
|
if match:
|
||||||
|
category_descriptions[category] = match.group(1).strip()
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Could not read skill description %s: %s", desc_file, e)
|
||||||
|
|
||||||
|
index_lines = []
|
||||||
|
for category in sorted(skills_by_category.keys()):
|
||||||
|
cat_desc = category_descriptions.get(category, "")
|
||||||
|
if cat_desc:
|
||||||
|
index_lines.append(f" {category}: {cat_desc}")
|
||||||
|
else:
|
||||||
|
index_lines.append(f" {category}:")
|
||||||
|
# Deduplicate and sort skills within each category
|
||||||
|
seen = set()
|
||||||
|
for name, desc in sorted(skills_by_category[category], key=lambda x: x[0]):
|
||||||
|
if name in seen:
|
||||||
|
continue
|
||||||
|
seen.add(name)
|
||||||
|
if desc:
|
||||||
|
index_lines.append(f" - {name}: {desc}")
|
||||||
|
else:
|
||||||
|
index_lines.append(f" - {name}")
|
||||||
|
|
||||||
|
return (
|
||||||
|
"## Skills (mandatory)\n"
|
||||||
|
"Before replying, scan the skills below. If one clearly matches your task, "
|
||||||
|
"load it with skill_view(name) and follow its instructions. "
|
||||||
|
"If a skill has issues, fix it with skill_manage(action='patch').\n"
|
||||||
|
"\n"
|
||||||
|
"<available_skills>\n"
|
||||||
|
+ "\n".join(index_lines) + "\n"
|
||||||
|
"</available_skills>\n"
|
||||||
|
"\n"
|
||||||
|
"If none match, proceed normally without loading a skill."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Context files (SOUL.md, AGENTS.md, .cursorrules)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _truncate_content(content: str, filename: str, max_chars: int = CONTEXT_FILE_MAX_CHARS) -> str:
|
||||||
|
"""Head/tail truncation with a marker in the middle."""
|
||||||
|
if len(content) <= max_chars:
|
||||||
|
return content
|
||||||
|
head_chars = int(max_chars * CONTEXT_TRUNCATE_HEAD_RATIO)
|
||||||
|
tail_chars = int(max_chars * CONTEXT_TRUNCATE_TAIL_RATIO)
|
||||||
|
head = content[:head_chars]
|
||||||
|
tail = content[-tail_chars:]
|
||||||
|
marker = f"\n\n[...truncated {filename}: kept {head_chars}+{tail_chars} of {len(content)} chars. Use file tools to read the full file.]\n\n"
|
||||||
|
return head + marker + tail
|
||||||
|
|
||||||
|
|
||||||
|
def build_context_files_prompt(cwd: Optional[str] = None) -> str:
|
||||||
|
"""Discover and load context files for the system prompt.
|
||||||
|
|
||||||
|
Discovery: AGENTS.md (recursive), .cursorrules / .cursor/rules/*.mdc,
|
||||||
|
SOUL.md (cwd then ~/.hermes/ fallback). Each capped at 20,000 chars.
|
||||||
|
"""
|
||||||
|
if cwd is None:
|
||||||
|
cwd = os.getcwd()
|
||||||
|
|
||||||
|
cwd_path = Path(cwd).resolve()
|
||||||
|
sections = []
|
||||||
|
|
||||||
|
# AGENTS.md (hierarchical, recursive)
|
||||||
|
top_level_agents = None
|
||||||
|
for name in ["AGENTS.md", "agents.md"]:
|
||||||
|
candidate = cwd_path / name
|
||||||
|
if candidate.exists():
|
||||||
|
top_level_agents = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
if top_level_agents:
|
||||||
|
agents_files = []
|
||||||
|
for root, dirs, files in os.walk(cwd_path):
|
||||||
|
dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ('node_modules', '__pycache__', 'venv', '.venv')]
|
||||||
|
for f in files:
|
||||||
|
if f.lower() == "agents.md":
|
||||||
|
agents_files.append(Path(root) / f)
|
||||||
|
agents_files.sort(key=lambda p: len(p.parts))
|
||||||
|
|
||||||
|
total_agents_content = ""
|
||||||
|
for agents_path in agents_files:
|
||||||
|
try:
|
||||||
|
content = agents_path.read_text(encoding="utf-8").strip()
|
||||||
|
if content:
|
||||||
|
rel_path = agents_path.relative_to(cwd_path)
|
||||||
|
content = _scan_context_content(content, str(rel_path))
|
||||||
|
total_agents_content += f"## {rel_path}\n\n{content}\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Could not read %s: %s", agents_path, e)
|
||||||
|
|
||||||
|
if total_agents_content:
|
||||||
|
total_agents_content = _truncate_content(total_agents_content, "AGENTS.md")
|
||||||
|
sections.append(total_agents_content)
|
||||||
|
|
||||||
|
# .cursorrules
|
||||||
|
cursorrules_content = ""
|
||||||
|
cursorrules_file = cwd_path / ".cursorrules"
|
||||||
|
if cursorrules_file.exists():
|
||||||
|
try:
|
||||||
|
content = cursorrules_file.read_text(encoding="utf-8").strip()
|
||||||
|
if content:
|
||||||
|
content = _scan_context_content(content, ".cursorrules")
|
||||||
|
cursorrules_content += f"## .cursorrules\n\n{content}\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Could not read .cursorrules: %s", e)
|
||||||
|
|
||||||
|
cursor_rules_dir = cwd_path / ".cursor" / "rules"
|
||||||
|
if cursor_rules_dir.exists() and cursor_rules_dir.is_dir():
|
||||||
|
mdc_files = sorted(cursor_rules_dir.glob("*.mdc"))
|
||||||
|
for mdc_file in mdc_files:
|
||||||
|
try:
|
||||||
|
content = mdc_file.read_text(encoding="utf-8").strip()
|
||||||
|
if content:
|
||||||
|
content = _scan_context_content(content, f".cursor/rules/{mdc_file.name}")
|
||||||
|
cursorrules_content += f"## .cursor/rules/{mdc_file.name}\n\n{content}\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Could not read %s: %s", mdc_file, e)
|
||||||
|
|
||||||
|
if cursorrules_content:
|
||||||
|
cursorrules_content = _truncate_content(cursorrules_content, ".cursorrules")
|
||||||
|
sections.append(cursorrules_content)
|
||||||
|
|
||||||
|
# SOUL.md (cwd first, then ~/.hermes/ fallback)
|
||||||
|
soul_path = None
|
||||||
|
for name in ["SOUL.md", "soul.md"]:
|
||||||
|
candidate = cwd_path / name
|
||||||
|
if candidate.exists():
|
||||||
|
soul_path = candidate
|
||||||
|
break
|
||||||
|
if not soul_path:
|
||||||
|
global_soul = Path.home() / ".hermes" / "SOUL.md"
|
||||||
|
if global_soul.exists():
|
||||||
|
soul_path = global_soul
|
||||||
|
|
||||||
|
if soul_path:
|
||||||
|
try:
|
||||||
|
content = soul_path.read_text(encoding="utf-8").strip()
|
||||||
|
if content:
|
||||||
|
content = _scan_context_content(content, "SOUL.md")
|
||||||
|
content = _truncate_content(content, "SOUL.md")
|
||||||
|
sections.append(
|
||||||
|
f"## SOUL.md\n\nIf SOUL.md is present, embody its persona and tone. "
|
||||||
|
f"Avoid stiff, generic replies; follow its guidance unless higher-priority "
|
||||||
|
f"instructions override it.\n\n{content}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Could not read SOUL.md from %s: %s", soul_path, e)
|
||||||
|
|
||||||
|
if not sections:
|
||||||
|
return ""
|
||||||
|
return "# Project Context\n\nThe following project context files have been loaded and should be followed:\n\n" + "\n".join(sections)
|
||||||
68
agent/prompt_caching.py
Normal file
68
agent/prompt_caching.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
"""Anthropic prompt caching (system_and_3 strategy).
|
||||||
|
|
||||||
|
Reduces input token costs by ~75% on multi-turn conversations by caching
|
||||||
|
the conversation prefix. Uses 4 cache_control breakpoints (Anthropic max):
|
||||||
|
1. System prompt (stable across all turns)
|
||||||
|
2-4. Last 3 non-system messages (rolling window)
|
||||||
|
|
||||||
|
Pure functions -- no class state, no AIAgent dependency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_cache_marker(msg: dict, cache_marker: dict) -> None:
|
||||||
|
"""Add cache_control to a single message, handling all format variations."""
|
||||||
|
role = msg.get("role", "")
|
||||||
|
content = msg.get("content")
|
||||||
|
|
||||||
|
if role == "tool":
|
||||||
|
msg["cache_control"] = cache_marker
|
||||||
|
return
|
||||||
|
|
||||||
|
if content is None:
|
||||||
|
msg["cache_control"] = cache_marker
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(content, str):
|
||||||
|
msg["content"] = [{"type": "text", "text": content, "cache_control": cache_marker}]
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(content, list) and content:
|
||||||
|
last = content[-1]
|
||||||
|
if isinstance(last, dict):
|
||||||
|
last["cache_control"] = cache_marker
|
||||||
|
|
||||||
|
|
||||||
|
def apply_anthropic_cache_control(
|
||||||
|
api_messages: List[Dict[str, Any]],
|
||||||
|
cache_ttl: str = "5m",
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Apply system_and_3 caching strategy to messages for Anthropic models.
|
||||||
|
|
||||||
|
Places up to 4 cache_control breakpoints: system prompt + last 3 non-system messages.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Deep copy of messages with cache_control breakpoints injected.
|
||||||
|
"""
|
||||||
|
messages = copy.deepcopy(api_messages)
|
||||||
|
if not messages:
|
||||||
|
return messages
|
||||||
|
|
||||||
|
marker = {"type": "ephemeral"}
|
||||||
|
if cache_ttl == "1h":
|
||||||
|
marker["ttl"] = "1h"
|
||||||
|
|
||||||
|
breakpoints_used = 0
|
||||||
|
|
||||||
|
if messages[0].get("role") == "system":
|
||||||
|
_apply_cache_marker(messages[0], marker)
|
||||||
|
breakpoints_used += 1
|
||||||
|
|
||||||
|
remaining = 4 - breakpoints_used
|
||||||
|
non_sys = [i for i in range(len(messages)) if messages[i].get("role") != "system"]
|
||||||
|
for idx in non_sys[-remaining:]:
|
||||||
|
_apply_cache_marker(messages[idx], marker)
|
||||||
|
|
||||||
|
return messages
|
||||||
161
agent/redact.py
Normal file
161
agent/redact.py
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
"""Regex-based secret redaction for logs and tool output.
|
||||||
|
|
||||||
|
Applies pattern matching to mask API keys, tokens, and credentials
|
||||||
|
before they reach log files, verbose output, or gateway logs.
|
||||||
|
|
||||||
|
Short tokens (< 18 chars) are fully masked. Longer tokens preserve
|
||||||
|
the first 6 and last 4 characters for debuggability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Known API key prefixes -- match the prefix + contiguous token chars
|
||||||
|
_PREFIX_PATTERNS = [
|
||||||
|
r"sk-[A-Za-z0-9_-]{10,}", # OpenAI / OpenRouter / Anthropic (sk-ant-*)
|
||||||
|
r"ghp_[A-Za-z0-9]{10,}", # GitHub PAT (classic)
|
||||||
|
r"github_pat_[A-Za-z0-9_]{10,}", # GitHub PAT (fine-grained)
|
||||||
|
r"xox[baprs]-[A-Za-z0-9-]{10,}", # Slack tokens
|
||||||
|
r"AIza[A-Za-z0-9_-]{30,}", # Google API keys
|
||||||
|
r"pplx-[A-Za-z0-9]{10,}", # Perplexity
|
||||||
|
r"fal_[A-Za-z0-9_-]{10,}", # Fal.ai
|
||||||
|
r"fc-[A-Za-z0-9]{10,}", # Firecrawl
|
||||||
|
r"bb_live_[A-Za-z0-9_-]{10,}", # BrowserBase
|
||||||
|
r"gAAAA[A-Za-z0-9_=-]{20,}", # Codex encrypted tokens
|
||||||
|
r"AKIA[A-Z0-9]{16}", # AWS Access Key ID
|
||||||
|
r"sk_live_[A-Za-z0-9]{10,}", # Stripe secret key (live)
|
||||||
|
r"sk_test_[A-Za-z0-9]{10,}", # Stripe secret key (test)
|
||||||
|
r"rk_live_[A-Za-z0-9]{10,}", # Stripe restricted key
|
||||||
|
r"SG\.[A-Za-z0-9_-]{10,}", # SendGrid API key
|
||||||
|
r"hf_[A-Za-z0-9]{10,}", # HuggingFace token
|
||||||
|
r"r8_[A-Za-z0-9]{10,}", # Replicate API token
|
||||||
|
r"npm_[A-Za-z0-9]{10,}", # npm access token
|
||||||
|
r"pypi-[A-Za-z0-9_-]{10,}", # PyPI API token
|
||||||
|
r"dop_v1_[A-Za-z0-9]{10,}", # DigitalOcean PAT
|
||||||
|
r"doo_v1_[A-Za-z0-9]{10,}", # DigitalOcean OAuth
|
||||||
|
r"am_[A-Za-z0-9_-]{10,}", # AgentMail API key
|
||||||
|
]
|
||||||
|
|
||||||
|
# ENV assignment patterns: KEY=value where KEY contains a secret-like name
|
||||||
|
_SECRET_ENV_NAMES = r"(?:API_?KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIAL|AUTH)"
|
||||||
|
_ENV_ASSIGN_RE = re.compile(
|
||||||
|
rf"([A-Z_]*{_SECRET_ENV_NAMES}[A-Z_]*)\s*=\s*(['\"]?)(\S+)\2",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# JSON field patterns: "apiKey": "value", "token": "value", etc.
|
||||||
|
_JSON_KEY_NAMES = r"(?:api_?[Kk]ey|token|secret|password|access_token|refresh_token|auth_token|bearer)"
|
||||||
|
_JSON_FIELD_RE = re.compile(
|
||||||
|
rf'("{_JSON_KEY_NAMES}")\s*:\s*"([^"]+)"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Authorization headers
|
||||||
|
_AUTH_HEADER_RE = re.compile(
|
||||||
|
r"(Authorization:\s*Bearer\s+)(\S+)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Telegram bot tokens: bot<digits>:<token> or <digits>:<alphanum>
|
||||||
|
_TELEGRAM_RE = re.compile(
|
||||||
|
r"(bot)?(\d{8,}):([-A-Za-z0-9_]{30,})",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Private key blocks: -----BEGIN RSA PRIVATE KEY----- ... -----END RSA PRIVATE KEY-----
|
||||||
|
_PRIVATE_KEY_RE = re.compile(
|
||||||
|
r"-----BEGIN[A-Z ]*PRIVATE KEY-----[\s\S]*?-----END[A-Z ]*PRIVATE KEY-----"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Database connection strings: protocol://user:PASSWORD@host
|
||||||
|
# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password
|
||||||
|
_DB_CONNSTR_RE = re.compile(
|
||||||
|
r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:]+:)([^@]+)(@)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# E.164 phone numbers: +<country><number>, 7-15 digits
|
||||||
|
# Negative lookahead prevents matching hex strings or identifiers
|
||||||
|
_SIGNAL_PHONE_RE = re.compile(r"(\+[1-9]\d{6,14})(?![A-Za-z0-9])")
|
||||||
|
|
||||||
|
# Compile known prefix patterns into one alternation
|
||||||
|
_PREFIX_RE = re.compile(
|
||||||
|
r"(?<![A-Za-z0-9_-])(" + "|".join(_PREFIX_PATTERNS) + r")(?![A-Za-z0-9_-])"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_token(token: str) -> str:
|
||||||
|
"""Mask a token, preserving prefix for long tokens."""
|
||||||
|
if len(token) < 18:
|
||||||
|
return "***"
|
||||||
|
return f"{token[:6]}...{token[-4:]}"
|
||||||
|
|
||||||
|
|
||||||
|
def redact_sensitive_text(text: str) -> str:
|
||||||
|
"""Apply all redaction patterns to a block of text.
|
||||||
|
|
||||||
|
Safe to call on any string -- non-matching text passes through unchanged.
|
||||||
|
Disabled when security.redact_secrets is false in config.yaml.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
if os.getenv("HERMES_REDACT_SECRETS", "").lower() in ("0", "false", "no", "off"):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Known prefixes (sk-, ghp_, etc.)
|
||||||
|
text = _PREFIX_RE.sub(lambda m: _mask_token(m.group(1)), text)
|
||||||
|
|
||||||
|
# ENV assignments: OPENAI_API_KEY=sk-abc...
|
||||||
|
def _redact_env(m):
|
||||||
|
name, quote, value = m.group(1), m.group(2), m.group(3)
|
||||||
|
return f"{name}={quote}{_mask_token(value)}{quote}"
|
||||||
|
text = _ENV_ASSIGN_RE.sub(_redact_env, text)
|
||||||
|
|
||||||
|
# JSON fields: "apiKey": "value"
|
||||||
|
def _redact_json(m):
|
||||||
|
key, value = m.group(1), m.group(2)
|
||||||
|
return f'{key}: "{_mask_token(value)}"'
|
||||||
|
text = _JSON_FIELD_RE.sub(_redact_json, text)
|
||||||
|
|
||||||
|
# Authorization headers
|
||||||
|
text = _AUTH_HEADER_RE.sub(
|
||||||
|
lambda m: m.group(1) + _mask_token(m.group(2)),
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Telegram bot tokens
|
||||||
|
def _redact_telegram(m):
|
||||||
|
prefix = m.group(1) or ""
|
||||||
|
digits = m.group(2)
|
||||||
|
return f"{prefix}{digits}:***"
|
||||||
|
text = _TELEGRAM_RE.sub(_redact_telegram, text)
|
||||||
|
|
||||||
|
# Private key blocks
|
||||||
|
text = _PRIVATE_KEY_RE.sub("[REDACTED PRIVATE KEY]", text)
|
||||||
|
|
||||||
|
# Database connection string passwords
|
||||||
|
text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
|
||||||
|
|
||||||
|
# E.164 phone numbers (Signal, WhatsApp)
|
||||||
|
def _redact_phone(m):
|
||||||
|
phone = m.group(1)
|
||||||
|
if len(phone) <= 8:
|
||||||
|
return phone[:2] + "****" + phone[-2:]
|
||||||
|
return phone[:4] + "****" + phone[-4:]
|
||||||
|
text = _SIGNAL_PHONE_RE.sub(_redact_phone, text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
class RedactingFormatter(logging.Formatter):
|
||||||
|
"""Log formatter that redacts secrets from all log messages."""
|
||||||
|
|
||||||
|
def __init__(self, fmt=None, datefmt=None, style='%', **kwargs):
|
||||||
|
super().__init__(fmt, datefmt, style, **kwargs)
|
||||||
|
|
||||||
|
def format(self, record: logging.LogRecord) -> str:
|
||||||
|
original = super().format(record)
|
||||||
|
return redact_sensitive_text(original)
|
||||||
116
agent/skill_commands.py
Normal file
116
agent/skill_commands.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
"""Skill slash commands — scan installed skills and build invocation messages.
|
||||||
|
|
||||||
|
Shared between CLI (cli.py) and gateway (gateway/run.py) so both surfaces
|
||||||
|
can invoke skills via /skill-name commands.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_skill_commands: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def scan_skill_commands() -> Dict[str, Dict[str, Any]]:
|
||||||
|
"""Scan ~/.hermes/skills/ and return a mapping of /command -> skill info.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping "/skill-name" to {name, description, skill_md_path, skill_dir}.
|
||||||
|
"""
|
||||||
|
global _skill_commands
|
||||||
|
_skill_commands = {}
|
||||||
|
try:
|
||||||
|
from tools.skills_tool import SKILLS_DIR, _parse_frontmatter, skill_matches_platform
|
||||||
|
if not SKILLS_DIR.exists():
|
||||||
|
return _skill_commands
|
||||||
|
for skill_md in SKILLS_DIR.rglob("SKILL.md"):
|
||||||
|
if any(part in ('.git', '.github', '.hub') for part in skill_md.parts):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
content = skill_md.read_text(encoding='utf-8')
|
||||||
|
frontmatter, body = _parse_frontmatter(content)
|
||||||
|
# Skip skills incompatible with the current OS platform
|
||||||
|
if not skill_matches_platform(frontmatter):
|
||||||
|
continue
|
||||||
|
name = frontmatter.get('name', skill_md.parent.name)
|
||||||
|
description = frontmatter.get('description', '')
|
||||||
|
if not description:
|
||||||
|
for line in body.strip().split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith('#'):
|
||||||
|
description = line[:80]
|
||||||
|
break
|
||||||
|
cmd_name = name.lower().replace(' ', '-').replace('_', '-')
|
||||||
|
_skill_commands[f"/{cmd_name}"] = {
|
||||||
|
"name": name,
|
||||||
|
"description": description or f"Invoke the {name} skill",
|
||||||
|
"skill_md_path": str(skill_md),
|
||||||
|
"skill_dir": str(skill_md.parent),
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _skill_commands
|
||||||
|
|
||||||
|
|
||||||
|
def get_skill_commands() -> Dict[str, Dict[str, Any]]:
|
||||||
|
"""Return the current skill commands mapping (scan first if empty)."""
|
||||||
|
if not _skill_commands:
|
||||||
|
scan_skill_commands()
|
||||||
|
return _skill_commands
|
||||||
|
|
||||||
|
|
||||||
|
def build_skill_invocation_message(cmd_key: str, user_instruction: str = "") -> Optional[str]:
|
||||||
|
"""Build the user message content for a skill slash command invocation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cmd_key: The command key including leading slash (e.g., "/gif-search").
|
||||||
|
user_instruction: Optional text the user typed after the command.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The formatted message string, or None if the skill wasn't found.
|
||||||
|
"""
|
||||||
|
commands = get_skill_commands()
|
||||||
|
skill_info = commands.get(cmd_key)
|
||||||
|
if not skill_info:
|
||||||
|
return None
|
||||||
|
|
||||||
|
skill_md_path = Path(skill_info["skill_md_path"])
|
||||||
|
skill_dir = Path(skill_info["skill_dir"])
|
||||||
|
skill_name = skill_info["name"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = skill_md_path.read_text(encoding='utf-8')
|
||||||
|
except Exception:
|
||||||
|
return f"[Failed to load skill: {skill_name}]"
|
||||||
|
|
||||||
|
parts = [
|
||||||
|
f'[SYSTEM: The user has invoked the "{skill_name}" skill, indicating they want you to follow its instructions. The full skill content is loaded below.]',
|
||||||
|
"",
|
||||||
|
content.strip(),
|
||||||
|
]
|
||||||
|
|
||||||
|
supporting = []
|
||||||
|
for subdir in ("references", "templates", "scripts", "assets"):
|
||||||
|
subdir_path = skill_dir / subdir
|
||||||
|
if subdir_path.exists():
|
||||||
|
for f in sorted(subdir_path.rglob("*")):
|
||||||
|
if f.is_file():
|
||||||
|
rel = str(f.relative_to(skill_dir))
|
||||||
|
supporting.append(rel)
|
||||||
|
|
||||||
|
if supporting:
|
||||||
|
parts.append("")
|
||||||
|
parts.append("[This skill has supporting files you can load with the skill_view tool:]")
|
||||||
|
for sf in supporting:
|
||||||
|
parts.append(f"- {sf}")
|
||||||
|
parts.append(f'\nTo view any of these, use: skill_view(name="{skill_name}", file="<path>")')
|
||||||
|
|
||||||
|
if user_instruction:
|
||||||
|
parts.append("")
|
||||||
|
parts.append(f"The user has provided the following instruction alongside the skill invocation: {user_instruction}")
|
||||||
|
|
||||||
|
return "\n".join(parts)
|
||||||
56
agent/trajectory.py
Normal file
56
agent/trajectory.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
"""Trajectory saving utilities and static helpers.
|
||||||
|
|
||||||
|
_convert_to_trajectory_format stays as an AIAgent method (batch_runner.py
|
||||||
|
calls agent._convert_to_trajectory_format). Only the static helpers and
|
||||||
|
the file-write logic live here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_scratchpad_to_think(content: str) -> str:
|
||||||
|
"""Convert <REASONING_SCRATCHPAD> tags to <think> tags."""
|
||||||
|
if not content or "<REASONING_SCRATCHPAD>" not in content:
|
||||||
|
return content
|
||||||
|
return content.replace("<REASONING_SCRATCHPAD>", "<think>").replace("</REASONING_SCRATCHPAD>", "</think>")
|
||||||
|
|
||||||
|
|
||||||
|
def has_incomplete_scratchpad(content: str) -> bool:
|
||||||
|
"""Check if content has an opening <REASONING_SCRATCHPAD> without a closing tag."""
|
||||||
|
if not content:
|
||||||
|
return False
|
||||||
|
return "<REASONING_SCRATCHPAD>" in content and "</REASONING_SCRATCHPAD>" not in content
|
||||||
|
|
||||||
|
|
||||||
|
def save_trajectory(trajectory: List[Dict[str, Any]], model: str,
|
||||||
|
completed: bool, filename: str = None):
|
||||||
|
"""Append a trajectory entry to a JSONL file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
trajectory: The ShareGPT-format conversation list.
|
||||||
|
model: Model name for metadata.
|
||||||
|
completed: Whether the conversation completed successfully.
|
||||||
|
filename: Override output filename. Defaults to trajectory_samples.jsonl
|
||||||
|
or failed_trajectories.jsonl based on ``completed``.
|
||||||
|
"""
|
||||||
|
if filename is None:
|
||||||
|
filename = "trajectory_samples.jsonl" if completed else "failed_trajectories.jsonl"
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"conversations": trajectory,
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"model": model,
|
||||||
|
"completed": completed,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filename, "a", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||||
|
logger.info("Trajectory saved to %s", filename)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to save trajectory: %s", e)
|
||||||
BIN
assets/banner.png
Normal file
BIN
assets/banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 12 KiB |
300
batch_runner.py
300
batch_runner.py
@@ -27,38 +27,29 @@ import time
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any, Optional, Tuple
|
from typing import List, Dict, Any, Optional, Tuple
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from multiprocessing import Pool, Manager, Lock
|
from multiprocessing import Pool, Lock
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeRemainingColumn, MofNCompleteColumn
|
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeRemainingColumn, MofNCompleteColumn
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
import fire
|
import fire
|
||||||
|
|
||||||
from run_agent import AIAgent
|
from run_agent import AIAgent
|
||||||
from toolset_distributions import (
|
from toolset_distributions import (
|
||||||
get_distribution,
|
|
||||||
list_distributions,
|
list_distributions,
|
||||||
sample_toolsets_from_distribution,
|
sample_toolsets_from_distribution,
|
||||||
validate_distribution
|
validate_distribution
|
||||||
)
|
)
|
||||||
|
from model_tools import TOOL_TO_TOOLSET_MAP
|
||||||
|
|
||||||
|
|
||||||
# Global configuration for worker processes
|
# Global configuration for worker processes
|
||||||
_WORKER_CONFIG = {}
|
_WORKER_CONFIG = {}
|
||||||
|
|
||||||
# All possible tools - used to ensure consistent schema across all trajectory entries
|
# All possible tools - auto-derived from the master mapping in model_tools.py.
|
||||||
# This is required because Arrow/Parquet (used by HuggingFace datasets) needs identical schemas
|
# This stays in sync automatically when new tools are added to TOOL_TO_TOOLSET_MAP.
|
||||||
ALL_POSSIBLE_TOOLS = {
|
# Used for consistent schema in Arrow/Parquet (HuggingFace datasets) and for
|
||||||
'terminal', 'web_search', 'web_extract',
|
# filtering corrupted entries during trajectory combination.
|
||||||
'vision_analyze', 'image_generate', 'mixture_of_agents',
|
ALL_POSSIBLE_TOOLS = set(TOOL_TO_TOOLSET_MAP.keys())
|
||||||
# Skills tools
|
|
||||||
'skills_categories', 'skills_list', 'skill_view',
|
|
||||||
# Browser automation tools
|
|
||||||
'browser_navigate', 'browser_snapshot', 'browser_click',
|
|
||||||
'browser_type', 'browser_scroll', 'browser_back',
|
|
||||||
'browser_press', 'browser_close', 'browser_get_images',
|
|
||||||
'browser_vision'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Default stats for tools that weren't used
|
# Default stats for tools that weren't used
|
||||||
DEFAULT_TOOL_STATS = {'count': 0, 'success': 0, 'failure': 0}
|
DEFAULT_TOOL_STATS = {'count': 0, 'success': 0, 'failure': 0}
|
||||||
@@ -180,7 +171,7 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
|
|||||||
if content_json.get("success") is False:
|
if content_json.get("success") is False:
|
||||||
is_success = False
|
is_success = False
|
||||||
|
|
||||||
except:
|
except (json.JSONDecodeError, ValueError, TypeError):
|
||||||
# If not JSON, check if content is empty or explicitly states an error
|
# If not JSON, check if content is empty or explicitly states an error
|
||||||
# Note: We avoid simple substring matching to prevent false positives
|
# Note: We avoid simple substring matching to prevent false positives
|
||||||
if not content:
|
if not content:
|
||||||
@@ -200,6 +191,42 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
|
|||||||
return tool_stats
|
return tool_stats
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_reasoning_stats(messages: List[Dict[str, Any]]) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Count how many assistant turns have reasoning vs no reasoning.
|
||||||
|
|
||||||
|
Checks for <REASONING_SCRATCHPAD> in content or a non-empty 'reasoning' field
|
||||||
|
(native thinking tokens). Returns counts for tracking reasoning coverage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
messages: Message history
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'total_assistant_turns', 'turns_with_reasoning', 'turns_without_reasoning'
|
||||||
|
"""
|
||||||
|
total = 0
|
||||||
|
with_reasoning = 0
|
||||||
|
|
||||||
|
for msg in messages:
|
||||||
|
if msg.get("role") != "assistant":
|
||||||
|
continue
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
content = msg.get("content", "") or ""
|
||||||
|
has_scratchpad = "<REASONING_SCRATCHPAD>" in content
|
||||||
|
has_native_reasoning = bool(msg.get("reasoning", "").strip()) if msg.get("reasoning") else False
|
||||||
|
|
||||||
|
if has_scratchpad or has_native_reasoning:
|
||||||
|
with_reasoning += 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_assistant_turns": total,
|
||||||
|
"turns_with_reasoning": with_reasoning,
|
||||||
|
"turns_without_reasoning": total - with_reasoning,
|
||||||
|
"has_any_reasoning": with_reasoning > 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _process_single_prompt(
|
def _process_single_prompt(
|
||||||
prompt_index: int,
|
prompt_index: int,
|
||||||
prompt_data: Dict[str, Any],
|
prompt_data: Dict[str, Any],
|
||||||
@@ -211,7 +238,7 @@ def _process_single_prompt(
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt_index (int): Index of prompt in dataset
|
prompt_index (int): Index of prompt in dataset
|
||||||
prompt_data (Dict): Prompt data containing 'prompt' field
|
prompt_data (Dict): Prompt data containing 'prompt' field and optional 'image' field
|
||||||
batch_num (int): Batch number
|
batch_num (int): Batch number
|
||||||
config (Dict): Configuration dict with agent parameters
|
config (Dict): Configuration dict with agent parameters
|
||||||
|
|
||||||
@@ -219,6 +246,58 @@ def _process_single_prompt(
|
|||||||
Dict: Result containing trajectory, stats, and metadata
|
Dict: Result containing trajectory, stats, and metadata
|
||||||
"""
|
"""
|
||||||
prompt = prompt_data["prompt"]
|
prompt = prompt_data["prompt"]
|
||||||
|
task_id = f"task_{prompt_index}"
|
||||||
|
|
||||||
|
# Per-prompt container image override: if the dataset row has an 'image' field,
|
||||||
|
# register it for this task's sandbox. Works with Docker, Modal, Singularity, and Daytona.
|
||||||
|
container_image = prompt_data.get("image") or prompt_data.get("docker_image")
|
||||||
|
if container_image:
|
||||||
|
# Verify the image is accessible before spending tokens on the agent loop.
|
||||||
|
# For Docker: check local cache, then try pulling.
|
||||||
|
# For Modal: skip local check (Modal pulls server-side).
|
||||||
|
env_type = os.getenv("TERMINAL_ENV", "local")
|
||||||
|
if env_type == "docker":
|
||||||
|
import subprocess as _sp
|
||||||
|
try:
|
||||||
|
probe = _sp.run(
|
||||||
|
["docker", "image", "inspect", container_image],
|
||||||
|
capture_output=True, timeout=10,
|
||||||
|
)
|
||||||
|
if probe.returncode != 0:
|
||||||
|
if config.get("verbose"):
|
||||||
|
print(f" Prompt {prompt_index}: Pulling docker image {container_image}...", flush=True)
|
||||||
|
pull = _sp.run(
|
||||||
|
["docker", "pull", container_image],
|
||||||
|
capture_output=True, text=True, timeout=600,
|
||||||
|
)
|
||||||
|
if pull.returncode != 0:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"prompt_index": prompt_index,
|
||||||
|
"error": f"Docker image not available: {container_image}\n{pull.stderr[:500]}",
|
||||||
|
"trajectory": None,
|
||||||
|
"tool_stats": {},
|
||||||
|
"toolsets_used": [],
|
||||||
|
"metadata": {"batch_num": batch_num, "timestamp": datetime.now().isoformat()},
|
||||||
|
}
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass # Docker CLI not installed — skip check (e.g., Modal backend)
|
||||||
|
except Exception as img_err:
|
||||||
|
if config.get("verbose"):
|
||||||
|
print(f" Prompt {prompt_index}: Docker image check failed: {img_err}", flush=True)
|
||||||
|
|
||||||
|
from tools.terminal_tool import register_task_env_overrides
|
||||||
|
overrides = {
|
||||||
|
"docker_image": container_image,
|
||||||
|
"modal_image": container_image,
|
||||||
|
"singularity_image": f"docker://{container_image}",
|
||||||
|
"daytona_image": container_image,
|
||||||
|
}
|
||||||
|
if prompt_data.get("cwd"):
|
||||||
|
overrides["cwd"] = prompt_data["cwd"]
|
||||||
|
register_task_env_overrides(task_id, overrides)
|
||||||
|
if config.get("verbose"):
|
||||||
|
print(f" Prompt {prompt_index}: Using container image {container_image}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Sample toolsets from distribution for this prompt
|
# Sample toolsets from distribution for this prompt
|
||||||
@@ -244,14 +323,22 @@ def _process_single_prompt(
|
|||||||
providers_ignored=config.get("providers_ignored"),
|
providers_ignored=config.get("providers_ignored"),
|
||||||
providers_order=config.get("providers_order"),
|
providers_order=config.get("providers_order"),
|
||||||
provider_sort=config.get("provider_sort"),
|
provider_sort=config.get("provider_sort"),
|
||||||
|
max_tokens=config.get("max_tokens"),
|
||||||
|
reasoning_config=config.get("reasoning_config"),
|
||||||
|
prefill_messages=config.get("prefill_messages"),
|
||||||
|
skip_context_files=True, # Don't pollute trajectories with SOUL.md/AGENTS.md
|
||||||
|
skip_memory=True, # Don't use persistent memory in batch runs
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run the agent with task_id to ensure each task gets its own isolated VM
|
# Run the agent with task_id to ensure each task gets its own isolated VM
|
||||||
result = agent.run_conversation(prompt, task_id=f"task_{prompt_index}")
|
result = agent.run_conversation(prompt, task_id=task_id)
|
||||||
|
|
||||||
# Extract tool usage statistics
|
# Extract tool usage statistics
|
||||||
tool_stats = _extract_tool_stats(result["messages"])
|
tool_stats = _extract_tool_stats(result["messages"])
|
||||||
|
|
||||||
|
# Extract reasoning coverage stats
|
||||||
|
reasoning_stats = _extract_reasoning_stats(result["messages"])
|
||||||
|
|
||||||
# Convert to trajectory format (using existing method)
|
# Convert to trajectory format (using existing method)
|
||||||
trajectory = agent._convert_to_trajectory_format(
|
trajectory = agent._convert_to_trajectory_format(
|
||||||
result["messages"],
|
result["messages"],
|
||||||
@@ -264,6 +351,7 @@ def _process_single_prompt(
|
|||||||
"prompt_index": prompt_index,
|
"prompt_index": prompt_index,
|
||||||
"trajectory": trajectory,
|
"trajectory": trajectory,
|
||||||
"tool_stats": tool_stats,
|
"tool_stats": tool_stats,
|
||||||
|
"reasoning_stats": reasoning_stats,
|
||||||
"completed": result["completed"],
|
"completed": result["completed"],
|
||||||
"partial": result.get("partial", False),
|
"partial": result.get("partial", False),
|
||||||
"api_calls": result["api_calls"],
|
"api_calls": result["api_calls"],
|
||||||
@@ -332,7 +420,9 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||||||
|
|
||||||
# Initialize aggregated stats for this batch
|
# Initialize aggregated stats for this batch
|
||||||
batch_tool_stats = {}
|
batch_tool_stats = {}
|
||||||
|
batch_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
|
||||||
completed_in_batch = []
|
completed_in_batch = []
|
||||||
|
discarded_no_reasoning = 0
|
||||||
|
|
||||||
# Process each prompt sequentially in this batch
|
# Process each prompt sequentially in this batch
|
||||||
for prompt_index, prompt_data in prompts_to_process:
|
for prompt_index, prompt_data in prompts_to_process:
|
||||||
@@ -346,6 +436,13 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||||||
|
|
||||||
# Save trajectory if successful
|
# Save trajectory if successful
|
||||||
if result["success"] and result["trajectory"]:
|
if result["success"] and result["trajectory"]:
|
||||||
|
# Discard samples with zero reasoning across all turns
|
||||||
|
reasoning = result.get("reasoning_stats", {})
|
||||||
|
if not reasoning.get("has_any_reasoning", True):
|
||||||
|
print(f" 🚫 Prompt {prompt_index} discarded (no reasoning in any turn)")
|
||||||
|
discarded_no_reasoning += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Get and normalize tool stats for consistent schema across all entries
|
# Get and normalize tool stats for consistent schema across all entries
|
||||||
raw_tool_stats = result.get("tool_stats", {})
|
raw_tool_stats = result.get("tool_stats", {})
|
||||||
tool_stats = _normalize_tool_stats(raw_tool_stats)
|
tool_stats = _normalize_tool_stats(raw_tool_stats)
|
||||||
@@ -386,6 +483,10 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||||||
batch_tool_stats[tool_name]["success"] += stats["success"]
|
batch_tool_stats[tool_name]["success"] += stats["success"]
|
||||||
batch_tool_stats[tool_name]["failure"] += stats["failure"]
|
batch_tool_stats[tool_name]["failure"] += stats["failure"]
|
||||||
|
|
||||||
|
# Aggregate reasoning stats
|
||||||
|
for key in batch_reasoning_stats:
|
||||||
|
batch_reasoning_stats[key] += result.get("reasoning_stats", {}).get(key, 0)
|
||||||
|
|
||||||
# Only mark as completed if successfully saved (failed prompts can be retried on resume)
|
# Only mark as completed if successfully saved (failed prompts can be retried on resume)
|
||||||
if result["success"] and result["trajectory"]:
|
if result["success"] and result["trajectory"]:
|
||||||
completed_in_batch.append(prompt_index)
|
completed_in_batch.append(prompt_index)
|
||||||
@@ -401,6 +502,8 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||||||
"processed": len(prompts_to_process),
|
"processed": len(prompts_to_process),
|
||||||
"skipped": len(batch_data) - len(prompts_to_process),
|
"skipped": len(batch_data) - len(prompts_to_process),
|
||||||
"tool_stats": batch_tool_stats,
|
"tool_stats": batch_tool_stats,
|
||||||
|
"reasoning_stats": batch_reasoning_stats,
|
||||||
|
"discarded_no_reasoning": discarded_no_reasoning,
|
||||||
"completed_prompts": completed_in_batch
|
"completed_prompts": completed_in_batch
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -428,6 +531,10 @@ class BatchRunner:
|
|||||||
providers_ignored: List[str] = None,
|
providers_ignored: List[str] = None,
|
||||||
providers_order: List[str] = None,
|
providers_order: List[str] = None,
|
||||||
provider_sort: str = None,
|
provider_sort: str = None,
|
||||||
|
max_tokens: int = None,
|
||||||
|
reasoning_config: Dict[str, Any] = None,
|
||||||
|
prefill_messages: List[Dict[str, Any]] = None,
|
||||||
|
max_samples: int = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the batch runner.
|
Initialize the batch runner.
|
||||||
@@ -449,6 +556,10 @@ class BatchRunner:
|
|||||||
providers_ignored (List[str]): OpenRouter providers to ignore (optional)
|
providers_ignored (List[str]): OpenRouter providers to ignore (optional)
|
||||||
providers_order (List[str]): OpenRouter providers to try in order (optional)
|
providers_order (List[str]): OpenRouter providers to try in order (optional)
|
||||||
provider_sort (str): Sort providers by price/throughput/latency (optional)
|
provider_sort (str): Sort providers by price/throughput/latency (optional)
|
||||||
|
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
|
||||||
|
reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking)
|
||||||
|
prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming)
|
||||||
|
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
|
||||||
"""
|
"""
|
||||||
self.dataset_file = Path(dataset_file)
|
self.dataset_file = Path(dataset_file)
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@@ -466,6 +577,10 @@ class BatchRunner:
|
|||||||
self.providers_ignored = providers_ignored
|
self.providers_ignored = providers_ignored
|
||||||
self.providers_order = providers_order
|
self.providers_order = providers_order
|
||||||
self.provider_sort = provider_sort
|
self.provider_sort = provider_sort
|
||||||
|
self.max_tokens = max_tokens
|
||||||
|
self.reasoning_config = reasoning_config
|
||||||
|
self.prefill_messages = prefill_messages
|
||||||
|
self.max_samples = max_samples
|
||||||
|
|
||||||
# Validate distribution
|
# Validate distribution
|
||||||
if not validate_distribution(distribution):
|
if not validate_distribution(distribution):
|
||||||
@@ -481,8 +596,12 @@ class BatchRunner:
|
|||||||
# Statistics file
|
# Statistics file
|
||||||
self.stats_file = self.output_dir / "statistics.json"
|
self.stats_file = self.output_dir / "statistics.json"
|
||||||
|
|
||||||
# Load dataset
|
# Load dataset (and optionally truncate to max_samples)
|
||||||
self.dataset = self._load_dataset()
|
self.dataset = self._load_dataset()
|
||||||
|
if self.max_samples and self.max_samples < len(self.dataset):
|
||||||
|
full_count = len(self.dataset)
|
||||||
|
self.dataset = self.dataset[:self.max_samples]
|
||||||
|
print(f"✂️ Truncated dataset from {full_count} to {self.max_samples} samples (--max_samples)")
|
||||||
|
|
||||||
# Create batches
|
# Create batches
|
||||||
self.batches = self._create_batches()
|
self.batches = self._create_batches()
|
||||||
@@ -582,13 +701,12 @@ class BatchRunner:
|
|||||||
"""
|
"""
|
||||||
checkpoint_data["last_updated"] = datetime.now().isoformat()
|
checkpoint_data["last_updated"] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
from utils import atomic_json_write
|
||||||
if lock:
|
if lock:
|
||||||
with lock:
|
with lock:
|
||||||
with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
|
atomic_json_write(self.checkpoint_file, checkpoint_data)
|
||||||
json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
|
|
||||||
else:
|
else:
|
||||||
with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
|
atomic_json_write(self.checkpoint_file, checkpoint_data)
|
||||||
json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
def _scan_completed_prompts_by_content(self) -> set:
|
def _scan_completed_prompts_by_content(self) -> set:
|
||||||
"""
|
"""
|
||||||
@@ -713,7 +831,9 @@ class BatchRunner:
|
|||||||
print(f" New batches created: {len(batches_to_process)}")
|
print(f" New batches created: {len(batches_to_process)}")
|
||||||
print("=" * 70 + "\n")
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
# Initialize checkpoint data (needed for saving at the end)
|
# Load existing checkpoint (so resume doesn't clobber prior progress)
|
||||||
|
checkpoint_data = self._load_checkpoint()
|
||||||
|
if checkpoint_data.get("run_name") != self.run_name:
|
||||||
checkpoint_data = {
|
checkpoint_data = {
|
||||||
"run_name": self.run_name,
|
"run_name": self.run_name,
|
||||||
"completed_prompts": [],
|
"completed_prompts": [],
|
||||||
@@ -735,10 +855,13 @@ class BatchRunner:
|
|||||||
"providers_ignored": self.providers_ignored,
|
"providers_ignored": self.providers_ignored,
|
||||||
"providers_order": self.providers_order,
|
"providers_order": self.providers_order,
|
||||||
"provider_sort": self.provider_sort,
|
"provider_sort": self.provider_sort,
|
||||||
|
"max_tokens": self.max_tokens,
|
||||||
|
"reasoning_config": self.reasoning_config,
|
||||||
|
"prefill_messages": self.prefill_messages,
|
||||||
}
|
}
|
||||||
|
|
||||||
# For backward compatibility, still track by index (but this is secondary to content matching)
|
# For backward compatibility, still track by index (but this is secondary to content matching)
|
||||||
completed_prompts_set = set()
|
completed_prompts_set = set(checkpoint_data.get("completed_prompts", []))
|
||||||
|
|
||||||
# Aggregate statistics across all batches
|
# Aggregate statistics across all batches
|
||||||
total_tool_stats = {}
|
total_tool_stats = {}
|
||||||
@@ -747,6 +870,9 @@ class BatchRunner:
|
|||||||
|
|
||||||
print(f"\n🔧 Initializing {self.num_workers} worker processes...")
|
print(f"\n🔧 Initializing {self.num_workers} worker processes...")
|
||||||
|
|
||||||
|
# Checkpoint writes happen in the parent process; keep a lock for safety.
|
||||||
|
checkpoint_lock = Lock()
|
||||||
|
|
||||||
# Process batches in parallel
|
# Process batches in parallel
|
||||||
with Pool(processes=self.num_workers) as pool:
|
with Pool(processes=self.num_workers) as pool:
|
||||||
# Create tasks for each batch
|
# Create tasks for each batch
|
||||||
@@ -792,11 +918,35 @@ class BatchRunner:
|
|||||||
for result in pool.imap_unordered(_process_batch_worker, tasks):
|
for result in pool.imap_unordered(_process_batch_worker, tasks):
|
||||||
results.append(result)
|
results.append(result)
|
||||||
progress.update(task, advance=1)
|
progress.update(task, advance=1)
|
||||||
|
|
||||||
|
# Incremental checkpoint update (so resume works after crash)
|
||||||
|
try:
|
||||||
|
batch_num = result.get('batch_num')
|
||||||
|
completed = result.get('completed_prompts', []) or []
|
||||||
|
completed_prompts_set.update(completed)
|
||||||
|
|
||||||
|
if isinstance(batch_num, int):
|
||||||
|
checkpoint_data.setdefault('batch_stats', {})[str(batch_num)] = {
|
||||||
|
'processed': result.get('processed', 0),
|
||||||
|
'skipped': result.get('skipped', 0),
|
||||||
|
'discarded_no_reasoning': result.get('discarded_no_reasoning', 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
checkpoint_data['completed_prompts'] = sorted(completed_prompts_set)
|
||||||
|
self._save_checkpoint(checkpoint_data, lock=checkpoint_lock)
|
||||||
|
except Exception as ckpt_err:
|
||||||
|
# Don't fail the run if checkpoint write fails
|
||||||
|
print(f"⚠️ Warning: Failed to save incremental checkpoint: {ckpt_err}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Batch worker failed: %s", e, exc_info=True)
|
||||||
|
raise
|
||||||
finally:
|
finally:
|
||||||
root_logger.setLevel(original_level)
|
root_logger.setLevel(original_level)
|
||||||
|
|
||||||
# Aggregate all batch statistics and update checkpoint
|
# Aggregate all batch statistics and update checkpoint
|
||||||
all_completed_prompts = list(completed_prompts_set)
|
all_completed_prompts = list(completed_prompts_set)
|
||||||
|
total_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
|
||||||
|
|
||||||
for batch_result in results:
|
for batch_result in results:
|
||||||
# Add newly completed prompts
|
# Add newly completed prompts
|
||||||
all_completed_prompts.extend(batch_result.get("completed_prompts", []))
|
all_completed_prompts.extend(batch_result.get("completed_prompts", []))
|
||||||
@@ -814,9 +964,16 @@ class BatchRunner:
|
|||||||
total_tool_stats[tool_name]["success"] += stats["success"]
|
total_tool_stats[tool_name]["success"] += stats["success"]
|
||||||
total_tool_stats[tool_name]["failure"] += stats["failure"]
|
total_tool_stats[tool_name]["failure"] += stats["failure"]
|
||||||
|
|
||||||
# Save final checkpoint
|
# Aggregate reasoning stats
|
||||||
|
for key in total_reasoning_stats:
|
||||||
|
total_reasoning_stats[key] += batch_result.get("reasoning_stats", {}).get(key, 0)
|
||||||
|
|
||||||
|
# Save final checkpoint (best-effort; incremental writes already happened)
|
||||||
|
try:
|
||||||
checkpoint_data["completed_prompts"] = all_completed_prompts
|
checkpoint_data["completed_prompts"] = all_completed_prompts
|
||||||
self._save_checkpoint(checkpoint_data)
|
self._save_checkpoint(checkpoint_data, lock=checkpoint_lock)
|
||||||
|
except Exception as ckpt_err:
|
||||||
|
print(f"âš ï¸ Warning: Failed to save final checkpoint: {ckpt_err}")
|
||||||
|
|
||||||
# Calculate success rates
|
# Calculate success rates
|
||||||
for tool_name in total_tool_stats:
|
for tool_name in total_tool_stats:
|
||||||
@@ -835,15 +992,8 @@ class BatchRunner:
|
|||||||
combined_file = self.output_dir / "trajectories.jsonl"
|
combined_file = self.output_dir / "trajectories.jsonl"
|
||||||
print(f"\n📦 Combining ALL batch files into {combined_file.name}...")
|
print(f"\n📦 Combining ALL batch files into {combined_file.name}...")
|
||||||
|
|
||||||
VALID_TOOLS = {'web_search', 'web_extract', 'terminal', 'vision_analyze',
|
# Valid tools auto-derived from model_tools.py — no manual updates needed
|
||||||
'image_generate', 'mixture_of_agents',
|
VALID_TOOLS = ALL_POSSIBLE_TOOLS
|
||||||
# Skills tools
|
|
||||||
'skills_categories', 'skills_list', 'skill_view',
|
|
||||||
# Browser automation tools
|
|
||||||
'browser_navigate', 'browser_snapshot', 'browser_click',
|
|
||||||
'browser_type', 'browser_scroll', 'browser_back',
|
|
||||||
'browser_press', 'browser_close', 'browser_get_images',
|
|
||||||
'browser_vision'}
|
|
||||||
|
|
||||||
total_entries = 0
|
total_entries = 0
|
||||||
filtered_entries = 0
|
filtered_entries = 0
|
||||||
@@ -892,7 +1042,8 @@ class BatchRunner:
|
|||||||
"model": self.model,
|
"model": self.model,
|
||||||
"completed_at": datetime.now().isoformat(),
|
"completed_at": datetime.now().isoformat(),
|
||||||
"duration_seconds": round(time.time() - start_time, 2),
|
"duration_seconds": round(time.time() - start_time, 2),
|
||||||
"tool_statistics": total_tool_stats
|
"tool_statistics": total_tool_stats,
|
||||||
|
"reasoning_statistics": total_reasoning_stats,
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(self.stats_file, 'w', encoding='utf-8') as f:
|
with open(self.stats_file, 'w', encoding='utf-8') as f:
|
||||||
@@ -930,6 +1081,25 @@ class BatchRunner:
|
|||||||
else:
|
else:
|
||||||
print("No tool calls were made during this run.")
|
print("No tool calls were made during this run.")
|
||||||
|
|
||||||
|
# Print reasoning coverage stats
|
||||||
|
total_discarded = sum(r.get("discarded_no_reasoning", 0) for r in results)
|
||||||
|
|
||||||
|
print(f"\n🧠 Reasoning Coverage:")
|
||||||
|
print("-" * 70)
|
||||||
|
total_turns = total_reasoning_stats["total_assistant_turns"]
|
||||||
|
with_reasoning = total_reasoning_stats["turns_with_reasoning"]
|
||||||
|
without_reasoning = total_reasoning_stats["turns_without_reasoning"]
|
||||||
|
if total_turns > 0:
|
||||||
|
pct_with = round(with_reasoning / total_turns * 100, 1)
|
||||||
|
pct_without = round(without_reasoning / total_turns * 100, 1)
|
||||||
|
print(f" Total assistant turns: {total_turns:,}")
|
||||||
|
print(f" With reasoning: {with_reasoning:,} ({pct_with}%)")
|
||||||
|
print(f" Without reasoning: {without_reasoning:,} ({pct_without}%)")
|
||||||
|
else:
|
||||||
|
print(" No assistant turns recorded.")
|
||||||
|
if total_discarded > 0:
|
||||||
|
print(f" 🚫 Samples discarded (zero reasoning): {total_discarded:,}")
|
||||||
|
|
||||||
print(f"\n💾 Results saved to: {self.output_dir}")
|
print(f"\n💾 Results saved to: {self.output_dir}")
|
||||||
print(f" - Trajectories: trajectories.jsonl (combined)")
|
print(f" - Trajectories: trajectories.jsonl (combined)")
|
||||||
print(f" - Individual batches: batch_*.jsonl (for debugging)")
|
print(f" - Individual batches: batch_*.jsonl (for debugging)")
|
||||||
@@ -942,7 +1112,7 @@ def main(
|
|||||||
batch_size: int = None,
|
batch_size: int = None,
|
||||||
run_name: str = None,
|
run_name: str = None,
|
||||||
distribution: str = "default",
|
distribution: str = "default",
|
||||||
model: str = "anthropic/claude-sonnet-4-20250514",
|
model: str = "anthropic/claude-sonnet-4.6",
|
||||||
api_key: str = None,
|
api_key: str = None,
|
||||||
base_url: str = "https://openrouter.ai/api/v1",
|
base_url: str = "https://openrouter.ai/api/v1",
|
||||||
max_turns: int = 10,
|
max_turns: int = 10,
|
||||||
@@ -956,6 +1126,11 @@ def main(
|
|||||||
providers_ignored: str = None,
|
providers_ignored: str = None,
|
||||||
providers_order: str = None,
|
providers_order: str = None,
|
||||||
provider_sort: str = None,
|
provider_sort: str = None,
|
||||||
|
max_tokens: int = None,
|
||||||
|
reasoning_effort: str = None,
|
||||||
|
reasoning_disabled: bool = False,
|
||||||
|
prefill_messages_file: str = None,
|
||||||
|
max_samples: int = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Run batch processing of agent prompts from a dataset.
|
Run batch processing of agent prompts from a dataset.
|
||||||
@@ -979,6 +1154,11 @@ def main(
|
|||||||
providers_ignored (str): Comma-separated list of OpenRouter providers to ignore (e.g. "together,deepinfra")
|
providers_ignored (str): Comma-separated list of OpenRouter providers to ignore (e.g. "together,deepinfra")
|
||||||
providers_order (str): Comma-separated list of OpenRouter providers to try in order (e.g. "anthropic,openai,google")
|
providers_order (str): Comma-separated list of OpenRouter providers to try in order (e.g. "anthropic,openai,google")
|
||||||
provider_sort (str): Sort providers by "price", "throughput", or "latency" (OpenRouter only)
|
provider_sort (str): Sort providers by "price", "throughput", or "latency" (OpenRouter only)
|
||||||
|
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
|
||||||
|
reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "medium")
|
||||||
|
reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
|
||||||
|
prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
|
||||||
|
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
# Basic usage
|
# Basic usage
|
||||||
@@ -990,9 +1170,13 @@ def main(
|
|||||||
# Use specific distribution
|
# Use specific distribution
|
||||||
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=image_test --distribution=image_gen
|
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=image_test --distribution=image_gen
|
||||||
|
|
||||||
# With ephemeral system prompt (not saved to dataset)
|
# With disabled reasoning and max tokens
|
||||||
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
|
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
|
||||||
--ephemeral_system_prompt="You are a helpful assistant focused on image generation."
|
--reasoning_disabled --max_tokens=128000
|
||||||
|
|
||||||
|
# With prefill messages from file
|
||||||
|
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
|
||||||
|
--prefill_messages_file=configs/prefill_opus.json
|
||||||
|
|
||||||
# List available distributions
|
# List available distributions
|
||||||
python batch_runner.py --list_distributions
|
python batch_runner.py --list_distributions
|
||||||
@@ -1031,6 +1215,36 @@ def main(
|
|||||||
providers_ignored_list = [p.strip() for p in providers_ignored.split(",")] if providers_ignored else None
|
providers_ignored_list = [p.strip() for p in providers_ignored.split(",")] if providers_ignored else None
|
||||||
providers_order_list = [p.strip() for p in providers_order.split(",")] if providers_order else None
|
providers_order_list = [p.strip() for p in providers_order.split(",")] if providers_order else None
|
||||||
|
|
||||||
|
# Build reasoning_config from CLI flags
|
||||||
|
# --reasoning_disabled takes priority, then --reasoning_effort, then default (medium)
|
||||||
|
reasoning_config = None
|
||||||
|
if reasoning_disabled:
|
||||||
|
# Completely disable reasoning/thinking tokens
|
||||||
|
reasoning_config = {"effort": "none"}
|
||||||
|
print("🧠 Reasoning: DISABLED (effort=none)")
|
||||||
|
elif reasoning_effort:
|
||||||
|
# Use specified effort level
|
||||||
|
valid_efforts = ["xhigh", "high", "medium", "low", "minimal", "none"]
|
||||||
|
if reasoning_effort not in valid_efforts:
|
||||||
|
print(f"❌ Error: --reasoning_effort must be one of: {', '.join(valid_efforts)}")
|
||||||
|
return
|
||||||
|
reasoning_config = {"enabled": True, "effort": reasoning_effort}
|
||||||
|
print(f"🧠 Reasoning effort: {reasoning_effort}")
|
||||||
|
|
||||||
|
# Load prefill messages from JSON file if provided
|
||||||
|
prefill_messages = None
|
||||||
|
if prefill_messages_file:
|
||||||
|
try:
|
||||||
|
with open(prefill_messages_file, 'r', encoding='utf-8') as f:
|
||||||
|
prefill_messages = json.load(f)
|
||||||
|
if not isinstance(prefill_messages, list):
|
||||||
|
print(f"❌ Error: prefill_messages_file must contain a JSON array of messages")
|
||||||
|
return
|
||||||
|
print(f"💬 Loaded {len(prefill_messages)} prefill messages from {prefill_messages_file}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error loading prefill messages: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
# Initialize and run batch runner
|
# Initialize and run batch runner
|
||||||
try:
|
try:
|
||||||
runner = BatchRunner(
|
runner = BatchRunner(
|
||||||
@@ -1050,6 +1264,10 @@ def main(
|
|||||||
providers_ignored=providers_ignored_list,
|
providers_ignored=providers_ignored_list,
|
||||||
providers_order=providers_order_list,
|
providers_order=providers_order_list,
|
||||||
provider_sort=provider_sort,
|
provider_sort=provider_sort,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
reasoning_config=reasoning_config,
|
||||||
|
prefill_messages=prefill_messages,
|
||||||
|
max_samples=max_samples,
|
||||||
)
|
)
|
||||||
|
|
||||||
runner.run(resume=resume)
|
runner.run(resume=resume)
|
||||||
|
|||||||
@@ -7,12 +7,59 @@
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
model:
|
model:
|
||||||
# Default model to use (can be overridden with --model flag)
|
# Default model to use (can be overridden with --model flag)
|
||||||
default: "anthropic/claude-sonnet-4"
|
default: "anthropic/claude-opus-4.6"
|
||||||
|
|
||||||
|
# Inference provider selection:
|
||||||
|
# "auto" - Use Nous Portal if logged in, otherwise OpenRouter/env vars (default)
|
||||||
|
# "openrouter" - Always use OpenRouter API key from OPENROUTER_API_KEY
|
||||||
|
# "nous" - Always use Nous Portal (requires: hermes login)
|
||||||
|
# "zai" - Use z.ai / ZhipuAI GLM models (requires: GLM_API_KEY)
|
||||||
|
# "kimi-coding"- Use Kimi / Moonshot AI models (requires: KIMI_API_KEY)
|
||||||
|
# "minimax" - Use MiniMax global endpoint (requires: MINIMAX_API_KEY)
|
||||||
|
# "minimax-cn" - Use MiniMax China endpoint (requires: MINIMAX_CN_API_KEY)
|
||||||
|
# Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
|
||||||
|
provider: "auto"
|
||||||
|
|
||||||
# API configuration (falls back to OPENROUTER_API_KEY env var)
|
# API configuration (falls back to OPENROUTER_API_KEY env var)
|
||||||
# api_key: "your-key-here" # Uncomment to set here instead of .env
|
# api_key: "your-key-here" # Uncomment to set here instead of .env
|
||||||
base_url: "https://openrouter.ai/api/v1"
|
base_url: "https://openrouter.ai/api/v1"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# OpenRouter Provider Routing (only applies when using OpenRouter)
|
||||||
|
# =============================================================================
|
||||||
|
# Control how requests are routed across providers on OpenRouter.
|
||||||
|
# See: https://openrouter.ai/docs/guides/routing/provider-selection
|
||||||
|
#
|
||||||
|
# provider_routing:
|
||||||
|
# # Sort strategy: "price" (default), "throughput", or "latency"
|
||||||
|
# # Append :nitro to model name for a shortcut to throughput sorting.
|
||||||
|
# sort: "throughput"
|
||||||
|
#
|
||||||
|
# # Only allow these providers (provider slugs from OpenRouter)
|
||||||
|
# # only: ["anthropic", "google"]
|
||||||
|
#
|
||||||
|
# # Skip these providers entirely
|
||||||
|
# # ignore: ["deepinfra", "fireworks"]
|
||||||
|
#
|
||||||
|
# # Try providers in this order (overrides default load balancing)
|
||||||
|
# # order: ["anthropic", "google", "together"]
|
||||||
|
#
|
||||||
|
# # Require providers to support all parameters in your request
|
||||||
|
# # require_parameters: true
|
||||||
|
#
|
||||||
|
# # Data policy: "allow" (default) or "deny" to exclude providers that may store data
|
||||||
|
# # data_collection: "deny"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Git Worktree Isolation
|
||||||
|
# =============================================================================
|
||||||
|
# When enabled, each CLI session creates an isolated git worktree so multiple
|
||||||
|
# agents can work on the same repo concurrently without file collisions.
|
||||||
|
# Equivalent to always passing --worktree / -w on the command line.
|
||||||
|
#
|
||||||
|
# worktree: true # Always create a worktree when in a git repo
|
||||||
|
# worktree: false # Default — only create when -w flag is passed
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Terminal Tool Configuration
|
# Terminal Tool Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -23,9 +70,12 @@ model:
|
|||||||
# OPTION 1: Local execution (default)
|
# OPTION 1: Local execution (default)
|
||||||
# Commands run directly on your machine in the current directory
|
# Commands run directly on your machine in the current directory
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
# Working directory behavior:
|
||||||
|
# - CLI (`hermes` command): Uses "." (current directory where you run hermes)
|
||||||
|
# - Messaging (Telegram/Discord): Uses MESSAGING_CWD from .env (default: home)
|
||||||
terminal:
|
terminal:
|
||||||
env_type: "local"
|
backend: "local"
|
||||||
cwd: "." # Use "." for current directory, or specify absolute path
|
cwd: "." # For local backend: "." = current directory. Ignored for remote backends.
|
||||||
timeout: 180
|
timeout: 180
|
||||||
lifetime_seconds: 300
|
lifetime_seconds: 300
|
||||||
# sudo_password: "" # Enable sudo commands (pipes via sudo -S) - SECURITY WARNING: plaintext!
|
# sudo_password: "" # Enable sudo commands (pipes via sudo -S) - SECURITY WARNING: plaintext!
|
||||||
@@ -36,8 +86,8 @@ terminal:
|
|||||||
# Great for: keeping agent isolated from its own code, using powerful remote hardware
|
# Great for: keeping agent isolated from its own code, using powerful remote hardware
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# terminal:
|
# terminal:
|
||||||
# env_type: "ssh"
|
# backend: "ssh"
|
||||||
# cwd: "/home/myuser/project"
|
# cwd: "/home/myuser/project" # Path on the REMOTE server
|
||||||
# timeout: 180
|
# timeout: 180
|
||||||
# lifetime_seconds: 300
|
# lifetime_seconds: 300
|
||||||
# ssh_host: "my-server.example.com"
|
# ssh_host: "my-server.example.com"
|
||||||
@@ -51,11 +101,11 @@ terminal:
|
|||||||
# Great for: reproducible environments, testing, isolation
|
# Great for: reproducible environments, testing, isolation
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# terminal:
|
# terminal:
|
||||||
# env_type: "docker"
|
# backend: "docker"
|
||||||
# cwd: "/workspace"
|
# cwd: "/workspace" # Path INSIDE the container (default: /)
|
||||||
# timeout: 180
|
# timeout: 180
|
||||||
# lifetime_seconds: 300
|
# lifetime_seconds: 300
|
||||||
# docker_image: "python:3.11"
|
# docker_image: "nikolaik/python-nodejs:python3.11-nodejs20"
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# OPTION 4: Singularity/Apptainer container
|
# OPTION 4: Singularity/Apptainer container
|
||||||
@@ -63,11 +113,11 @@ terminal:
|
|||||||
# Great for: HPC clusters, shared compute environments
|
# Great for: HPC clusters, shared compute environments
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# terminal:
|
# terminal:
|
||||||
# env_type: "singularity"
|
# backend: "singularity"
|
||||||
# cwd: "/workspace"
|
# cwd: "/workspace" # Path INSIDE the container (default: /root)
|
||||||
# timeout: 180
|
# timeout: 180
|
||||||
# lifetime_seconds: 300
|
# lifetime_seconds: 300
|
||||||
# singularity_image: "docker://python:3.11"
|
# singularity_image: "docker://nikolaik/python-nodejs:python3.11-nodejs20"
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# OPTION 5: Modal cloud execution
|
# OPTION 5: Modal cloud execution
|
||||||
@@ -75,11 +125,34 @@ terminal:
|
|||||||
# Great for: GPU access, scalable compute, serverless execution
|
# Great for: GPU access, scalable compute, serverless execution
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# terminal:
|
# terminal:
|
||||||
# env_type: "modal"
|
# backend: "modal"
|
||||||
# cwd: "/workspace"
|
# cwd: "/workspace" # Path INSIDE the sandbox (default: /root)
|
||||||
# timeout: 180
|
# timeout: 180
|
||||||
# lifetime_seconds: 300
|
# lifetime_seconds: 300
|
||||||
# modal_image: "python:3.11"
|
# modal_image: "nikolaik/python-nodejs:python3.11-nodejs20"
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# OPTION 6: Daytona cloud execution
|
||||||
|
# Commands run in Daytona cloud sandboxes
|
||||||
|
# Great for: Cloud dev environments, persistent workspaces, team collaboration
|
||||||
|
# Requires: pip install daytona, DAYTONA_API_KEY env var
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# terminal:
|
||||||
|
# backend: "daytona"
|
||||||
|
# cwd: "~"
|
||||||
|
# timeout: 180
|
||||||
|
# lifetime_seconds: 300
|
||||||
|
# daytona_image: "nikolaik/python-nodejs:python3.11-nodejs20"
|
||||||
|
# container_disk: 10240 # Daytona max is 10GB per sandbox
|
||||||
|
|
||||||
|
#
|
||||||
|
# --- Container resource limits (docker, singularity, modal, daytona -- ignored for local/ssh) ---
|
||||||
|
# These settings apply to all container backends. They control the resources
|
||||||
|
# allocated to the sandbox and whether its filesystem persists across sessions.
|
||||||
|
container_cpu: 1 # CPU cores
|
||||||
|
container_memory: 5120 # Memory in MB (5120 = 5GB)
|
||||||
|
container_disk: 51200 # Disk in MB (51200 = 50GB)
|
||||||
|
container_persistent: true # Persist filesystem across sessions (false = ephemeral)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# SUDO SUPPORT (works with ALL backends above)
|
# SUDO SUPPORT (works with ALL backends above)
|
||||||
@@ -112,19 +185,167 @@ browser:
|
|||||||
# after this period of no activity between agent loops (default: 120 = 2 minutes)
|
# after this period of no activity between agent loops (default: 120 = 2 minutes)
|
||||||
inactivity_timeout: 120
|
inactivity_timeout: 120
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Context Compression (Auto-shrinks long conversations)
|
||||||
|
# =============================================================================
|
||||||
|
# When conversation approaches model's context limit, middle turns are
|
||||||
|
# automatically summarized to free up space while preserving important context.
|
||||||
|
#
|
||||||
|
# HOW IT WORKS:
|
||||||
|
# 1. Tracks actual token usage from API responses (not estimates)
|
||||||
|
# 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
|
||||||
|
# 3. Protects first 3 turns (system prompt, initial request, first response)
|
||||||
|
# 4. Protects last 4 turns (recent context is most relevant)
|
||||||
|
# 5. Summarizes middle turns using a fast/cheap model
|
||||||
|
# 6. Inserts summary as a user message, continues conversation seamlessly
|
||||||
|
#
|
||||||
|
compression:
|
||||||
|
# Enable automatic context compression (default: true)
|
||||||
|
# Set to false if you prefer to manage context manually or want errors on overflow
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Trigger compression at this % of model's context limit (default: 0.85 = 85%)
|
||||||
|
# Lower values = more aggressive compression, higher values = compress later
|
||||||
|
threshold: 0.85
|
||||||
|
|
||||||
|
# Model to use for generating summaries (fast/cheap recommended)
|
||||||
|
# This model compresses the middle turns into a concise summary.
|
||||||
|
# IMPORTANT: it receives the full middle section of the conversation, so it
|
||||||
|
# MUST support a context length at least as large as your main model's.
|
||||||
|
summary_model: "google/gemini-3-flash-preview"
|
||||||
|
|
||||||
|
# Provider for the summary model (default: "auto")
|
||||||
|
# Options: "auto", "openrouter", "nous", "main"
|
||||||
|
# summary_provider: "auto"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Auxiliary Models (Advanced — Experimental)
|
||||||
|
# =============================================================================
|
||||||
|
# Hermes uses lightweight "auxiliary" models for side tasks: image analysis,
|
||||||
|
# browser screenshot analysis, web page summarization, and context compression.
|
||||||
|
#
|
||||||
|
# By default these use Gemini Flash via OpenRouter or Nous Portal and are
|
||||||
|
# auto-detected from your credentials. You do NOT need to change anything
|
||||||
|
# here for normal usage.
|
||||||
|
#
|
||||||
|
# WARNING: Overriding these with providers other than OpenRouter or Nous Portal
|
||||||
|
# is EXPERIMENTAL and may not work. Not all models/providers support vision,
|
||||||
|
# produce usable summaries, or accept the same API format. Change at your own
|
||||||
|
# risk — if things break, reset to "auto" / empty values.
|
||||||
|
#
|
||||||
|
# Each task has its own provider + model pair so you can mix providers.
|
||||||
|
# For example: OpenRouter for vision (needs multimodal), but your main
|
||||||
|
# local endpoint for compression (just needs text).
|
||||||
|
#
|
||||||
|
# Provider options:
|
||||||
|
# "auto" - Best available: OpenRouter → Nous Portal → main endpoint (default)
|
||||||
|
# "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY)
|
||||||
|
# "nous" - Force Nous Portal (requires: hermes login)
|
||||||
|
# "codex" - Force Codex OAuth (requires: hermes model → Codex).
|
||||||
|
# Uses gpt-5.3-codex which supports vision.
|
||||||
|
# "main" - Use your custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY).
|
||||||
|
# Works with OpenAI API, local models, or any OpenAI-compatible
|
||||||
|
# endpoint. Also falls back to Codex OAuth and API-key providers.
|
||||||
|
#
|
||||||
|
# Model: leave empty to use the provider's default. When empty, OpenRouter
|
||||||
|
# uses "google/gemini-3-flash-preview" and Nous uses "gemini-3-flash".
|
||||||
|
# Other providers pick a sensible default automatically.
|
||||||
|
#
|
||||||
|
# auxiliary:
|
||||||
|
# # Image analysis: vision_analyze tool + browser screenshots
|
||||||
|
# vision:
|
||||||
|
# provider: "auto"
|
||||||
|
# model: "" # e.g. "google/gemini-2.5-flash", "openai/gpt-4o"
|
||||||
|
#
|
||||||
|
# # Web page scraping / summarization + browser page text extraction
|
||||||
|
# web_extract:
|
||||||
|
# provider: "auto"
|
||||||
|
# model: ""
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Persistent Memory
|
||||||
|
# =============================================================================
|
||||||
|
# Bounded curated memory injected into the system prompt every session.
|
||||||
|
# Two stores: MEMORY.md (agent's notes) and USER.md (user profile).
|
||||||
|
# Character limits keep the memory small and focused. The agent manages
|
||||||
|
# pruning -- when at the limit, it must consolidate or replace entries.
|
||||||
|
# Disabled by default in batch_runner and RL environments.
|
||||||
|
#
|
||||||
|
memory:
|
||||||
|
# Agent's personal notes: environment facts, conventions, things learned
|
||||||
|
memory_enabled: true
|
||||||
|
|
||||||
|
# User profile: preferences, communication style, expectations
|
||||||
|
user_profile_enabled: true
|
||||||
|
|
||||||
|
# Character limits (~2.75 chars per token, model-independent)
|
||||||
|
memory_char_limit: 2200 # ~800 tokens
|
||||||
|
user_char_limit: 1375 # ~500 tokens
|
||||||
|
|
||||||
|
# Periodic memory nudge: remind the agent to consider saving memories
|
||||||
|
# every N user turns. Set to 0 to disable. Only active when memory is enabled.
|
||||||
|
nudge_interval: 10 # Nudge every 10 user turns (0 = disabled)
|
||||||
|
|
||||||
|
# Memory flush: give the agent one turn to save memories before context is
|
||||||
|
# lost (compression, /new, /reset, exit). Set to 0 to disable.
|
||||||
|
# For exit/reset, only fires if the session had at least this many user turns.
|
||||||
|
flush_min_turns: 6 # Min user turns to trigger flush on exit/reset (0 = disabled)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Session Reset Policy (Messaging Platforms)
|
||||||
|
# =============================================================================
|
||||||
|
# Controls when messaging sessions (Telegram, Discord, WhatsApp, Slack) are
|
||||||
|
# automatically cleared. Without resets, conversation context grows indefinitely
|
||||||
|
# which increases API costs with every message.
|
||||||
|
#
|
||||||
|
# When a reset triggers, the agent first saves important information to its
|
||||||
|
# persistent memory — but the conversation context is wiped. The agent starts
|
||||||
|
# fresh but retains learned facts via its memory system.
|
||||||
|
#
|
||||||
|
# Users can always manually reset with /reset or /new in chat.
|
||||||
|
#
|
||||||
|
# Modes:
|
||||||
|
# "both" - Reset on EITHER inactivity timeout or daily boundary (recommended)
|
||||||
|
# "idle" - Reset only after N minutes of inactivity
|
||||||
|
# "daily" - Reset only at a fixed hour each day
|
||||||
|
# "none" - Never auto-reset; context lives until /reset or compression kicks in
|
||||||
|
#
|
||||||
|
# When a reset triggers, the agent gets one turn to save important memories and
|
||||||
|
# skills before the context is wiped. Persistent memory carries across sessions.
|
||||||
|
#
|
||||||
|
session_reset:
|
||||||
|
mode: both # "both", "idle", "daily", or "none"
|
||||||
|
idle_minutes: 1440 # Inactivity timeout in minutes (default: 1440 = 24 hours)
|
||||||
|
at_hour: 4 # Daily reset hour, 0-23 local time (default: 4 AM)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Skills Configuration
|
||||||
|
# =============================================================================
|
||||||
|
# Skills are reusable procedures the agent can load and follow. The agent can
|
||||||
|
# also create new skills after completing complex tasks.
|
||||||
|
#
|
||||||
|
skills:
|
||||||
|
# Nudge the agent to create skills after complex tasks.
|
||||||
|
# Every N tool-calling iterations, remind the model to consider saving a skill.
|
||||||
|
# Set to 0 to disable.
|
||||||
|
creation_nudge_interval: 15
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Agent Behavior
|
# Agent Behavior
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
agent:
|
agent:
|
||||||
# Maximum conversation turns before stopping
|
# Maximum tool-calling iterations per conversation
|
||||||
max_turns: 20
|
# Higher = more room for complex tasks, but costs more tokens
|
||||||
|
# Recommended: 20-30 for focused tasks, 50-100 for open exploration
|
||||||
|
max_turns: 60
|
||||||
|
|
||||||
# Enable verbose logging
|
# Enable verbose logging
|
||||||
verbose: false
|
verbose: false
|
||||||
|
|
||||||
# Custom system prompt (personality, instructions, etc.)
|
# Reasoning effort level (OpenRouter and Nous Portal)
|
||||||
# Leave empty or remove to use default agent behavior
|
# Controls how much "thinking" the model does before responding.
|
||||||
system_prompt: ""
|
# Options: "xhigh" (max), "high", "medium", "low", "minimal", "none" (disable)
|
||||||
|
reasoning_effort: "medium"
|
||||||
|
|
||||||
# Predefined personalities (use with /personality command)
|
# Predefined personalities (use with /personality command)
|
||||||
personalities:
|
personalities:
|
||||||
@@ -149,19 +370,107 @@ agent:
|
|||||||
# Control which tools the agent has access to.
|
# Control which tools the agent has access to.
|
||||||
# Use "all" to enable everything, or specify individual toolsets.
|
# Use "all" to enable everything, or specify individual toolsets.
|
||||||
|
|
||||||
# Available toolsets:
|
# =============================================================================
|
||||||
|
# Platform Toolsets (per-platform tool configuration)
|
||||||
|
# =============================================================================
|
||||||
|
# Override which toolsets are available on each platform.
|
||||||
|
# If a platform isn't listed here, its built-in default is used.
|
||||||
|
#
|
||||||
|
# You can use EITHER:
|
||||||
|
# - A preset like "hermes-cli" or "hermes-telegram" (curated tool set)
|
||||||
|
# - A list of individual toolsets to compose your own (see list below)
|
||||||
|
#
|
||||||
|
# Supported platform keys: cli, telegram, discord, whatsapp, slack
|
||||||
|
#
|
||||||
|
# Examples:
|
||||||
|
#
|
||||||
|
# # Use presets (same as defaults):
|
||||||
|
# platform_toolsets:
|
||||||
|
# cli: [hermes-cli]
|
||||||
|
# telegram: [hermes-telegram]
|
||||||
|
#
|
||||||
|
# # Custom: give Telegram only web + terminal + file + planning:
|
||||||
|
# platform_toolsets:
|
||||||
|
# telegram: [web, terminal, file, todo]
|
||||||
|
#
|
||||||
|
# # Custom: CLI without browser or image gen:
|
||||||
|
# platform_toolsets:
|
||||||
|
# cli: [web, terminal, file, skills, todo, tts, cronjob]
|
||||||
|
#
|
||||||
|
# # Restrictive: Discord gets read-only tools only:
|
||||||
|
# platform_toolsets:
|
||||||
|
# discord: [web, vision, skills, todo]
|
||||||
|
#
|
||||||
|
# If not set, defaults are:
|
||||||
|
# cli: hermes-cli (everything + cronjob management)
|
||||||
|
# telegram: hermes-telegram (terminal, file, web, vision, image, tts, browser, skills, todo, cronjob, messaging)
|
||||||
|
# discord: hermes-discord (same as telegram)
|
||||||
|
# whatsapp: hermes-whatsapp (same as telegram)
|
||||||
|
# slack: hermes-slack (same as telegram)
|
||||||
|
#
|
||||||
|
platform_toolsets:
|
||||||
|
cli: [hermes-cli]
|
||||||
|
telegram: [hermes-telegram]
|
||||||
|
discord: [hermes-discord]
|
||||||
|
whatsapp: [hermes-whatsapp]
|
||||||
|
slack: [hermes-slack]
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Available toolsets (use these names in platform_toolsets or the toolsets list)
|
||||||
|
#
|
||||||
|
# Run `hermes chat --list-toolsets` to see all toolsets and their tools.
|
||||||
|
# Run `hermes chat --list-tools` to see every individual tool with descriptions.
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
#
|
||||||
|
# INDIVIDUAL TOOLSETS (compose your own):
|
||||||
|
# web - web_search, web_extract
|
||||||
|
# search - web_search only (no scraping)
|
||||||
|
# terminal - terminal, process
|
||||||
|
# file - read_file, write_file, patch, search
|
||||||
|
# browser - browser_navigate, browser_snapshot, browser_click, browser_type,
|
||||||
|
# browser_scroll, browser_back, browser_press, browser_close,
|
||||||
|
# browser_get_images, browser_vision (requires BROWSERBASE_API_KEY)
|
||||||
|
# vision - vision_analyze (requires OPENROUTER_API_KEY)
|
||||||
|
# image_gen - image_generate (requires FAL_KEY)
|
||||||
|
# skills - skills_list, skill_view
|
||||||
|
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
|
||||||
|
# moa - mixture_of_agents (requires OPENROUTER_API_KEY)
|
||||||
|
# todo - todo (in-memory task planning, no deps)
|
||||||
|
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI key)
|
||||||
|
# cronjob - schedule_cronjob, list_cronjobs, remove_cronjob
|
||||||
|
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
|
||||||
|
#
|
||||||
|
# PRESETS (curated bundles):
|
||||||
|
# hermes-cli - All of the above except rl + send_message
|
||||||
|
# hermes-telegram - terminal, file, web, vision, image_gen, tts, browser,
|
||||||
|
# skills, todo, cronjob, send_message
|
||||||
|
# hermes-discord - Same as hermes-telegram
|
||||||
|
# hermes-whatsapp - Same as hermes-telegram
|
||||||
|
# hermes-slack - Same as hermes-telegram
|
||||||
|
#
|
||||||
|
# COMPOSITE:
|
||||||
|
# debugging - terminal + web + file
|
||||||
|
# safe - web + vision + moa (no terminal access)
|
||||||
|
# all - Everything available
|
||||||
#
|
#
|
||||||
# web - Web search and content extraction (web_search, web_extract)
|
# web - Web search and content extraction (web_search, web_extract)
|
||||||
# search - Web search only, no scraping (web_search)
|
# search - Web search only, no scraping (web_search)
|
||||||
# terminal - Command execution (terminal)
|
# terminal - Command execution and process management (terminal, process)
|
||||||
|
# file - File operations: read, write, patch, search
|
||||||
# browser - Full browser automation (navigate, click, type, screenshot, etc.)
|
# browser - Full browser automation (navigate, click, type, screenshot, etc.)
|
||||||
# vision - Image analysis (vision_analyze)
|
# vision - Image analysis (vision_analyze)
|
||||||
# image_gen - Image generation with FLUX (image_generate)
|
# image_gen - Image generation with FLUX (image_generate)
|
||||||
# skills - Load skill documents (skills_categories, skills_list, skill_view)
|
# skills - Load skill documents (skills_list, skill_view)
|
||||||
# moa - Mixture of Agents reasoning (mixture_of_agents)
|
# moa - Mixture of Agents reasoning (mixture_of_agents)
|
||||||
|
# todo - Task planning and tracking for multi-step work
|
||||||
|
# memory - Persistent memory across sessions (personal notes + user profile)
|
||||||
|
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
|
||||||
|
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI)
|
||||||
|
# cronjob - Schedule and manage automated tasks (CLI-only)
|
||||||
|
# rl - RL training tools (Tinker-Atropos)
|
||||||
#
|
#
|
||||||
# Composite toolsets:
|
# Composite toolsets:
|
||||||
# debugging - terminal + web (for troubleshooting)
|
# debugging - terminal + web + file (for troubleshooting)
|
||||||
# safe - web + vision + moa (no terminal access)
|
# safe - web + vision + moa (no terminal access)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@@ -212,6 +521,74 @@ toolsets:
|
|||||||
# toolsets:
|
# toolsets:
|
||||||
# - safe
|
# - safe
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# MCP (Model Context Protocol) Servers
|
||||||
|
# =============================================================================
|
||||||
|
# Connect to external MCP servers to add tools from the MCP ecosystem.
|
||||||
|
# Each server's tools are automatically discovered and registered.
|
||||||
|
# See docs/mcp.md for full documentation.
|
||||||
|
#
|
||||||
|
# Stdio servers (spawn a subprocess):
|
||||||
|
# command: the executable to run
|
||||||
|
# args: command-line arguments
|
||||||
|
# env: environment variables (only these + safe defaults passed to subprocess)
|
||||||
|
#
|
||||||
|
# HTTP servers (connect to a URL):
|
||||||
|
# url: the MCP server endpoint
|
||||||
|
# headers: HTTP headers (e.g., for authentication)
|
||||||
|
#
|
||||||
|
# Optional per-server settings:
|
||||||
|
# timeout: tool call timeout in seconds (default: 120)
|
||||||
|
# connect_timeout: initial connection timeout (default: 60)
|
||||||
|
#
|
||||||
|
# mcp_servers:
|
||||||
|
# time:
|
||||||
|
# command: uvx
|
||||||
|
# args: ["mcp-server-time"]
|
||||||
|
# filesystem:
|
||||||
|
# command: npx
|
||||||
|
# args: ["-y", "@modelcontextprotocol/server-filesystem", "/home/user"]
|
||||||
|
# notion:
|
||||||
|
# url: https://mcp.notion.com/mcp
|
||||||
|
# github:
|
||||||
|
# command: npx
|
||||||
|
# args: ["-y", "@modelcontextprotocol/server-github"]
|
||||||
|
# env:
|
||||||
|
# GITHUB_PERSONAL_ACCESS_TOKEN: "ghp_..."
|
||||||
|
#
|
||||||
|
# Sampling (server-initiated LLM requests) — enabled by default.
|
||||||
|
# Per-server config under the 'sampling' key:
|
||||||
|
# analysis:
|
||||||
|
# command: npx
|
||||||
|
# args: ["-y", "analysis-server"]
|
||||||
|
# sampling:
|
||||||
|
# enabled: true # default: true
|
||||||
|
# model: "gemini-3-flash" # override model (optional)
|
||||||
|
# max_tokens_cap: 4096 # max tokens per request
|
||||||
|
# timeout: 30 # LLM call timeout (seconds)
|
||||||
|
# max_rpm: 10 # max requests per minute
|
||||||
|
# allowed_models: [] # model whitelist (empty = all)
|
||||||
|
# max_tool_rounds: 5 # tool loop limit (0 = disable)
|
||||||
|
# log_level: "info" # audit verbosity
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Voice Transcription (Speech-to-Text)
|
||||||
|
# =============================================================================
|
||||||
|
# Automatically transcribe voice messages on messaging platforms.
|
||||||
|
# Requires OPENAI_API_KEY in .env (uses OpenAI Whisper API directly).
|
||||||
|
stt:
|
||||||
|
enabled: true
|
||||||
|
model: "whisper-1" # whisper-1 (cheapest) | gpt-4o-mini-transcribe | gpt-4o-transcribe
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Response Pacing (Messaging Platforms)
|
||||||
|
# =============================================================================
|
||||||
|
# Add human-like delays between message chunks.
|
||||||
|
# human_delay:
|
||||||
|
# mode: "off" # "off" | "natural" | "custom"
|
||||||
|
# min_ms: 800 # Min delay (custom mode only)
|
||||||
|
# max_ms: 2500 # Max delay (custom mode only)
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Session Logging
|
# Session Logging
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -227,9 +604,54 @@ toolsets:
|
|||||||
# No configuration needed - logging is always enabled.
|
# No configuration needed - logging is always enabled.
|
||||||
# To disable, you would need to modify the source code.
|
# To disable, you would need to modify the source code.
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Code Execution Sandbox (Programmatic Tool Calling)
|
||||||
|
# =============================================================================
|
||||||
|
# The execute_code tool runs Python scripts that call Hermes tools via RPC.
|
||||||
|
# Intermediate tool results stay out of the LLM's context window.
|
||||||
|
code_execution:
|
||||||
|
timeout: 300 # Max seconds per script before kill (default: 300 = 5 min)
|
||||||
|
max_tool_calls: 50 # Max RPC tool calls per execution (default: 50)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Subagent Delegation
|
||||||
|
# =============================================================================
|
||||||
|
# The delegate_task tool spawns child agents with isolated context.
|
||||||
|
# Supports single tasks and batch mode (up to 3 parallel).
|
||||||
|
delegation:
|
||||||
|
max_iterations: 50 # Max tool-calling turns per child (default: 50)
|
||||||
|
default_toolsets: ["terminal", "file", "web"] # Default toolsets for subagents
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Honcho Integration (Cross-Session User Modeling)
|
||||||
|
# =============================================================================
|
||||||
|
# AI-native persistent memory via Honcho (https://honcho.dev/).
|
||||||
|
# Builds a deeper understanding of the user across sessions and tools.
|
||||||
|
# Runs alongside USER.md — additive, not a replacement.
|
||||||
|
#
|
||||||
|
# Requires: pip install honcho-ai
|
||||||
|
# Config: ~/.honcho/config.json (shared with Claude Code, Cursor, etc.)
|
||||||
|
# API key: HONCHO_API_KEY in ~/.hermes/.env or ~/.honcho/config.json
|
||||||
|
#
|
||||||
|
# Hermes-specific overrides (optional — most config comes from ~/.honcho/config.json):
|
||||||
|
# honcho: {}
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Display
|
# Display
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
display:
|
display:
|
||||||
# Use compact banner mode
|
# Use compact banner mode
|
||||||
compact: false
|
compact: false
|
||||||
|
|
||||||
|
# Tool progress display level (CLI and gateway)
|
||||||
|
# off: Silent — no tool activity shown, just the final response
|
||||||
|
# new: Show a tool indicator only when the tool changes (skip repeats)
|
||||||
|
# all: Show every tool call with a short preview (default)
|
||||||
|
# verbose: Full args, results, and debug logs (same as /verbose)
|
||||||
|
# Toggle at runtime with /verbose in the CLI
|
||||||
|
tool_progress: all
|
||||||
|
|
||||||
|
# Play terminal bell when agent finishes a response.
|
||||||
|
# Useful for long-running tasks — your terminal will ding when the agent is done.
|
||||||
|
# Works over SSH. Most terminals can be configured to flash the taskbar or play a sound.
|
||||||
|
bell_on_complete: false
|
||||||
|
|||||||
@@ -1,42 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Browser-focused data generation run
|
|
||||||
# Uses browser-use-tasks.jsonl (6504 tasks)
|
|
||||||
# Distribution: browser 97%, web 20%, vision 12%, terminal 15%
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/browser_tasks_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
echo "🌐 Running browser-focused tasks with browser_tasks distribution"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="browser-use-tasks.jsonl" \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name="browser_tasks" \
|
|
||||||
--distribution="browser_tasks" \
|
|
||||||
--model="moonshotai/kimi-k2.5" \
|
|
||||||
--verbose \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--num_workers=50 \
|
|
||||||
--max_turns=60 \
|
|
||||||
--resume \
|
|
||||||
--ephemeral_system_prompt="You are an AI assistant with browser automation capabilities. Your primary task is to navigate and interact with web pages to accomplish user goals.
|
|
||||||
|
|
||||||
IMPORTANT GUIDELINES:
|
|
||||||
|
|
||||||
1. SEARCHING: Do NOT try to search directly on Google or other search engines via the browser - they block automated searches. Instead, ALWAYS use the web_search tool first to find URLs for any pages you need to visit, then use browser tools to navigate to those URLs.
|
|
||||||
|
|
||||||
2. COOKIE/PRIVACY DIALOGS: After navigating to a page, ALWAYS check if there are cookie consent dialogs, privacy popups, or overlay modals blocking the page. These appear in snapshots as 'dialog' elements with buttons like 'Close', 'Accept', 'Accept All', 'Decline', 'I Agree', 'Got it', 'OK', or 'X'. You MUST dismiss these dialogs FIRST by clicking the appropriate button before trying to interact with other page elements. After dismissing a dialog, take a fresh browser_snapshot to get updated element references.
|
|
||||||
|
|
||||||
3. HANDLING TIMEOUTS: If an action times out, it often means the element is blocked by an overlay or the page state has changed. Take a new snapshot to see the current page state and look for any dialogs or popups that need to be dismissed. If there is no dialog box to bypass, then try a new method or report the error to the user and complete the task.
|
|
||||||
|
|
||||||
4. GENERAL: Use browser tools to click elements, fill forms, extract information, and perform web-based tasks. If terminal is available, use it for any local file operations or computations needed to support your web tasks. Be thorough in verifying your actions and handle any errors gracefully by retrying or trying alternative approaches." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
|
|
||||||
# --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate a timestamp for the log file
|
|
||||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
||||||
LOG_FILE="logs/imagen_eval_gpt5_${TIMESTAMP}.log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="source-data/hermes-agent-imagen-data/hermes_agent_imagen_train_sft.jsonl" \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name="imagen_train_sft_glm4.7" \
|
|
||||||
--distribution="image_gen" \
|
|
||||||
--model="z-ai/glm-4.7" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
|
|
||||||
--num_workers=50 \
|
|
||||||
--max_turns=25 \
|
|
||||||
--ephemeral_system_prompt="When generating an image for the user view the image by using the vision_analyze tool to ensure it is what the user wanted. If it isn't feel free to retry a few times. If none are perfect, choose the best option that is the closest match, and explain its imperfections. If the image generation tool fails, try again a few times. If the vision analyze tool fails, provide the image to the user and explain it is your best effort attempt." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
# --verbose \
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/glm4.7-thinking-sft1_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_sft_2.jsonl" \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name="megascience_glm4.7-thinking-sft2" \
|
|
||||||
--distribution="science" \
|
|
||||||
--model="z-ai/glm-4.7" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
|
|
||||||
--num_workers=15 \
|
|
||||||
--max_turns=60 \
|
|
||||||
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12, so you can maintain focused context." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
|
|
||||||
# --verbose \
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/glm4.7-thinking-sft1-10k_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="source-data/hermes-agent-megascience-data/hermes_agent_megascience_sft_train_1_10k.jsonl" \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name="megascience_glm4.7-thinking-sft1" \
|
|
||||||
--distribution="science" \
|
|
||||||
--model="z-ai/glm-4.7" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
|
|
||||||
--num_workers=50 \
|
|
||||||
--max_turns=60 \
|
|
||||||
--resume \
|
|
||||||
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used for furthering results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12, so you can maintain a focused context." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
|
|
||||||
# --verbose \
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/glm4.7-terminal-tasks_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="source-data/raw_tasks_prompts.jsonl" \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name="terminal-tasks-glm4.7-thinking" \
|
|
||||||
--distribution="default" \
|
|
||||||
--model="z-ai/glm-4.7" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
|
|
||||||
--num_workers=50 \
|
|
||||||
--max_turns=60 \
|
|
||||||
--ephemeral_system_prompt="You have access to a variety of tools to help you complete coding, system administration, and general computing tasks. You can use them in sequence and build off of the results of prior tools you've used. Always use the terminal tool to execute commands, write code, install packages, and verify your work. You should test and validate everything you create. Always pip install any packages you need (use --break-system-packages if needed). If you need a tool that isn't available, you can use the terminal to install or create it. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Use web search when you need to look up documentation, APIs, or current best practices." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
|
|
||||||
# --verbose \
|
|
||||||
# --resume \
|
|
||||||
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
python batch_runner.py \
|
|
||||||
--dataset_file="hermes-agent-megascience-data/hermes_agent_megascience_eval.jsonl" \
|
|
||||||
--batch_size=10 \
|
|
||||||
--run_name="megascience_eval_gpt5_2" \
|
|
||||||
--distribution="science" \
|
|
||||||
--model="gpt-5" \
|
|
||||||
--base_url="https://api.openai.com/v1" \
|
|
||||||
--api_key="${OPENAI_API_KEY}" \
|
|
||||||
--num_workers=5 \
|
|
||||||
--max_turns=30 \
|
|
||||||
--verbose \
|
|
||||||
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use a tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should not be confident in your own reasoning, knowledge, or calculations without using a tool to verify or validate your work."
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
python batch_runner.py \
|
|
||||||
--dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_eval.jsonl" \
|
|
||||||
--batch_size=50 \
|
|
||||||
--run_name="megascience_sft_minimax-m2.1-thinking-2-eval" \
|
|
||||||
--distribution="science" \
|
|
||||||
--model="minimax/minimax-m2.1" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--providers_allowed="minimax" \
|
|
||||||
--num_workers=1 \
|
|
||||||
--max_turns=40 \
|
|
||||||
--verbose \
|
|
||||||
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12."
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/glm4.7-terminal-tasks-newterm_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_eval.jsonl" \
|
|
||||||
--batch_size=1 \
|
|
||||||
--run_name="terminal-tasks-test-newterm" \
|
|
||||||
--distribution="terminal_only" \
|
|
||||||
--verbose \
|
|
||||||
--model="z-ai/glm-4.7" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
|
|
||||||
--num_workers=5 \
|
|
||||||
--max_turns=60 \
|
|
||||||
--ephemeral_system_prompt="You have access to a variety of tools to help you complete coding, system administration, and general computing tasks. You can use them in sequence and build off of the results of prior tools you've used. Always use the terminal tool to execute commands, write code, install packages, and verify your work. You should test and validate everything you create. Always pip install any packages you need (use --break-system-packages if needed). If you need a tool that isn't available, you can use the terminal to install or create it. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Use web search when you need to look up documentation, APIs, or current best practices." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
|
|
||||||
# --verbose \
|
|
||||||
# --resume \
|
|
||||||
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Terminal-only evaluation run using Modal sandboxes
|
|
||||||
# Uses 10 sample tasks from nous-terminal-tasks
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/terminal_eval_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
echo "🔧 Using Modal sandboxes (TERMINAL_ENV=modal)"
|
|
||||||
|
|
||||||
# Set terminal to use Modal
|
|
||||||
export TERMINAL_ENV=modal
|
|
||||||
export TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
|
|
||||||
export TERMINAL_TIMEOUT=300
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="nous-terminal-tasks_eval.jsonl" \
|
|
||||||
--batch_size=5 \
|
|
||||||
--run_name="terminal_eval" \
|
|
||||||
--distribution="terminal_only" \
|
|
||||||
--model="z-ai/glm-4.7" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
|
|
||||||
--num_workers=2 \
|
|
||||||
--max_turns=30 \
|
|
||||||
--ephemeral_system_prompt="You have access to a terminal tool for executing commands. Use it to complete the task. Install any packages you need with apt-get or pip (use --break-system-packages if needed). Do not use interactive tools (vim, nano, python repl). If git output is large, pipe to cat." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Mixed browser+terminal data generation run
|
|
||||||
# Uses mixed-browser-terminal-tasks.jsonl (200 tasks)
|
|
||||||
# Distribution: browser 92%, terminal 92%, web 35%, vision 15%, image_gen 15%
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/mixed_tasks_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
echo "🔀 Running mixed browser+terminal tasks with mixed_tasks distribution"
|
|
||||||
|
|
||||||
# Set terminal environment
|
|
||||||
# SIF images are automatically built/cached by terminal_tool.py
|
|
||||||
export TERMINAL_ENV=singularity
|
|
||||||
export TERMINAL_SINGULARITY_IMAGE="docker://nikolaik/python-nodejs:python3.11-nodejs20"
|
|
||||||
export TERMINAL_TIMEOUT=300
|
|
||||||
|
|
||||||
# Set up Apptainer cache directories (use /scratch if available, otherwise /tmp)
|
|
||||||
if [ -d "/scratch" ] && [ -w "/scratch" ]; then
|
|
||||||
CACHE_BASE="/scratch/$USER/.apptainer"
|
|
||||||
else
|
|
||||||
CACHE_BASE="/tmp/$USER/.apptainer"
|
|
||||||
fi
|
|
||||||
export APPTAINER_CACHEDIR="$CACHE_BASE"
|
|
||||||
export APPTAINER_TMPDIR="$CACHE_BASE/tmp"
|
|
||||||
mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR"
|
|
||||||
|
|
||||||
echo "📁 Apptainer cache: $APPTAINER_CACHEDIR"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="mixed-browser-terminal-tasks.jsonl" \
|
|
||||||
--batch_size=20 \
|
|
||||||
--run_name="mixed_tasks" \
|
|
||||||
--distribution="mixed_tasks" \
|
|
||||||
--model="moonshotai/kimi-k2.5" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--num_workers=25 \
|
|
||||||
--max_turns=60 \
|
|
||||||
--ephemeral_system_prompt="You are an AI assistant capable of both browser automation and terminal operations. Use browser tools to navigate websites, interact with web pages, fill forms, and extract information. Use terminal tools to execute commands, write and run code, install packages (use --break-system-packages with pip if needed), and perform local computations. When web search is available, use it to find URLs, documentation, or current information. If vision is available, use it to analyze images or screenshots. If image generation is available, use it when the task requires creating images. Combine browser and terminal capabilities effectively - for example, you might use the browser to fetch data from a website and terminal to process or analyze it. Always verify your work and handle errors gracefully. Whenever you can do something in a terminal instead of a web browser, you should choose to do so, as it's much cheaper." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Terminal-focused data generation run
|
|
||||||
# Uses nous-terminal-tasks.jsonl (597 tasks)
|
|
||||||
# Distribution: terminal 97%, web 15%, browser 0%, vision 8%, image_gen 3%
|
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
|
||||||
mkdir -p logs
|
|
||||||
|
|
||||||
# Generate log filename with timestamp
|
|
||||||
LOG_FILE="logs/terminal_tasks_$(date +%Y%m%d_%H%M%S).log"
|
|
||||||
|
|
||||||
echo "📝 Logging output to: $LOG_FILE"
|
|
||||||
echo "💻 Running terminal-focused tasks with terminal_tasks distribution"
|
|
||||||
|
|
||||||
# Set terminal environment
|
|
||||||
# SIF images are automatically built/cached by terminal_tool.py
|
|
||||||
export TERMINAL_ENV=singularity
|
|
||||||
export TERMINAL_SINGULARITY_IMAGE="docker://nikolaik/python-nodejs:python3.11-nodejs20"
|
|
||||||
export TERMINAL_TIMEOUT=300
|
|
||||||
|
|
||||||
# Set up Apptainer cache directories (use /scratch if available, otherwise /tmp)
|
|
||||||
if [ -d "/scratch" ] && [ -w "/scratch" ]; then
|
|
||||||
CACHE_BASE="/scratch/$USER/.apptainer"
|
|
||||||
else
|
|
||||||
CACHE_BASE="/tmp/$USER/.apptainer"
|
|
||||||
fi
|
|
||||||
export APPTAINER_CACHEDIR="$CACHE_BASE"
|
|
||||||
export APPTAINER_TMPDIR="$CACHE_BASE/tmp"
|
|
||||||
mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR"
|
|
||||||
|
|
||||||
echo "📁 Apptainer cache: $APPTAINER_CACHEDIR"
|
|
||||||
echo "🐳 Image: $TERMINAL_SINGULARITY_IMAGE (auto-converted to SIF on first use)"
|
|
||||||
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file="nous-terminal-tasks.jsonl" \
|
|
||||||
--batch_size=5 \
|
|
||||||
--run_name="terminal_tasks-kimi-k2.5" \
|
|
||||||
--distribution="terminal_tasks" \
|
|
||||||
--model="moonshotai/kimi-k2.5" \
|
|
||||||
--verbose \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--num_workers=80 \
|
|
||||||
--max_turns=60 \
|
|
||||||
--providers_ignored="Novita" \
|
|
||||||
--resume \
|
|
||||||
--ephemeral_system_prompt="You have access to a terminal tool for executing commands and completing coding, system administration, and computing tasks. Use the terminal to write code, run scripts, install packages (use --break-system-packages with pip if needed), manipulate files, and verify your work. Always test and validate code you create. Do not use interactive tools like vim, nano, or python REPL. If git output is large, pipe to cat. When web search is available, use it to look up documentation, APIs, or best practices. If browser tools are available, use them for web interactions that require page manipulation. Do not use the terminal to communicate with the user - only your final response will be shown to them." \
|
|
||||||
2>&1 | tee "$LOG_FILE"
|
|
||||||
|
|
||||||
echo "✅ Log saved to: $LOG_FILE"
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Check if a prompt argument was provided
|
|
||||||
if [ $# -eq 0 ]; then
|
|
||||||
echo "Error: Please provide a prompt as an argument"
|
|
||||||
echo "Usage: $0 \"your prompt here\""
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get the prompt from the first argument
|
|
||||||
PROMPT="$1"
|
|
||||||
|
|
||||||
# Set debug mode for web tools
|
|
||||||
export WEB_TOOLS_DEBUG=true
|
|
||||||
|
|
||||||
# Run the agent with the provided prompt
|
|
||||||
python run_agent.py \
|
|
||||||
--query "$PROMPT" \
|
|
||||||
--max_turns 30 \
|
|
||||||
--model claude-sonnet-4-5-20250929 \
|
|
||||||
--base_url https://api.anthropic.com/v1/ \
|
|
||||||
--api_key $ANTHROPIC_API_KEY \
|
|
||||||
--save_trajectories
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Test skills tool with Kimi K2.5
|
|
||||||
# Usage: ./configs/test_skills_kimi.sh "your query here"
|
|
||||||
# Example: ./configs/test_skills_kimi.sh "List available skills and show me the vllm skill"
|
|
||||||
|
|
||||||
# Default query if none provided
|
|
||||||
QUERY="${1:-List all available skills. Then show me the axolotl skill and view one of its reference files.}"
|
|
||||||
|
|
||||||
echo "🎯 Testing Skills Tool with Kimi K2.5"
|
|
||||||
echo "📝 Query: $QUERY"
|
|
||||||
echo "="
|
|
||||||
|
|
||||||
python run_agent.py \
|
|
||||||
--enabled_toolsets=skills \
|
|
||||||
--model="moonshotai/kimi-k2.5" \
|
|
||||||
--base_url="https://openrouter.ai/api/v1" \
|
|
||||||
--max_turns=10 \
|
|
||||||
--verbose \
|
|
||||||
--save_sample \
|
|
||||||
--query="$QUERY"
|
|
||||||
35
cron/__init__.py
Normal file
35
cron/__init__.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""
|
||||||
|
Cron job scheduling system for Hermes Agent.
|
||||||
|
|
||||||
|
This module provides scheduled task execution, allowing the agent to:
|
||||||
|
- Run automated tasks on schedules (cron expressions, intervals, one-shot)
|
||||||
|
- Self-schedule reminders and follow-up tasks
|
||||||
|
- Execute tasks in isolated sessions (no prior context)
|
||||||
|
|
||||||
|
Cron jobs are executed automatically by the gateway daemon:
|
||||||
|
hermes gateway install # Install as system service (recommended)
|
||||||
|
hermes gateway # Or run in foreground
|
||||||
|
|
||||||
|
The gateway ticks the scheduler every 60 seconds. A file lock prevents
|
||||||
|
duplicate execution if multiple processes overlap.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from cron.jobs import (
|
||||||
|
create_job,
|
||||||
|
get_job,
|
||||||
|
list_jobs,
|
||||||
|
remove_job,
|
||||||
|
update_job,
|
||||||
|
JOBS_FILE,
|
||||||
|
)
|
||||||
|
from cron.scheduler import tick
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"create_job",
|
||||||
|
"get_job",
|
||||||
|
"list_jobs",
|
||||||
|
"remove_job",
|
||||||
|
"update_job",
|
||||||
|
"tick",
|
||||||
|
"JOBS_FILE",
|
||||||
|
]
|
||||||
410
cron/jobs.py
Normal file
410
cron/jobs.py
Normal file
@@ -0,0 +1,410 @@
|
|||||||
|
"""
|
||||||
|
Cron job storage and management.
|
||||||
|
|
||||||
|
Jobs are stored in ~/.hermes/cron/jobs.json
|
||||||
|
Output is saved to ~/.hermes/cron/output/{job_id}/{timestamp}.md
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Dict, List, Any
|
||||||
|
|
||||||
|
from hermes_time import now as _hermes_now
|
||||||
|
|
||||||
|
try:
|
||||||
|
from croniter import croniter
|
||||||
|
HAS_CRONITER = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_CRONITER = False
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Configuration
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
HERMES_DIR = Path.home() / ".hermes"
|
||||||
|
CRON_DIR = HERMES_DIR / "cron"
|
||||||
|
JOBS_FILE = CRON_DIR / "jobs.json"
|
||||||
|
OUTPUT_DIR = CRON_DIR / "output"
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_dirs():
|
||||||
|
"""Ensure cron directories exist."""
|
||||||
|
CRON_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Schedule Parsing
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def parse_duration(s: str) -> int:
|
||||||
|
"""
|
||||||
|
Parse duration string into minutes.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"30m" → 30
|
||||||
|
"2h" → 120
|
||||||
|
"1d" → 1440
|
||||||
|
"""
|
||||||
|
s = s.strip().lower()
|
||||||
|
match = re.match(r'^(\d+)\s*(m|min|mins|minute|minutes|h|hr|hrs|hour|hours|d|day|days)$', s)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(f"Invalid duration: '{s}'. Use format like '30m', '2h', or '1d'")
|
||||||
|
|
||||||
|
value = int(match.group(1))
|
||||||
|
unit = match.group(2)[0] # First char: m, h, or d
|
||||||
|
|
||||||
|
multipliers = {'m': 1, 'h': 60, 'd': 1440}
|
||||||
|
return value * multipliers[unit]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_schedule(schedule: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Parse schedule string into structured format.
|
||||||
|
|
||||||
|
Returns dict with:
|
||||||
|
- kind: "once" | "interval" | "cron"
|
||||||
|
- For "once": "run_at" (ISO timestamp)
|
||||||
|
- For "interval": "minutes" (int)
|
||||||
|
- For "cron": "expr" (cron expression)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"30m" → once in 30 minutes
|
||||||
|
"2h" → once in 2 hours
|
||||||
|
"every 30m" → recurring every 30 minutes
|
||||||
|
"every 2h" → recurring every 2 hours
|
||||||
|
"0 9 * * *" → cron expression
|
||||||
|
"2026-02-03T14:00" → once at timestamp
|
||||||
|
"""
|
||||||
|
schedule = schedule.strip()
|
||||||
|
original = schedule
|
||||||
|
schedule_lower = schedule.lower()
|
||||||
|
|
||||||
|
# "every X" pattern → recurring interval
|
||||||
|
if schedule_lower.startswith("every "):
|
||||||
|
duration_str = schedule[6:].strip()
|
||||||
|
minutes = parse_duration(duration_str)
|
||||||
|
return {
|
||||||
|
"kind": "interval",
|
||||||
|
"minutes": minutes,
|
||||||
|
"display": f"every {minutes}m"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for cron expression (5 or 6 space-separated fields)
|
||||||
|
# Cron fields: minute hour day month weekday [year]
|
||||||
|
parts = schedule.split()
|
||||||
|
if len(parts) >= 5 and all(
|
||||||
|
re.match(r'^[\d\*\-,/]+$', p) for p in parts[:5]
|
||||||
|
):
|
||||||
|
if not HAS_CRONITER:
|
||||||
|
raise ValueError("Cron expressions require 'croniter' package. Install with: pip install croniter")
|
||||||
|
# Validate cron expression
|
||||||
|
try:
|
||||||
|
croniter(schedule)
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Invalid cron expression '{schedule}': {e}")
|
||||||
|
return {
|
||||||
|
"kind": "cron",
|
||||||
|
"expr": schedule,
|
||||||
|
"display": schedule
|
||||||
|
}
|
||||||
|
|
||||||
|
# ISO timestamp (contains T or looks like date)
|
||||||
|
if 'T' in schedule or re.match(r'^\d{4}-\d{2}-\d{2}', schedule):
|
||||||
|
try:
|
||||||
|
# Parse and validate
|
||||||
|
dt = datetime.fromisoformat(schedule.replace('Z', '+00:00'))
|
||||||
|
return {
|
||||||
|
"kind": "once",
|
||||||
|
"run_at": dt.isoformat(),
|
||||||
|
"display": f"once at {dt.strftime('%Y-%m-%d %H:%M')}"
|
||||||
|
}
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError(f"Invalid timestamp '{schedule}': {e}")
|
||||||
|
|
||||||
|
# Duration like "30m", "2h", "1d" → one-shot from now
|
||||||
|
try:
|
||||||
|
minutes = parse_duration(schedule)
|
||||||
|
run_at = _hermes_now() + timedelta(minutes=minutes)
|
||||||
|
return {
|
||||||
|
"kind": "once",
|
||||||
|
"run_at": run_at.isoformat(),
|
||||||
|
"display": f"once in {original}"
|
||||||
|
}
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid schedule '{original}'. Use:\n"
|
||||||
|
f" - Duration: '30m', '2h', '1d' (one-shot)\n"
|
||||||
|
f" - Interval: 'every 30m', 'every 2h' (recurring)\n"
|
||||||
|
f" - Cron: '0 9 * * *' (cron expression)\n"
|
||||||
|
f" - Timestamp: '2026-02-03T14:00:00' (one-shot at time)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_aware(dt: datetime) -> datetime:
|
||||||
|
"""Make a naive datetime tz-aware using the configured timezone.
|
||||||
|
|
||||||
|
Handles backward compatibility: timestamps stored before timezone support
|
||||||
|
are naive (server-local). We assume they were in the same timezone as
|
||||||
|
the current configuration so comparisons work without crashing.
|
||||||
|
"""
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
tz = _hermes_now().tzinfo
|
||||||
|
return dt.replace(tzinfo=tz)
|
||||||
|
return dt
|
||||||
|
|
||||||
|
|
||||||
|
def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Compute the next run time for a schedule.
|
||||||
|
|
||||||
|
Returns ISO timestamp string, or None if no more runs.
|
||||||
|
"""
|
||||||
|
now = _hermes_now()
|
||||||
|
|
||||||
|
if schedule["kind"] == "once":
|
||||||
|
run_at = _ensure_aware(datetime.fromisoformat(schedule["run_at"]))
|
||||||
|
# If in the future, return it; if in the past, no more runs
|
||||||
|
return schedule["run_at"] if run_at > now else None
|
||||||
|
|
||||||
|
elif schedule["kind"] == "interval":
|
||||||
|
minutes = schedule["minutes"]
|
||||||
|
if last_run_at:
|
||||||
|
# Next run is last_run + interval
|
||||||
|
last = _ensure_aware(datetime.fromisoformat(last_run_at))
|
||||||
|
next_run = last + timedelta(minutes=minutes)
|
||||||
|
else:
|
||||||
|
# First run is now + interval
|
||||||
|
next_run = now + timedelta(minutes=minutes)
|
||||||
|
return next_run.isoformat()
|
||||||
|
|
||||||
|
elif schedule["kind"] == "cron":
|
||||||
|
if not HAS_CRONITER:
|
||||||
|
return None
|
||||||
|
cron = croniter(schedule["expr"], now)
|
||||||
|
next_run = cron.get_next(datetime)
|
||||||
|
return next_run.isoformat()
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Job CRUD Operations
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def load_jobs() -> List[Dict[str, Any]]:
|
||||||
|
"""Load all jobs from storage."""
|
||||||
|
ensure_dirs()
|
||||||
|
if not JOBS_FILE.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(JOBS_FILE, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return data.get("jobs", [])
|
||||||
|
except (json.JSONDecodeError, IOError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def save_jobs(jobs: List[Dict[str, Any]]):
|
||||||
|
"""Save all jobs to storage."""
|
||||||
|
ensure_dirs()
|
||||||
|
fd, tmp_path = tempfile.mkstemp(dir=str(JOBS_FILE.parent), suffix='.tmp', prefix='.jobs_')
|
||||||
|
try:
|
||||||
|
with os.fdopen(fd, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump({"jobs": jobs, "updated_at": _hermes_now().isoformat()}, f, indent=2)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
|
os.replace(tmp_path, JOBS_FILE)
|
||||||
|
except BaseException:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def create_job(
|
||||||
|
prompt: str,
|
||||||
|
schedule: str,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
repeat: Optional[int] = None,
|
||||||
|
deliver: Optional[str] = None,
|
||||||
|
origin: Optional[Dict[str, Any]] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Create a new cron job.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The prompt to run (must be self-contained)
|
||||||
|
schedule: Schedule string (see parse_schedule)
|
||||||
|
name: Optional friendly name
|
||||||
|
repeat: How many times to run (None = forever, 1 = once)
|
||||||
|
deliver: Where to deliver output ("origin", "local", "telegram", etc.)
|
||||||
|
origin: Source info where job was created (for "origin" delivery)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created job dict
|
||||||
|
"""
|
||||||
|
parsed_schedule = parse_schedule(schedule)
|
||||||
|
|
||||||
|
# Auto-set repeat=1 for one-shot schedules if not specified
|
||||||
|
if parsed_schedule["kind"] == "once" and repeat is None:
|
||||||
|
repeat = 1
|
||||||
|
|
||||||
|
# Default delivery to origin if available, otherwise local
|
||||||
|
if deliver is None:
|
||||||
|
deliver = "origin" if origin else "local"
|
||||||
|
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
now = _hermes_now().isoformat()
|
||||||
|
|
||||||
|
job = {
|
||||||
|
"id": job_id,
|
||||||
|
"name": name or prompt[:50].strip(),
|
||||||
|
"prompt": prompt,
|
||||||
|
"schedule": parsed_schedule,
|
||||||
|
"schedule_display": parsed_schedule.get("display", schedule),
|
||||||
|
"repeat": {
|
||||||
|
"times": repeat, # None = forever
|
||||||
|
"completed": 0
|
||||||
|
},
|
||||||
|
"enabled": True,
|
||||||
|
"created_at": now,
|
||||||
|
"next_run_at": compute_next_run(parsed_schedule),
|
||||||
|
"last_run_at": None,
|
||||||
|
"last_status": None,
|
||||||
|
"last_error": None,
|
||||||
|
# Delivery configuration
|
||||||
|
"deliver": deliver,
|
||||||
|
"origin": origin, # Tracks where job was created for "origin" delivery
|
||||||
|
}
|
||||||
|
|
||||||
|
jobs = load_jobs()
|
||||||
|
jobs.append(job)
|
||||||
|
save_jobs(jobs)
|
||||||
|
|
||||||
|
return job
|
||||||
|
|
||||||
|
|
||||||
|
def get_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get a job by ID."""
|
||||||
|
jobs = load_jobs()
|
||||||
|
for job in jobs:
|
||||||
|
if job["id"] == job_id:
|
||||||
|
return job
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def list_jobs(include_disabled: bool = False) -> List[Dict[str, Any]]:
|
||||||
|
"""List all jobs, optionally including disabled ones."""
|
||||||
|
jobs = load_jobs()
|
||||||
|
if not include_disabled:
|
||||||
|
jobs = [j for j in jobs if j.get("enabled", True)]
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
def update_job(job_id: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Update a job by ID."""
|
||||||
|
jobs = load_jobs()
|
||||||
|
for i, job in enumerate(jobs):
|
||||||
|
if job["id"] == job_id:
|
||||||
|
jobs[i] = {**job, **updates}
|
||||||
|
save_jobs(jobs)
|
||||||
|
return jobs[i]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def remove_job(job_id: str) -> bool:
|
||||||
|
"""Remove a job by ID."""
|
||||||
|
jobs = load_jobs()
|
||||||
|
original_len = len(jobs)
|
||||||
|
jobs = [j for j in jobs if j["id"] != job_id]
|
||||||
|
if len(jobs) < original_len:
|
||||||
|
save_jobs(jobs)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Mark a job as having been run.
|
||||||
|
|
||||||
|
Updates last_run_at, last_status, increments completed count,
|
||||||
|
computes next_run_at, and auto-deletes if repeat limit reached.
|
||||||
|
"""
|
||||||
|
jobs = load_jobs()
|
||||||
|
for i, job in enumerate(jobs):
|
||||||
|
if job["id"] == job_id:
|
||||||
|
now = _hermes_now().isoformat()
|
||||||
|
job["last_run_at"] = now
|
||||||
|
job["last_status"] = "ok" if success else "error"
|
||||||
|
job["last_error"] = error if not success else None
|
||||||
|
|
||||||
|
# Increment completed count
|
||||||
|
if job.get("repeat"):
|
||||||
|
job["repeat"]["completed"] = job["repeat"].get("completed", 0) + 1
|
||||||
|
|
||||||
|
# Check if we've hit the repeat limit
|
||||||
|
times = job["repeat"].get("times")
|
||||||
|
completed = job["repeat"]["completed"]
|
||||||
|
if times is not None and completed >= times:
|
||||||
|
# Remove the job (limit reached)
|
||||||
|
jobs.pop(i)
|
||||||
|
save_jobs(jobs)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Compute next run
|
||||||
|
job["next_run_at"] = compute_next_run(job["schedule"], now)
|
||||||
|
|
||||||
|
# If no next run (one-shot completed), disable
|
||||||
|
if job["next_run_at"] is None:
|
||||||
|
job["enabled"] = False
|
||||||
|
|
||||||
|
save_jobs(jobs)
|
||||||
|
return
|
||||||
|
|
||||||
|
save_jobs(jobs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_due_jobs() -> List[Dict[str, Any]]:
|
||||||
|
"""Get all jobs that are due to run now."""
|
||||||
|
now = _hermes_now()
|
||||||
|
jobs = load_jobs()
|
||||||
|
due = []
|
||||||
|
|
||||||
|
for job in jobs:
|
||||||
|
if not job.get("enabled", True):
|
||||||
|
continue
|
||||||
|
|
||||||
|
next_run = job.get("next_run_at")
|
||||||
|
if not next_run:
|
||||||
|
continue
|
||||||
|
|
||||||
|
next_run_dt = _ensure_aware(datetime.fromisoformat(next_run))
|
||||||
|
if next_run_dt <= now:
|
||||||
|
due.append(job)
|
||||||
|
|
||||||
|
return due
|
||||||
|
|
||||||
|
|
||||||
|
def save_job_output(job_id: str, output: str):
|
||||||
|
"""Save job output to file."""
|
||||||
|
ensure_dirs()
|
||||||
|
job_output_dir = OUTPUT_DIR / job_id
|
||||||
|
job_output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
timestamp = _hermes_now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
|
output_file = job_output_dir / f"{timestamp}.md"
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(output)
|
||||||
|
|
||||||
|
return output_file
|
||||||
390
cron/scheduler.py
Normal file
390
cron/scheduler.py
Normal file
@@ -0,0 +1,390 @@
|
|||||||
|
"""
|
||||||
|
Cron job scheduler - executes due jobs.
|
||||||
|
|
||||||
|
Provides tick() which checks for due jobs and runs them. The gateway
|
||||||
|
calls this every 60 seconds from a background thread.
|
||||||
|
|
||||||
|
Uses a file-based lock (~/.hermes/cron/.tick.lock) so only one tick
|
||||||
|
runs at a time if multiple processes overlap.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
# fcntl is Unix-only; on Windows use msvcrt for file locking
|
||||||
|
try:
|
||||||
|
import fcntl
|
||||||
|
except ImportError:
|
||||||
|
fcntl = None
|
||||||
|
try:
|
||||||
|
import msvcrt
|
||||||
|
except ImportError:
|
||||||
|
msvcrt = None
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from hermes_time import now as _hermes_now
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from cron.jobs import get_due_jobs, mark_job_run, save_job_output
|
||||||
|
|
||||||
|
# Resolve Hermes home directory (respects HERMES_HOME override)
|
||||||
|
_hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||||
|
|
||||||
|
# File-based lock prevents concurrent ticks from gateway + daemon + systemd timer
|
||||||
|
_LOCK_DIR = _hermes_home / "cron"
|
||||||
|
_LOCK_FILE = _LOCK_DIR / ".tick.lock"
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_origin(job: dict) -> Optional[dict]:
|
||||||
|
"""Extract origin info from a job, returning {platform, chat_id, chat_name} or None."""
|
||||||
|
origin = job.get("origin")
|
||||||
|
if not origin:
|
||||||
|
return None
|
||||||
|
platform = origin.get("platform")
|
||||||
|
chat_id = origin.get("chat_id")
|
||||||
|
if platform and chat_id:
|
||||||
|
return origin
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _deliver_result(job: dict, content: str) -> None:
|
||||||
|
"""
|
||||||
|
Deliver job output to the configured target (origin chat, specific platform, etc.).
|
||||||
|
|
||||||
|
Uses the standalone platform send functions from send_message_tool so delivery
|
||||||
|
works whether or not the gateway is running.
|
||||||
|
"""
|
||||||
|
deliver = job.get("deliver", "local")
|
||||||
|
origin = _resolve_origin(job)
|
||||||
|
|
||||||
|
if deliver == "local":
|
||||||
|
return
|
||||||
|
|
||||||
|
# Resolve target platform + chat_id
|
||||||
|
if deliver == "origin":
|
||||||
|
if not origin:
|
||||||
|
logger.warning("Job '%s' deliver=origin but no origin stored, skipping delivery", job["id"])
|
||||||
|
return
|
||||||
|
platform_name = origin["platform"]
|
||||||
|
chat_id = origin["chat_id"]
|
||||||
|
elif ":" in deliver:
|
||||||
|
platform_name, chat_id = deliver.split(":", 1)
|
||||||
|
else:
|
||||||
|
# Bare platform name like "telegram" — need to resolve to origin or home channel
|
||||||
|
platform_name = deliver
|
||||||
|
if origin and origin.get("platform") == platform_name:
|
||||||
|
chat_id = origin["chat_id"]
|
||||||
|
else:
|
||||||
|
# Fall back to home channel
|
||||||
|
chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "")
|
||||||
|
if not chat_id:
|
||||||
|
logger.warning("Job '%s' deliver=%s but no chat_id or home channel. Set via: hermes config set %s_HOME_CHANNEL <channel_id>", job["id"], deliver, platform_name.upper())
|
||||||
|
return
|
||||||
|
|
||||||
|
from tools.send_message_tool import _send_to_platform
|
||||||
|
from gateway.config import load_gateway_config, Platform
|
||||||
|
|
||||||
|
platform_map = {
|
||||||
|
"telegram": Platform.TELEGRAM,
|
||||||
|
"discord": Platform.DISCORD,
|
||||||
|
"slack": Platform.SLACK,
|
||||||
|
"whatsapp": Platform.WHATSAPP,
|
||||||
|
"signal": Platform.SIGNAL,
|
||||||
|
}
|
||||||
|
platform = platform_map.get(platform_name.lower())
|
||||||
|
if not platform:
|
||||||
|
logger.warning("Job '%s': unknown platform '%s' for delivery", job["id"], platform_name)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
config = load_gateway_config()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Job '%s': failed to load gateway config for delivery: %s", job["id"], e)
|
||||||
|
return
|
||||||
|
|
||||||
|
pconfig = config.platforms.get(platform)
|
||||||
|
if not pconfig or not pconfig.enabled:
|
||||||
|
logger.warning("Job '%s': platform '%s' not configured/enabled", job["id"], platform_name)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Run the async send in a fresh event loop (safe from any thread)
|
||||||
|
try:
|
||||||
|
result = asyncio.run(_send_to_platform(platform, pconfig, chat_id, content))
|
||||||
|
except RuntimeError:
|
||||||
|
# asyncio.run() fails if there's already a running loop in this thread;
|
||||||
|
# spin up a new thread to avoid that.
|
||||||
|
import concurrent.futures
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||||
|
future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, content))
|
||||||
|
result = future.result(timeout=30)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Job '%s': delivery to %s:%s failed: %s", job["id"], platform_name, chat_id, e)
|
||||||
|
return
|
||||||
|
|
||||||
|
if result and result.get("error"):
|
||||||
|
logger.error("Job '%s': delivery error: %s", job["id"], result["error"])
|
||||||
|
else:
|
||||||
|
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
|
||||||
|
# Mirror the delivered content into the target's gateway session
|
||||||
|
try:
|
||||||
|
from gateway.mirror import mirror_to_session
|
||||||
|
mirror_to_session(platform_name, chat_id, content, source_label="cron")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Execute a single cron job.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (success, full_output_doc, final_response, error_message)
|
||||||
|
"""
|
||||||
|
from run_agent import AIAgent
|
||||||
|
|
||||||
|
job_id = job["id"]
|
||||||
|
job_name = job["name"]
|
||||||
|
prompt = job["prompt"]
|
||||||
|
origin = _resolve_origin(job)
|
||||||
|
|
||||||
|
logger.info("Running job '%s' (ID: %s)", job_name, job_id)
|
||||||
|
logger.info("Prompt: %s", prompt[:100])
|
||||||
|
|
||||||
|
# Inject origin context so the agent's send_message tool knows the chat
|
||||||
|
if origin:
|
||||||
|
os.environ["HERMES_SESSION_PLATFORM"] = origin["platform"]
|
||||||
|
os.environ["HERMES_SESSION_CHAT_ID"] = str(origin["chat_id"])
|
||||||
|
if origin.get("chat_name"):
|
||||||
|
os.environ["HERMES_SESSION_CHAT_NAME"] = origin["chat_name"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Re-read .env and config.yaml fresh every run so provider/key
|
||||||
|
# changes take effect without a gateway restart.
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
try:
|
||||||
|
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="latin-1")
|
||||||
|
|
||||||
|
model = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6"
|
||||||
|
|
||||||
|
# Load config.yaml for model, reasoning, prefill, toolsets, provider routing
|
||||||
|
_cfg = {}
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
_cfg_path = str(_hermes_home / "config.yaml")
|
||||||
|
if os.path.exists(_cfg_path):
|
||||||
|
with open(_cfg_path) as _f:
|
||||||
|
_cfg = yaml.safe_load(_f) or {}
|
||||||
|
_model_cfg = _cfg.get("model", {})
|
||||||
|
if isinstance(_model_cfg, str):
|
||||||
|
model = _model_cfg
|
||||||
|
elif isinstance(_model_cfg, dict):
|
||||||
|
model = _model_cfg.get("default", model)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Reasoning config from env or config.yaml
|
||||||
|
reasoning_config = None
|
||||||
|
effort = os.getenv("HERMES_REASONING_EFFORT", "")
|
||||||
|
if not effort:
|
||||||
|
effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
|
||||||
|
if effort and effort.lower() != "none":
|
||||||
|
valid = ("xhigh", "high", "medium", "low", "minimal")
|
||||||
|
if effort.lower() in valid:
|
||||||
|
reasoning_config = {"enabled": True, "effort": effort.lower()}
|
||||||
|
elif effort.lower() == "none":
|
||||||
|
reasoning_config = {"enabled": False}
|
||||||
|
|
||||||
|
# Prefill messages from env or config.yaml
|
||||||
|
prefill_messages = None
|
||||||
|
prefill_file = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") or _cfg.get("prefill_messages_file", "")
|
||||||
|
if prefill_file:
|
||||||
|
import json as _json
|
||||||
|
pfpath = Path(prefill_file).expanduser()
|
||||||
|
if not pfpath.is_absolute():
|
||||||
|
pfpath = _hermes_home / pfpath
|
||||||
|
if pfpath.exists():
|
||||||
|
try:
|
||||||
|
with open(pfpath, "r", encoding="utf-8") as _pf:
|
||||||
|
prefill_messages = _json.load(_pf)
|
||||||
|
if not isinstance(prefill_messages, list):
|
||||||
|
prefill_messages = None
|
||||||
|
except Exception:
|
||||||
|
prefill_messages = None
|
||||||
|
|
||||||
|
# Max iterations
|
||||||
|
max_iterations = _cfg.get("agent", {}).get("max_turns") or _cfg.get("max_turns") or 90
|
||||||
|
|
||||||
|
# Provider routing
|
||||||
|
pr = _cfg.get("provider_routing", {})
|
||||||
|
|
||||||
|
from hermes_cli.runtime_provider import (
|
||||||
|
resolve_runtime_provider,
|
||||||
|
format_runtime_provider_error,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
runtime = resolve_runtime_provider(
|
||||||
|
requested=os.getenv("HERMES_INFERENCE_PROVIDER"),
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
message = format_runtime_provider_error(exc)
|
||||||
|
raise RuntimeError(message) from exc
|
||||||
|
|
||||||
|
agent = AIAgent(
|
||||||
|
model=model,
|
||||||
|
api_key=runtime.get("api_key"),
|
||||||
|
base_url=runtime.get("base_url"),
|
||||||
|
provider=runtime.get("provider"),
|
||||||
|
api_mode=runtime.get("api_mode"),
|
||||||
|
max_iterations=max_iterations,
|
||||||
|
reasoning_config=reasoning_config,
|
||||||
|
prefill_messages=prefill_messages,
|
||||||
|
providers_allowed=pr.get("only"),
|
||||||
|
providers_ignored=pr.get("ignore"),
|
||||||
|
providers_order=pr.get("order"),
|
||||||
|
provider_sort=pr.get("sort"),
|
||||||
|
quiet_mode=True,
|
||||||
|
session_id=f"cron_{job_id}_{_hermes_now().strftime('%Y%m%d_%H%M%S')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
result = agent.run_conversation(prompt)
|
||||||
|
|
||||||
|
final_response = result.get("final_response", "")
|
||||||
|
if not final_response:
|
||||||
|
final_response = "(No response generated)"
|
||||||
|
|
||||||
|
output = f"""# Cron Job: {job_name}
|
||||||
|
|
||||||
|
**Job ID:** {job_id}
|
||||||
|
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||||
|
**Schedule:** {job.get('schedule_display', 'N/A')}
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
{prompt}
|
||||||
|
|
||||||
|
## Response
|
||||||
|
|
||||||
|
{final_response}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger.info("Job '%s' completed successfully", job_name)
|
||||||
|
return True, output, final_response, None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"{type(e).__name__}: {str(e)}"
|
||||||
|
logger.error("Job '%s' failed: %s", job_name, error_msg)
|
||||||
|
|
||||||
|
output = f"""# Cron Job: {job_name} (FAILED)
|
||||||
|
|
||||||
|
**Job ID:** {job_id}
|
||||||
|
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||||
|
**Schedule:** {job.get('schedule_display', 'N/A')}
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
{prompt}
|
||||||
|
|
||||||
|
## Error
|
||||||
|
|
||||||
|
```
|
||||||
|
{error_msg}
|
||||||
|
|
||||||
|
{traceback.format_exc()}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
return False, output, "", error_msg
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up injected env vars so they don't leak to other jobs
|
||||||
|
for key in ("HERMES_SESSION_PLATFORM", "HERMES_SESSION_CHAT_ID", "HERMES_SESSION_CHAT_NAME"):
|
||||||
|
os.environ.pop(key, None)
|
||||||
|
|
||||||
|
|
||||||
|
def tick(verbose: bool = True) -> int:
|
||||||
|
"""
|
||||||
|
Check and run all due jobs.
|
||||||
|
|
||||||
|
Uses a file lock so only one tick runs at a time, even if the gateway's
|
||||||
|
in-process ticker and a standalone daemon or manual tick overlap.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
verbose: Whether to print status messages
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of jobs executed (0 if another tick is already running)
|
||||||
|
"""
|
||||||
|
_LOCK_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Cross-platform file locking: fcntl on Unix, msvcrt on Windows
|
||||||
|
lock_fd = None
|
||||||
|
try:
|
||||||
|
lock_fd = open(_LOCK_FILE, "w")
|
||||||
|
if fcntl:
|
||||||
|
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||||
|
elif msvcrt:
|
||||||
|
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_NBLCK, 1)
|
||||||
|
except (OSError, IOError):
|
||||||
|
logger.debug("Tick skipped — another instance holds the lock")
|
||||||
|
if lock_fd is not None:
|
||||||
|
lock_fd.close()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
due_jobs = get_due_jobs()
|
||||||
|
|
||||||
|
if verbose and not due_jobs:
|
||||||
|
logger.info("%s - No jobs due", _hermes_now().strftime('%H:%M:%S'))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
logger.info("%s - %s job(s) due", _hermes_now().strftime('%H:%M:%S'), len(due_jobs))
|
||||||
|
|
||||||
|
executed = 0
|
||||||
|
for job in due_jobs:
|
||||||
|
try:
|
||||||
|
success, output, final_response, error = run_job(job)
|
||||||
|
|
||||||
|
output_file = save_job_output(job["id"], output)
|
||||||
|
if verbose:
|
||||||
|
logger.info("Output saved to: %s", output_file)
|
||||||
|
|
||||||
|
# Deliver the final response to the origin/target chat
|
||||||
|
deliver_content = final_response if success else f"⚠️ Cron job '{job.get('name', job['id'])}' failed:\n{error}"
|
||||||
|
if deliver_content:
|
||||||
|
try:
|
||||||
|
_deliver_result(job, deliver_content)
|
||||||
|
except Exception as de:
|
||||||
|
logger.error("Delivery failed for job %s: %s", job["id"], de)
|
||||||
|
|
||||||
|
mark_job_run(job["id"], success, error)
|
||||||
|
executed += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error processing job %s: %s", job['id'], e)
|
||||||
|
mark_job_run(job["id"], False, str(e))
|
||||||
|
|
||||||
|
return executed
|
||||||
|
finally:
|
||||||
|
if fcntl:
|
||||||
|
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||||||
|
elif msvcrt:
|
||||||
|
try:
|
||||||
|
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_UNLCK, 1)
|
||||||
|
except (OSError, IOError):
|
||||||
|
pass
|
||||||
|
lock_fd.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
tick(verbose=True)
|
||||||
5
datagen-config-examples/example_browser_tasks.jsonl
Normal file
5
datagen-config-examples/example_browser_tasks.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"prompt": "Go to https://news.ycombinator.com and find the top 5 posts on the front page. For each post, get the title, URL, points, and number of comments. Return the results as a formatted summary."}
|
||||||
|
{"prompt": "Navigate to https://en.wikipedia.org/wiki/Hermes and extract the first paragraph of the article, the image caption, and the list of items in the infobox. Summarize what you find."}
|
||||||
|
{"prompt": "Go to https://github.com/trending and find the top 3 trending repositories today. For each repo, get the name, description, language, and star count. Write the results to a file called trending_repos.md."}
|
||||||
|
{"prompt": "Visit https://httpbin.org/forms/post and fill out the form with sample data (customer name: Jane Doe, size: Medium, topping: Bacon, delivery time: 12:00). Submit the form and report what the response page shows."}
|
||||||
|
{"prompt": "Navigate to https://books.toscrape.com, browse to the Travel category, find the highest-rated book, and extract its title, price, availability, and description."}
|
||||||
65
datagen-config-examples/run_browser_tasks.sh
Executable file
65
datagen-config-examples/run_browser_tasks.sh
Executable file
@@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Example: Browser-Focused Data Generation
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# Generates tool-calling trajectories for browser automation tasks.
|
||||||
|
# The agent navigates websites, fills forms, extracts information, etc.
|
||||||
|
#
|
||||||
|
# Distribution: browser 97%, web 20%, vision 12%, terminal 15%
|
||||||
|
#
|
||||||
|
# Prerequisites:
|
||||||
|
# - OPENROUTER_API_KEY in ~/.hermes/.env
|
||||||
|
# - BROWSERBASE_API_KEY in ~/.hermes/.env (for browser tools)
|
||||||
|
# - A dataset JSONL file with one {"prompt": "..."} per line
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# cd ~/.hermes/hermes-agent
|
||||||
|
# bash datagen-config-examples/run_browser_tasks.sh
|
||||||
|
#
|
||||||
|
# Output: data/browser_tasks_example/trajectories.jsonl
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
mkdir -p logs
|
||||||
|
|
||||||
|
LOG_FILE="logs/browser_tasks_$(date +%Y%m%d_%H%M%S).log"
|
||||||
|
echo "📝 Logging to: $LOG_FILE"
|
||||||
|
|
||||||
|
# Point to the example dataset in this directory
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
|
||||||
|
python batch_runner.py \
|
||||||
|
--dataset_file="$SCRIPT_DIR/example_browser_tasks.jsonl" \
|
||||||
|
--batch_size=5 \
|
||||||
|
--run_name="browser_tasks_example" \
|
||||||
|
--distribution="browser_tasks" \
|
||||||
|
--model="anthropic/claude-sonnet-4" \
|
||||||
|
--base_url="https://openrouter.ai/api/v1" \
|
||||||
|
--num_workers=3 \
|
||||||
|
--max_turns=30 \
|
||||||
|
--ephemeral_system_prompt="You are an AI assistant with browser automation capabilities. Your primary task is to navigate and interact with web pages to accomplish user goals.
|
||||||
|
|
||||||
|
IMPORTANT GUIDELINES:
|
||||||
|
|
||||||
|
1. SEARCHING: Do NOT search directly on Google via the browser — they block automated searches. Use the web_search tool first to find URLs, then navigate to them with browser tools.
|
||||||
|
|
||||||
|
2. COOKIE/PRIVACY DIALOGS: After navigating to a page, check for cookie consent or privacy popups. Dismiss them by clicking Accept/Close/OK before interacting with other elements. Take a fresh browser_snapshot afterward.
|
||||||
|
|
||||||
|
3. HANDLING TIMEOUTS: If an action times out, the element may be blocked by an overlay. Take a new snapshot and look for dialogs to dismiss. If none, try an alternative approach or report the issue.
|
||||||
|
|
||||||
|
4. GENERAL: Use browser tools to click, fill forms, and extract information. Use terminal for local file operations. Verify your actions and handle errors gracefully." \
|
||||||
|
2>&1 | tee "$LOG_FILE"
|
||||||
|
|
||||||
|
echo "✅ Done. Log: $LOG_FILE"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Common options you can add:
|
||||||
|
#
|
||||||
|
# --resume Resume from checkpoint if interrupted
|
||||||
|
# --verbose Enable detailed logging
|
||||||
|
# --max_tokens=63000 Set max response tokens
|
||||||
|
# --reasoning_disabled Disable model thinking/reasoning tokens
|
||||||
|
# --providers_allowed="anthropic,google" Restrict to specific providers
|
||||||
|
# --prefill_messages_file="configs/prefill.json" Few-shot priming
|
||||||
|
# =============================================================================
|
||||||
46
datagen-config-examples/web_research.yaml
Normal file
46
datagen-config-examples/web_research.yaml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# datagen-config-examples/web_research.yaml
|
||||||
|
#
|
||||||
|
# Batch data generation config for WebResearchEnv.
|
||||||
|
# Generates tool-calling trajectories for multi-step web research tasks.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python batch_runner.py \
|
||||||
|
# --config datagen-config-examples/web_research.yaml \
|
||||||
|
# --run_name web_research_v1
|
||||||
|
|
||||||
|
environment: web-research
|
||||||
|
|
||||||
|
# Toolsets available to the agent during data generation
|
||||||
|
toolsets:
|
||||||
|
- web
|
||||||
|
- file
|
||||||
|
|
||||||
|
# How many parallel workers to use
|
||||||
|
num_workers: 4
|
||||||
|
|
||||||
|
# Questions per batch
|
||||||
|
batch_size: 20
|
||||||
|
|
||||||
|
# Total trajectories to generate (comment out to run full dataset)
|
||||||
|
max_items: 500
|
||||||
|
|
||||||
|
# Model to use for generation (override with --model flag)
|
||||||
|
model: openrouter/nousresearch/hermes-3-llama-3.1-405b
|
||||||
|
|
||||||
|
# System prompt additions (ephemeral — not saved to trajectories)
|
||||||
|
ephemeral_system_prompt: |
|
||||||
|
You are a highly capable research agent. When asked a factual question,
|
||||||
|
always use web_search to find current, accurate information before answering.
|
||||||
|
Cite at least 2 sources. Be concise and accurate.
|
||||||
|
|
||||||
|
# Output directory
|
||||||
|
output_dir: data/web_research_v1
|
||||||
|
|
||||||
|
# Trajectory compression settings (for fitting into training token budgets)
|
||||||
|
compression:
|
||||||
|
enabled: true
|
||||||
|
target_max_tokens: 16000
|
||||||
|
|
||||||
|
# Eval settings
|
||||||
|
eval_every: 100 # Run eval every N trajectories
|
||||||
|
eval_size: 25 # Number of held-out questions per eval run
|
||||||
104
docs/agents.md
104
docs/agents.md
@@ -1,104 +0,0 @@
|
|||||||
# Agents
|
|
||||||
|
|
||||||
The agent is the core loop that orchestrates LLM calls and tool execution.
|
|
||||||
|
|
||||||
## AIAgent Class
|
|
||||||
|
|
||||||
The main agent is implemented in `run_agent.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class AIAgent:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model: str = "anthropic/claude-sonnet-4",
|
|
||||||
api_key: str = None,
|
|
||||||
base_url: str = "https://openrouter.ai/api/v1",
|
|
||||||
max_turns: int = 20,
|
|
||||||
enabled_toolsets: list = None,
|
|
||||||
disabled_toolsets: list = None,
|
|
||||||
verbose_logging: bool = False,
|
|
||||||
):
|
|
||||||
# Initialize OpenAI client, load tools based on toolsets
|
|
||||||
...
|
|
||||||
|
|
||||||
def chat(self, user_message: str, task_id: str = None) -> str:
|
|
||||||
# Main entry point - runs the agent loop
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
## Agent Loop
|
|
||||||
|
|
||||||
The core loop in `_run_agent_loop()`:
|
|
||||||
|
|
||||||
```
|
|
||||||
1. Add user message to conversation
|
|
||||||
2. Call LLM with tools
|
|
||||||
3. If LLM returns tool calls:
|
|
||||||
- Execute each tool
|
|
||||||
- Add tool results to conversation
|
|
||||||
- Go to step 2
|
|
||||||
4. If LLM returns text response:
|
|
||||||
- Return response to user
|
|
||||||
```
|
|
||||||
|
|
||||||
```python
|
|
||||||
while turns < max_turns:
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
tools=tool_schemas,
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.tool_calls:
|
|
||||||
for tool_call in response.tool_calls:
|
|
||||||
result = await execute_tool(tool_call)
|
|
||||||
messages.append(tool_result_message(result))
|
|
||||||
turns += 1
|
|
||||||
else:
|
|
||||||
return response.content
|
|
||||||
```
|
|
||||||
|
|
||||||
## Conversation Management
|
|
||||||
|
|
||||||
Messages are stored as a list of dicts following OpenAI format:
|
|
||||||
|
|
||||||
```python
|
|
||||||
messages = [
|
|
||||||
{"role": "system", "content": "You are a helpful assistant..."},
|
|
||||||
{"role": "user", "content": "Search for Python tutorials"},
|
|
||||||
{"role": "assistant", "content": None, "tool_calls": [...]},
|
|
||||||
{"role": "tool", "tool_call_id": "...", "content": "..."},
|
|
||||||
{"role": "assistant", "content": "Here's what I found..."},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
## Reasoning Context
|
|
||||||
|
|
||||||
For models that support reasoning (chain-of-thought), the agent:
|
|
||||||
1. Extracts `reasoning_content` from API responses
|
|
||||||
2. Stores it in `assistant_msg["reasoning"]` for trajectory export
|
|
||||||
3. Passes it back via `reasoning_content` field on subsequent turns
|
|
||||||
|
|
||||||
## Trajectory Export
|
|
||||||
|
|
||||||
Conversations can be exported for training:
|
|
||||||
|
|
||||||
```python
|
|
||||||
agent = AIAgent(save_trajectories=True)
|
|
||||||
agent.chat("Do something")
|
|
||||||
# Saves to trajectories/*.jsonl in ShareGPT format
|
|
||||||
```
|
|
||||||
|
|
||||||
## Batch Processing
|
|
||||||
|
|
||||||
For processing multiple prompts, use `batch_runner.py`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python batch_runner.py \
|
|
||||||
--dataset_file=prompts.jsonl \
|
|
||||||
--batch_size=20 \
|
|
||||||
--num_workers=4 \
|
|
||||||
--run_name=my_run
|
|
||||||
```
|
|
||||||
|
|
||||||
See `batch_runner.py` for parallel execution with checkpointing.
|
|
||||||
264
docs/cli.md
264
docs/cli.md
@@ -1,264 +0,0 @@
|
|||||||
# CLI
|
|
||||||
|
|
||||||
The Hermes Agent CLI provides an interactive terminal interface for working with the agent.
|
|
||||||
|
|
||||||
## Running the CLI
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Basic usage
|
|
||||||
./hermes
|
|
||||||
|
|
||||||
# With specific model
|
|
||||||
./hermes --model "anthropic/claude-sonnet-4"
|
|
||||||
|
|
||||||
# With specific toolsets
|
|
||||||
./hermes --toolsets "web,terminal,skills"
|
|
||||||
|
|
||||||
# Verbose mode
|
|
||||||
./hermes --verbose
|
|
||||||
```
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
The CLI is implemented in `cli.py` and uses:
|
|
||||||
|
|
||||||
- **Rich** - Welcome banner with ASCII art and styled panels
|
|
||||||
- **prompt_toolkit** - Fixed input area with command history
|
|
||||||
- **KawaiiSpinner** - Animated feedback during operations
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────┐
|
|
||||||
│ HERMES-AGENT ASCII Logo │
|
|
||||||
│ ┌─────────────┐ ┌────────────────────────────┐ │
|
|
||||||
│ │ Caduceus │ │ Model: claude-opus-4.5 │ │
|
|
||||||
│ │ ASCII Art │ │ Terminal: local │ │
|
|
||||||
│ │ │ │ Working Dir: /home/user │ │
|
|
||||||
│ │ │ │ Available Tools: 19 │ │
|
|
||||||
│ │ │ │ Available Skills: 12 │ │
|
|
||||||
│ └─────────────┘ └────────────────────────────┘ │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
│ Conversation output scrolls here... │
|
|
||||||
│ │
|
|
||||||
│ User: Hello! │
|
|
||||||
│ ────────────────────────────────────────────── │
|
|
||||||
│ (◕‿◕✿) 🧠 pondering... (2.3s) │
|
|
||||||
│ ✧٩(ˊᗜˋ*)و✧ got it! (2.3s) │
|
|
||||||
│ │
|
|
||||||
│ Assistant: Hello! How can I help you today? │
|
|
||||||
├─────────────────────────────────────────────────┤
|
|
||||||
│ ❯ [Fixed input area at bottom] │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Commands
|
|
||||||
|
|
||||||
| Command | Description |
|
|
||||||
|---------|-------------|
|
|
||||||
| `/help` | Show available commands |
|
|
||||||
| `/tools` | List available tools grouped by toolset |
|
|
||||||
| `/toolsets` | List available toolsets with descriptions |
|
|
||||||
| `/model [name]` | Show or change the current model |
|
|
||||||
| `/prompt [text]` | View/set/clear custom system prompt |
|
|
||||||
| `/personality [name]` | Set a predefined personality |
|
|
||||||
| `/clear` | Clear screen and reset conversation |
|
|
||||||
| `/reset` | Reset conversation only (keep screen) |
|
|
||||||
| `/history` | Show conversation history |
|
|
||||||
| `/save` | Save current conversation to file |
|
|
||||||
| `/config` | Show current configuration |
|
|
||||||
| `/quit` | Exit the CLI (also: `/exit`, `/q`) |
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
The CLI is configured via `cli-config.yaml`. Copy from `cli-config.yaml.example`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cp cli-config.yaml.example cli-config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### Model Configuration
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
model:
|
|
||||||
default: "anthropic/claude-opus-4.5"
|
|
||||||
base_url: "https://openrouter.ai/api/v1"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Terminal Configuration
|
|
||||||
|
|
||||||
The CLI supports multiple terminal backends:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# Local execution (default)
|
|
||||||
terminal:
|
|
||||||
env_type: "local"
|
|
||||||
cwd: "." # Current directory
|
|
||||||
|
|
||||||
# SSH remote execution (sandboxed - agent can't touch its own code)
|
|
||||||
terminal:
|
|
||||||
env_type: "ssh"
|
|
||||||
cwd: "/home/myuser/project"
|
|
||||||
ssh_host: "my-server.example.com"
|
|
||||||
ssh_user: "myuser"
|
|
||||||
ssh_key: "~/.ssh/id_rsa"
|
|
||||||
|
|
||||||
# Docker container
|
|
||||||
terminal:
|
|
||||||
env_type: "docker"
|
|
||||||
docker_image: "python:3.11"
|
|
||||||
|
|
||||||
# Singularity/Apptainer (HPC)
|
|
||||||
terminal:
|
|
||||||
env_type: "singularity"
|
|
||||||
singularity_image: "docker://python:3.11"
|
|
||||||
|
|
||||||
# Modal cloud
|
|
||||||
terminal:
|
|
||||||
env_type: "modal"
|
|
||||||
modal_image: "python:3.11"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Sudo Support
|
|
||||||
|
|
||||||
The CLI supports interactive sudo prompts:
|
|
||||||
|
|
||||||
```
|
|
||||||
┌──────────────────────────────────────────────────────────┐
|
|
||||||
│ 🔐 SUDO PASSWORD REQUIRED │
|
|
||||||
├──────────────────────────────────────────────────────────┤
|
|
||||||
│ Enter password below (input is hidden), or: │
|
|
||||||
│ • Press Enter to skip (command fails gracefully) │
|
|
||||||
│ • Wait 45s to auto-skip │
|
|
||||||
└──────────────────────────────────────────────────────────┘
|
|
||||||
|
|
||||||
Password (hidden):
|
|
||||||
```
|
|
||||||
|
|
||||||
**Options:**
|
|
||||||
- **Interactive**: Leave `sudo_password` unset - you'll be prompted when needed
|
|
||||||
- **Configured**: Set `sudo_password` in `cli-config.yaml` to auto-fill
|
|
||||||
- **Environment**: Set `SUDO_PASSWORD` in `.env` for all runs
|
|
||||||
|
|
||||||
Password is cached for the session once entered.
|
|
||||||
|
|
||||||
### Toolsets
|
|
||||||
|
|
||||||
Control which tools are available:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# Enable all tools
|
|
||||||
toolsets:
|
|
||||||
- all
|
|
||||||
|
|
||||||
# Or enable specific toolsets
|
|
||||||
toolsets:
|
|
||||||
- web
|
|
||||||
- terminal
|
|
||||||
- skills
|
|
||||||
```
|
|
||||||
|
|
||||||
Available toolsets: `web`, `search`, `terminal`, `browser`, `vision`, `image_gen`, `skills`, `moa`, `debugging`, `safe`
|
|
||||||
|
|
||||||
### Personalities
|
|
||||||
|
|
||||||
Predefined personalities for the `/personality` command:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
agent:
|
|
||||||
personalities:
|
|
||||||
helpful: "You are a helpful, friendly AI assistant."
|
|
||||||
kawaii: "You are a kawaii assistant! Use cute expressions..."
|
|
||||||
pirate: "Arrr! Ye be talkin' to Captain Hermes..."
|
|
||||||
# Add your own!
|
|
||||||
```
|
|
||||||
|
|
||||||
Built-in personalities:
|
|
||||||
- `helpful`, `concise`, `technical`, `creative`, `teacher`
|
|
||||||
- `kawaii`, `catgirl`, `pirate`, `shakespeare`, `surfer`
|
|
||||||
- `noir`, `uwu`, `philosopher`, `hype`
|
|
||||||
|
|
||||||
## Animated Feedback
|
|
||||||
|
|
||||||
The CLI provides animated feedback during operations:
|
|
||||||
|
|
||||||
### Thinking Animation
|
|
||||||
|
|
||||||
During API calls, shows animated spinner with thinking verbs:
|
|
||||||
```
|
|
||||||
◜ (。•́︿•̀。) pondering... (1.2s)
|
|
||||||
◠ (⊙_⊙) contemplating... (2.4s)
|
|
||||||
✧٩(ˊᗜˋ*)و✧ got it! (3.1s)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Tool Execution Animation
|
|
||||||
|
|
||||||
Each tool type has unique animations:
|
|
||||||
```
|
|
||||||
⠋ (◕‿◕✿) 🔍 web_search... (0.8s)
|
|
||||||
▅ (≧◡≦) 💻 terminal... (1.2s)
|
|
||||||
🌓 (★ω★) 🌐 browser_navigate... (2.1s)
|
|
||||||
✧ (✿◠‿◠) 🎨 image_generate... (4.5s)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Multi-line Input
|
|
||||||
|
|
||||||
For multi-line input, end a line with `\` to continue:
|
|
||||||
|
|
||||||
```
|
|
||||||
❯ Write a function that:\
|
|
||||||
1. Takes a list of numbers\
|
|
||||||
2. Returns the sum
|
|
||||||
```
|
|
||||||
|
|
||||||
## Environment Variable Priority
|
|
||||||
|
|
||||||
For terminal settings, `cli-config.yaml` takes precedence over `.env`:
|
|
||||||
|
|
||||||
1. `cli-config.yaml` (highest priority in CLI)
|
|
||||||
2. `.env` file
|
|
||||||
3. System environment variables
|
|
||||||
4. Default values
|
|
||||||
|
|
||||||
This allows you to have different terminal configs for CLI vs batch processing.
|
|
||||||
|
|
||||||
## Session Management
|
|
||||||
|
|
||||||
- **History**: Command history is saved to `~/.hermes_history`
|
|
||||||
- **Conversations**: Use `/save` to export conversations
|
|
||||||
- **Reset**: Use `/clear` for full reset, `/reset` to just clear history
|
|
||||||
- **Session Logs**: Every session automatically logs to `logs/session_{session_id}.json`
|
|
||||||
|
|
||||||
### Session Logging
|
|
||||||
|
|
||||||
Sessions are automatically logged to the `logs/` directory:
|
|
||||||
|
|
||||||
```
|
|
||||||
logs/
|
|
||||||
├── session_20260201_143052_a1b2c3.json
|
|
||||||
├── session_20260201_150217_d4e5f6.json
|
|
||||||
└── ...
|
|
||||||
```
|
|
||||||
|
|
||||||
The session ID is displayed in the welcome banner and follows the format: `YYYYMMDD_HHMMSS_UUID`.
|
|
||||||
|
|
||||||
Log files contain:
|
|
||||||
- Full conversation history in trajectory format
|
|
||||||
- Timestamps for session start and last update
|
|
||||||
- Model and message count metadata
|
|
||||||
|
|
||||||
This is useful for:
|
|
||||||
- Debugging agent behavior
|
|
||||||
- Replaying conversations
|
|
||||||
- Training data inspection
|
|
||||||
|
|
||||||
## Quiet Mode
|
|
||||||
|
|
||||||
The CLI runs in "quiet mode" (`HERMES_QUIET=1`), which:
|
|
||||||
- Suppresses verbose logging from tools
|
|
||||||
- Enables kawaii-style animated feedback
|
|
||||||
- Hides terminal environment warnings
|
|
||||||
- Keeps output clean and user-friendly
|
|
||||||
|
|
||||||
For verbose output (debugging), use:
|
|
||||||
```bash
|
|
||||||
./hermes --verbose
|
|
||||||
```
|
|
||||||
@@ -1,124 +0,0 @@
|
|||||||
# LLM Client
|
|
||||||
|
|
||||||
Hermes Agent uses the OpenAI Python SDK with OpenRouter as the backend, providing access to many models through a single API.
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
```python
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
client = OpenAI(
|
|
||||||
api_key=os.getenv("OPENROUTER_API_KEY"),
|
|
||||||
base_url="https://openrouter.ai/api/v1"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Supported Models
|
|
||||||
|
|
||||||
Any model available on [OpenRouter](https://openrouter.ai/models):
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Anthropic
|
|
||||||
model = "anthropic/claude-sonnet-4"
|
|
||||||
model = "anthropic/claude-opus-4"
|
|
||||||
|
|
||||||
# OpenAI
|
|
||||||
model = "openai/gpt-4o"
|
|
||||||
model = "openai/o1"
|
|
||||||
|
|
||||||
# Google
|
|
||||||
model = "google/gemini-2.0-flash"
|
|
||||||
|
|
||||||
# Open models
|
|
||||||
model = "meta-llama/llama-3.3-70b-instruct"
|
|
||||||
model = "deepseek/deepseek-chat-v3"
|
|
||||||
model = "moonshotai/kimi-k2.5"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Tool Calling
|
|
||||||
|
|
||||||
Standard OpenAI function calling format:
|
|
||||||
|
|
||||||
```python
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
tools=[
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "web_search",
|
|
||||||
"description": "Search the web",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"query": {"type": "string"}
|
|
||||||
},
|
|
||||||
"required": ["query"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check for tool calls
|
|
||||||
if response.choices[0].message.tool_calls:
|
|
||||||
for tool_call in response.choices[0].message.tool_calls:
|
|
||||||
name = tool_call.function.name
|
|
||||||
args = json.loads(tool_call.function.arguments)
|
|
||||||
# Execute tool...
|
|
||||||
```
|
|
||||||
|
|
||||||
## Reasoning Models
|
|
||||||
|
|
||||||
Some models return reasoning/thinking content:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Access reasoning if available
|
|
||||||
message = response.choices[0].message
|
|
||||||
if hasattr(message, 'reasoning_content') and message.reasoning_content:
|
|
||||||
reasoning = message.reasoning_content
|
|
||||||
# Store for trajectory export
|
|
||||||
```
|
|
||||||
|
|
||||||
## Provider Selection
|
|
||||||
|
|
||||||
OpenRouter allows selecting specific providers:
|
|
||||||
|
|
||||||
```python
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
extra_body={
|
|
||||||
"provider": {
|
|
||||||
"order": ["Anthropic", "Google"], # Preferred providers
|
|
||||||
"ignore": ["Novita"], # Providers to skip
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
Common errors and handling:
|
|
||||||
|
|
||||||
```python
|
|
||||||
try:
|
|
||||||
response = client.chat.completions.create(...)
|
|
||||||
except openai.RateLimitError:
|
|
||||||
# Back off and retry
|
|
||||||
except openai.APIError as e:
|
|
||||||
# Check e.code for specific errors
|
|
||||||
# 400 = bad request (often provider-specific)
|
|
||||||
# 502 = bad gateway (retry with different provider)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Cost Tracking
|
|
||||||
|
|
||||||
OpenRouter returns usage info:
|
|
||||||
|
|
||||||
```python
|
|
||||||
usage = response.usage
|
|
||||||
print(f"Tokens: {usage.prompt_tokens} + {usage.completion_tokens}")
|
|
||||||
print(f"Cost: ${usage.cost:.6f}") # If available
|
|
||||||
```
|
|
||||||
@@ -1,121 +0,0 @@
|
|||||||
# Message Format & Trajectories
|
|
||||||
|
|
||||||
Hermes Agent uses two message formats: the **API format** for LLM calls and the **trajectory format** for training data export.
|
|
||||||
|
|
||||||
## API Message Format
|
|
||||||
|
|
||||||
Standard OpenAI chat format used during execution:
|
|
||||||
|
|
||||||
```python
|
|
||||||
messages = [
|
|
||||||
# System prompt
|
|
||||||
{"role": "system", "content": "You are a helpful assistant with tools..."},
|
|
||||||
|
|
||||||
# User query
|
|
||||||
{"role": "user", "content": "Search for Python tutorials"},
|
|
||||||
|
|
||||||
# Assistant with tool call
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": None,
|
|
||||||
"tool_calls": [{
|
|
||||||
"id": "call_abc123",
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "web_search",
|
|
||||||
"arguments": "{\"query\": \"Python tutorials\"}"
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
|
|
||||||
# Tool result
|
|
||||||
{
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": "call_abc123",
|
|
||||||
"content": "{\"results\": [...]}"
|
|
||||||
},
|
|
||||||
|
|
||||||
# Final response
|
|
||||||
{"role": "assistant", "content": "Here's what I found..."}
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
## Trajectory Format (ShareGPT)
|
|
||||||
|
|
||||||
Exported for training in ShareGPT format:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"conversations": [
|
|
||||||
{"from": "system", "value": "You are a helpful assistant..."},
|
|
||||||
{"from": "human", "value": "Search for Python tutorials"},
|
|
||||||
{"from": "gpt", "value": "<tool_call>\n{\"name\": \"web_search\", \"arguments\": {\"query\": \"Python tutorials\"}}\n</tool_call>"},
|
|
||||||
{"from": "tool", "value": "<tool_response>\n{\"results\": [...]}\n</tool_response>"},
|
|
||||||
{"from": "gpt", "value": "Here's what I found..."}
|
|
||||||
],
|
|
||||||
"tools": "[{\"type\": \"function\", \"function\": {...}}]",
|
|
||||||
"source": "hermes-agent"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Reasoning Content
|
|
||||||
|
|
||||||
For models that output reasoning/chain-of-thought:
|
|
||||||
|
|
||||||
**During execution** (API format):
|
|
||||||
```python
|
|
||||||
# Stored internally but not sent back to model in content
|
|
||||||
assistant_msg = {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "Here's what I found...",
|
|
||||||
"reasoning": "Let me think about this step by step..." # Internal only
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**In trajectory export** (reasoning wrapped in tags):
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"from": "gpt",
|
|
||||||
"value": "<think>\nLet me think about this step by step...\n</think>\nHere's what I found..."
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Conversion Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
API Response → Internal Storage → Trajectory Export
|
|
||||||
↓ ↓ ↓
|
|
||||||
tool_calls reasoning field <tool_call> tags
|
|
||||||
reasoning_content <think> tags
|
|
||||||
```
|
|
||||||
|
|
||||||
The conversion happens in `_convert_to_trajectory_format()` in `run_agent.py`.
|
|
||||||
|
|
||||||
## Ephemeral System Prompts
|
|
||||||
|
|
||||||
Batch processing supports ephemeral system prompts that guide behavior during execution but are NOT saved to trajectories:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# During execution: full system prompt + ephemeral guidance
|
|
||||||
messages = [
|
|
||||||
{"role": "system", "content": SYSTEM_PROMPT + "\n\n" + ephemeral_prompt},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
|
|
||||||
# In saved trajectory: only the base system prompt
|
|
||||||
trajectory = {
|
|
||||||
"conversations": [
|
|
||||||
{"from": "system", "value": SYSTEM_PROMPT}, # No ephemeral
|
|
||||||
...
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Trajectory Compression
|
|
||||||
|
|
||||||
Long trajectories can be compressed for training using `trajectory_compressor.py`:
|
|
||||||
|
|
||||||
- Protects first/last N turns
|
|
||||||
- Summarizes middle turns with LLM
|
|
||||||
- Targets specific token budget
|
|
||||||
- See `configs/trajectory_compression.yaml` for settings
|
|
||||||
159
docs/tools.md
159
docs/tools.md
@@ -1,159 +0,0 @@
|
|||||||
# Tools
|
|
||||||
|
|
||||||
Tools are functions that extend the agent's capabilities. Each tool is defined with an OpenAI-compatible JSON schema and an async handler function.
|
|
||||||
|
|
||||||
## Tool Structure
|
|
||||||
|
|
||||||
Each tool module in `tools/` exports:
|
|
||||||
1. **Schema definitions** - OpenAI function-calling format
|
|
||||||
2. **Handler functions** - Async functions that execute the tool
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Example: tools/web_tools.py
|
|
||||||
|
|
||||||
# Schema definition
|
|
||||||
WEB_SEARCH_SCHEMA = {
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "web_search",
|
|
||||||
"description": "Search the web for information",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"query": {"type": "string", "description": "Search query"}
|
|
||||||
},
|
|
||||||
"required": ["query"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Handler function
|
|
||||||
async def web_search(query: str) -> dict:
|
|
||||||
"""Execute web search and return results."""
|
|
||||||
# Implementation...
|
|
||||||
return {"results": [...]}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Tool Categories
|
|
||||||
|
|
||||||
| Category | Module | Tools |
|
|
||||||
|----------|--------|-------|
|
|
||||||
| **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` |
|
|
||||||
| **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal/ssh backends) |
|
|
||||||
| **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. |
|
|
||||||
| **Vision** | `vision_tools.py` | `vision_analyze` |
|
|
||||||
| **Image Gen** | `image_generation_tool.py` | `image_generate` |
|
|
||||||
| **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` |
|
|
||||||
| **Skills** | `skills_tool.py` | `skills_categories`, `skills_list`, `skill_view` |
|
|
||||||
|
|
||||||
## Tool Registration
|
|
||||||
|
|
||||||
Tools are registered in `model_tools.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# model_tools.py
|
|
||||||
TOOL_SCHEMAS = [
|
|
||||||
*WEB_TOOL_SCHEMAS,
|
|
||||||
*TERMINAL_TOOL_SCHEMAS,
|
|
||||||
*BROWSER_TOOL_SCHEMAS,
|
|
||||||
# ...
|
|
||||||
]
|
|
||||||
|
|
||||||
TOOL_HANDLERS = {
|
|
||||||
"web_search": web_search,
|
|
||||||
"terminal": terminal_tool,
|
|
||||||
"browser_navigate": browser_navigate,
|
|
||||||
# ...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Toolsets
|
|
||||||
|
|
||||||
Tools are grouped into **toolsets** for logical organization (see `toolsets.py`):
|
|
||||||
|
|
||||||
```python
|
|
||||||
TOOLSETS = {
|
|
||||||
"web": {
|
|
||||||
"description": "Web search and content extraction",
|
|
||||||
"tools": ["web_search", "web_extract", "web_crawl"]
|
|
||||||
},
|
|
||||||
"terminal": {
|
|
||||||
"description": "Command execution",
|
|
||||||
"tools": ["terminal"]
|
|
||||||
},
|
|
||||||
# ...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Adding a New Tool
|
|
||||||
|
|
||||||
1. Create handler function in `tools/your_tool.py`
|
|
||||||
2. Define JSON schema following OpenAI format
|
|
||||||
3. Register in `model_tools.py` (schemas and handlers)
|
|
||||||
4. Add to appropriate toolset in `toolsets.py`
|
|
||||||
5. Update `tools/__init__.py` exports
|
|
||||||
|
|
||||||
## Stateful Tools
|
|
||||||
|
|
||||||
Some tools maintain state across calls within a session:
|
|
||||||
|
|
||||||
- **Terminal**: Keeps container/sandbox running between commands
|
|
||||||
- **Browser**: Maintains browser session for multi-step navigation
|
|
||||||
|
|
||||||
State is managed per `task_id` and cleaned up automatically.
|
|
||||||
|
|
||||||
## Terminal Backends
|
|
||||||
|
|
||||||
The terminal tool supports multiple execution backends:
|
|
||||||
|
|
||||||
| Backend | Description | Use Case |
|
|
||||||
|---------|-------------|----------|
|
|
||||||
| `local` | Direct execution on host | Development, simple tasks |
|
|
||||||
| `ssh` | Remote execution via SSH | Sandboxing (agent can't modify its own code) |
|
|
||||||
| `docker` | Docker container | Isolation, reproducibility |
|
|
||||||
| `singularity` | Singularity/Apptainer | HPC clusters, rootless containers |
|
|
||||||
| `modal` | Modal cloud | Scalable cloud compute, GPUs |
|
|
||||||
|
|
||||||
Configure via environment variables or `cli-config.yaml`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# SSH backend example (in cli-config.yaml)
|
|
||||||
terminal:
|
|
||||||
env_type: "ssh"
|
|
||||||
ssh_host: "my-server.example.com"
|
|
||||||
ssh_user: "myuser"
|
|
||||||
ssh_key: "~/.ssh/id_rsa"
|
|
||||||
cwd: "/home/myuser/project"
|
|
||||||
```
|
|
||||||
|
|
||||||
The SSH backend uses ControlMaster for connection persistence, making subsequent commands fast.
|
|
||||||
|
|
||||||
## Skills Tools (Progressive Disclosure)
|
|
||||||
|
|
||||||
Skills are on-demand knowledge documents. They use **progressive disclosure** to minimize tokens:
|
|
||||||
|
|
||||||
```
|
|
||||||
Level 0: skills_categories() → ["mlops", "devops"] (~50 tokens)
|
|
||||||
Level 1: skills_list(category) → [{name, description}, ...] (~3k tokens)
|
|
||||||
Level 2: skill_view(name) → Full content + metadata (varies)
|
|
||||||
Level 3: skill_view(name, path) → Specific reference file (varies)
|
|
||||||
```
|
|
||||||
|
|
||||||
Skill directory structure:
|
|
||||||
```
|
|
||||||
skills/
|
|
||||||
└── mlops/
|
|
||||||
└── axolotl/
|
|
||||||
├── SKILL.md # Main instructions (required)
|
|
||||||
├── references/ # Additional docs
|
|
||||||
└── templates/ # Output formats, configs
|
|
||||||
```
|
|
||||||
|
|
||||||
SKILL.md uses YAML frontmatter:
|
|
||||||
```yaml
|
|
||||||
---
|
|
||||||
name: axolotl
|
|
||||||
description: Fine-tuning LLMs with Axolotl
|
|
||||||
tags: [Fine-Tuning, LoRA, DPO]
|
|
||||||
---
|
|
||||||
```
|
|
||||||
334
environments/README.md
Normal file
334
environments/README.md
Normal file
@@ -0,0 +1,334 @@
|
|||||||
|
# Hermes-Agent Atropos Environments
|
||||||
|
|
||||||
|
This directory contains the integration layer between **hermes-agent's** tool-calling capabilities and the **Atropos** RL training framework. It provides everything needed to run agentic LLMs through multi-turn tool-calling loops, score their output with arbitrary reward functions, and feed results into Atropos for training or evaluation.
|
||||||
|
|
||||||
|
## Architecture Overview
|
||||||
|
|
||||||
|
```
|
||||||
|
Atropos Framework
|
||||||
|
┌───────────────────────┐
|
||||||
|
│ BaseEnv │ (atroposlib)
|
||||||
|
│ - Server management │
|
||||||
|
│ - Worker scheduling │
|
||||||
|
│ - Wandb logging │
|
||||||
|
│ - CLI (serve/process/ │
|
||||||
|
│ evaluate) │
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│ inherits
|
||||||
|
┌───────────┴───────────┐
|
||||||
|
│ HermesAgentBaseEnv │ hermes_base_env.py
|
||||||
|
│ - Terminal backend │
|
||||||
|
│ - Tool resolution │
|
||||||
|
│ - Agent loop │
|
||||||
|
│ - ToolContext │
|
||||||
|
│ - Async patches │
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│ inherits
|
||||||
|
┌─────────────────┼─────────────────┐
|
||||||
|
│ │ │
|
||||||
|
TerminalTestEnv HermesSweEnv TerminalBench2EvalEnv
|
||||||
|
(stack testing) (SWE training) (TB2 benchmark eval)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inheritance Chain
|
||||||
|
|
||||||
|
**BaseEnv** (from `atroposlib`) is the Atropos base class. It provides:
|
||||||
|
- Server management (OpenAI-compatible API servers, VLLM, SGLang)
|
||||||
|
- Worker scheduling for parallel rollouts
|
||||||
|
- Wandb integration for metrics and rollout logging
|
||||||
|
- CLI interface with three subcommands: `serve`, `process`, `evaluate`
|
||||||
|
- `evaluate_log()` for saving eval results to JSON + samples.jsonl
|
||||||
|
|
||||||
|
**HermesAgentBaseEnv** (`hermes_base_env.py`) extends BaseEnv with hermes-agent specifics:
|
||||||
|
- Sets `os.environ["TERMINAL_ENV"]` to configure the terminal backend (local, docker, modal, daytona, ssh, singularity)
|
||||||
|
- Resolves hermes-agent toolsets via `_resolve_tools_for_group()` (calls `get_tool_definitions()` which queries `tools/registry.py`)
|
||||||
|
- Implements `collect_trajectory()` which runs the full agent loop and computes rewards
|
||||||
|
- Supports two-phase operation (Phase 1: OpenAI server, Phase 2: VLLM ManagedServer)
|
||||||
|
- Applies monkey patches for async-safe tool operation at import time
|
||||||
|
|
||||||
|
Concrete environments inherit from `HermesAgentBaseEnv` and implement:
|
||||||
|
- `setup()` -- Load dataset, initialize state
|
||||||
|
- `get_next_item()` -- Return the next item for rollout
|
||||||
|
- `format_prompt()` -- Convert a dataset item into the user message
|
||||||
|
- `compute_reward()` -- Score the rollout using ToolContext
|
||||||
|
- `evaluate()` -- Periodic evaluation logic
|
||||||
|
|
||||||
|
## Core Components
|
||||||
|
|
||||||
|
### Agent Loop (`agent_loop.py`)
|
||||||
|
|
||||||
|
`HermesAgentLoop` is the reusable multi-turn agent engine. It runs the same pattern as hermes-agent's `run_agent.py`:
|
||||||
|
|
||||||
|
1. Send messages + tools to the API via `server.chat_completion()`
|
||||||
|
2. If the response contains `tool_calls`, execute each one via `handle_function_call()` (which delegates to `tools/registry.py`'s `dispatch()`)
|
||||||
|
3. Append tool results to the conversation and go back to step 1
|
||||||
|
4. If the response has no tool_calls, the agent is done
|
||||||
|
|
||||||
|
Tool calls are executed in a thread pool (`run_in_executor`) so backends that use `asyncio.run()` internally (Modal, Docker) don't deadlock inside Atropos's event loop.
|
||||||
|
|
||||||
|
Returns an `AgentResult` containing the full conversation history, turn count, reasoning content per turn, tool errors, and optional ManagedServer state (for Phase 2).
|
||||||
|
|
||||||
|
### Tool Context (`tool_context.py`)
|
||||||
|
|
||||||
|
`ToolContext` is a per-rollout handle that gives reward/verification functions direct access to **all** hermes-agent tools, scoped to the rollout's `task_id`. The same `task_id` means the terminal/browser session is the SAME one the model used during its rollout -- all state (files, processes, browser tabs) is preserved.
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def compute_reward(self, item, result, ctx: ToolContext):
|
||||||
|
# Run tests in the model's terminal sandbox
|
||||||
|
test = ctx.terminal("pytest -v")
|
||||||
|
if test["exit_code"] == 0:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
# Check if a file was created
|
||||||
|
content = ctx.read_file("/workspace/solution.py")
|
||||||
|
if content.get("content"):
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
# Download files locally for verification (binary-safe)
|
||||||
|
ctx.download_file("/remote/output.bin", "/local/output.bin")
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Available methods:
|
||||||
|
- **Terminal**: `terminal(command, timeout)` -- run shell commands
|
||||||
|
- **Files**: `read_file(path)`, `write_file(path, content)`, `search(query, path)`
|
||||||
|
- **Transfers**: `upload_file()`, `upload_dir()`, `download_file()`, `download_dir()` -- binary-safe file transfers between host and sandbox
|
||||||
|
- **Web**: `web_search(query)`, `web_extract(urls)`
|
||||||
|
- **Browser**: `browser_navigate(url)`, `browser_snapshot()`
|
||||||
|
- **Generic**: `call_tool(name, args)` -- call any hermes-agent tool by name
|
||||||
|
- **Cleanup**: `cleanup()` -- release all resources (called automatically after `compute_reward`)
|
||||||
|
|
||||||
|
### Patches (`patches.py`)
|
||||||
|
|
||||||
|
**Problem**: Some hermes-agent tools use `asyncio.run()` internally (e.g., mini-swe-agent's Modal backend via SWE-ReX). This crashes when called from inside Atropos's event loop because `asyncio.run()` cannot be nested.
|
||||||
|
|
||||||
|
**Solution**: `patches.py` monkey-patches `SwerexModalEnvironment` to use a dedicated background thread (`_AsyncWorker`) with its own event loop. The calling code sees the same sync interface, but internally the async work happens on a separate thread that doesn't conflict with Atropos's loop.
|
||||||
|
|
||||||
|
What gets patched:
|
||||||
|
- `SwerexModalEnvironment.__init__` -- creates Modal deployment on a background thread
|
||||||
|
- `SwerexModalEnvironment.execute` -- runs commands on the same background thread
|
||||||
|
- `SwerexModalEnvironment.stop` -- stops deployment on the background thread
|
||||||
|
|
||||||
|
The patches are:
|
||||||
|
- **Idempotent** -- calling `apply_patches()` multiple times is safe
|
||||||
|
- **Transparent** -- same interface and behavior, only the internal async execution changes
|
||||||
|
- **Universal** -- works identically in normal CLI use (no running event loop)
|
||||||
|
|
||||||
|
Applied automatically at import time by `hermes_base_env.py`.
|
||||||
|
|
||||||
|
### Tool Call Parsers (`tool_call_parsers/`)
|
||||||
|
|
||||||
|
Client-side parsers that extract structured `tool_calls` from raw model output text. Used in **Phase 2** (VLLM server type) where ManagedServer's `/generate` endpoint returns raw text without tool call parsing.
|
||||||
|
|
||||||
|
Each parser is a standalone reimplementation of the corresponding VLLM parser's `extract_tool_calls()` logic. No VLLM dependency -- only standard library (`re`, `json`, `uuid`) and `openai` types.
|
||||||
|
|
||||||
|
Available parsers:
|
||||||
|
- `hermes` -- Hermes/ChatML `<tool_call>` XML format
|
||||||
|
- `mistral` -- Mistral `[TOOL_CALLS]` format
|
||||||
|
- `llama3_json` -- Llama 3 JSON tool calling
|
||||||
|
- `qwen` -- Qwen tool calling format
|
||||||
|
- `qwen3_coder` -- Qwen3 Coder format
|
||||||
|
- `deepseek_v3` -- DeepSeek V3 format
|
||||||
|
- `deepseek_v3_1` -- DeepSeek V3.1 format
|
||||||
|
- `kimi_k2` -- Kimi K2 format
|
||||||
|
- `longcat` -- Longcat format
|
||||||
|
- `glm45` / `glm47` -- GLM model formats
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
```python
|
||||||
|
from environments.tool_call_parsers import get_parser
|
||||||
|
|
||||||
|
parser = get_parser("hermes")
|
||||||
|
content, tool_calls = parser.parse(raw_model_output)
|
||||||
|
```
|
||||||
|
|
||||||
|
In Phase 1 (OpenAI server type), these parsers are not needed -- the server handles tool call parsing natively.
|
||||||
|
|
||||||
|
## Two-Phase Operation
|
||||||
|
|
||||||
|
### Phase 1: OpenAI Server (Evaluation / SFT Data Generation)
|
||||||
|
|
||||||
|
Uses `server.chat_completion()` with `tools=` parameter. The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing natively. Returns `ChatCompletion` objects with structured `tool_calls`.
|
||||||
|
|
||||||
|
- Good for: evaluation, SFT data generation, testing
|
||||||
|
- Run with: `serve` (with `run-api`), `process`, or `evaluate` subcommands
|
||||||
|
- Placeholder tokens are created for the Atropos pipeline
|
||||||
|
|
||||||
|
### Phase 2: VLLM ManagedServer (Full RL Training)
|
||||||
|
|
||||||
|
Uses ManagedServer for exact token IDs + logprobs via `/generate`. Client-side tool call parser (from `tool_call_parsers/`) reconstructs structured `tool_calls` from raw output.
|
||||||
|
|
||||||
|
- Good for: full RL training with GRPO/PPO
|
||||||
|
- Run with: `serve` subcommand
|
||||||
|
- Real tokens, masks, and logprobs flow through the pipeline
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
environments/
|
||||||
|
├── README.md # This file
|
||||||
|
├── __init__.py # Package exports
|
||||||
|
├── hermes_base_env.py # Abstract base (HermesAgentBaseEnv)
|
||||||
|
├── agent_loop.py # Multi-turn agent engine (HermesAgentLoop)
|
||||||
|
├── tool_context.py # Per-rollout tool access for reward functions
|
||||||
|
├── patches.py # Async-safety patches for Modal backend
|
||||||
|
│
|
||||||
|
├── tool_call_parsers/ # Phase 2 client-side parsers
|
||||||
|
│ ├── __init__.py # Registry + base class
|
||||||
|
│ ├── hermes_parser.py
|
||||||
|
│ ├── mistral_parser.py
|
||||||
|
│ ├── llama_parser.py
|
||||||
|
│ ├── qwen_parser.py
|
||||||
|
│ ├── qwen3_coder_parser.py
|
||||||
|
│ ├── deepseek_v3_parser.py
|
||||||
|
│ ├── deepseek_v3_1_parser.py
|
||||||
|
│ ├── kimi_k2_parser.py
|
||||||
|
│ ├── longcat_parser.py
|
||||||
|
│ ├── glm45_parser.py
|
||||||
|
│ └── glm47_parser.py
|
||||||
|
│
|
||||||
|
├── terminal_test_env/ # Stack validation environment
|
||||||
|
│ └── terminal_test_env.py
|
||||||
|
│
|
||||||
|
├── hermes_swe_env/ # SWE-bench style training environment
|
||||||
|
│ └── hermes_swe_env.py
|
||||||
|
│
|
||||||
|
└── benchmarks/ # Evaluation benchmarks
|
||||||
|
├── terminalbench_2/ # 89 terminal tasks, Modal sandboxes
|
||||||
|
│ └── terminalbench2_env.py
|
||||||
|
├── tblite/ # 100 calibrated tasks (fast TB2 proxy)
|
||||||
|
│ └── tblite_env.py
|
||||||
|
└── yc_bench/ # Long-horizon strategic benchmark
|
||||||
|
└── yc_bench_env.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Concrete Environments
|
||||||
|
|
||||||
|
### TerminalTestEnv (`terminal_test_env/`)
|
||||||
|
|
||||||
|
A self-contained environment with inline tasks (no external dataset needed) for validating the full stack end-to-end. Each task asks the model to create a file at a known path, and the verifier checks the content matches.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Serve mode (needs run-api)
|
||||||
|
run-api
|
||||||
|
python environments/terminal_test_env/terminal_test_env.py serve
|
||||||
|
|
||||||
|
# Process mode (no run-api, saves to JSONL)
|
||||||
|
python environments/terminal_test_env/terminal_test_env.py process \
|
||||||
|
--env.data_path_to_save_groups terminal_test_output.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
### HermesSweEnv (`hermes_swe_env/`)
|
||||||
|
|
||||||
|
SWE-bench style training environment. The model gets a coding task, uses terminal + file + web tools to solve it, and the reward function runs tests in the same Modal sandbox.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python environments/hermes_swe_env/hermes_swe_env.py serve \
|
||||||
|
--openai.model_name YourModel \
|
||||||
|
--env.dataset_name bigcode/humanevalpack \
|
||||||
|
--env.terminal_backend modal
|
||||||
|
```
|
||||||
|
|
||||||
|
### TerminalBench2EvalEnv (`benchmarks/terminalbench_2/`)
|
||||||
|
|
||||||
|
**Eval-only** environment for the Terminal-Bench 2.0 benchmark (89 tasks). Each task gets a pre-built Docker Hub image, a natural language instruction, and a test suite. The agent uses terminal + file tools to solve the task, then the test suite verifies correctness.
|
||||||
|
|
||||||
|
Follows the standard Atropos eval pattern (like GPQA, MMLU, etc.):
|
||||||
|
- Run via `evaluate` subcommand (no `run-api` needed)
|
||||||
|
- `setup()` loads the dataset, `evaluate()` runs all tasks
|
||||||
|
- `rollout_and_score_eval()` handles per-task agent loop + test verification
|
||||||
|
- Downloads verifier output locally for reliable reward checking (Harbor pattern)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run full benchmark
|
||||||
|
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||||
|
--openai.model_name anthropic/claude-opus-4.6
|
||||||
|
|
||||||
|
# Run subset of tasks
|
||||||
|
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||||
|
--openai.model_name anthropic/claude-opus-4.6 \
|
||||||
|
--env.task_filter fix-git,git-multibranch
|
||||||
|
|
||||||
|
# Skip specific tasks
|
||||||
|
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||||
|
--openai.model_name anthropic/claude-opus-4.6 \
|
||||||
|
--env.skip_tasks heavy-task,slow-task
|
||||||
|
```
|
||||||
|
|
||||||
|
## Creating a New Environment
|
||||||
|
|
||||||
|
### Training Environment
|
||||||
|
|
||||||
|
1. Create a new directory under `environments/`
|
||||||
|
2. Create your env file inheriting from `HermesAgentBaseEnv`
|
||||||
|
3. Implement the four abstract methods + `evaluate()`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
|
||||||
|
class MyEnvConfig(HermesAgentEnvConfig):
|
||||||
|
pass # Add custom fields as needed
|
||||||
|
|
||||||
|
class MyEnv(HermesAgentBaseEnv):
|
||||||
|
name = "my-env"
|
||||||
|
env_config_cls = MyEnvConfig
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls):
|
||||||
|
env_config = MyEnvConfig(
|
||||||
|
enabled_toolsets=["terminal", "file"],
|
||||||
|
terminal_backend="modal",
|
||||||
|
# ... other config
|
||||||
|
)
|
||||||
|
server_configs = [APIServerConfig(...)]
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
async def setup(self):
|
||||||
|
self.dataset = load_dataset(...)
|
||||||
|
self.iter = 0
|
||||||
|
|
||||||
|
async def get_next_item(self):
|
||||||
|
item = self.dataset[self.iter % len(self.dataset)]
|
||||||
|
self.iter += 1
|
||||||
|
return item
|
||||||
|
|
||||||
|
def format_prompt(self, item):
|
||||||
|
return item["instruction"]
|
||||||
|
|
||||||
|
async def compute_reward(self, item, result, ctx):
|
||||||
|
# ctx gives you full tool access to the rollout's sandbox
|
||||||
|
test = ctx.terminal("pytest -v")
|
||||||
|
return 1.0 if test["exit_code"] == 0 else 0.0
|
||||||
|
|
||||||
|
async def evaluate(self, *args, **kwargs):
|
||||||
|
# Periodic evaluation logic
|
||||||
|
...
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
MyEnv.cli()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Eval-Only Environment (Benchmark)
|
||||||
|
|
||||||
|
For eval benchmarks, follow the pattern in `terminalbench2_env.py`:
|
||||||
|
1. Create under `environments/benchmarks/your-benchmark/`
|
||||||
|
2. Inherit from `HermesAgentBaseEnv`
|
||||||
|
3. Set eval-only config: `eval_handling=STOP_TRAIN`, `steps_per_eval=1`, `total_steps=1`
|
||||||
|
4. Stub the training methods (`collect_trajectories`, `score`)
|
||||||
|
5. Implement `rollout_and_score_eval()` and `evaluate()`
|
||||||
|
6. Run with `evaluate` subcommand
|
||||||
|
|
||||||
|
## Key Config Fields
|
||||||
|
|
||||||
|
| Field | Description | Default |
|
||||||
|
|-------|-------------|---------|
|
||||||
|
| `enabled_toolsets` | Which hermes toolsets to enable | `None` (all) |
|
||||||
|
| `disabled_toolsets` | Toolsets to disable | `None` |
|
||||||
|
| `distribution` | Probabilistic toolset distribution name | `None` |
|
||||||
|
| `max_agent_turns` | Max LLM calls per rollout | `30` |
|
||||||
|
| `agent_temperature` | Sampling temperature | `1.0` |
|
||||||
|
| `terminal_backend` | `local`, `docker`, `modal`, `daytona`, `ssh`, `singularity` | `local` |
|
||||||
|
| `system_prompt` | System message for the agent | `None` |
|
||||||
|
| `tool_call_parser` | Parser name for Phase 2 | `hermes` |
|
||||||
|
| `eval_handling` | `STOP_TRAIN`, `LIMIT_TRAIN`, `NONE` | `STOP_TRAIN` |
|
||||||
31
environments/__init__.py
Normal file
31
environments/__init__.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
"""
|
||||||
|
Hermes-Agent Atropos Environments
|
||||||
|
|
||||||
|
Provides a layered integration between hermes-agent's tool-calling capabilities
|
||||||
|
and the Atropos RL training framework.
|
||||||
|
|
||||||
|
Core layers:
|
||||||
|
- agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling
|
||||||
|
- tool_context: Per-rollout tool access handle for reward/verification functions
|
||||||
|
- hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos
|
||||||
|
- tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate)
|
||||||
|
|
||||||
|
Concrete environments:
|
||||||
|
- terminal_test_env/: Simple file-creation tasks for testing the stack
|
||||||
|
- hermes_swe_env/: SWE-bench style tasks with Modal sandboxes
|
||||||
|
|
||||||
|
Benchmarks (eval-only):
|
||||||
|
- benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation
|
||||||
|
"""
|
||||||
|
|
||||||
|
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||||
|
from environments.tool_context import ToolContext
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AgentResult",
|
||||||
|
"HermesAgentLoop",
|
||||||
|
"ToolContext",
|
||||||
|
"HermesAgentBaseEnv",
|
||||||
|
"HermesAgentEnvConfig",
|
||||||
|
]
|
||||||
453
environments/agent_loop.py
Normal file
453
environments/agent_loop.py
Normal file
@@ -0,0 +1,453 @@
|
|||||||
|
"""
|
||||||
|
HermesAgentLoop -- Reusable Multi-Turn Agent Engine
|
||||||
|
|
||||||
|
Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling.
|
||||||
|
Works with any server that returns ChatCompletion objects with tool_calls:
|
||||||
|
- Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API)
|
||||||
|
- Phase 2: ManagedServer with client-side tool call parser
|
||||||
|
|
||||||
|
The loop passes tools= and checks response.choices[0].message.tool_calls,
|
||||||
|
identical to hermes-agent's run_agent.py. Tool execution is dispatched via
|
||||||
|
handle_function_call() from model_tools.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import concurrent.futures
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Dict, List, Optional, Set
|
||||||
|
|
||||||
|
from model_tools import handle_function_call
|
||||||
|
|
||||||
|
# Thread pool for running sync tool calls that internally use asyncio.run()
|
||||||
|
# (e.g., mini-swe-agent's modal/docker/daytona backends). Running them in a separate
|
||||||
|
# thread gives them a clean event loop so they don't deadlock inside Atropos's loop.
|
||||||
|
# Size must be large enough for concurrent eval tasks (e.g., 89 TB2 tasks all
|
||||||
|
# making tool calls). Too small = thread pool starvation, tasks queue for minutes.
|
||||||
|
# Resized at runtime by HermesAgentBaseEnv.__init__ via resize_tool_pool().
|
||||||
|
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=128)
|
||||||
|
|
||||||
|
|
||||||
|
def resize_tool_pool(max_workers: int):
|
||||||
|
"""
|
||||||
|
Replace the global tool executor with a new one of the given size.
|
||||||
|
|
||||||
|
Called by HermesAgentBaseEnv.__init__ based on config.tool_pool_size.
|
||||||
|
Safe to call before any tasks are submitted.
|
||||||
|
"""
|
||||||
|
global _tool_executor
|
||||||
|
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
logger.info("Tool thread pool resized to %d workers", max_workers)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ToolError:
|
||||||
|
"""Record of a tool execution error during the agent loop."""
|
||||||
|
|
||||||
|
turn: int # Which turn the error occurred on
|
||||||
|
tool_name: str # Which tool was called
|
||||||
|
arguments: str # The arguments passed (truncated)
|
||||||
|
error: str # The error message
|
||||||
|
tool_result: str # The raw result returned to the model
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AgentResult:
|
||||||
|
"""Result of running the agent loop."""
|
||||||
|
|
||||||
|
# Full conversation history in OpenAI message format
|
||||||
|
messages: List[Dict[str, Any]]
|
||||||
|
# ManagedServer.get_state() if available (Phase 2), None otherwise
|
||||||
|
managed_state: Optional[Dict[str, Any]] = None
|
||||||
|
# How many LLM calls were made
|
||||||
|
turns_used: int = 0
|
||||||
|
# True if model stopped calling tools naturally (vs hitting max_turns)
|
||||||
|
finished_naturally: bool = False
|
||||||
|
# Extracted reasoning content per turn (from PR #297 helpers)
|
||||||
|
reasoning_per_turn: List[Optional[str]] = field(default_factory=list)
|
||||||
|
# Tool errors encountered during the loop
|
||||||
|
tool_errors: List[ToolError] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_reasoning_from_message(message) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract reasoning content from a ChatCompletion message.
|
||||||
|
|
||||||
|
Handles multiple provider formats:
|
||||||
|
1. message.reasoning_content field (some providers)
|
||||||
|
2. message.reasoning field (some providers)
|
||||||
|
3. message.reasoning_details[].text (OpenRouter style)
|
||||||
|
|
||||||
|
Note: <think> block extraction from content is NOT done here -- that's
|
||||||
|
handled by the response already in Phase 1 (server does it) or by
|
||||||
|
ManagedServer's patch in Phase 2.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message: The assistant message from ChatCompletion response
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted reasoning text, or None if not found
|
||||||
|
"""
|
||||||
|
# Check reasoning_content field (common across providers)
|
||||||
|
if hasattr(message, "reasoning_content") and message.reasoning_content:
|
||||||
|
return message.reasoning_content
|
||||||
|
|
||||||
|
# Check reasoning field
|
||||||
|
if hasattr(message, "reasoning") and message.reasoning:
|
||||||
|
return message.reasoning
|
||||||
|
|
||||||
|
# Check reasoning_details (OpenRouter style)
|
||||||
|
if hasattr(message, "reasoning_details") and message.reasoning_details:
|
||||||
|
for detail in message.reasoning_details:
|
||||||
|
if hasattr(detail, "text") and detail.text:
|
||||||
|
return detail.text
|
||||||
|
if isinstance(detail, dict) and detail.get("text"):
|
||||||
|
return detail["text"]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class HermesAgentLoop:
|
||||||
|
"""
|
||||||
|
Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling.
|
||||||
|
|
||||||
|
Same pattern as run_agent.py:
|
||||||
|
- Pass tools= to the API
|
||||||
|
- Check response.choices[0].message.tool_calls
|
||||||
|
- Dispatch via handle_function_call()
|
||||||
|
|
||||||
|
Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter,
|
||||||
|
or ManagedServer with a parser. The server determines how tool_calls get
|
||||||
|
populated on the response.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
server,
|
||||||
|
tool_schemas: List[Dict[str, Any]],
|
||||||
|
valid_tool_names: Set[str],
|
||||||
|
max_turns: int = 30,
|
||||||
|
task_id: Optional[str] = None,
|
||||||
|
temperature: float = 1.0,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
extra_body: Optional[Dict[str, Any]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the agent loop.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
server: Server object with chat_completion() method (OpenAIServer,
|
||||||
|
ManagedServer, ServerManager, etc.)
|
||||||
|
tool_schemas: OpenAI-format tool definitions from get_tool_definitions()
|
||||||
|
valid_tool_names: Set of tool names the model is allowed to call
|
||||||
|
max_turns: Maximum number of LLM calls before stopping
|
||||||
|
task_id: Unique ID for terminal/browser session isolation
|
||||||
|
temperature: Sampling temperature for generation
|
||||||
|
max_tokens: Max tokens per generation (None for server default)
|
||||||
|
extra_body: Extra parameters passed to the OpenAI client's create() call.
|
||||||
|
Used for OpenRouter provider preferences, transforms, etc.
|
||||||
|
e.g. {"provider": {"ignore": ["DeepInfra"]}}
|
||||||
|
"""
|
||||||
|
self.server = server
|
||||||
|
self.tool_schemas = tool_schemas
|
||||||
|
self.valid_tool_names = valid_tool_names
|
||||||
|
self.max_turns = max_turns
|
||||||
|
self.task_id = task_id or str(uuid.uuid4())
|
||||||
|
self.temperature = temperature
|
||||||
|
self.max_tokens = max_tokens
|
||||||
|
self.extra_body = extra_body
|
||||||
|
|
||||||
|
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
|
||||||
|
"""
|
||||||
|
Execute the full agent loop using standard OpenAI tool calling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
messages: Initial conversation messages (system + user).
|
||||||
|
Modified in-place as the conversation progresses.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AgentResult with full conversation history, managed state, and metadata
|
||||||
|
"""
|
||||||
|
reasoning_per_turn = []
|
||||||
|
tool_errors: List[ToolError] = []
|
||||||
|
|
||||||
|
# Per-loop TodoStore for the todo tool (ephemeral, dies with the loop)
|
||||||
|
from tools.todo_tool import TodoStore, todo_tool as _todo_tool
|
||||||
|
_todo_store = TodoStore()
|
||||||
|
|
||||||
|
# Extract user task from first user message for browser_snapshot context
|
||||||
|
_user_task = None
|
||||||
|
for msg in messages:
|
||||||
|
if msg.get("role") == "user":
|
||||||
|
content = msg.get("content", "")
|
||||||
|
if isinstance(content, str) and content.strip():
|
||||||
|
_user_task = content.strip()[:500] # Cap to avoid huge strings
|
||||||
|
break
|
||||||
|
|
||||||
|
import time as _time
|
||||||
|
|
||||||
|
for turn in range(self.max_turns):
|
||||||
|
turn_start = _time.monotonic()
|
||||||
|
|
||||||
|
# Build the chat_completion kwargs
|
||||||
|
chat_kwargs = {
|
||||||
|
"messages": messages,
|
||||||
|
"n": 1,
|
||||||
|
"temperature": self.temperature,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Only pass tools if we have them
|
||||||
|
if self.tool_schemas:
|
||||||
|
chat_kwargs["tools"] = self.tool_schemas
|
||||||
|
|
||||||
|
# Only pass max_tokens if explicitly set
|
||||||
|
if self.max_tokens is not None:
|
||||||
|
chat_kwargs["max_tokens"] = self.max_tokens
|
||||||
|
|
||||||
|
# Inject extra_body for provider-specific params (e.g., OpenRouter
|
||||||
|
# provider preferences like banned/preferred providers, transforms)
|
||||||
|
if self.extra_body:
|
||||||
|
chat_kwargs["extra_body"] = self.extra_body
|
||||||
|
|
||||||
|
# Make the API call -- standard OpenAI spec
|
||||||
|
api_start = _time.monotonic()
|
||||||
|
try:
|
||||||
|
response = await self.server.chat_completion(**chat_kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
api_elapsed = _time.monotonic() - api_start
|
||||||
|
logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e)
|
||||||
|
return AgentResult(
|
||||||
|
messages=messages,
|
||||||
|
managed_state=self._get_managed_state(),
|
||||||
|
turns_used=turn + 1,
|
||||||
|
finished_naturally=False,
|
||||||
|
reasoning_per_turn=reasoning_per_turn,
|
||||||
|
tool_errors=tool_errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
api_elapsed = _time.monotonic() - api_start
|
||||||
|
|
||||||
|
if not response or not response.choices:
|
||||||
|
logger.warning("Empty response on turn %d (api=%.1fs)", turn + 1, api_elapsed)
|
||||||
|
return AgentResult(
|
||||||
|
messages=messages,
|
||||||
|
managed_state=self._get_managed_state(),
|
||||||
|
turns_used=turn + 1,
|
||||||
|
finished_naturally=False,
|
||||||
|
reasoning_per_turn=reasoning_per_turn,
|
||||||
|
tool_errors=tool_errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
assistant_msg = response.choices[0].message
|
||||||
|
|
||||||
|
# Extract reasoning content from the response (all provider formats)
|
||||||
|
reasoning = _extract_reasoning_from_message(assistant_msg)
|
||||||
|
reasoning_per_turn.append(reasoning)
|
||||||
|
|
||||||
|
# Check for tool calls -- standard OpenAI spec
|
||||||
|
if assistant_msg.tool_calls:
|
||||||
|
# Build the assistant message dict for conversation history
|
||||||
|
msg_dict: Dict[str, Any] = {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": assistant_msg.content or "",
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"id": tc.id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": tc.function.name,
|
||||||
|
"arguments": tc.function.arguments,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for tc in assistant_msg.tool_calls
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Preserve reasoning_content for multi-turn chat template handling
|
||||||
|
# (e.g., Kimi-K2's template renders <think> blocks differently
|
||||||
|
# for history vs. the latest turn based on this field)
|
||||||
|
if reasoning:
|
||||||
|
msg_dict["reasoning_content"] = reasoning
|
||||||
|
|
||||||
|
messages.append(msg_dict)
|
||||||
|
|
||||||
|
# Execute each tool call via hermes-agent's dispatch
|
||||||
|
for tc in assistant_msg.tool_calls:
|
||||||
|
tool_name = tc.function.name
|
||||||
|
tool_args_raw = tc.function.arguments
|
||||||
|
|
||||||
|
# Validate tool name
|
||||||
|
if tool_name not in self.valid_tool_names:
|
||||||
|
tool_result = json.dumps(
|
||||||
|
{
|
||||||
|
"error": f"Unknown tool '{tool_name}'. "
|
||||||
|
f"Available tools: {sorted(self.valid_tool_names)}"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
tool_errors.append(ToolError(
|
||||||
|
turn=turn + 1, tool_name=tool_name,
|
||||||
|
arguments=tool_args_raw[:200],
|
||||||
|
error=f"Unknown tool '{tool_name}'",
|
||||||
|
tool_result=tool_result,
|
||||||
|
))
|
||||||
|
logger.warning(
|
||||||
|
"Model called unknown tool '%s' on turn %d",
|
||||||
|
tool_name, turn + 1,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Parse arguments and dispatch
|
||||||
|
try:
|
||||||
|
args = json.loads(tool_args_raw)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
args = {}
|
||||||
|
logger.warning(
|
||||||
|
"Invalid JSON in tool call arguments for '%s': %s",
|
||||||
|
tool_name, tool_args_raw[:200],
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if tool_name == "terminal":
|
||||||
|
backend = os.getenv("TERMINAL_ENV", "local")
|
||||||
|
cmd_preview = args.get("command", "")[:80]
|
||||||
|
logger.info(
|
||||||
|
"[%s] $ %s", self.task_id[:8], cmd_preview,
|
||||||
|
)
|
||||||
|
|
||||||
|
tool_submit_time = _time.monotonic()
|
||||||
|
|
||||||
|
# Todo tool -- handle locally (needs per-loop TodoStore)
|
||||||
|
if tool_name == "todo":
|
||||||
|
tool_result = _todo_tool(
|
||||||
|
todos=args.get("todos"),
|
||||||
|
merge=args.get("merge", False),
|
||||||
|
store=_todo_store,
|
||||||
|
)
|
||||||
|
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||||
|
elif tool_name == "memory":
|
||||||
|
tool_result = json.dumps({"error": "Memory is not available in RL environments."})
|
||||||
|
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||||
|
elif tool_name == "session_search":
|
||||||
|
tool_result = json.dumps({"error": "Session search is not available in RL environments."})
|
||||||
|
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||||
|
else:
|
||||||
|
# Run tool calls in a thread pool so backends that
|
||||||
|
# use asyncio.run() internally (modal, docker, daytona) get
|
||||||
|
# a clean event loop instead of deadlocking.
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
# Capture current tool_name/args for the lambda
|
||||||
|
_tn, _ta, _tid = tool_name, args, self.task_id
|
||||||
|
tool_result = await loop.run_in_executor(
|
||||||
|
_tool_executor,
|
||||||
|
lambda: handle_function_call(
|
||||||
|
_tn, _ta, task_id=_tid,
|
||||||
|
user_task=_user_task,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||||
|
|
||||||
|
# Log slow tools and thread pool stats for debugging
|
||||||
|
pool_active = _tool_executor._work_queue.qsize()
|
||||||
|
if tool_elapsed > 30:
|
||||||
|
logger.warning(
|
||||||
|
"[%s] turn %d: %s took %.1fs (pool queue=%d)",
|
||||||
|
self.task_id[:8], turn + 1, tool_name,
|
||||||
|
tool_elapsed, pool_active,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
tool_result = json.dumps(
|
||||||
|
{"error": f"Tool execution failed: {type(e).__name__}: {str(e)}"}
|
||||||
|
)
|
||||||
|
tool_errors.append(ToolError(
|
||||||
|
turn=turn + 1, tool_name=tool_name,
|
||||||
|
arguments=tool_args_raw[:200],
|
||||||
|
error=f"{type(e).__name__}: {str(e)}",
|
||||||
|
tool_result=tool_result,
|
||||||
|
))
|
||||||
|
logger.error(
|
||||||
|
"Tool '%s' execution failed on turn %d: %s",
|
||||||
|
tool_name, turn + 1, e,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Also check if the tool returned an error in its JSON result
|
||||||
|
try:
|
||||||
|
result_data = json.loads(tool_result)
|
||||||
|
if isinstance(result_data, dict):
|
||||||
|
err = result_data.get("error")
|
||||||
|
exit_code = result_data.get("exit_code")
|
||||||
|
if err and exit_code and exit_code < 0:
|
||||||
|
tool_errors.append(ToolError(
|
||||||
|
turn=turn + 1, tool_name=tool_name,
|
||||||
|
arguments=tool_args_raw[:200],
|
||||||
|
error=str(err),
|
||||||
|
tool_result=tool_result[:500],
|
||||||
|
))
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Add tool response to conversation
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": tc.id,
|
||||||
|
"content": tool_result,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
turn_elapsed = _time.monotonic() - turn_start
|
||||||
|
logger.info(
|
||||||
|
"[%s] turn %d: api=%.1fs, %d tools, turn_total=%.1fs",
|
||||||
|
self.task_id[:8], turn + 1, api_elapsed,
|
||||||
|
len(assistant_msg.tool_calls), turn_elapsed,
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# No tool calls -- model is done
|
||||||
|
msg_dict = {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": assistant_msg.content or "",
|
||||||
|
}
|
||||||
|
if reasoning:
|
||||||
|
msg_dict["reasoning_content"] = reasoning
|
||||||
|
messages.append(msg_dict)
|
||||||
|
|
||||||
|
turn_elapsed = _time.monotonic() - turn_start
|
||||||
|
logger.info(
|
||||||
|
"[%s] turn %d: api=%.1fs, no tools (finished), turn_total=%.1fs",
|
||||||
|
self.task_id[:8], turn + 1, api_elapsed, turn_elapsed,
|
||||||
|
)
|
||||||
|
|
||||||
|
return AgentResult(
|
||||||
|
messages=messages,
|
||||||
|
managed_state=self._get_managed_state(),
|
||||||
|
turns_used=turn + 1,
|
||||||
|
finished_naturally=True,
|
||||||
|
reasoning_per_turn=reasoning_per_turn,
|
||||||
|
tool_errors=tool_errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hit max turns without the model stopping
|
||||||
|
logger.info("Agent hit max_turns (%d) without finishing", self.max_turns)
|
||||||
|
return AgentResult(
|
||||||
|
messages=messages,
|
||||||
|
managed_state=self._get_managed_state(),
|
||||||
|
turns_used=self.max_turns,
|
||||||
|
finished_naturally=False,
|
||||||
|
reasoning_per_turn=reasoning_per_turn,
|
||||||
|
tool_errors=tool_errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_managed_state(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Get ManagedServer state if the server supports it.
|
||||||
|
|
||||||
|
Returns state dict with SequenceNodes containing tokens/logprobs/masks,
|
||||||
|
or None if the server doesn't support get_state() (e.g., regular OpenAI server).
|
||||||
|
"""
|
||||||
|
if hasattr(self.server, "get_state"):
|
||||||
|
return self.server.get_state()
|
||||||
|
return None
|
||||||
0
environments/benchmarks/__init__.py
Normal file
0
environments/benchmarks/__init__.py
Normal file
73
environments/benchmarks/tblite/README.md
Normal file
73
environments/benchmarks/tblite/README.md
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# OpenThoughts-TBLite Evaluation Environment
|
||||||
|
|
||||||
|
This environment evaluates terminal agents on the [OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite) benchmark, a difficulty-calibrated subset of [Terminal-Bench 2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0).
|
||||||
|
|
||||||
|
## Source
|
||||||
|
|
||||||
|
OpenThoughts-TBLite was created by the [OpenThoughts](https://www.openthoughts.ai/) Agent team in collaboration with [Snorkel AI](https://snorkel.ai/) and [Bespoke Labs](https://bespokelabs.ai/). The original dataset and documentation live at:
|
||||||
|
|
||||||
|
- **Dataset (source):** [open-thoughts/OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite)
|
||||||
|
- **GitHub:** [open-thoughts/OpenThoughts-TBLite](https://github.com/open-thoughts/OpenThoughts-TBLite)
|
||||||
|
- **Blog post:** [openthoughts.ai/blog/openthoughts-tblite](https://www.openthoughts.ai/blog/openthoughts-tblite)
|
||||||
|
|
||||||
|
## Our Dataset
|
||||||
|
|
||||||
|
We converted the source into the same schema used by our Terminal-Bench 2.0 environment (pre-built Docker Hub images, base64-encoded test tarballs, etc.) and published it as:
|
||||||
|
|
||||||
|
- **Dataset (ours):** [NousResearch/openthoughts-tblite](https://huggingface.co/datasets/NousResearch/openthoughts-tblite)
|
||||||
|
- **Docker images:** `nousresearch/tblite-<task-name>:latest` on Docker Hub (100 images)
|
||||||
|
|
||||||
|
The conversion script is at `scripts/prepare_tblite_dataset.py`.
|
||||||
|
|
||||||
|
## Why TBLite?
|
||||||
|
|
||||||
|
Terminal-Bench 2.0 is one of the strongest frontier evaluations for terminal agents, but when a model scores near the floor (e.g., Qwen 3 8B at <1%), many changes look identical in aggregate score. TBLite addresses this by calibrating task difficulty using Claude Haiku 4.5 as a reference:
|
||||||
|
|
||||||
|
| Difficulty | Pass Rate Range | Tasks |
|
||||||
|
|------------|----------------|-------|
|
||||||
|
| Easy | >= 70% | 40 |
|
||||||
|
| Medium | 40-69% | 26 |
|
||||||
|
| Hard | 10-39% | 26 |
|
||||||
|
| Extreme | < 10% | 8 |
|
||||||
|
|
||||||
|
This gives enough solvable tasks to detect small improvements quickly, while preserving enough hard tasks to avoid saturation. The correlation between TBLite and TB2 scores is **r = 0.911**.
|
||||||
|
|
||||||
|
TBLite also runs 2.6-8x faster than the full TB2, making it practical for iteration loops.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run the full benchmark
|
||||||
|
python environments/benchmarks/tblite/tblite_env.py evaluate
|
||||||
|
|
||||||
|
# Filter to specific tasks
|
||||||
|
python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||||
|
--env.task_filter "broken-python,pandas-etl"
|
||||||
|
|
||||||
|
# Use a different model
|
||||||
|
python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||||
|
--server.model_name "qwen/qwen3-30b"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
`TBLiteEvalEnv` is a thin subclass of `TerminalBench2EvalEnv`. All evaluation logic (agent loop, Docker sandbox management, test verification, metrics) is inherited. Only the defaults differ:
|
||||||
|
|
||||||
|
| Setting | TB2 | TBLite |
|
||||||
|
|----------------|----------------------------------|-----------------------------------------|
|
||||||
|
| Dataset | `NousResearch/terminal-bench-2` | `NousResearch/openthoughts-tblite` |
|
||||||
|
| Tasks | 89 | 100 |
|
||||||
|
| Task timeout | 1800s (30 min) | 1200s (20 min) |
|
||||||
|
| Wandb name | `terminal-bench-2` | `openthoughts-tblite` |
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@software{OpenThoughts-TBLite,
|
||||||
|
author = {OpenThoughts-Agent team, Snorkel AI, Bespoke Labs},
|
||||||
|
month = Feb,
|
||||||
|
title = {{OpenThoughts-TBLite: A High-Signal Benchmark for Iterating on Terminal Agents}},
|
||||||
|
howpublished = {https://www.openthoughts.ai/blog/openthoughts-tblite},
|
||||||
|
year = {2026}
|
||||||
|
}
|
||||||
|
```
|
||||||
0
environments/benchmarks/tblite/__init__.py
Normal file
0
environments/benchmarks/tblite/__init__.py
Normal file
39
environments/benchmarks/tblite/default.yaml
Normal file
39
environments/benchmarks/tblite/default.yaml
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# OpenThoughts-TBLite Evaluation -- Default Configuration
|
||||||
|
#
|
||||||
|
# Eval-only environment for the TBLite benchmark (100 difficulty-calibrated
|
||||||
|
# terminal tasks, a faster proxy for Terminal-Bench 2.0).
|
||||||
|
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
||||||
|
# and OpenRouter for inference.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/tblite/default.yaml
|
||||||
|
#
|
||||||
|
# # Override model:
|
||||||
|
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/tblite/default.yaml \
|
||||||
|
# --openai.model_name anthropic/claude-sonnet-4
|
||||||
|
|
||||||
|
env:
|
||||||
|
enabled_toolsets: ["terminal", "file"]
|
||||||
|
max_agent_turns: 60
|
||||||
|
max_token_length: 32000
|
||||||
|
agent_temperature: 0.8
|
||||||
|
terminal_backend: "modal"
|
||||||
|
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||||
|
tool_pool_size: 128 # thread pool for 100 parallel tasks
|
||||||
|
dataset_name: "NousResearch/openthoughts-tblite"
|
||||||
|
test_timeout: 600
|
||||||
|
task_timeout: 1200 # 20 min wall-clock per task (TBLite tasks are faster)
|
||||||
|
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_name: "openthoughts-tblite"
|
||||||
|
ensure_scores_are_not_same: false
|
||||||
|
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite"
|
||||||
|
|
||||||
|
openai:
|
||||||
|
base_url: "https://openrouter.ai/api/v1"
|
||||||
|
model_name: "anthropic/claude-opus-4.6"
|
||||||
|
server_type: "openai"
|
||||||
|
health_check: false
|
||||||
|
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||||
42
environments/benchmarks/tblite/run_eval.sh
Executable file
42
environments/benchmarks/tblite/run_eval.sh
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# OpenThoughts-TBLite Evaluation
|
||||||
|
#
|
||||||
|
# Run from repo root:
|
||||||
|
# bash environments/benchmarks/tblite/run_eval.sh
|
||||||
|
#
|
||||||
|
# Override model:
|
||||||
|
# bash environments/benchmarks/tblite/run_eval.sh \
|
||||||
|
# --openai.model_name anthropic/claude-sonnet-4
|
||||||
|
#
|
||||||
|
# Run a subset:
|
||||||
|
# bash environments/benchmarks/tblite/run_eval.sh \
|
||||||
|
# --env.task_filter broken-python,pandas-etl
|
||||||
|
#
|
||||||
|
# All terminal settings (backend, timeout, lifetime, pool size) are
|
||||||
|
# configured via env config fields -- no env vars needed.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
mkdir -p logs evals/openthoughts-tblite
|
||||||
|
LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log"
|
||||||
|
|
||||||
|
echo "OpenThoughts-TBLite Evaluation"
|
||||||
|
echo "Log file: $LOG_FILE"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Unbuffered python output so logs are written in real-time
|
||||||
|
export PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# Show INFO-level agent loop timing (api/tool durations per turn)
|
||||||
|
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
|
||||||
|
export LOGLEVEL=INFO
|
||||||
|
|
||||||
|
python tblite_env.py evaluate \
|
||||||
|
--config default.yaml \
|
||||||
|
"$@" \
|
||||||
|
2>&1 | tee "$LOG_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Log saved to: $LOG_FILE"
|
||||||
|
echo "Eval results: evals/openthoughts-tblite/"
|
||||||
119
environments/benchmarks/tblite/tblite_env.py
Normal file
119
environments/benchmarks/tblite/tblite_env.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
"""
|
||||||
|
OpenThoughts-TBLite Evaluation Environment
|
||||||
|
|
||||||
|
A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal
|
||||||
|
agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults
|
||||||
|
to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated
|
||||||
|
tasks vs TB2's 89 harder tasks).
|
||||||
|
|
||||||
|
TBLite tasks are a curated subset of TB2 with a difficulty distribution
|
||||||
|
designed to give meaningful signal even for smaller models:
|
||||||
|
- Easy (40 tasks): >= 70% pass rate with Claude Haiku 4.5
|
||||||
|
- Medium (26 tasks): 40-69% pass rate
|
||||||
|
- Hard (26 tasks): 10-39% pass rate
|
||||||
|
- Extreme (8 tasks): < 10% pass rate
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python environments/benchmarks/tblite/tblite_env.py evaluate
|
||||||
|
|
||||||
|
# Filter to specific tasks:
|
||||||
|
python environments/benchmarks/tblite/tblite_env.py evaluate \\
|
||||||
|
--env.task_filter "broken-python,pandas-etl"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from atroposlib.envs.base import EvalHandlingEnum
|
||||||
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||||
|
|
||||||
|
from environments.benchmarks.terminalbench_2.terminalbench2_env import (
|
||||||
|
TerminalBench2EvalConfig,
|
||||||
|
TerminalBench2EvalEnv,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TBLiteEvalConfig(TerminalBench2EvalConfig):
|
||||||
|
"""Configuration for the OpenThoughts-TBLite evaluation environment.
|
||||||
|
|
||||||
|
Inherits all TB2 config fields. Only the dataset default and task timeout
|
||||||
|
differ -- TBLite tasks are calibrated to be faster.
|
||||||
|
"""
|
||||||
|
|
||||||
|
dataset_name: str = Field(
|
||||||
|
default="NousResearch/openthoughts-tblite",
|
||||||
|
description="HuggingFace dataset containing TBLite tasks.",
|
||||||
|
)
|
||||||
|
|
||||||
|
task_timeout: int = Field(
|
||||||
|
default=1200,
|
||||||
|
description="Maximum wall-clock seconds per task. TBLite tasks are "
|
||||||
|
"generally faster than TB2, so 20 minutes is usually sufficient.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TBLiteEvalEnv(TerminalBench2EvalEnv):
|
||||||
|
"""OpenThoughts-TBLite evaluation environment.
|
||||||
|
|
||||||
|
Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop,
|
||||||
|
test verification, Docker image resolution, metrics, wandb logging).
|
||||||
|
Only the default configuration differs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "openthoughts-tblite"
|
||||||
|
env_config_cls = TBLiteEvalConfig
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]:
|
||||||
|
env_config = TBLiteEvalConfig(
|
||||||
|
enabled_toolsets=["terminal", "file"],
|
||||||
|
disabled_toolsets=None,
|
||||||
|
distribution=None,
|
||||||
|
|
||||||
|
max_agent_turns=60,
|
||||||
|
max_token_length=16000,
|
||||||
|
agent_temperature=0.6,
|
||||||
|
system_prompt=None,
|
||||||
|
|
||||||
|
terminal_backend="modal",
|
||||||
|
terminal_timeout=300,
|
||||||
|
|
||||||
|
test_timeout=180,
|
||||||
|
|
||||||
|
# 100 tasks in parallel
|
||||||
|
tool_pool_size=128,
|
||||||
|
|
||||||
|
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||||
|
group_size=1,
|
||||||
|
steps_per_eval=1,
|
||||||
|
total_steps=1,
|
||||||
|
|
||||||
|
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||||
|
use_wandb=True,
|
||||||
|
wandb_name="openthoughts-tblite",
|
||||||
|
ensure_scores_are_not_same=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
server_configs = [
|
||||||
|
APIServerConfig(
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
model_name="anthropic/claude-sonnet-4",
|
||||||
|
server_type="openai",
|
||||||
|
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||||
|
health_check=False,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
TBLiteEvalEnv.cli()
|
||||||
0
environments/benchmarks/terminalbench_2/__init__.py
Normal file
0
environments/benchmarks/terminalbench_2/__init__.py
Normal file
38
environments/benchmarks/terminalbench_2/default.yaml
Normal file
38
environments/benchmarks/terminalbench_2/default.yaml
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
# Terminal-Bench 2.0 Evaluation -- Default Configuration
|
||||||
|
#
|
||||||
|
# Eval-only environment for the TB2 benchmark (89 terminal tasks).
|
||||||
|
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
||||||
|
# and OpenRouter for inference.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/terminalbench_2/default.yaml
|
||||||
|
#
|
||||||
|
# # Override model:
|
||||||
|
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/terminalbench_2/default.yaml \
|
||||||
|
# --openai.model_name anthropic/claude-sonnet-4
|
||||||
|
|
||||||
|
env:
|
||||||
|
enabled_toolsets: ["terminal", "file"]
|
||||||
|
max_agent_turns: 60
|
||||||
|
max_token_length: 32000
|
||||||
|
agent_temperature: 0.8
|
||||||
|
terminal_backend: "modal"
|
||||||
|
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||||
|
tool_pool_size: 128 # thread pool for 89 parallel tasks
|
||||||
|
dataset_name: "NousResearch/terminal-bench-2"
|
||||||
|
test_timeout: 600
|
||||||
|
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
|
||||||
|
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_name: "terminal-bench-2"
|
||||||
|
ensure_scores_are_not_same: false
|
||||||
|
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
|
||||||
|
|
||||||
|
openai:
|
||||||
|
base_url: "https://openrouter.ai/api/v1"
|
||||||
|
model_name: "anthropic/claude-opus-4.6"
|
||||||
|
server_type: "openai"
|
||||||
|
health_check: false
|
||||||
|
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||||
42
environments/benchmarks/terminalbench_2/run_eval.sh
Executable file
42
environments/benchmarks/terminalbench_2/run_eval.sh
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Terminal-Bench 2.0 Evaluation
|
||||||
|
#
|
||||||
|
# Run from repo root:
|
||||||
|
# bash environments/benchmarks/terminalbench_2/run_eval.sh
|
||||||
|
#
|
||||||
|
# Override model:
|
||||||
|
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
|
||||||
|
# --openai.model_name anthropic/claude-sonnet-4
|
||||||
|
#
|
||||||
|
# Run a subset:
|
||||||
|
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
|
||||||
|
# --env.task_filter fix-git,git-multibranch
|
||||||
|
#
|
||||||
|
# All terminal settings (backend, timeout, lifetime, pool size) are
|
||||||
|
# configured via env config fields -- no env vars needed.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
mkdir -p logs evals/terminal-bench-2
|
||||||
|
LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log"
|
||||||
|
|
||||||
|
echo "Terminal-Bench 2.0 Evaluation"
|
||||||
|
echo "Log file: $LOG_FILE"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Unbuffered python output so logs are written in real-time
|
||||||
|
export PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# Show INFO-level agent loop timing (api/tool durations per turn)
|
||||||
|
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
|
||||||
|
export LOGLEVEL=INFO
|
||||||
|
|
||||||
|
python terminalbench2_env.py evaluate \
|
||||||
|
--config default.yaml \
|
||||||
|
"$@" \
|
||||||
|
2>&1 | tee "$LOG_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Log saved to: $LOG_FILE"
|
||||||
|
echo "Eval results: evals/terminal-bench-2/"
|
||||||
904
environments/benchmarks/terminalbench_2/terminalbench2_env.py
Normal file
904
environments/benchmarks/terminalbench_2/terminalbench2_env.py
Normal file
@@ -0,0 +1,904 @@
|
|||||||
|
"""
|
||||||
|
TerminalBench2Env -- Terminal-Bench 2.0 Evaluation Environment
|
||||||
|
|
||||||
|
Evaluates agentic LLMs on challenging terminal tasks from Terminal-Bench 2.0.
|
||||||
|
Each task provides a unique Docker environment (pre-built on Docker Hub), a natural
|
||||||
|
language instruction, and a test suite for verification. The agent uses terminal +
|
||||||
|
file tools to complete the task, then the test suite runs inside the same sandbox.
|
||||||
|
|
||||||
|
This is an eval-only environment (not a training environment). It is designed to
|
||||||
|
be run via the `evaluate` subcommand:
|
||||||
|
|
||||||
|
python environments/terminalbench2_env.py evaluate \\
|
||||||
|
--env.dataset_name NousResearch/terminal-bench-2
|
||||||
|
|
||||||
|
The evaluate flow:
|
||||||
|
1. setup() -- Loads the TB2 dataset from HuggingFace
|
||||||
|
2. evaluate() -- Iterates over all tasks, running each through:
|
||||||
|
a. rollout_and_score_eval() -- Per-task agent loop + test verification
|
||||||
|
- Resolves Docker image (pre-built Hub image or Dockerfile fallback)
|
||||||
|
- Registers per-task Modal sandbox via register_task_env_overrides()
|
||||||
|
- Runs the HermesAgentLoop (terminal + file tools)
|
||||||
|
- Uploads test suite and runs test.sh in the same sandbox
|
||||||
|
- Returns binary pass/fail result
|
||||||
|
b. Aggregates per-task, per-category, and overall pass rates
|
||||||
|
c. Logs results via evaluate_log() and wandb
|
||||||
|
|
||||||
|
Key features:
|
||||||
|
- Per-task Modal sandboxes using pre-built Docker Hub images
|
||||||
|
- Binary reward: 1.0 if all tests pass, 0.0 otherwise
|
||||||
|
- Concurrency-controlled parallel evaluation via asyncio.Semaphore
|
||||||
|
- Per-task, per-category, and aggregate pass rate tracking
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
import tarfile
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
# Ensure repo root is on sys.path for imports
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from atroposlib.envs.base import EvalHandlingEnum
|
||||||
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||||
|
|
||||||
|
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
from environments.tool_context import ToolContext
|
||||||
|
from tools.terminal_tool import (
|
||||||
|
register_task_env_overrides,
|
||||||
|
clear_task_env_overrides,
|
||||||
|
cleanup_vm,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Configuration
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class TerminalBench2EvalConfig(HermesAgentEnvConfig):
|
||||||
|
"""
|
||||||
|
Configuration for the Terminal-Bench 2.0 evaluation environment.
|
||||||
|
|
||||||
|
Extends HermesAgentEnvConfig with TB2-specific settings for dataset loading,
|
||||||
|
test execution, task filtering, and eval concurrency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# --- Dataset ---
|
||||||
|
dataset_name: str = Field(
|
||||||
|
default="NousResearch/terminal-bench-2",
|
||||||
|
description="HuggingFace dataset containing TB2 tasks.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Test execution ---
|
||||||
|
test_timeout: int = Field(
|
||||||
|
default=180,
|
||||||
|
description="Timeout in seconds for running the test suite after agent completes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Image strategy ---
|
||||||
|
force_build: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="If True, always build from Dockerfile (ignore docker_image). "
|
||||||
|
"Useful for testing custom Dockerfiles.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Task filtering (comma-separated from CLI) ---
|
||||||
|
task_filter: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Comma-separated task names to run (e.g., 'fix-git,git-multibranch'). "
|
||||||
|
"If not set, all tasks are run.",
|
||||||
|
)
|
||||||
|
skip_tasks: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Comma-separated task names to skip on top of the default skip list.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Per-task wall-clock timeout ---
|
||||||
|
task_timeout: int = Field(
|
||||||
|
default=1800,
|
||||||
|
description="Maximum wall-clock seconds per task (agent loop + verification). "
|
||||||
|
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
||||||
|
MODAL_INCOMPATIBLE_TASKS = {
|
||||||
|
"qemu-startup", # Needs KVM/hardware virtualization
|
||||||
|
"qemu-alpine-ssh", # Needs KVM/hardware virtualization
|
||||||
|
"crack-7z-hash", # Password brute-force -- too slow for cloud sandbox timeouts
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tar extraction helper
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def _extract_base64_tar(b64_data: str, target_dir: Path):
|
||||||
|
"""Extract a base64-encoded tar.gz archive into target_dir."""
|
||||||
|
if not b64_data:
|
||||||
|
return
|
||||||
|
raw = base64.b64decode(b64_data)
|
||||||
|
buf = io.BytesIO(raw)
|
||||||
|
with tarfile.open(fileobj=buf, mode="r:gz") as tar:
|
||||||
|
tar.extractall(path=str(target_dir))
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Main Environment
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||||
|
"""
|
||||||
|
Terminal-Bench 2.0 evaluation environment (eval-only, no training).
|
||||||
|
|
||||||
|
Inherits from HermesAgentBaseEnv for:
|
||||||
|
- Terminal backend setup (os.environ["TERMINAL_ENV"])
|
||||||
|
- Tool resolution via _resolve_tools_for_group()
|
||||||
|
- Monkey patches for async-safe tool operation
|
||||||
|
- Wandb trajectory formatting
|
||||||
|
|
||||||
|
The evaluate flow (triggered by `environment.py evaluate`):
|
||||||
|
1. setup() -- Load dataset from HuggingFace
|
||||||
|
2. evaluate() -- Run all tasks through rollout_and_score_eval()
|
||||||
|
|
||||||
|
Each task in rollout_and_score_eval():
|
||||||
|
1. Resolve Docker image (pre-built Hub image or Dockerfile fallback)
|
||||||
|
2. Register per-task Modal sandbox override
|
||||||
|
3. Run HermesAgentLoop with terminal + file tools
|
||||||
|
4. Upload test suite and execute test.sh in the same sandbox
|
||||||
|
5. Check /logs/verifier/reward.txt for pass/fail
|
||||||
|
6. Clean up sandbox, overrides, and temp files
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "terminal-bench-2"
|
||||||
|
env_config_cls = TerminalBench2EvalConfig
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls) -> Tuple[TerminalBench2EvalConfig, List[APIServerConfig]]:
|
||||||
|
"""
|
||||||
|
Default configuration for Terminal-Bench 2.0 evaluation.
|
||||||
|
|
||||||
|
Uses eval-only settings:
|
||||||
|
- eval_handling=STOP_TRAIN so the eval flow runs cleanly
|
||||||
|
- steps_per_eval=1, total_steps=1 so eval triggers immediately
|
||||||
|
- group_size=1 (one rollout per group, each task is expensive)
|
||||||
|
|
||||||
|
Uses Modal terminal backend (cloud-isolated sandbox per task) and
|
||||||
|
OpenRouter with Claude for inference.
|
||||||
|
"""
|
||||||
|
env_config = TerminalBench2EvalConfig(
|
||||||
|
# Terminal + file tools only (the agent interacts via shell commands)
|
||||||
|
enabled_toolsets=["terminal", "file"],
|
||||||
|
disabled_toolsets=None,
|
||||||
|
distribution=None,
|
||||||
|
|
||||||
|
# Agent settings -- TB2 tasks are complex, need many turns
|
||||||
|
max_agent_turns=60,
|
||||||
|
max_token_length=16000,
|
||||||
|
agent_temperature=0.6,
|
||||||
|
system_prompt=None,
|
||||||
|
|
||||||
|
# Modal backend for per-task cloud-isolated sandboxes
|
||||||
|
terminal_backend="modal",
|
||||||
|
terminal_timeout=300, # 5 min per command (builds, pip install, etc.)
|
||||||
|
|
||||||
|
# Test execution timeout (TB2 test scripts can install deps like pytest)
|
||||||
|
test_timeout=180,
|
||||||
|
|
||||||
|
# 89 tasks run in parallel, each needs a thread for tool calls
|
||||||
|
tool_pool_size=128,
|
||||||
|
|
||||||
|
# --- Eval-only Atropos settings ---
|
||||||
|
# These settings make the env work as an eval-only environment:
|
||||||
|
# - STOP_TRAIN: pauses training during eval (standard for eval envs)
|
||||||
|
# - steps_per_eval=1, total_steps=1: eval triggers immediately
|
||||||
|
# - group_size=1: one rollout per group (each task is expensive)
|
||||||
|
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||||
|
group_size=1,
|
||||||
|
steps_per_eval=1,
|
||||||
|
total_steps=1,
|
||||||
|
|
||||||
|
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||||
|
use_wandb=True,
|
||||||
|
wandb_name="terminal-bench-2",
|
||||||
|
ensure_scores_are_not_same=False, # Binary rewards may all be 0 or 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# OpenRouter with Claude -- API key loaded from .env
|
||||||
|
server_configs = [
|
||||||
|
APIServerConfig(
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
model_name="anthropic/claude-sonnet-4",
|
||||||
|
server_type="openai",
|
||||||
|
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||||
|
health_check=False,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Setup -- load dataset
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def setup(self):
|
||||||
|
"""Load the Terminal-Bench 2.0 dataset from HuggingFace."""
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
# Auto-set terminal_lifetime to task_timeout + 120s so sandboxes
|
||||||
|
# never get killed during an active task, but still get cleaned up
|
||||||
|
# promptly after the task times out.
|
||||||
|
lifetime = self.config.task_timeout + 120
|
||||||
|
self.config.terminal_lifetime = lifetime
|
||||||
|
os.environ["TERMINAL_LIFETIME_SECONDS"] = str(lifetime)
|
||||||
|
print(f" Terminal lifetime auto-set to {lifetime}s (task_timeout + 120s)")
|
||||||
|
|
||||||
|
print(f"Loading TB2 dataset from: {self.config.dataset_name}")
|
||||||
|
ds = load_dataset(self.config.dataset_name, split="train")
|
||||||
|
|
||||||
|
# Apply task filters (comma-separated strings from CLI)
|
||||||
|
tasks = list(ds)
|
||||||
|
if self.config.task_filter:
|
||||||
|
allowed = {name.strip() for name in self.config.task_filter.split(",")}
|
||||||
|
tasks = [t for t in tasks if t["task_name"] in allowed]
|
||||||
|
print(f" Filtered to {len(tasks)} tasks: {sorted(allowed)}")
|
||||||
|
|
||||||
|
# Skip tasks incompatible with the current backend (e.g., QEMU on Modal)
|
||||||
|
# plus any user-specified skip_tasks
|
||||||
|
skip = set(MODAL_INCOMPATIBLE_TASKS) if self.config.terminal_backend == "modal" else set()
|
||||||
|
if self.config.skip_tasks:
|
||||||
|
skip |= {name.strip() for name in self.config.skip_tasks.split(",")}
|
||||||
|
if skip:
|
||||||
|
before = len(tasks)
|
||||||
|
tasks = [t for t in tasks if t["task_name"] not in skip]
|
||||||
|
skipped = before - len(tasks)
|
||||||
|
if skipped > 0:
|
||||||
|
print(f" Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}")
|
||||||
|
|
||||||
|
self.all_eval_items = tasks
|
||||||
|
self.iter = 0
|
||||||
|
|
||||||
|
# Build category index for per-category metrics
|
||||||
|
self.category_index: Dict[str, List[int]] = defaultdict(list)
|
||||||
|
for i, task in enumerate(self.all_eval_items):
|
||||||
|
self.category_index[task.get("category", "unknown")].append(i)
|
||||||
|
|
||||||
|
# Reward tracking for wandb logging
|
||||||
|
self.eval_metrics: List[Tuple[str, float]] = []
|
||||||
|
|
||||||
|
# Streaming JSONL writer -- saves each task's full conversation
|
||||||
|
# immediately on completion so data is preserved even on Ctrl+C.
|
||||||
|
# Timestamped filename so each run produces a unique file.
|
||||||
|
import datetime
|
||||||
|
log_dir = os.path.join(os.path.dirname(__file__), "logs")
|
||||||
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
|
run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
|
||||||
|
self._streaming_file = open(self._streaming_path, "w")
|
||||||
|
self._streaming_lock = __import__("threading").Lock()
|
||||||
|
print(f" Streaming results to: {self._streaming_path}")
|
||||||
|
|
||||||
|
print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories")
|
||||||
|
for cat, indices in sorted(self.category_index.items()):
|
||||||
|
print(f" {cat}: {len(indices)} tasks")
|
||||||
|
|
||||||
|
def _save_result(self, result: Dict[str, Any]):
|
||||||
|
"""Write a single task result to the streaming JSONL file immediately."""
|
||||||
|
if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
|
||||||
|
return
|
||||||
|
with self._streaming_lock:
|
||||||
|
self._streaming_file.write(json.dumps(result, ensure_ascii=False, default=str) + "\n")
|
||||||
|
self._streaming_file.flush()
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Training pipeline stubs -- NOT used in eval-only mode
|
||||||
|
# =========================================================================
|
||||||
|
# These satisfy the abstract method requirements from HermesAgentBaseEnv.
|
||||||
|
# The evaluate subcommand calls setup() -> evaluate() directly, bypassing
|
||||||
|
# the training pipeline entirely.
|
||||||
|
|
||||||
|
async def get_next_item(self):
|
||||||
|
"""Return next item (stub -- not used in eval-only mode)."""
|
||||||
|
item = self.all_eval_items[self.iter % len(self.all_eval_items)]
|
||||||
|
self.iter += 1
|
||||||
|
return item
|
||||||
|
|
||||||
|
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||||
|
"""Return the task's instruction as the user prompt."""
|
||||||
|
return item["instruction"]
|
||||||
|
|
||||||
|
async def compute_reward(self, item, result, ctx) -> float:
|
||||||
|
"""Compute reward (stub -- actual verification is in rollout_and_score_eval)."""
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
async def collect_trajectories(self, item):
|
||||||
|
"""Collect trajectories (stub -- not used in eval-only mode)."""
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
async def score(self, rollout_group_data):
|
||||||
|
"""Score rollouts (stub -- not used in eval-only mode)."""
|
||||||
|
return None
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Docker image resolution
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _resolve_task_image(
|
||||||
|
self, item: Dict[str, Any], task_name: str
|
||||||
|
) -> Tuple[str, Optional[Path]]:
|
||||||
|
"""
|
||||||
|
Resolve the Docker image for a task, with fallback to Dockerfile.
|
||||||
|
|
||||||
|
Strategy (mirrors Harbor's approach):
|
||||||
|
1. If force_build=True, always build from Dockerfile in environment_tar
|
||||||
|
2. If docker_image is available, use the pre-built Docker Hub image (fast)
|
||||||
|
3. Otherwise, extract Dockerfile from environment_tar and build (slow)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(modal_image, temp_dir) -- modal_image is a Docker Hub name or a
|
||||||
|
Dockerfile path. temp_dir is set if we extracted files that need
|
||||||
|
cleanup later.
|
||||||
|
"""
|
||||||
|
docker_image = item.get("docker_image", "")
|
||||||
|
environment_tar = item.get("environment_tar", "")
|
||||||
|
|
||||||
|
# Fast path: use pre-built Docker Hub image
|
||||||
|
if docker_image and not self.config.force_build:
|
||||||
|
logger.info("Task %s: using pre-built image %s", task_name, docker_image)
|
||||||
|
return docker_image, None
|
||||||
|
|
||||||
|
# Slow path: extract Dockerfile from environment_tar and build
|
||||||
|
if environment_tar:
|
||||||
|
task_dir = Path(tempfile.mkdtemp(prefix=f"tb2-{task_name}-"))
|
||||||
|
_extract_base64_tar(environment_tar, task_dir)
|
||||||
|
dockerfile_path = task_dir / "Dockerfile"
|
||||||
|
if dockerfile_path.exists():
|
||||||
|
logger.info(
|
||||||
|
"Task %s: building from Dockerfile (force_build=%s, docker_image=%s)",
|
||||||
|
task_name, self.config.force_build, bool(docker_image),
|
||||||
|
)
|
||||||
|
return str(dockerfile_path), task_dir
|
||||||
|
|
||||||
|
# Neither available -- fall back to Hub image if force_build was True
|
||||||
|
if docker_image:
|
||||||
|
logger.warning(
|
||||||
|
"Task %s: force_build=True but no environment_tar, "
|
||||||
|
"falling back to docker_image %s", task_name, docker_image,
|
||||||
|
)
|
||||||
|
return docker_image, None
|
||||||
|
|
||||||
|
return "", None
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Per-task evaluation -- agent loop + test verification
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict:
|
||||||
|
"""
|
||||||
|
Evaluate a single TB2 task: run the agent loop, then verify with tests.
|
||||||
|
|
||||||
|
This is the core evaluation method. For each task it:
|
||||||
|
1. Resolves the Docker image and registers the Modal sandbox override
|
||||||
|
2. Runs HermesAgentLoop with terminal + file tools
|
||||||
|
3. Uploads the test suite into the sandbox
|
||||||
|
4. Executes test.sh and checks the result
|
||||||
|
5. Cleans up the sandbox and temp files
|
||||||
|
|
||||||
|
Args:
|
||||||
|
eval_item: A single TB2 task dict from the dataset
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'passed' (bool), 'reward' (float), 'task_name' (str),
|
||||||
|
'category' (str), and optional debug info
|
||||||
|
"""
|
||||||
|
task_name = eval_item.get("task_name", "unknown")
|
||||||
|
category = eval_item.get("category", "unknown")
|
||||||
|
task_id = str(uuid.uuid4())
|
||||||
|
task_dir = None # Set if we extract a Dockerfile (needs cleanup)
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
tqdm.write(f" [START] {task_name} (task_id={task_id[:8]})")
|
||||||
|
task_start = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# --- 1. Resolve Docker image ---
|
||||||
|
modal_image, task_dir = self._resolve_task_image(eval_item, task_name)
|
||||||
|
if not modal_image:
|
||||||
|
logger.error("Task %s: no docker_image or environment_tar, skipping", task_name)
|
||||||
|
return {
|
||||||
|
"passed": False, "reward": 0.0,
|
||||||
|
"task_name": task_name, "category": category,
|
||||||
|
"error": "no_image",
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- 2. Register per-task Modal image override ---
|
||||||
|
register_task_env_overrides(task_id, {"modal_image": modal_image})
|
||||||
|
logger.info(
|
||||||
|
"Task %s: registered image override for task_id %s",
|
||||||
|
task_name, task_id[:8],
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- 3. Resolve tools and build messages ---
|
||||||
|
tools, valid_names = self._resolve_tools_for_group()
|
||||||
|
|
||||||
|
messages: List[Dict[str, Any]] = []
|
||||||
|
if self.config.system_prompt:
|
||||||
|
messages.append({"role": "system", "content": self.config.system_prompt})
|
||||||
|
messages.append({"role": "user", "content": self.format_prompt(eval_item)})
|
||||||
|
|
||||||
|
# --- 4. Run agent loop ---
|
||||||
|
agent = HermesAgentLoop(
|
||||||
|
server=self.server,
|
||||||
|
tool_schemas=tools,
|
||||||
|
valid_tool_names=valid_names,
|
||||||
|
max_turns=self.config.max_agent_turns,
|
||||||
|
task_id=task_id,
|
||||||
|
temperature=self.config.agent_temperature,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
extra_body=self.config.extra_body,
|
||||||
|
)
|
||||||
|
result = await agent.run(messages)
|
||||||
|
|
||||||
|
# --- 5. Verify -- run test suite in the agent's sandbox ---
|
||||||
|
# Skip verification if the agent produced no meaningful output
|
||||||
|
only_system_and_user = all(
|
||||||
|
msg.get("role") in ("system", "user") for msg in result.messages
|
||||||
|
)
|
||||||
|
if result.turns_used == 0 or only_system_and_user:
|
||||||
|
logger.warning(
|
||||||
|
"Task %s: agent produced no output (turns=%d). Reward=0.",
|
||||||
|
task_name, result.turns_used,
|
||||||
|
)
|
||||||
|
reward = 0.0
|
||||||
|
else:
|
||||||
|
# Run tests in a thread so the blocking ctx.terminal() calls
|
||||||
|
# don't freeze the entire event loop (which would stall all
|
||||||
|
# other tasks, tqdm updates, and timeout timers).
|
||||||
|
ctx = ToolContext(task_id)
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
reward = await loop.run_in_executor(
|
||||||
|
None, # default thread pool
|
||||||
|
self._run_tests, eval_item, ctx, task_name,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Task %s: test verification failed: %s", task_name, e)
|
||||||
|
reward = 0.0
|
||||||
|
finally:
|
||||||
|
ctx.cleanup()
|
||||||
|
|
||||||
|
passed = reward == 1.0
|
||||||
|
status = "PASS" if passed else "FAIL"
|
||||||
|
elapsed = time.time() - task_start
|
||||||
|
tqdm.write(f" [{status}] {task_name} (turns={result.turns_used}, {elapsed:.0f}s)")
|
||||||
|
logger.info(
|
||||||
|
"Task %s: reward=%.1f, turns=%d, finished=%s",
|
||||||
|
task_name, reward, result.turns_used, result.finished_naturally,
|
||||||
|
)
|
||||||
|
|
||||||
|
out = {
|
||||||
|
"passed": passed,
|
||||||
|
"reward": reward,
|
||||||
|
"task_name": task_name,
|
||||||
|
"category": category,
|
||||||
|
"turns_used": result.turns_used,
|
||||||
|
"finished_naturally": result.finished_naturally,
|
||||||
|
"messages": result.messages,
|
||||||
|
}
|
||||||
|
self._save_result(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
elapsed = time.time() - task_start
|
||||||
|
logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True)
|
||||||
|
tqdm.write(f" [ERROR] {task_name}: {e} ({elapsed:.0f}s)")
|
||||||
|
out = {
|
||||||
|
"passed": False, "reward": 0.0,
|
||||||
|
"task_name": task_name, "category": category,
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
self._save_result(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# --- Cleanup: clear overrides, sandbox, and temp files ---
|
||||||
|
clear_task_env_overrides(task_id)
|
||||||
|
try:
|
||||||
|
cleanup_vm(task_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("VM cleanup for %s: %s", task_id[:8], e)
|
||||||
|
if task_dir and task_dir.exists():
|
||||||
|
shutil.rmtree(task_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def _run_tests(
|
||||||
|
self, item: Dict[str, Any], ctx: ToolContext, task_name: str
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Upload and execute the test suite in the agent's sandbox, then
|
||||||
|
download the verifier output locally to read the reward.
|
||||||
|
|
||||||
|
Follows Harbor's verification pattern:
|
||||||
|
1. Upload tests/ directory into the sandbox
|
||||||
|
2. Execute test.sh inside the sandbox
|
||||||
|
3. Download /logs/verifier/ directory to a local temp dir
|
||||||
|
4. Read reward.txt locally with native Python I/O
|
||||||
|
|
||||||
|
Downloading locally avoids issues with the file_read tool on
|
||||||
|
the Modal VM and matches how Harbor handles verification.
|
||||||
|
|
||||||
|
TB2 test scripts (test.sh) typically:
|
||||||
|
1. Install pytest via uv/pip
|
||||||
|
2. Run pytest against the test files in /tests/
|
||||||
|
3. Write results to /logs/verifier/reward.txt
|
||||||
|
|
||||||
|
Args:
|
||||||
|
item: The TB2 task dict (contains tests_tar, test_sh)
|
||||||
|
ctx: ToolContext scoped to this task's sandbox
|
||||||
|
task_name: For logging
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
1.0 if tests pass, 0.0 otherwise
|
||||||
|
"""
|
||||||
|
tests_tar = item.get("tests_tar", "")
|
||||||
|
test_sh = item.get("test_sh", "")
|
||||||
|
|
||||||
|
if not test_sh:
|
||||||
|
logger.warning("Task %s: no test_sh content, reward=0", task_name)
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Create required directories in the sandbox
|
||||||
|
ctx.terminal("mkdir -p /tests /logs/verifier")
|
||||||
|
|
||||||
|
# Upload test files into the sandbox (binary-safe via base64)
|
||||||
|
if tests_tar:
|
||||||
|
tests_temp = Path(tempfile.mkdtemp(prefix=f"tb2-tests-{task_name}-"))
|
||||||
|
try:
|
||||||
|
_extract_base64_tar(tests_tar, tests_temp)
|
||||||
|
ctx.upload_dir(str(tests_temp), "/tests")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Task %s: failed to upload test files: %s", task_name, e)
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(tests_temp, ignore_errors=True)
|
||||||
|
|
||||||
|
# Write the test runner script (test.sh)
|
||||||
|
ctx.write_file("/tests/test.sh", test_sh)
|
||||||
|
ctx.terminal("chmod +x /tests/test.sh")
|
||||||
|
|
||||||
|
# Execute the test suite
|
||||||
|
logger.info(
|
||||||
|
"Task %s: running test suite (timeout=%ds)",
|
||||||
|
task_name, self.config.test_timeout,
|
||||||
|
)
|
||||||
|
test_result = ctx.terminal(
|
||||||
|
"bash /tests/test.sh",
|
||||||
|
timeout=self.config.test_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
exit_code = test_result.get("exit_code", -1)
|
||||||
|
output = test_result.get("output", "")
|
||||||
|
|
||||||
|
# Download the verifier output directory locally, then read reward.txt
|
||||||
|
# with native Python I/O. This avoids issues with file_read on the
|
||||||
|
# Modal VM and matches Harbor's verification pattern.
|
||||||
|
reward = 0.0
|
||||||
|
local_verifier_dir = Path(tempfile.mkdtemp(prefix=f"tb2-verifier-{task_name}-"))
|
||||||
|
try:
|
||||||
|
ctx.download_dir("/logs/verifier", str(local_verifier_dir))
|
||||||
|
|
||||||
|
reward_file = local_verifier_dir / "reward.txt"
|
||||||
|
if reward_file.exists() and reward_file.stat().st_size > 0:
|
||||||
|
content = reward_file.read_text().strip()
|
||||||
|
if content == "1":
|
||||||
|
reward = 1.0
|
||||||
|
elif content == "0":
|
||||||
|
reward = 0.0
|
||||||
|
else:
|
||||||
|
# Unexpected content -- try parsing as float
|
||||||
|
try:
|
||||||
|
reward = float(content)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
logger.warning(
|
||||||
|
"Task %s: reward.txt content unexpected (%r), "
|
||||||
|
"falling back to exit_code=%d",
|
||||||
|
task_name, content, exit_code,
|
||||||
|
)
|
||||||
|
reward = 1.0 if exit_code == 0 else 0.0
|
||||||
|
else:
|
||||||
|
# reward.txt not written -- fall back to exit code
|
||||||
|
logger.warning(
|
||||||
|
"Task %s: reward.txt not found after download, "
|
||||||
|
"falling back to exit_code=%d",
|
||||||
|
task_name, exit_code,
|
||||||
|
)
|
||||||
|
reward = 1.0 if exit_code == 0 else 0.0
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Task %s: failed to download verifier dir: %s, "
|
||||||
|
"falling back to exit_code=%d",
|
||||||
|
task_name, e, exit_code,
|
||||||
|
)
|
||||||
|
reward = 1.0 if exit_code == 0 else 0.0
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(local_verifier_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
# Log test output for debugging failures
|
||||||
|
if reward == 0.0:
|
||||||
|
output_preview = output[-500:] if output else "(no output)"
|
||||||
|
logger.info(
|
||||||
|
"Task %s: FAIL (exit_code=%d)\n%s",
|
||||||
|
task_name, exit_code, output_preview,
|
||||||
|
)
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Evaluate -- main entry point for the eval subcommand
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
|
||||||
|
"""
|
||||||
|
Wrap rollout_and_score_eval with a per-task wall-clock timeout.
|
||||||
|
|
||||||
|
If the task exceeds task_timeout seconds, it's automatically scored
|
||||||
|
as FAIL. This prevents any single task from hanging indefinitely.
|
||||||
|
"""
|
||||||
|
task_name = item.get("task_name", "unknown")
|
||||||
|
category = item.get("category", "unknown")
|
||||||
|
try:
|
||||||
|
return await asyncio.wait_for(
|
||||||
|
self.rollout_and_score_eval(item),
|
||||||
|
timeout=self.config.task_timeout,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
from tqdm import tqdm
|
||||||
|
elapsed = self.config.task_timeout
|
||||||
|
tqdm.write(f" [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)")
|
||||||
|
logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed)
|
||||||
|
out = {
|
||||||
|
"passed": False, "reward": 0.0,
|
||||||
|
"task_name": task_name, "category": category,
|
||||||
|
"error": f"timeout ({elapsed}s)",
|
||||||
|
}
|
||||||
|
self._save_result(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
async def evaluate(self, *args, **kwargs) -> None:
|
||||||
|
"""
|
||||||
|
Run Terminal-Bench 2.0 evaluation over all tasks.
|
||||||
|
|
||||||
|
This is the main entry point when invoked via:
|
||||||
|
python environments/terminalbench2_env.py evaluate
|
||||||
|
|
||||||
|
Runs all tasks through rollout_and_score_eval() via asyncio.gather()
|
||||||
|
(same pattern as GPQA and other Atropos eval envs). Each task is
|
||||||
|
wrapped with a wall-clock timeout so hung tasks auto-fail.
|
||||||
|
|
||||||
|
Suppresses noisy Modal/terminal output (HERMES_QUIET) so the tqdm
|
||||||
|
bar stays visible.
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Route all logging through tqdm.write() so the progress bar stays
|
||||||
|
# pinned at the bottom while log lines scroll above it.
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
class _TqdmHandler(logging.Handler):
|
||||||
|
def emit(self, record):
|
||||||
|
try:
|
||||||
|
tqdm.write(self.format(record))
|
||||||
|
except Exception:
|
||||||
|
self.handleError(record)
|
||||||
|
|
||||||
|
handler = _TqdmHandler()
|
||||||
|
handler.setFormatter(logging.Formatter(
|
||||||
|
"%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||||||
|
datefmt="%H:%M:%S",
|
||||||
|
))
|
||||||
|
root = logging.getLogger()
|
||||||
|
root.handlers = [handler] # Replace any existing handlers
|
||||||
|
root.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Silence noisy third-party loggers that flood the output
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING) # Every HTTP request
|
||||||
|
logging.getLogger("openai").setLevel(logging.WARNING) # OpenAI client retries
|
||||||
|
logging.getLogger("rex-deploy").setLevel(logging.WARNING) # Swerex deployment
|
||||||
|
logging.getLogger("rex_image_builder").setLevel(logging.WARNING) # Image builds
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Starting Terminal-Bench 2.0 Evaluation")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Dataset: {self.config.dataset_name}")
|
||||||
|
print(f" Total tasks: {len(self.all_eval_items)}")
|
||||||
|
print(f" Max agent turns: {self.config.max_agent_turns}")
|
||||||
|
print(f" Task timeout: {self.config.task_timeout}s")
|
||||||
|
print(f" Terminal backend: {self.config.terminal_backend}")
|
||||||
|
print(f" Tool thread pool: {self.config.tool_pool_size}")
|
||||||
|
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
|
||||||
|
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Fire all tasks with wall-clock timeout, track live accuracy on the bar
|
||||||
|
total_tasks = len(self.all_eval_items)
|
||||||
|
eval_tasks = [
|
||||||
|
asyncio.ensure_future(self._eval_with_timeout(item))
|
||||||
|
for item in self.all_eval_items
|
||||||
|
]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
passed_count = 0
|
||||||
|
pbar = tqdm(total=total_tasks, desc="Evaluating TB2", dynamic_ncols=True)
|
||||||
|
try:
|
||||||
|
for coro in asyncio.as_completed(eval_tasks):
|
||||||
|
result = await coro
|
||||||
|
results.append(result)
|
||||||
|
if result and result.get("passed"):
|
||||||
|
passed_count += 1
|
||||||
|
done = len(results)
|
||||||
|
pct = (passed_count / done * 100) if done else 0
|
||||||
|
pbar.set_postfix_str(f"pass={passed_count}/{done} ({pct:.1f}%)")
|
||||||
|
pbar.update(1)
|
||||||
|
except (KeyboardInterrupt, asyncio.CancelledError):
|
||||||
|
pbar.close()
|
||||||
|
print(f"\n\nInterrupted! Cleaning up {len(eval_tasks)} tasks...")
|
||||||
|
# Cancel all pending tasks
|
||||||
|
for task in eval_tasks:
|
||||||
|
task.cancel()
|
||||||
|
# Let cancellations propagate (finally blocks run cleanup_vm)
|
||||||
|
await asyncio.gather(*eval_tasks, return_exceptions=True)
|
||||||
|
# Belt-and-suspenders: clean up any remaining sandboxes
|
||||||
|
from tools.terminal_tool import cleanup_all_environments
|
||||||
|
cleanup_all_environments()
|
||||||
|
print("All sandboxes cleaned up.")
|
||||||
|
return
|
||||||
|
finally:
|
||||||
|
pbar.close()
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
# Filter out None results (shouldn't happen, but be safe)
|
||||||
|
valid_results = [r for r in results if r is not None]
|
||||||
|
|
||||||
|
if not valid_results:
|
||||||
|
print("Warning: No valid evaluation results obtained")
|
||||||
|
return
|
||||||
|
|
||||||
|
# ---- Compute metrics ----
|
||||||
|
total = len(valid_results)
|
||||||
|
passed = sum(1 for r in valid_results if r.get("passed"))
|
||||||
|
overall_pass_rate = passed / total if total > 0 else 0.0
|
||||||
|
|
||||||
|
# Per-category breakdown
|
||||||
|
cat_results: Dict[str, List[Dict]] = defaultdict(list)
|
||||||
|
for r in valid_results:
|
||||||
|
cat_results[r.get("category", "unknown")].append(r)
|
||||||
|
|
||||||
|
# Build metrics dict
|
||||||
|
eval_metrics = {
|
||||||
|
"eval/pass_rate": overall_pass_rate,
|
||||||
|
"eval/total_tasks": total,
|
||||||
|
"eval/passed_tasks": passed,
|
||||||
|
"eval/evaluation_time_seconds": end_time - start_time,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Per-category metrics
|
||||||
|
for category, cat_items in sorted(cat_results.items()):
|
||||||
|
cat_passed = sum(1 for r in cat_items if r.get("passed"))
|
||||||
|
cat_total = len(cat_items)
|
||||||
|
cat_pass_rate = cat_passed / cat_total if cat_total > 0 else 0.0
|
||||||
|
cat_key = category.replace(" ", "_").replace("-", "_").lower()
|
||||||
|
eval_metrics[f"eval/pass_rate_{cat_key}"] = cat_pass_rate
|
||||||
|
|
||||||
|
# Store metrics for wandb_log
|
||||||
|
self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
|
||||||
|
|
||||||
|
# ---- Print summary ----
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Terminal-Bench 2.0 Evaluation Results")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Overall Pass Rate: {overall_pass_rate:.4f} ({passed}/{total})")
|
||||||
|
print(f"Evaluation Time: {end_time - start_time:.1f} seconds")
|
||||||
|
|
||||||
|
print("\nCategory Breakdown:")
|
||||||
|
for category, cat_items in sorted(cat_results.items()):
|
||||||
|
cat_passed = sum(1 for r in cat_items if r.get("passed"))
|
||||||
|
cat_total = len(cat_items)
|
||||||
|
cat_rate = cat_passed / cat_total if cat_total > 0 else 0.0
|
||||||
|
print(f" {category}: {cat_rate:.1%} ({cat_passed}/{cat_total})")
|
||||||
|
|
||||||
|
# Print individual task results
|
||||||
|
print("\nTask Results:")
|
||||||
|
for r in sorted(valid_results, key=lambda x: x.get("task_name", "")):
|
||||||
|
status = "PASS" if r.get("passed") else "FAIL"
|
||||||
|
turns = r.get("turns_used", "?")
|
||||||
|
error = r.get("error", "")
|
||||||
|
extra = f" (error: {error})" if error else ""
|
||||||
|
print(f" [{status}] {r['task_name']} (turns={turns}){extra}")
|
||||||
|
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Build sample records for evaluate_log (includes full conversations)
|
||||||
|
samples = [
|
||||||
|
{
|
||||||
|
"task_name": r.get("task_name"),
|
||||||
|
"category": r.get("category"),
|
||||||
|
"passed": r.get("passed"),
|
||||||
|
"reward": r.get("reward"),
|
||||||
|
"turns_used": r.get("turns_used"),
|
||||||
|
"error": r.get("error"),
|
||||||
|
"messages": r.get("messages"),
|
||||||
|
}
|
||||||
|
for r in valid_results
|
||||||
|
]
|
||||||
|
|
||||||
|
# Log evaluation results
|
||||||
|
try:
|
||||||
|
await self.evaluate_log(
|
||||||
|
metrics=eval_metrics,
|
||||||
|
samples=samples,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
generation_parameters={
|
||||||
|
"temperature": self.config.agent_temperature,
|
||||||
|
"max_tokens": self.config.max_token_length,
|
||||||
|
"max_agent_turns": self.config.max_agent_turns,
|
||||||
|
"terminal_backend": self.config.terminal_backend,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error logging evaluation results: {e}")
|
||||||
|
|
||||||
|
# Close streaming file
|
||||||
|
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||||
|
self._streaming_file.close()
|
||||||
|
print(f" Live results saved to: {self._streaming_path}")
|
||||||
|
|
||||||
|
# Kill all remaining sandboxes. Timed-out tasks leave orphaned thread
|
||||||
|
# pool workers still executing commands -- cleanup_all stops them.
|
||||||
|
from tools.terminal_tool import cleanup_all_environments
|
||||||
|
print("\nCleaning up all sandboxes...")
|
||||||
|
cleanup_all_environments()
|
||||||
|
|
||||||
|
# Shut down the tool thread pool so orphaned workers from timed-out
|
||||||
|
# tasks are killed immediately instead of retrying against dead
|
||||||
|
# sandboxes and spamming the console with TimeoutError warnings.
|
||||||
|
from environments.agent_loop import _tool_executor
|
||||||
|
_tool_executor.shutdown(wait=False, cancel_futures=True)
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Wandb logging
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||||
|
"""Log TB2-specific metrics to wandb."""
|
||||||
|
if wandb_metrics is None:
|
||||||
|
wandb_metrics = {}
|
||||||
|
|
||||||
|
# Add stored eval metrics
|
||||||
|
for metric_name, metric_value in self.eval_metrics:
|
||||||
|
wandb_metrics[metric_name] = metric_value
|
||||||
|
self.eval_metrics = []
|
||||||
|
|
||||||
|
await super().wandb_log(wandb_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
TerminalBench2EvalEnv.cli()
|
||||||
115
environments/benchmarks/yc_bench/README.md
Normal file
115
environments/benchmarks/yc_bench/README.md
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
# YC-Bench: Long-Horizon Agent Benchmark
|
||||||
|
|
||||||
|
[YC-Bench](https://github.com/collinear-ai/yc-bench) by [Collinear AI](https://collinear.ai/) is a deterministic, long-horizon benchmark that tests LLM agents' ability to act as a tech startup CEO. The agent manages a simulated company over 1-3 years, making compounding decisions about resource allocation, cash flow, task management, and prestige specialisation across 4 skill domains.
|
||||||
|
|
||||||
|
Unlike TerminalBench2 (which evaluates per-task coding ability with binary pass/fail), YC-Bench measures **long-term strategic coherence** — whether an agent can maintain consistent strategy, manage compounding consequences, and adapt plans over hundreds of turns.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install yc-bench (optional dependency)
|
||||||
|
pip install "hermes-agent[yc-bench]"
|
||||||
|
|
||||||
|
# Or install from source
|
||||||
|
git clone https://github.com/collinear-ai/yc-bench
|
||||||
|
cd yc-bench && pip install -e .
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
yc-bench --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From the repo root:
|
||||||
|
bash environments/benchmarks/yc_bench/run_eval.sh
|
||||||
|
|
||||||
|
# Or directly:
|
||||||
|
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||||
|
--config environments/benchmarks/yc_bench/default.yaml
|
||||||
|
|
||||||
|
# Override model:
|
||||||
|
bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||||
|
--openai.model_name anthropic/claude-opus-4-20250514
|
||||||
|
|
||||||
|
# Quick single-preset test:
|
||||||
|
bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||||
|
--env.presets '["fast_test"]' --env.seeds '[1]'
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
HermesAgentLoop (our agent)
|
||||||
|
-> terminal tool -> subprocess("yc-bench company status") -> JSON output
|
||||||
|
-> terminal tool -> subprocess("yc-bench task accept --task-id X") -> JSON
|
||||||
|
-> terminal tool -> subprocess("yc-bench sim resume") -> JSON (advance time)
|
||||||
|
-> ... (100-500 turns per run)
|
||||||
|
```
|
||||||
|
|
||||||
|
The environment initialises the simulation via `yc-bench sim init` (NOT `yc-bench run`, which would start yc-bench's own built-in agent loop). Our `HermesAgentLoop` then drives all interaction through CLI commands.
|
||||||
|
|
||||||
|
### Simulation Mechanics
|
||||||
|
|
||||||
|
- **4 skill domains**: research, inference, data_environment, training
|
||||||
|
- **Prestige system** (1.0-10.0): Gates access to higher-paying tasks
|
||||||
|
- **Employee management**: Junior/Mid/Senior with domain-specific skill rates
|
||||||
|
- **Throughput splitting**: `effective_rate = base_rate / N` active tasks per employee
|
||||||
|
- **Financial pressure**: Monthly payroll, bankruptcy = game over
|
||||||
|
- **Deterministic**: SHA256-based RNG — same seed + preset = same world
|
||||||
|
|
||||||
|
### Difficulty Presets
|
||||||
|
|
||||||
|
| Preset | Employees | Tasks | Focus |
|
||||||
|
|-----------|-----------|-------|-------|
|
||||||
|
| tutorial | 3 | 50 | Basic loop mechanics |
|
||||||
|
| easy | 5 | 100 | Throughput awareness |
|
||||||
|
| **medium**| 5 | 150 | Prestige climbing + domain specialisation |
|
||||||
|
| **hard** | 7 | 200 | Precise ETA reasoning |
|
||||||
|
| nightmare | 8 | 300 | Sustained perfection under payroll pressure |
|
||||||
|
| fast_test | (varies) | (varies) | Quick validation (~50 turns) |
|
||||||
|
|
||||||
|
Default eval runs **fast_test + medium + hard** × 3 seeds = 9 runs.
|
||||||
|
|
||||||
|
### Scoring
|
||||||
|
|
||||||
|
```
|
||||||
|
composite = 0.5 × survival + 0.5 × normalised_funds
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Survival** (binary): Did the company avoid bankruptcy?
|
||||||
|
- **Normalised funds** (0.0-1.0): Log-scale relative to initial $250K capital
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Key fields in `default.yaml`:
|
||||||
|
|
||||||
|
| Field | Default | Description |
|
||||||
|
|-------|---------|-------------|
|
||||||
|
| `presets` | `["fast_test", "medium", "hard"]` | Which presets to evaluate |
|
||||||
|
| `seeds` | `[1, 2, 3]` | RNG seeds per preset |
|
||||||
|
| `max_agent_turns` | 200 | Max LLM calls per run |
|
||||||
|
| `run_timeout` | 3600 | Wall-clock timeout per run (seconds) |
|
||||||
|
| `survival_weight` | 0.5 | Weight of survival in composite score |
|
||||||
|
| `funds_weight` | 0.5 | Weight of normalised funds in composite |
|
||||||
|
| `horizon_years` | null | Override horizon (null = auto from preset) |
|
||||||
|
|
||||||
|
## Cost & Time Estimates
|
||||||
|
|
||||||
|
Each run is 100-500 LLM turns. Approximate costs per run at typical API rates:
|
||||||
|
|
||||||
|
| Preset | Turns | Time | Est. Cost |
|
||||||
|
|--------|-------|------|-----------|
|
||||||
|
| fast_test | ~50 | 5-10 min | $1-5 |
|
||||||
|
| medium | ~200 | 20-40 min | $5-15 |
|
||||||
|
| hard | ~300 | 30-60 min | $10-25 |
|
||||||
|
|
||||||
|
Full default eval (9 runs): ~3-6 hours, $50-200 depending on model.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [collinear-ai/yc-bench](https://github.com/collinear-ai/yc-bench) — Official repository
|
||||||
|
- [Collinear AI](https://collinear.ai/) — Company behind yc-bench
|
||||||
|
- [TerminalBench2](../terminalbench_2/) — Per-task coding benchmark (complementary)
|
||||||
0
environments/benchmarks/yc_bench/__init__.py
Normal file
0
environments/benchmarks/yc_bench/__init__.py
Normal file
43
environments/benchmarks/yc_bench/default.yaml
Normal file
43
environments/benchmarks/yc_bench/default.yaml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# YC-Bench Evaluation -- Default Configuration
|
||||||
|
#
|
||||||
|
# Long-horizon agent benchmark: agent plays CEO of an AI startup over
|
||||||
|
# a simulated 1-3 year run, interacting via yc-bench CLI subcommands.
|
||||||
|
#
|
||||||
|
# Requires: pip install "hermes-agent[yc-bench]"
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/yc_bench/default.yaml
|
||||||
|
#
|
||||||
|
# # Override model:
|
||||||
|
# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/yc_bench/default.yaml \
|
||||||
|
# --openai.model_name anthropic/claude-opus-4-20250514
|
||||||
|
|
||||||
|
env:
|
||||||
|
enabled_toolsets: ["terminal"]
|
||||||
|
max_agent_turns: 200
|
||||||
|
max_token_length: 32000
|
||||||
|
agent_temperature: 0.0
|
||||||
|
terminal_backend: "local"
|
||||||
|
terminal_timeout: 60
|
||||||
|
presets: ["fast_test", "medium", "hard"]
|
||||||
|
seeds: [1, 2, 3]
|
||||||
|
run_timeout: 3600 # 60 min wall-clock per run, auto-FAIL if exceeded
|
||||||
|
survival_weight: 0.5 # weight of binary survival in composite score
|
||||||
|
funds_weight: 0.5 # weight of normalised final funds in composite score
|
||||||
|
db_dir: "/tmp/yc_bench_dbs"
|
||||||
|
company_name: "BenchCo"
|
||||||
|
start_date: "01/01/2025" # MM/DD/YYYY (yc-bench convention)
|
||||||
|
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_name: "yc-bench"
|
||||||
|
ensure_scores_are_not_same: false
|
||||||
|
data_dir_to_save_evals: "environments/benchmarks/evals/yc-bench"
|
||||||
|
|
||||||
|
openai:
|
||||||
|
base_url: "https://openrouter.ai/api/v1"
|
||||||
|
model_name: "anthropic/claude-sonnet-4.6"
|
||||||
|
server_type: "openai"
|
||||||
|
health_check: false
|
||||||
|
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||||
34
environments/benchmarks/yc_bench/run_eval.sh
Executable file
34
environments/benchmarks/yc_bench/run_eval.sh
Executable file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# YC-Bench Evaluation
|
||||||
|
#
|
||||||
|
# Requires: pip install "hermes-agent[yc-bench]"
|
||||||
|
#
|
||||||
|
# Run from repo root:
|
||||||
|
# bash environments/benchmarks/yc_bench/run_eval.sh
|
||||||
|
#
|
||||||
|
# Override model:
|
||||||
|
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||||
|
# --openai.model_name anthropic/claude-opus-4-20250514
|
||||||
|
#
|
||||||
|
# Run a single preset:
|
||||||
|
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||||
|
# --env.presets '["fast_test"]' --env.seeds '[1]'
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
mkdir -p logs evals/yc-bench
|
||||||
|
LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log"
|
||||||
|
|
||||||
|
echo "YC-Bench Evaluation"
|
||||||
|
echo "Log: $LOG_FILE"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \
|
||||||
|
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||||
|
--config environments/benchmarks/yc_bench/default.yaml \
|
||||||
|
"$@" \
|
||||||
|
2>&1 | tee "$LOG_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Log saved to: $LOG_FILE"
|
||||||
847
environments/benchmarks/yc_bench/yc_bench_env.py
Normal file
847
environments/benchmarks/yc_bench/yc_bench_env.py
Normal file
@@ -0,0 +1,847 @@
|
|||||||
|
"""
|
||||||
|
YCBenchEvalEnv -- YC-Bench Long-Horizon Agent Benchmark Environment
|
||||||
|
|
||||||
|
Evaluates agentic LLMs on YC-Bench: a deterministic, long-horizon benchmark
|
||||||
|
where the agent acts as CEO of an AI startup over a simulated 1-3 year run.
|
||||||
|
The agent manages cash flow, employees, tasks, and prestige across 4 domains,
|
||||||
|
interacting exclusively via CLI subprocess calls against a SQLite-backed
|
||||||
|
discrete-event simulation.
|
||||||
|
|
||||||
|
Unlike TerminalBench2 (per-task binary pass/fail), YC-Bench measures sustained
|
||||||
|
multi-turn strategic coherence -- whether an agent can manage compounding
|
||||||
|
decisions over hundreds of turns without going bankrupt.
|
||||||
|
|
||||||
|
This is an eval-only environment. Run via:
|
||||||
|
|
||||||
|
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||||
|
--config environments/benchmarks/yc_bench/default.yaml
|
||||||
|
|
||||||
|
The evaluate flow:
|
||||||
|
1. setup() -- Verifies yc-bench installed, builds eval matrix (preset x seed)
|
||||||
|
2. evaluate() -- Iterates over all runs sequentially through:
|
||||||
|
a. rollout_and_score_eval() -- Per-run agent loop
|
||||||
|
- Initialises a fresh yc-bench simulation via `sim init` (NOT `run`)
|
||||||
|
- Runs HermesAgentLoop with terminal tool only
|
||||||
|
- Reads final SQLite DB to extract score
|
||||||
|
- Returns survival (0/1) + normalised funds score
|
||||||
|
b. Aggregates per-preset and overall metrics
|
||||||
|
c. Logs results via evaluate_log() and wandb
|
||||||
|
|
||||||
|
Key features:
|
||||||
|
- CLI-only interface: agent calls yc-bench subcommands via terminal tool
|
||||||
|
- Deterministic: same seed + preset = same world (SHA256-based RNG)
|
||||||
|
- Multi-dimensional scoring: survival + normalised final funds
|
||||||
|
- Per-preset difficulty breakdown in results
|
||||||
|
- Isolated SQLite DB per run (no cross-run state leakage)
|
||||||
|
|
||||||
|
Requires: pip install hermes-agent[yc-bench]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from atroposlib.envs.base import EvalHandlingEnum
|
||||||
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||||
|
|
||||||
|
from environments.agent_loop import HermesAgentLoop
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# System prompt
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
YC_BENCH_SYSTEM_PROMPT = """\
|
||||||
|
You are the autonomous CEO of an early-stage AI startup in a deterministic
|
||||||
|
business simulation. You manage the company exclusively through the `yc-bench`
|
||||||
|
CLI tool. Your primary goal is to **survive** until the simulation horizon ends
|
||||||
|
without going bankrupt, while **maximising final funds**.
|
||||||
|
|
||||||
|
## Simulation Mechanics
|
||||||
|
|
||||||
|
- **Funds**: You start with $250,000 seed capital. Revenue comes from completing
|
||||||
|
tasks. Rewards scale with your prestige: `base × (1 + scale × (prestige − 1))`.
|
||||||
|
- **Domains**: There are 4 skill domains: **research**, **inference**,
|
||||||
|
**data_environment**, and **training**. Each has its own prestige level
|
||||||
|
(1.0-10.0). Higher prestige unlocks better-paying tasks.
|
||||||
|
- **Employees**: You have employees (Junior/Mid/Senior) with domain-specific
|
||||||
|
skill rates. **Throughput splits**: `effective_rate = base_rate / N` where N
|
||||||
|
is the number of active tasks assigned to that employee. Focus beats breadth.
|
||||||
|
- **Payroll**: Deducted automatically on the first business day of each month.
|
||||||
|
Running out of funds = bankruptcy = game over.
|
||||||
|
- **Time**: The simulation runs on business days (Mon-Fri), 09:00-18:00.
|
||||||
|
Time only advances when you call `yc-bench sim resume`.
|
||||||
|
|
||||||
|
## Task Lifecycle
|
||||||
|
|
||||||
|
1. Browse market tasks with `market browse`
|
||||||
|
2. Accept a task with `task accept` (this sets its deadline)
|
||||||
|
3. Assign employees with `task assign`
|
||||||
|
4. Dispatch with `task dispatch` to start work
|
||||||
|
5. Call `sim resume` to advance time and let employees make progress
|
||||||
|
6. Tasks complete when all domain requirements are fulfilled
|
||||||
|
|
||||||
|
**Penalties for failure vary by difficulty preset.** Completing a task on time
|
||||||
|
earns full reward + prestige gain. Missing a deadline or cancelling a task
|
||||||
|
incurs prestige penalties -- cancelling is always more costly than letting a
|
||||||
|
task fail, so cancel only as a last resort.
|
||||||
|
|
||||||
|
## CLI Commands
|
||||||
|
|
||||||
|
### Observe
|
||||||
|
- `yc-bench company status` -- funds, prestige, runway
|
||||||
|
- `yc-bench employee list` -- skills, salary, active tasks
|
||||||
|
- `yc-bench market browse [--domain D] [--required-prestige-lte N]` -- available tasks
|
||||||
|
- `yc-bench task list [--status active|planned]` -- your tasks
|
||||||
|
- `yc-bench task inspect --task-id UUID` -- progress, deadline, assignments
|
||||||
|
- `yc-bench finance ledger [--category monthly_payroll|task_reward]` -- transaction history
|
||||||
|
- `yc-bench report monthly` -- monthly P&L
|
||||||
|
|
||||||
|
### Act
|
||||||
|
- `yc-bench task accept --task-id UUID` -- accept from market
|
||||||
|
- `yc-bench task assign --task-id UUID --employee-id UUID` -- assign employee
|
||||||
|
- `yc-bench task dispatch --task-id UUID` -- start work (needs >=1 assignment)
|
||||||
|
- `yc-bench task cancel --task-id UUID --reason "text"` -- cancel (prestige penalty)
|
||||||
|
- `yc-bench sim resume` -- advance simulation clock
|
||||||
|
|
||||||
|
### Memory (persists across context truncation)
|
||||||
|
- `yc-bench scratchpad read` -- read your persistent notes
|
||||||
|
- `yc-bench scratchpad write --content "text"` -- overwrite notes
|
||||||
|
- `yc-bench scratchpad append --content "text"` -- append to notes
|
||||||
|
- `yc-bench scratchpad clear` -- clear notes
|
||||||
|
|
||||||
|
## Strategy Guidelines
|
||||||
|
|
||||||
|
1. **Specialise in 2-3 domains** to climb the prestige ladder faster and unlock
|
||||||
|
high-reward tasks. Don't spread thin across all 4 domains early on.
|
||||||
|
2. **Focus employees** -- assigning one employee to many tasks halves their
|
||||||
|
throughput per additional task. Keep assignments concentrated.
|
||||||
|
3. **Use the scratchpad** to track your strategy, upcoming deadlines, and
|
||||||
|
employee assignments. This persists even if conversation context is truncated.
|
||||||
|
4. **Monitor runway** -- always know how many months of payroll you can cover.
|
||||||
|
Accept high-reward tasks before payroll dates.
|
||||||
|
5. **Don't over-accept** -- taking too many tasks and missing deadlines cascades
|
||||||
|
into prestige loss, locking you out of profitable contracts.
|
||||||
|
6. Use `finance ledger` and `report monthly` to track revenue trends.
|
||||||
|
|
||||||
|
## Your Turn
|
||||||
|
|
||||||
|
Each turn:
|
||||||
|
1. Call `yc-bench company status` and `yc-bench task list` to orient yourself.
|
||||||
|
2. Check for completed tasks and pending deadlines.
|
||||||
|
3. Browse market for profitable tasks within your prestige level.
|
||||||
|
4. Accept, assign, and dispatch tasks strategically.
|
||||||
|
5. Call `yc-bench sim resume` to advance time.
|
||||||
|
6. Repeat until the simulation ends.
|
||||||
|
|
||||||
|
Think step by step before acting."""
|
||||||
|
|
||||||
|
# Starting funds in cents ($250,000)
|
||||||
|
INITIAL_FUNDS_CENTS = 25_000_000
|
||||||
|
|
||||||
|
# Default horizon per preset (years)
|
||||||
|
_PRESET_HORIZONS = {
|
||||||
|
"tutorial": 1,
|
||||||
|
"easy": 1,
|
||||||
|
"medium": 1,
|
||||||
|
"hard": 1,
|
||||||
|
"nightmare": 1,
|
||||||
|
"fast_test": 1,
|
||||||
|
"default": 3,
|
||||||
|
"high_reward": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Configuration
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class YCBenchEvalConfig(HermesAgentEnvConfig):
|
||||||
|
"""
|
||||||
|
Configuration for the YC-Bench evaluation environment.
|
||||||
|
|
||||||
|
Extends HermesAgentEnvConfig with YC-Bench-specific settings for
|
||||||
|
preset selection, seed control, scoring, and simulation parameters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
presets: List[str] = Field(
|
||||||
|
default=["fast_test", "medium", "hard"],
|
||||||
|
description="YC-Bench preset names to evaluate.",
|
||||||
|
)
|
||||||
|
seeds: List[int] = Field(
|
||||||
|
default=[1, 2, 3],
|
||||||
|
description="Random seeds -- each preset x seed = one run.",
|
||||||
|
)
|
||||||
|
run_timeout: int = Field(
|
||||||
|
default=3600,
|
||||||
|
description="Maximum wall-clock seconds per run. Default 60 minutes.",
|
||||||
|
)
|
||||||
|
survival_weight: float = Field(
|
||||||
|
default=0.5,
|
||||||
|
description="Weight of survival (0/1) in composite score.",
|
||||||
|
)
|
||||||
|
funds_weight: float = Field(
|
||||||
|
default=0.5,
|
||||||
|
description="Weight of normalised final funds in composite score.",
|
||||||
|
)
|
||||||
|
db_dir: str = Field(
|
||||||
|
default="/tmp/yc_bench_dbs",
|
||||||
|
description="Directory for per-run SQLite databases.",
|
||||||
|
)
|
||||||
|
horizon_years: Optional[int] = Field(
|
||||||
|
default=None,
|
||||||
|
description=(
|
||||||
|
"Simulation horizon in years. If None (default), inferred from "
|
||||||
|
"preset name (1 year for most, 3 for 'default')."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
company_name: str = Field(
|
||||||
|
default="BenchCo",
|
||||||
|
description="Name of the simulated company.",
|
||||||
|
)
|
||||||
|
start_date: str = Field(
|
||||||
|
default="01/01/2025",
|
||||||
|
description="Simulation start date in MM/DD/YYYY format (yc-bench convention).",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Scoring helpers
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def _read_final_score(db_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Read final game state from a YC-Bench SQLite database.
|
||||||
|
|
||||||
|
Returns dict with final_funds_cents (int), survived (bool),
|
||||||
|
terminal_reason (str).
|
||||||
|
|
||||||
|
Note: yc-bench table names are plural -- 'companies' not 'company',
|
||||||
|
'sim_events' not 'simulation_log'.
|
||||||
|
"""
|
||||||
|
if not os.path.exists(db_path):
|
||||||
|
logger.warning("DB not found at %s", db_path)
|
||||||
|
return {
|
||||||
|
"final_funds_cents": 0,
|
||||||
|
"survived": False,
|
||||||
|
"terminal_reason": "db_missing",
|
||||||
|
}
|
||||||
|
|
||||||
|
conn = None
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Read final funds from the 'companies' table
|
||||||
|
cur.execute("SELECT funds_cents FROM companies LIMIT 1")
|
||||||
|
row = cur.fetchone()
|
||||||
|
funds = row[0] if row else 0
|
||||||
|
|
||||||
|
# Determine terminal reason from 'sim_events' table
|
||||||
|
terminal_reason = "unknown"
|
||||||
|
try:
|
||||||
|
cur.execute(
|
||||||
|
"SELECT event_type FROM sim_events "
|
||||||
|
"WHERE event_type IN ('bankruptcy', 'horizon_end') "
|
||||||
|
"ORDER BY scheduled_at DESC LIMIT 1"
|
||||||
|
)
|
||||||
|
event_row = cur.fetchone()
|
||||||
|
if event_row:
|
||||||
|
terminal_reason = event_row[0]
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
# Table may not exist if simulation didn't progress
|
||||||
|
pass
|
||||||
|
|
||||||
|
survived = funds >= 0 and terminal_reason != "bankruptcy"
|
||||||
|
return {
|
||||||
|
"final_funds_cents": funds,
|
||||||
|
"survived": survived,
|
||||||
|
"terminal_reason": terminal_reason,
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to read DB %s: %s", db_path, e)
|
||||||
|
return {
|
||||||
|
"final_funds_cents": 0,
|
||||||
|
"survived": False,
|
||||||
|
"terminal_reason": f"db_error: {e}",
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
if conn:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_composite_score(
|
||||||
|
final_funds_cents: int,
|
||||||
|
survived: bool,
|
||||||
|
survival_weight: float = 0.5,
|
||||||
|
funds_weight: float = 0.5,
|
||||||
|
initial_funds_cents: int = INITIAL_FUNDS_CENTS,
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Compute composite score from survival and final funds.
|
||||||
|
|
||||||
|
Score = survival_weight * survival_score
|
||||||
|
+ funds_weight * normalised_funds_score
|
||||||
|
|
||||||
|
Normalised funds uses log-scale relative to initial capital:
|
||||||
|
- funds <= 0: 0.0
|
||||||
|
- funds == initial: ~0.15
|
||||||
|
- funds == 10x: ~0.52
|
||||||
|
- funds == 100x: 1.0
|
||||||
|
"""
|
||||||
|
survival_score = 1.0 if survived else 0.0
|
||||||
|
|
||||||
|
if final_funds_cents <= 0:
|
||||||
|
funds_score = 0.0
|
||||||
|
else:
|
||||||
|
max_ratio = 100.0
|
||||||
|
ratio = final_funds_cents / max(initial_funds_cents, 1)
|
||||||
|
funds_score = min(math.log1p(ratio) / math.log1p(max_ratio), 1.0)
|
||||||
|
|
||||||
|
return survival_weight * survival_score + funds_weight * funds_score
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Main Environment
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class YCBenchEvalEnv(HermesAgentBaseEnv):
|
||||||
|
"""
|
||||||
|
YC-Bench long-horizon agent benchmark environment (eval-only).
|
||||||
|
|
||||||
|
Each eval item is a (preset, seed) pair. The environment initialises the
|
||||||
|
simulation via ``yc-bench sim init`` (NOT ``yc-bench run`` which would start
|
||||||
|
a competing built-in agent loop). The HermesAgentLoop then drives the
|
||||||
|
interaction by calling individual yc-bench CLI commands via the terminal tool.
|
||||||
|
|
||||||
|
After the agent loop ends, the SQLite DB is read to extract the final score.
|
||||||
|
|
||||||
|
Scoring:
|
||||||
|
composite = 0.5 * survival + 0.5 * normalised_funds
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "yc-bench"
|
||||||
|
env_config_cls = YCBenchEvalConfig
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls) -> Tuple[YCBenchEvalConfig, List[APIServerConfig]]:
|
||||||
|
env_config = YCBenchEvalConfig(
|
||||||
|
enabled_toolsets=["terminal"],
|
||||||
|
disabled_toolsets=None,
|
||||||
|
distribution=None,
|
||||||
|
max_agent_turns=200,
|
||||||
|
max_token_length=32000,
|
||||||
|
agent_temperature=0.0,
|
||||||
|
system_prompt=YC_BENCH_SYSTEM_PROMPT,
|
||||||
|
terminal_backend="local",
|
||||||
|
terminal_timeout=60,
|
||||||
|
presets=["fast_test", "medium", "hard"],
|
||||||
|
seeds=[1, 2, 3],
|
||||||
|
run_timeout=3600,
|
||||||
|
survival_weight=0.5,
|
||||||
|
funds_weight=0.5,
|
||||||
|
db_dir="/tmp/yc_bench_dbs",
|
||||||
|
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||||
|
group_size=1,
|
||||||
|
steps_per_eval=1,
|
||||||
|
total_steps=1,
|
||||||
|
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||||
|
use_wandb=True,
|
||||||
|
wandb_name="yc-bench",
|
||||||
|
ensure_scores_are_not_same=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
server_configs = [
|
||||||
|
APIServerConfig(
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
model_name="anthropic/claude-sonnet-4.6",
|
||||||
|
server_type="openai",
|
||||||
|
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||||
|
health_check=False,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Setup
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def setup(self):
|
||||||
|
"""Verify yc-bench is installed and build the eval matrix."""
|
||||||
|
# Verify yc-bench CLI is available
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["yc-bench", "--help"], capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise FileNotFoundError
|
||||||
|
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||||
|
raise RuntimeError(
|
||||||
|
"yc-bench CLI not found. Install with:\n"
|
||||||
|
' pip install "hermes-agent[yc-bench]"\n'
|
||||||
|
"Or: git clone https://github.com/collinear-ai/yc-bench "
|
||||||
|
"&& cd yc-bench && pip install -e ."
|
||||||
|
)
|
||||||
|
print("yc-bench CLI verified.")
|
||||||
|
|
||||||
|
# Build eval matrix: preset x seed
|
||||||
|
self.all_eval_items = [
|
||||||
|
{"preset": preset, "seed": seed}
|
||||||
|
for preset in self.config.presets
|
||||||
|
for seed in self.config.seeds
|
||||||
|
]
|
||||||
|
self.iter = 0
|
||||||
|
|
||||||
|
os.makedirs(self.config.db_dir, exist_ok=True)
|
||||||
|
self.eval_metrics: List[Tuple[str, float]] = []
|
||||||
|
|
||||||
|
# Streaming JSONL log for crash-safe result persistence
|
||||||
|
log_dir = os.path.join(os.path.dirname(__file__), "logs")
|
||||||
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
|
run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
|
||||||
|
self._streaming_file = open(self._streaming_path, "w")
|
||||||
|
self._streaming_lock = threading.Lock()
|
||||||
|
|
||||||
|
print(f"\nYC-Bench eval matrix: {len(self.all_eval_items)} runs")
|
||||||
|
for item in self.all_eval_items:
|
||||||
|
print(f" preset={item['preset']!r} seed={item['seed']}")
|
||||||
|
print(f"Streaming results to: {self._streaming_path}\n")
|
||||||
|
|
||||||
|
def _save_result(self, result: Dict[str, Any]):
|
||||||
|
"""Write a single run result to the streaming JSONL file immediately."""
|
||||||
|
if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
|
||||||
|
return
|
||||||
|
with self._streaming_lock:
|
||||||
|
self._streaming_file.write(
|
||||||
|
json.dumps(result, ensure_ascii=False, default=str) + "\n"
|
||||||
|
)
|
||||||
|
self._streaming_file.flush()
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Training pipeline stubs (eval-only -- not used)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def get_next_item(self):
|
||||||
|
item = self.all_eval_items[self.iter % len(self.all_eval_items)]
|
||||||
|
self.iter += 1
|
||||||
|
return item
|
||||||
|
|
||||||
|
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||||
|
preset = item["preset"]
|
||||||
|
seed = item["seed"]
|
||||||
|
return (
|
||||||
|
f"A new YC-Bench simulation has been initialized "
|
||||||
|
f"(preset='{preset}', seed={seed}).\n"
|
||||||
|
f"Your company '{self.config.company_name}' is ready.\n\n"
|
||||||
|
"Begin by calling:\n"
|
||||||
|
"1. `yc-bench company status` -- see your starting funds and prestige\n"
|
||||||
|
"2. `yc-bench employee list` -- see your team and their skills\n"
|
||||||
|
"3. `yc-bench market browse --required-prestige-lte 1` -- find tasks "
|
||||||
|
"you can take\n\n"
|
||||||
|
"Then accept 2-3 tasks, assign employees, dispatch them, and call "
|
||||||
|
"`yc-bench sim resume` to advance time. Repeat this loop until the "
|
||||||
|
"simulation ends (horizon reached or bankruptcy)."
|
||||||
|
)
|
||||||
|
|
||||||
|
async def compute_reward(self, item, result, ctx) -> float:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
async def collect_trajectories(self, item):
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
async def score(self, rollout_group_data):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Per-run evaluation
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict:
|
||||||
|
"""
|
||||||
|
Evaluate a single (preset, seed) run.
|
||||||
|
|
||||||
|
1. Sets DATABASE_URL and YC_BENCH_EXPERIMENT env vars
|
||||||
|
2. Initialises the simulation via ``yc-bench sim init`` (NOT ``run``)
|
||||||
|
3. Runs HermesAgentLoop with terminal tool
|
||||||
|
4. Reads SQLite DB to compute final score
|
||||||
|
5. Returns result dict with survival, funds, and composite score
|
||||||
|
"""
|
||||||
|
preset = eval_item["preset"]
|
||||||
|
seed = eval_item["seed"]
|
||||||
|
run_id = str(uuid.uuid4())[:8]
|
||||||
|
run_key = f"{preset}_seed{seed}_{run_id}"
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
tqdm.write(f" [START] preset={preset!r} seed={seed} (run_id={run_id})")
|
||||||
|
run_start = time.time()
|
||||||
|
|
||||||
|
# Isolated DB per run -- prevents cross-run state leakage
|
||||||
|
db_path = os.path.join(self.config.db_dir, f"yc_bench_{run_key}.db")
|
||||||
|
os.environ["DATABASE_URL"] = f"sqlite:///{db_path}"
|
||||||
|
os.environ["YC_BENCH_EXPERIMENT"] = preset
|
||||||
|
|
||||||
|
# Determine horizon: explicit config override > preset lookup > default 1
|
||||||
|
horizon = self.config.horizon_years or _PRESET_HORIZONS.get(preset, 1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
# Step 1: Initialise the simulation via CLI
|
||||||
|
# IMPORTANT: We use `sim init`, NOT `yc-bench run`.
|
||||||
|
# `yc-bench run` starts yc-bench's own LLM agent loop (via
|
||||||
|
# LiteLLM), which would compete with our HermesAgentLoop.
|
||||||
|
# `sim init` just sets up the world and returns.
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
init_cmd = [
|
||||||
|
"yc-bench", "sim", "init",
|
||||||
|
"--seed", str(seed),
|
||||||
|
"--start-date", self.config.start_date,
|
||||||
|
"--company-name", self.config.company_name,
|
||||||
|
"--horizon-years", str(horizon),
|
||||||
|
]
|
||||||
|
init_result = subprocess.run(
|
||||||
|
init_cmd, capture_output=True, text=True, timeout=30,
|
||||||
|
)
|
||||||
|
if init_result.returncode != 0:
|
||||||
|
error_msg = (init_result.stderr or init_result.stdout).strip()
|
||||||
|
raise RuntimeError(f"yc-bench sim init failed: {error_msg}")
|
||||||
|
|
||||||
|
tqdm.write(f" Simulation initialized (horizon={horizon}yr)")
|
||||||
|
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
# Step 2: Run the HermesAgentLoop
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
tools, valid_names = self._resolve_tools_for_group()
|
||||||
|
|
||||||
|
messages: List[Dict[str, Any]] = [
|
||||||
|
{"role": "system", "content": YC_BENCH_SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": self.format_prompt(eval_item)},
|
||||||
|
]
|
||||||
|
|
||||||
|
agent = HermesAgentLoop(
|
||||||
|
server=self.server,
|
||||||
|
tool_schemas=tools,
|
||||||
|
valid_tool_names=valid_names,
|
||||||
|
max_turns=self.config.max_agent_turns,
|
||||||
|
task_id=run_id,
|
||||||
|
temperature=self.config.agent_temperature,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
extra_body=self.config.extra_body,
|
||||||
|
)
|
||||||
|
result = await agent.run(messages)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
# Step 3: Read final score from the simulation DB
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
score_data = _read_final_score(db_path)
|
||||||
|
final_funds = score_data["final_funds_cents"]
|
||||||
|
survived = score_data["survived"]
|
||||||
|
terminal_reason = score_data["terminal_reason"]
|
||||||
|
|
||||||
|
composite = _compute_composite_score(
|
||||||
|
final_funds_cents=final_funds,
|
||||||
|
survived=survived,
|
||||||
|
survival_weight=self.config.survival_weight,
|
||||||
|
funds_weight=self.config.funds_weight,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.time() - run_start
|
||||||
|
status = "SURVIVED" if survived else "BANKRUPT"
|
||||||
|
if final_funds >= 0:
|
||||||
|
funds_str = f"${final_funds / 100:,.0f}"
|
||||||
|
else:
|
||||||
|
funds_str = f"-${abs(final_funds) / 100:,.0f}"
|
||||||
|
|
||||||
|
tqdm.write(
|
||||||
|
f" [{status}] preset={preset!r} seed={seed} "
|
||||||
|
f"funds={funds_str} score={composite:.3f} "
|
||||||
|
f"turns={result.turns_used} ({elapsed:.0f}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
out = {
|
||||||
|
"preset": preset,
|
||||||
|
"seed": seed,
|
||||||
|
"survived": survived,
|
||||||
|
"final_funds_cents": final_funds,
|
||||||
|
"final_funds_usd": final_funds / 100,
|
||||||
|
"terminal_reason": terminal_reason,
|
||||||
|
"composite_score": composite,
|
||||||
|
"turns_used": result.turns_used,
|
||||||
|
"finished_naturally": result.finished_naturally,
|
||||||
|
"elapsed_seconds": elapsed,
|
||||||
|
"db_path": db_path,
|
||||||
|
"messages": result.messages,
|
||||||
|
}
|
||||||
|
self._save_result(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
elapsed = time.time() - run_start
|
||||||
|
logger.error("Run %s failed: %s", run_key, e, exc_info=True)
|
||||||
|
tqdm.write(
|
||||||
|
f" [ERROR] preset={preset!r} seed={seed}: {e} ({elapsed:.0f}s)"
|
||||||
|
)
|
||||||
|
out = {
|
||||||
|
"preset": preset,
|
||||||
|
"seed": seed,
|
||||||
|
"survived": False,
|
||||||
|
"final_funds_cents": 0,
|
||||||
|
"final_funds_usd": 0.0,
|
||||||
|
"terminal_reason": f"error: {e}",
|
||||||
|
"composite_score": 0.0,
|
||||||
|
"turns_used": 0,
|
||||||
|
"error": str(e),
|
||||||
|
"elapsed_seconds": elapsed,
|
||||||
|
}
|
||||||
|
self._save_result(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Evaluate
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def _run_with_timeout(self, item: Dict[str, Any]) -> Dict:
|
||||||
|
"""Wrap a single rollout with a wall-clock timeout."""
|
||||||
|
preset = item["preset"]
|
||||||
|
seed = item["seed"]
|
||||||
|
try:
|
||||||
|
return await asyncio.wait_for(
|
||||||
|
self.rollout_and_score_eval(item),
|
||||||
|
timeout=self.config.run_timeout,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
from tqdm import tqdm
|
||||||
|
tqdm.write(
|
||||||
|
f" [TIMEOUT] preset={preset!r} seed={seed} "
|
||||||
|
f"(exceeded {self.config.run_timeout}s)"
|
||||||
|
)
|
||||||
|
out = {
|
||||||
|
"preset": preset,
|
||||||
|
"seed": seed,
|
||||||
|
"survived": False,
|
||||||
|
"final_funds_cents": 0,
|
||||||
|
"final_funds_usd": 0.0,
|
||||||
|
"terminal_reason": f"timeout ({self.config.run_timeout}s)",
|
||||||
|
"composite_score": 0.0,
|
||||||
|
"turns_used": 0,
|
||||||
|
"error": "timeout",
|
||||||
|
}
|
||||||
|
self._save_result(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
async def evaluate(self, *args, **kwargs) -> None:
|
||||||
|
"""
|
||||||
|
Run YC-Bench evaluation over all (preset, seed) combinations.
|
||||||
|
|
||||||
|
Runs sequentially -- each run is 100-500 turns, parallelising would
|
||||||
|
be prohibitively expensive and cause env var conflicts.
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# --- tqdm-compatible logging handler (TB2 pattern) ---
|
||||||
|
class _TqdmHandler(logging.Handler):
|
||||||
|
def emit(self, record):
|
||||||
|
try:
|
||||||
|
tqdm.write(self.format(record))
|
||||||
|
except Exception:
|
||||||
|
self.handleError(record)
|
||||||
|
|
||||||
|
root = logging.getLogger()
|
||||||
|
handler = _TqdmHandler()
|
||||||
|
handler.setFormatter(
|
||||||
|
logging.Formatter("%(levelname)s %(name)s: %(message)s")
|
||||||
|
)
|
||||||
|
root.handlers = [handler]
|
||||||
|
for noisy in ("httpx", "openai"):
|
||||||
|
logging.getLogger(noisy).setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
# --- Print config summary ---
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Starting YC-Bench Evaluation")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Presets: {self.config.presets}")
|
||||||
|
print(f" Seeds: {self.config.seeds}")
|
||||||
|
print(f" Total runs: {len(self.all_eval_items)}")
|
||||||
|
print(f" Max turns/run: {self.config.max_agent_turns}")
|
||||||
|
print(f" Run timeout: {self.config.run_timeout}s")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
pbar = tqdm(
|
||||||
|
total=len(self.all_eval_items), desc="YC-Bench", dynamic_ncols=True
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
for item in self.all_eval_items:
|
||||||
|
result = await self._run_with_timeout(item)
|
||||||
|
results.append(result)
|
||||||
|
survived_count = sum(1 for r in results if r.get("survived"))
|
||||||
|
pbar.set_postfix_str(
|
||||||
|
f"survived={survived_count}/{len(results)}"
|
||||||
|
)
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
except (KeyboardInterrupt, asyncio.CancelledError):
|
||||||
|
tqdm.write("\n[INTERRUPTED] Stopping evaluation...")
|
||||||
|
pbar.close()
|
||||||
|
try:
|
||||||
|
from tools.terminal_tool import cleanup_all_environments
|
||||||
|
cleanup_all_environments()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||||
|
self._streaming_file.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
pbar.close()
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
# --- Compute metrics ---
|
||||||
|
valid = [r for r in results if r is not None]
|
||||||
|
if not valid:
|
||||||
|
print("Warning: No valid results.")
|
||||||
|
return
|
||||||
|
|
||||||
|
total = len(valid)
|
||||||
|
survived_total = sum(1 for r in valid if r.get("survived"))
|
||||||
|
survival_rate = survived_total / total if total else 0.0
|
||||||
|
avg_score = (
|
||||||
|
sum(r.get("composite_score", 0) for r in valid) / total
|
||||||
|
if total
|
||||||
|
else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
preset_results: Dict[str, List[Dict]] = defaultdict(list)
|
||||||
|
for r in valid:
|
||||||
|
preset_results[r["preset"]].append(r)
|
||||||
|
|
||||||
|
eval_metrics = {
|
||||||
|
"eval/survival_rate": survival_rate,
|
||||||
|
"eval/avg_composite_score": avg_score,
|
||||||
|
"eval/total_runs": total,
|
||||||
|
"eval/survived_runs": survived_total,
|
||||||
|
"eval/evaluation_time_seconds": end_time - start_time,
|
||||||
|
}
|
||||||
|
|
||||||
|
for preset, items in sorted(preset_results.items()):
|
||||||
|
ps = sum(1 for r in items if r.get("survived"))
|
||||||
|
pt = len(items)
|
||||||
|
pa = (
|
||||||
|
sum(r.get("composite_score", 0) for r in items) / pt
|
||||||
|
if pt
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
key = preset.replace("-", "_")
|
||||||
|
eval_metrics[f"eval/survival_rate_{key}"] = ps / pt if pt else 0
|
||||||
|
eval_metrics[f"eval/avg_score_{key}"] = pa
|
||||||
|
|
||||||
|
self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
|
||||||
|
|
||||||
|
# --- Print summary ---
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("YC-Bench Evaluation Results")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(
|
||||||
|
f"Overall survival rate: {survival_rate:.1%} "
|
||||||
|
f"({survived_total}/{total})"
|
||||||
|
)
|
||||||
|
print(f"Average composite score: {avg_score:.4f}")
|
||||||
|
print(f"Evaluation time: {end_time - start_time:.1f}s")
|
||||||
|
|
||||||
|
print("\nPer-preset breakdown:")
|
||||||
|
for preset, items in sorted(preset_results.items()):
|
||||||
|
ps = sum(1 for r in items if r.get("survived"))
|
||||||
|
pt = len(items)
|
||||||
|
pa = (
|
||||||
|
sum(r.get("composite_score", 0) for r in items) / pt
|
||||||
|
if pt
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
print(f" {preset}: {ps}/{pt} survived avg_score={pa:.4f}")
|
||||||
|
for r in items:
|
||||||
|
status = "SURVIVED" if r.get("survived") else "BANKRUPT"
|
||||||
|
funds = r.get("final_funds_usd", 0)
|
||||||
|
print(
|
||||||
|
f" seed={r['seed']} [{status}] "
|
||||||
|
f"${funds:,.0f} "
|
||||||
|
f"score={r.get('composite_score', 0):.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# --- Log results ---
|
||||||
|
samples = [
|
||||||
|
{k: v for k, v in r.items() if k != "messages"} for r in valid
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
await self.evaluate_log(
|
||||||
|
metrics=eval_metrics,
|
||||||
|
samples=samples,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
generation_parameters={
|
||||||
|
"temperature": self.config.agent_temperature,
|
||||||
|
"max_tokens": self.config.max_token_length,
|
||||||
|
"max_agent_turns": self.config.max_agent_turns,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error logging results: {e}")
|
||||||
|
|
||||||
|
# --- Cleanup (TB2 pattern) ---
|
||||||
|
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||||
|
self._streaming_file.close()
|
||||||
|
print(f"Results saved to: {self._streaming_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tools.terminal_tool import cleanup_all_environments
|
||||||
|
cleanup_all_environments()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from environments.agent_loop import _tool_executor
|
||||||
|
_tool_executor.shutdown(wait=False, cancel_futures=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Wandb logging
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||||
|
"""Log YC-Bench-specific metrics to wandb."""
|
||||||
|
if wandb_metrics is None:
|
||||||
|
wandb_metrics = {}
|
||||||
|
for k, v in self.eval_metrics:
|
||||||
|
wandb_metrics[k] = v
|
||||||
|
self.eval_metrics = []
|
||||||
|
await super().wandb_log(wandb_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
YCBenchEvalEnv.cli()
|
||||||
672
environments/hermes_base_env.py
Normal file
672
environments/hermes_base_env.py
Normal file
@@ -0,0 +1,672 @@
|
|||||||
|
"""
|
||||||
|
HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos
|
||||||
|
|
||||||
|
Provides the Atropos integration plumbing that all hermes-agent environments share:
|
||||||
|
- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2)
|
||||||
|
- Per-group toolset/distribution resolution
|
||||||
|
- Agent loop orchestration via HermesAgentLoop
|
||||||
|
- ToolContext creation for reward functions
|
||||||
|
- ScoredDataGroup construction from ManagedServer state
|
||||||
|
|
||||||
|
Subclasses only need to implement:
|
||||||
|
setup() -- Load dataset, initialize state
|
||||||
|
get_next_item() -- Return the next item from the dataset
|
||||||
|
format_prompt() -- Convert a dataset item into the user message
|
||||||
|
compute_reward() -- Score the rollout (has full ToolContext access)
|
||||||
|
evaluate() -- Periodic evaluation
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
from abc import abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
|
# Ensure the hermes-agent repo root is on sys.path so that imports like
|
||||||
|
# `from model_tools import ...` and `from environments.X import ...` work
|
||||||
|
# regardless of where the script is invoked from.
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
# Load API keys from hermes-agent/.env so all environments can access them
|
||||||
|
_env_path = _repo_root / ".env"
|
||||||
|
if _env_path.exists():
|
||||||
|
load_dotenv(dotenv_path=_env_path)
|
||||||
|
|
||||||
|
# Apply monkey patches for async-safe tool operation inside Atropos's event loop.
|
||||||
|
# This patches SwerexModalEnvironment to use a background thread instead of
|
||||||
|
# asyncio.run(), which would deadlock inside Atropos. Safe for normal CLI too.
|
||||||
|
from environments.patches import apply_patches
|
||||||
|
apply_patches()
|
||||||
|
|
||||||
|
from atroposlib.envs.base import (
|
||||||
|
BaseEnv,
|
||||||
|
BaseEnvConfig,
|
||||||
|
ScoredDataGroup,
|
||||||
|
ScoredDataItem,
|
||||||
|
)
|
||||||
|
from atroposlib.envs.server_handling.server_manager import (
|
||||||
|
APIServerConfig,
|
||||||
|
ServerBaseline,
|
||||||
|
ServerManager,
|
||||||
|
)
|
||||||
|
from atroposlib.type_definitions import Item
|
||||||
|
|
||||||
|
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||||
|
from environments.tool_context import ToolContext
|
||||||
|
|
||||||
|
# Import hermes-agent toolset infrastructure
|
||||||
|
from model_tools import get_tool_definitions
|
||||||
|
from toolset_distributions import sample_toolsets_from_distribution
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HermesAgentEnvConfig(BaseEnvConfig):
|
||||||
|
"""
|
||||||
|
Configuration for hermes-agent Atropos environments.
|
||||||
|
|
||||||
|
Extends BaseEnvConfig with agent-specific settings for toolsets,
|
||||||
|
terminal backend, dataset loading, and tool call parsing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# --- Toolset configuration ---
|
||||||
|
# Mutually exclusive: use either enabled_toolsets OR distribution
|
||||||
|
enabled_toolsets: Optional[List[str]] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). "
|
||||||
|
"If None and distribution is also None, all available toolsets are enabled.",
|
||||||
|
)
|
||||||
|
disabled_toolsets: Optional[List[str]] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.",
|
||||||
|
)
|
||||||
|
distribution: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Name of a toolset distribution from toolset_distributions.py "
|
||||||
|
"(e.g., 'development', 'terminal_tasks'). Sampled once per group. "
|
||||||
|
"Mutually exclusive with enabled_toolsets.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Agent loop configuration ---
|
||||||
|
max_agent_turns: int = Field(
|
||||||
|
default=30,
|
||||||
|
description="Maximum number of LLM calls (tool-calling iterations) per rollout.",
|
||||||
|
)
|
||||||
|
system_prompt: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="System prompt for the agent. Tools are handled via the tools= parameter, "
|
||||||
|
"not embedded in the prompt text.",
|
||||||
|
)
|
||||||
|
agent_temperature: float = Field(
|
||||||
|
default=1.0,
|
||||||
|
description="Sampling temperature for agent generation during rollouts.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Terminal backend ---
|
||||||
|
terminal_backend: str = Field(
|
||||||
|
default="local",
|
||||||
|
description="Terminal backend: 'local', 'docker', 'modal', 'daytona', 'ssh', 'singularity'. "
|
||||||
|
"Modal or Daytona recommended for production RL (cloud isolation per rollout).",
|
||||||
|
)
|
||||||
|
terminal_timeout: int = Field(
|
||||||
|
default=120,
|
||||||
|
description="Per-command timeout in seconds for terminal tool calls. "
|
||||||
|
"Commands exceeding this are killed. Increase for tasks with long-running "
|
||||||
|
"commands (compilation, pip install, etc.).",
|
||||||
|
)
|
||||||
|
terminal_lifetime: int = Field(
|
||||||
|
default=3600,
|
||||||
|
description="Sandbox inactivity lifetime in seconds. The cleanup thread kills "
|
||||||
|
"sandboxes that have been idle longer than this. Must be longer than "
|
||||||
|
"the longest gap between tool calls (e.g., waiting for LLM response).",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Dataset ---
|
||||||
|
dataset_name: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="HuggingFace dataset name. Optional if tasks are defined inline.",
|
||||||
|
)
|
||||||
|
dataset_split: str = Field(
|
||||||
|
default="train",
|
||||||
|
description="Dataset split to use.",
|
||||||
|
)
|
||||||
|
prompt_field: str = Field(
|
||||||
|
default="prompt",
|
||||||
|
description="Which field in the dataset contains the prompt.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Thread pool ---
|
||||||
|
tool_pool_size: int = Field(
|
||||||
|
default=128,
|
||||||
|
description="Thread pool size for tool execution. Each concurrent task needs a "
|
||||||
|
"thread for tool calls. Must be large enough for parallel evaluation. "
|
||||||
|
"Too small = thread pool starvation.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Phase 2: Tool call parsing ---
|
||||||
|
tool_call_parser: str = Field(
|
||||||
|
default="hermes",
|
||||||
|
description="Tool call parser name for Phase 2 (VLLM server type). "
|
||||||
|
"Ignored in Phase 1 (OpenAI server type where VLLM parses natively). "
|
||||||
|
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Provider-specific parameters ---
|
||||||
|
# Passed as extra_body to the OpenAI client's chat.completions.create() call.
|
||||||
|
# Useful for OpenRouter provider preferences, transforms, route settings, etc.
|
||||||
|
# Example YAML:
|
||||||
|
# extra_body:
|
||||||
|
# provider:
|
||||||
|
# ignore: ["DeepInfra", "Fireworks"]
|
||||||
|
# order: ["Together"]
|
||||||
|
# transforms: ["middle-out"]
|
||||||
|
extra_body: Optional[Dict[str, Any]] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Extra body parameters passed to the OpenAI client's "
|
||||||
|
"chat.completions.create(). Used for OpenRouter provider preferences, "
|
||||||
|
"transforms, and other provider-specific settings.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class HermesAgentBaseEnv(BaseEnv):
|
||||||
|
"""
|
||||||
|
Abstract base environment for hermes-agent Atropos integration.
|
||||||
|
|
||||||
|
Handles two modes of operation:
|
||||||
|
- Phase 1 (OpenAI server type): Uses server.chat_completion() directly.
|
||||||
|
The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing
|
||||||
|
and reasoning extraction natively. DummyManagedServer provides placeholder
|
||||||
|
tokens. Good for SFT data gen, verifier testing, evaluation.
|
||||||
|
|
||||||
|
- Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs
|
||||||
|
via /generate. Client-side tool call parser reconstructs structured tool_calls
|
||||||
|
from raw output. Full RL training capability.
|
||||||
|
|
||||||
|
Subclasses must implement:
|
||||||
|
setup() -- Load dataset, initialize state
|
||||||
|
get_next_item() -- Return the next item to roll out
|
||||||
|
format_prompt() -- Convert a dataset item into the user message string
|
||||||
|
compute_reward() -- Score the rollout using ToolContext
|
||||||
|
evaluate() -- Periodic evaluation
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: Optional[str] = "hermes-agent"
|
||||||
|
env_config_cls = HermesAgentEnvConfig
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: HermesAgentEnvConfig,
|
||||||
|
server_configs: Union[ServerBaseline, List[APIServerConfig]],
|
||||||
|
slurm=False,
|
||||||
|
testing=False,
|
||||||
|
):
|
||||||
|
super().__init__(config, server_configs, slurm, testing)
|
||||||
|
|
||||||
|
# Set terminal environment variables so hermes tools pick them up.
|
||||||
|
# These can all be overridden per-environment via config fields instead
|
||||||
|
# of requiring users to set shell env vars.
|
||||||
|
if config.terminal_backend:
|
||||||
|
os.environ["TERMINAL_ENV"] = config.terminal_backend
|
||||||
|
os.environ["TERMINAL_TIMEOUT"] = str(config.terminal_timeout)
|
||||||
|
os.environ["TERMINAL_LIFETIME_SECONDS"] = str(config.terminal_lifetime)
|
||||||
|
print(
|
||||||
|
f"🖥️ Terminal: backend={config.terminal_backend}, "
|
||||||
|
f"timeout={config.terminal_timeout}s, lifetime={config.terminal_lifetime}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Resize the agent loop's thread pool for tool execution.
|
||||||
|
# This must be large enough for the number of concurrent tasks
|
||||||
|
# (e.g., 89 parallel TB2 eval tasks each need a thread for tool calls).
|
||||||
|
from environments.agent_loop import resize_tool_pool
|
||||||
|
resize_tool_pool(config.tool_pool_size)
|
||||||
|
|
||||||
|
# Current group's resolved tools (set in collect_trajectories)
|
||||||
|
self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
|
||||||
|
|
||||||
|
# Tool error tracking for wandb logging
|
||||||
|
self._tool_error_buffer: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Toolset resolution (per-group)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]:
|
||||||
|
"""
|
||||||
|
Resolve toolsets for a group. Called once in collect_trajectories(),
|
||||||
|
then shared by all collect_trajectory() calls in the group.
|
||||||
|
|
||||||
|
If distribution is set, samples probabilistically.
|
||||||
|
If enabled_toolsets is set, uses that explicit list.
|
||||||
|
disabled_toolsets is applied as a filter on top.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(tool_schemas, valid_tool_names) tuple
|
||||||
|
"""
|
||||||
|
config = self.config
|
||||||
|
|
||||||
|
if config.distribution:
|
||||||
|
group_toolsets = sample_toolsets_from_distribution(config.distribution)
|
||||||
|
logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets)
|
||||||
|
else:
|
||||||
|
group_toolsets = config.enabled_toolsets # None means "all available"
|
||||||
|
if group_toolsets is None:
|
||||||
|
logger.warning(
|
||||||
|
"enabled_toolsets is None -- loading ALL tools including messaging. "
|
||||||
|
"Set explicit enabled_toolsets for RL training."
|
||||||
|
)
|
||||||
|
|
||||||
|
tools = get_tool_definitions(
|
||||||
|
enabled_toolsets=group_toolsets,
|
||||||
|
disabled_toolsets=config.disabled_toolsets,
|
||||||
|
quiet_mode=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
valid_names = {t["function"]["name"] for t in tools} if tools else set()
|
||||||
|
logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names))
|
||||||
|
return tools, valid_names
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Server mode detection
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _use_managed_server(self) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1).
|
||||||
|
|
||||||
|
Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang',
|
||||||
|
which go through the /generate endpoint for exact token tracking.
|
||||||
|
|
||||||
|
Phase 1 (direct server) is used for 'openai' server type, which uses
|
||||||
|
/v1/chat/completions with native tool call parsing.
|
||||||
|
"""
|
||||||
|
if not self.server.servers:
|
||||||
|
return False
|
||||||
|
|
||||||
|
server = self.server.servers[0]
|
||||||
|
# If the server is an OpenAI server (not VLLM/SGLang), use direct mode
|
||||||
|
from atroposlib.envs.server_handling.openai_server import OpenAIServer
|
||||||
|
return not isinstance(server, OpenAIServer)
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Core Atropos integration
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def collect_trajectories(
|
||||||
|
self, item: Item
|
||||||
|
) -> Tuple[
|
||||||
|
Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
|
||||||
|
List[Item],
|
||||||
|
]:
|
||||||
|
"""
|
||||||
|
Override collect_trajectories to resolve toolsets once per group,
|
||||||
|
then delegate to the standard group-level collection.
|
||||||
|
|
||||||
|
The default BaseEnv.collect_trajectories() calls collect_trajectory()
|
||||||
|
group_size times in parallel. We resolve tools once here and store
|
||||||
|
them for all those calls to use.
|
||||||
|
"""
|
||||||
|
# Resolve toolsets for this group (shared by all rollouts in the group)
|
||||||
|
self._current_group_tools = self._resolve_tools_for_group()
|
||||||
|
|
||||||
|
# Delegate to the default implementation which calls collect_trajectory()
|
||||||
|
# group_size times via asyncio.gather
|
||||||
|
return await super().collect_trajectories(item)
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Wandb rollout display -- format trajectories nicely
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str:
|
||||||
|
"""
|
||||||
|
Format a conversation's messages into a readable trajectory string
|
||||||
|
for wandb rollout tables. Shows tool calls, tool results, and reasoning
|
||||||
|
in a structured way instead of raw token decoding.
|
||||||
|
"""
|
||||||
|
parts = []
|
||||||
|
for msg in messages:
|
||||||
|
role = msg.get("role", "unknown")
|
||||||
|
content = msg.get("content", "")
|
||||||
|
|
||||||
|
if role == "system":
|
||||||
|
parts.append(f"[SYSTEM]\n{content}")
|
||||||
|
|
||||||
|
elif role == "user":
|
||||||
|
parts.append(f"[USER]\n{content}")
|
||||||
|
|
||||||
|
elif role == "assistant":
|
||||||
|
# Show reasoning if present
|
||||||
|
reasoning = msg.get("reasoning_content", "")
|
||||||
|
if reasoning:
|
||||||
|
# Truncate long reasoning for display
|
||||||
|
if len(reasoning) > 300:
|
||||||
|
reasoning = reasoning[:300] + "..."
|
||||||
|
parts.append(f"[ASSISTANT thinking]\n{reasoning}")
|
||||||
|
|
||||||
|
# Show content
|
||||||
|
if content:
|
||||||
|
parts.append(f"[ASSISTANT]\n{content}")
|
||||||
|
|
||||||
|
# Show tool calls
|
||||||
|
tool_calls = msg.get("tool_calls", [])
|
||||||
|
for tc in tool_calls:
|
||||||
|
func = tc.get("function", {})
|
||||||
|
name = func.get("name", "?")
|
||||||
|
args = func.get("arguments", "{}")
|
||||||
|
# Truncate long arguments for display
|
||||||
|
if len(args) > 200:
|
||||||
|
args = args[:200] + "..."
|
||||||
|
parts.append(f"[TOOL CALL] {name}({args})")
|
||||||
|
|
||||||
|
elif role == "tool":
|
||||||
|
tool_id = msg.get("tool_call_id", "")
|
||||||
|
result = content
|
||||||
|
# Truncate long tool results for display
|
||||||
|
if len(result) > 500:
|
||||||
|
result = result[:500] + "..."
|
||||||
|
parts.append(f"[TOOL RESULT] {result}")
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
async def add_rollouts_for_wandb(
|
||||||
|
self,
|
||||||
|
scored_data,
|
||||||
|
item=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Override to show formatted trajectories with tool calls visible,
|
||||||
|
instead of raw token decoding which loses all structure.
|
||||||
|
"""
|
||||||
|
num_keep = self.config.num_rollouts_per_group_for_logging
|
||||||
|
if num_keep == -1:
|
||||||
|
num_keep = self.config.group_size
|
||||||
|
|
||||||
|
group = []
|
||||||
|
for i in range(min(num_keep, len(scored_data.get("scores", [])))):
|
||||||
|
score = scored_data["scores"][i]
|
||||||
|
|
||||||
|
# Use messages if available for rich display
|
||||||
|
messages = None
|
||||||
|
if scored_data.get("messages") and i < len(scored_data["messages"]):
|
||||||
|
messages = scored_data["messages"][i]
|
||||||
|
|
||||||
|
if messages:
|
||||||
|
text = self._format_trajectory_for_display(messages)
|
||||||
|
elif scored_data.get("tokens") and i < len(scored_data["tokens"]):
|
||||||
|
text = self.tokenizer.decode(scored_data["tokens"][i])
|
||||||
|
else:
|
||||||
|
text = "(no data)"
|
||||||
|
|
||||||
|
group.append((text, score))
|
||||||
|
|
||||||
|
self.rollouts_for_wandb.append(group)
|
||||||
|
if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
|
||||||
|
self.rollouts_for_wandb.pop(0)
|
||||||
|
|
||||||
|
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||||
|
"""Log base metrics including tool errors to wandb."""
|
||||||
|
if wandb_metrics is None:
|
||||||
|
wandb_metrics = {}
|
||||||
|
|
||||||
|
# Log tool error stats
|
||||||
|
if self._tool_error_buffer:
|
||||||
|
wandb_metrics["train/tool_errors_count"] = len(self._tool_error_buffer)
|
||||||
|
|
||||||
|
# Log error details as a summary string (tables can crash wandb on tmp cleanup)
|
||||||
|
error_summaries = []
|
||||||
|
for err in self._tool_error_buffer:
|
||||||
|
error_summaries.append(
|
||||||
|
f"[turn {err['turn']}] {err['tool']}({err['args'][:80]}) -> {err['error'][:150]}"
|
||||||
|
)
|
||||||
|
wandb_metrics["train/tool_error_details"] = "\n".join(error_summaries)
|
||||||
|
|
||||||
|
# Also print to stdout for immediate visibility
|
||||||
|
for summary in error_summaries:
|
||||||
|
print(f" Tool Error: {summary}")
|
||||||
|
|
||||||
|
self._tool_error_buffer = []
|
||||||
|
else:
|
||||||
|
wandb_metrics["train/tool_errors_count"] = 0
|
||||||
|
|
||||||
|
await super().wandb_log(wandb_metrics)
|
||||||
|
|
||||||
|
async def collect_trajectory(
|
||||||
|
self, item: Item
|
||||||
|
) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]:
|
||||||
|
"""
|
||||||
|
Run a single rollout: agent loop + reward computation.
|
||||||
|
|
||||||
|
This is called group_size times in parallel by collect_trajectories().
|
||||||
|
Each call gets its own task_id for terminal/browser session isolation.
|
||||||
|
"""
|
||||||
|
task_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
# Get group-level tools (resolved once in collect_trajectories)
|
||||||
|
if self._current_group_tools is None:
|
||||||
|
# Fallback: resolve per-trajectory if called outside collect_trajectories
|
||||||
|
tools, valid_names = self._resolve_tools_for_group()
|
||||||
|
else:
|
||||||
|
tools, valid_names = self._current_group_tools
|
||||||
|
|
||||||
|
# Build initial messages
|
||||||
|
messages: List[Dict[str, Any]] = []
|
||||||
|
if self.config.system_prompt:
|
||||||
|
messages.append({"role": "system", "content": self.config.system_prompt})
|
||||||
|
messages.append({"role": "user", "content": self.format_prompt(item)})
|
||||||
|
|
||||||
|
# Run the agent loop
|
||||||
|
result: AgentResult
|
||||||
|
if self._use_managed_server():
|
||||||
|
# Phase 2: ManagedServer with parser -- exact tokens + logprobs
|
||||||
|
# Load the tool call parser from registry based on config
|
||||||
|
from environments.tool_call_parsers import get_parser
|
||||||
|
try:
|
||||||
|
tc_parser = get_parser(self.config.tool_call_parser)
|
||||||
|
except KeyError:
|
||||||
|
logger.warning(
|
||||||
|
"Tool call parser '%s' not found, falling back to 'hermes'",
|
||||||
|
self.config.tool_call_parser,
|
||||||
|
)
|
||||||
|
tc_parser = get_parser("hermes")
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with self.server.managed_server(
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
tool_call_parser=tc_parser,
|
||||||
|
) as managed:
|
||||||
|
agent = HermesAgentLoop(
|
||||||
|
server=managed,
|
||||||
|
tool_schemas=tools,
|
||||||
|
valid_tool_names=valid_names,
|
||||||
|
max_turns=self.config.max_agent_turns,
|
||||||
|
task_id=task_id,
|
||||||
|
temperature=self.config.agent_temperature,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
extra_body=self.config.extra_body,
|
||||||
|
)
|
||||||
|
result = await agent.run(messages)
|
||||||
|
except NotImplementedError:
|
||||||
|
# DummyManagedServer not allowed -- fall back to Phase 1
|
||||||
|
logger.warning(
|
||||||
|
"ManagedServer not available (OpenAI server?). "
|
||||||
|
"Falling back to direct server mode."
|
||||||
|
)
|
||||||
|
agent = HermesAgentLoop(
|
||||||
|
server=self.server,
|
||||||
|
tool_schemas=tools,
|
||||||
|
valid_tool_names=valid_names,
|
||||||
|
max_turns=self.config.max_agent_turns,
|
||||||
|
task_id=task_id,
|
||||||
|
temperature=self.config.agent_temperature,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
extra_body=self.config.extra_body,
|
||||||
|
)
|
||||||
|
result = await agent.run(messages)
|
||||||
|
else:
|
||||||
|
# Phase 1: OpenAI server -- native tool_calls, placeholder tokens
|
||||||
|
agent = HermesAgentLoop(
|
||||||
|
server=self.server,
|
||||||
|
tool_schemas=tools,
|
||||||
|
valid_tool_names=valid_names,
|
||||||
|
max_turns=self.config.max_agent_turns,
|
||||||
|
task_id=task_id,
|
||||||
|
temperature=self.config.agent_temperature,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
extra_body=self.config.extra_body,
|
||||||
|
)
|
||||||
|
result = await agent.run(messages)
|
||||||
|
|
||||||
|
# Skip reward computation if the agent loop produced no meaningful work
|
||||||
|
# (e.g., API call failed on turn 1). No point spinning up a Modal sandbox
|
||||||
|
# just to verify files that were never created.
|
||||||
|
only_system_and_user = all(
|
||||||
|
msg.get("role") in ("system", "user") for msg in result.messages
|
||||||
|
)
|
||||||
|
if result.turns_used == 0 or only_system_and_user:
|
||||||
|
logger.warning(
|
||||||
|
"Agent loop produced no output (turns=%d, msgs=%d). Skipping reward.",
|
||||||
|
result.turns_used, len(result.messages),
|
||||||
|
)
|
||||||
|
reward = 0.0
|
||||||
|
else:
|
||||||
|
# Compute reward using ToolContext (gives verifier full tool access)
|
||||||
|
ctx = ToolContext(task_id)
|
||||||
|
try:
|
||||||
|
reward = await self.compute_reward(item, result, ctx)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("compute_reward failed: %s", e)
|
||||||
|
reward = 0.0
|
||||||
|
finally:
|
||||||
|
ctx.cleanup()
|
||||||
|
|
||||||
|
# Track tool errors for wandb logging
|
||||||
|
if result.tool_errors:
|
||||||
|
for err in result.tool_errors:
|
||||||
|
self._tool_error_buffer.append({
|
||||||
|
"turn": err.turn,
|
||||||
|
"tool": err.tool_name,
|
||||||
|
"args": err.arguments[:150],
|
||||||
|
"error": err.error[:300],
|
||||||
|
"result": err.tool_result[:300],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Build ScoredDataItem from ManagedServer state
|
||||||
|
# Phase 2: real tokens/masks/logprobs from SequenceNodes
|
||||||
|
# Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline)
|
||||||
|
nodes = (result.managed_state or {}).get("nodes", [])
|
||||||
|
|
||||||
|
if nodes:
|
||||||
|
# Phase 2 (or DummyManagedServer): use actual node data
|
||||||
|
node = nodes[-1] # Final sequence node = full trajectory
|
||||||
|
scored_item: Dict[str, Any] = {
|
||||||
|
"tokens": node.tokens,
|
||||||
|
"masks": node.masked_tokens,
|
||||||
|
"scores": reward,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Include logprobs if available (Phase 2)
|
||||||
|
if hasattr(node, "logprobs") and node.logprobs:
|
||||||
|
scored_item["advantages"] = None # Computed by trainer
|
||||||
|
scored_item["ref_logprobs"] = None
|
||||||
|
else:
|
||||||
|
# Phase 1 with no managed state: create placeholder tokens
|
||||||
|
# so the data pipeline doesn't break. These are NOT suitable
|
||||||
|
# for training but allow process mode (SFT data gen) to work.
|
||||||
|
# Tokenize the full conversation to get approximate tokens.
|
||||||
|
full_text = "\n".join(
|
||||||
|
msg.get("content", "") for msg in result.messages if msg.get("content")
|
||||||
|
)
|
||||||
|
if self.tokenizer:
|
||||||
|
tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
|
||||||
|
else:
|
||||||
|
tokens = list(range(min(len(full_text) // 4, 128)))
|
||||||
|
|
||||||
|
scored_item = {
|
||||||
|
"tokens": tokens,
|
||||||
|
"masks": [-100] + tokens[1:], # Mask first token as prompt
|
||||||
|
"scores": reward,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Always include messages for wandb rollout display and data logging
|
||||||
|
scored_item["messages"] = result.messages
|
||||||
|
|
||||||
|
return scored_item, []
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Abstract methods -- subclasses must implement
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def setup(self):
|
||||||
|
"""
|
||||||
|
Load dataset, initialize state.
|
||||||
|
|
||||||
|
Called once when the environment starts. Typical implementation:
|
||||||
|
self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
|
||||||
|
self.iter = 0
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def get_next_item(self) -> Item:
|
||||||
|
"""
|
||||||
|
Return the next item from the dataset for rollout.
|
||||||
|
|
||||||
|
Called by the base env's main loop to get items for workers.
|
||||||
|
Should cycle through the dataset.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def format_prompt(self, item: Item) -> str:
|
||||||
|
"""
|
||||||
|
Convert a dataset item into the user message for the agent.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
item: Dataset item (dict, tuple, etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The prompt string to send to the agent
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def compute_reward(
|
||||||
|
self, item: Item, result: AgentResult, ctx: ToolContext
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Score the rollout. Has full access to:
|
||||||
|
- item: the original dataset item (ground truth, test commands, etc.)
|
||||||
|
- result: AgentResult with full messages, turn count, reasoning, etc.
|
||||||
|
- ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web,
|
||||||
|
browser, vision...) scoped to this rollout's sandbox. Nothing
|
||||||
|
is off-limits.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
item: The dataset item that was rolled out
|
||||||
|
result: The agent's rollout result
|
||||||
|
ctx: ToolContext with full tool access for verification
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Reward float (typically 0.0 to 1.0, but any float is valid)
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def evaluate(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Periodic evaluation. Called every steps_per_eval steps.
|
||||||
|
|
||||||
|
Typical implementation runs the agent on a held-out eval set
|
||||||
|
and logs metrics via wandb/evaluate_log.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
0
environments/hermes_swe_env/__init__.py
Normal file
0
environments/hermes_swe_env/__init__.py
Normal file
34
environments/hermes_swe_env/default.yaml
Normal file
34
environments/hermes_swe_env/default.yaml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# SWE Environment -- Default Configuration
|
||||||
|
#
|
||||||
|
# SWE-bench style tasks with Modal sandboxes for cloud isolation.
|
||||||
|
# Uses terminal + file + web toolsets.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python environments/hermes_swe_env/hermes_swe_env.py serve \
|
||||||
|
# --config environments/hermes_swe_env/default.yaml
|
||||||
|
|
||||||
|
env:
|
||||||
|
enabled_toolsets: ["terminal", "file", "web"]
|
||||||
|
max_agent_turns: 30
|
||||||
|
max_token_length: 4096
|
||||||
|
group_size: 4
|
||||||
|
terminal_backend: "modal"
|
||||||
|
tool_call_parser: "hermes"
|
||||||
|
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||||
|
dataset_name: "bigcode/humanevalpack"
|
||||||
|
dataset_split: "test"
|
||||||
|
prompt_field: "prompt"
|
||||||
|
steps_per_eval: 50
|
||||||
|
total_steps: 500
|
||||||
|
use_wandb: true
|
||||||
|
wandb_name: "hermes-swe"
|
||||||
|
system_prompt: >
|
||||||
|
You are a skilled software engineer. You have access to a terminal,
|
||||||
|
file tools, and web search. Use these tools to complete the coding task.
|
||||||
|
Write clean, working code and verify it runs correctly before finishing.
|
||||||
|
|
||||||
|
openai:
|
||||||
|
base_url: "http://localhost:8000/v1"
|
||||||
|
model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||||
|
server_type: "openai"
|
||||||
|
api_key: ""
|
||||||
229
environments/hermes_swe_env/hermes_swe_env.py
Normal file
229
environments/hermes_swe_env/hermes_swe_env.py
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
"""
|
||||||
|
HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes
|
||||||
|
|
||||||
|
A concrete environment for software engineering tasks where the model writes code
|
||||||
|
and the reward function runs tests to verify correctness. Uses Modal terminal
|
||||||
|
backend for cloud-isolated sandboxes per rollout.
|
||||||
|
|
||||||
|
The reward function uses ToolContext.terminal() to run test commands in the same
|
||||||
|
Modal sandbox the model used during its agentic loop. All filesystem state from
|
||||||
|
the model's tool calls is preserved for verification.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Phase 1: OpenAI server type
|
||||||
|
vllm serve YourModel --tool-parser hermes
|
||||||
|
run-api
|
||||||
|
python environments/hermes_swe_env.py serve \\
|
||||||
|
--openai.base_url http://localhost:8000/v1 \\
|
||||||
|
--openai.model_name YourModel \\
|
||||||
|
--openai.server_type openai \\
|
||||||
|
--env.dataset_name bigcode/humanevalpack \\
|
||||||
|
--env.terminal_backend modal
|
||||||
|
|
||||||
|
# Phase 2: VLLM server type (full RL training)
|
||||||
|
python environments/hermes_swe_env.py serve \\
|
||||||
|
--openai.base_url http://localhost:8000/v1 \\
|
||||||
|
--openai.model_name YourModel \\
|
||||||
|
--openai.server_type vllm \\
|
||||||
|
--env.tool_call_parser hermes \\
|
||||||
|
--env.terminal_backend modal
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
# Ensure repo root is on sys.path for imports
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
from atroposlib.envs.base import ScoredDataGroup
|
||||||
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||||
|
from atroposlib.type_definitions import Item
|
||||||
|
|
||||||
|
from environments.agent_loop import AgentResult
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
from environments.tool_context import ToolContext
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HermesSweEnvConfig(HermesAgentEnvConfig):
|
||||||
|
"""Config with defaults for SWE-bench style tasks."""
|
||||||
|
|
||||||
|
pass # Inherits all fields, overrides defaults in config_init
|
||||||
|
|
||||||
|
|
||||||
|
class HermesSweEnv(HermesAgentBaseEnv):
|
||||||
|
"""
|
||||||
|
SWE-bench style environment using Modal terminal backend.
|
||||||
|
|
||||||
|
The model gets a coding task, uses terminal + file + web tools to solve it,
|
||||||
|
and the reward function runs tests in the same Modal sandbox to verify.
|
||||||
|
|
||||||
|
Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
|
||||||
|
and customize format_prompt() and compute_reward() as needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "hermes-swe"
|
||||||
|
env_config_cls = HermesSweEnvConfig
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
|
||||||
|
"""
|
||||||
|
Default configuration for the SWE environment.
|
||||||
|
|
||||||
|
Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
|
||||||
|
"""
|
||||||
|
env_config = HermesSweEnvConfig(
|
||||||
|
# Toolsets: terminal for running code, file for reading/writing, web for docs
|
||||||
|
enabled_toolsets=["terminal", "file", "web"],
|
||||||
|
disabled_toolsets=None,
|
||||||
|
distribution=None,
|
||||||
|
# Agent settings -- SWE tasks need more turns
|
||||||
|
max_agent_turns=30,
|
||||||
|
max_token_length=4096,
|
||||||
|
agent_temperature=1.0,
|
||||||
|
system_prompt=(
|
||||||
|
"You are a skilled software engineer. You have access to a terminal, "
|
||||||
|
"file tools, and web search. Use these tools to complete the coding task. "
|
||||||
|
"Write clean, working code and verify it runs correctly before finishing."
|
||||||
|
),
|
||||||
|
# Modal backend for cloud-isolated sandboxes
|
||||||
|
terminal_backend="modal",
|
||||||
|
# Dataset -- override via CLI for your specific SWE dataset
|
||||||
|
dataset_name="bigcode/humanevalpack",
|
||||||
|
dataset_split="test",
|
||||||
|
prompt_field="prompt",
|
||||||
|
# Atropos settings
|
||||||
|
group_size=4,
|
||||||
|
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||||
|
tool_call_parser="hermes",
|
||||||
|
steps_per_eval=50,
|
||||||
|
total_steps=500,
|
||||||
|
use_wandb=True,
|
||||||
|
wandb_name="hermes-swe",
|
||||||
|
)
|
||||||
|
|
||||||
|
server_configs = [
|
||||||
|
APIServerConfig(
|
||||||
|
base_url="http://localhost:8000/v1",
|
||||||
|
model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||||
|
server_type="openai", # Phase 1; switch to "vllm" for Phase 2
|
||||||
|
api_key="",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
async def setup(self):
|
||||||
|
"""Load the SWE dataset."""
|
||||||
|
if self.config.dataset_name:
|
||||||
|
self.dataset = load_dataset(
|
||||||
|
self.config.dataset_name, split=self.config.dataset_split
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Placeholder if no dataset specified
|
||||||
|
self.dataset = []
|
||||||
|
self.iter = 0
|
||||||
|
self.reward_buffer: List[float] = []
|
||||||
|
|
||||||
|
async def get_next_item(self) -> Dict[str, Any]:
|
||||||
|
"""Cycle through the SWE dataset."""
|
||||||
|
if not self.dataset:
|
||||||
|
raise ValueError("No dataset loaded. Set dataset_name in config.")
|
||||||
|
item = self.dataset[self.iter % len(self.dataset)]
|
||||||
|
self.iter += 1
|
||||||
|
return item
|
||||||
|
|
||||||
|
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Format the SWE task prompt.
|
||||||
|
|
||||||
|
Override this in subclasses for different dataset formats.
|
||||||
|
Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
|
||||||
|
"""
|
||||||
|
prompt = item.get(self.config.prompt_field, "")
|
||||||
|
|
||||||
|
# If the dataset has test information, include it in the prompt
|
||||||
|
test_info = item.get("test", item.get("test_code", item.get("tests", "")))
|
||||||
|
if test_info:
|
||||||
|
prompt += f"\n\nTests to pass:\n{test_info}"
|
||||||
|
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
async def compute_reward(
|
||||||
|
self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Score by running tests in the model's Modal sandbox.
|
||||||
|
|
||||||
|
Default implementation:
|
||||||
|
- If the dataset item has a 'test' or 'test_code' field, run it
|
||||||
|
- Check exit code: 0 = pass, non-zero = fail
|
||||||
|
- Partial credit for file creation
|
||||||
|
|
||||||
|
Override this in subclasses for more sophisticated reward logic.
|
||||||
|
"""
|
||||||
|
# Find the test command from the dataset item
|
||||||
|
test_code = item.get("test", item.get("test_code", item.get("tests", "")))
|
||||||
|
|
||||||
|
if test_code:
|
||||||
|
# Run the test in the model's sandbox
|
||||||
|
test_result = ctx.terminal(
|
||||||
|
f'cd /workspace && python3 -c "{test_code}"', timeout=60
|
||||||
|
)
|
||||||
|
|
||||||
|
if test_result["exit_code"] == 0:
|
||||||
|
self.reward_buffer.append(1.0)
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
# Partial credit: check if the model created any Python files
|
||||||
|
file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
|
||||||
|
if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
|
||||||
|
self.reward_buffer.append(0.1)
|
||||||
|
return 0.1
|
||||||
|
|
||||||
|
self.reward_buffer.append(0.0)
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
async def evaluate(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Run evaluation on a held-out set.
|
||||||
|
|
||||||
|
Override for dataset-specific evaluation logic.
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
eval_metrics = {"eval/placeholder": 0.0}
|
||||||
|
await self.evaluate_log(
|
||||||
|
metrics=eval_metrics,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||||
|
"""Log SWE-specific metrics."""
|
||||||
|
if wandb_metrics is None:
|
||||||
|
wandb_metrics = {}
|
||||||
|
|
||||||
|
if self.reward_buffer:
|
||||||
|
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
|
||||||
|
self.reward_buffer
|
||||||
|
)
|
||||||
|
wandb_metrics["train/pass_rate"] = sum(
|
||||||
|
1 for r in self.reward_buffer if r == 1.0
|
||||||
|
) / len(self.reward_buffer)
|
||||||
|
self.reward_buffer = []
|
||||||
|
|
||||||
|
await super().wandb_log(wandb_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
HermesSweEnv.cli()
|
||||||
188
environments/patches.py
Normal file
188
environments/patches.py
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
"""
|
||||||
|
Monkey patches for making hermes-agent tools work inside async frameworks (Atropos).
|
||||||
|
|
||||||
|
Problem:
|
||||||
|
Some tools use asyncio.run() internally (e.g., mini-swe-agent's Modal backend,
|
||||||
|
web_extract). This crashes when called from inside Atropos's event loop because
|
||||||
|
asyncio.run() can't be nested.
|
||||||
|
|
||||||
|
Solution:
|
||||||
|
Replace the problematic methods with versions that use a dedicated background
|
||||||
|
thread with its own event loop. The calling code sees the same sync interface --
|
||||||
|
call a function, get a result -- but internally the async work happens on a
|
||||||
|
separate thread that doesn't conflict with Atropos's loop.
|
||||||
|
|
||||||
|
These patches are safe for normal CLI use too: when there's no running event
|
||||||
|
loop, the behavior is identical (the background thread approach works regardless).
|
||||||
|
|
||||||
|
What gets patched:
|
||||||
|
- SwerexModalEnvironment.__init__ -- creates Modal deployment on a background thread
|
||||||
|
- SwerexModalEnvironment.execute -- runs commands on the same background thread
|
||||||
|
- SwerexModalEnvironment.stop -- stops deployment on the background thread
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
Call apply_patches() once at import time (done automatically by hermes_base_env.py).
|
||||||
|
This is idempotent -- calling it multiple times is safe.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_patches_applied = False
|
||||||
|
|
||||||
|
|
||||||
|
class _AsyncWorker:
|
||||||
|
"""
|
||||||
|
A dedicated background thread with its own event loop.
|
||||||
|
|
||||||
|
Allows sync code to submit async coroutines and block for results,
|
||||||
|
even when called from inside another running event loop. Used to
|
||||||
|
bridge sync tool interfaces with async backends (Modal, SWE-ReX).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._loop: asyncio.AbstractEventLoop = None
|
||||||
|
self._thread: threading.Thread = None
|
||||||
|
self._started = threading.Event()
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""Start the background event loop thread."""
|
||||||
|
self._thread = threading.Thread(target=self._run_loop, daemon=True)
|
||||||
|
self._thread.start()
|
||||||
|
self._started.wait(timeout=30)
|
||||||
|
|
||||||
|
def _run_loop(self):
|
||||||
|
"""Background thread entry point -- runs the event loop forever."""
|
||||||
|
self._loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(self._loop)
|
||||||
|
self._started.set()
|
||||||
|
self._loop.run_forever()
|
||||||
|
|
||||||
|
def run_coroutine(self, coro, timeout=600):
|
||||||
|
"""
|
||||||
|
Submit a coroutine to the background loop and block until it completes.
|
||||||
|
|
||||||
|
Safe to call from any thread, including threads that already have
|
||||||
|
a running event loop.
|
||||||
|
"""
|
||||||
|
if self._loop is None or self._loop.is_closed():
|
||||||
|
raise RuntimeError("AsyncWorker loop is not running")
|
||||||
|
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
||||||
|
return future.result(timeout=timeout)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""Stop the background event loop and join the thread."""
|
||||||
|
if self._loop and self._loop.is_running():
|
||||||
|
self._loop.call_soon_threadsafe(self._loop.stop)
|
||||||
|
if self._thread:
|
||||||
|
self._thread.join(timeout=10)
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_swerex_modal():
|
||||||
|
"""
|
||||||
|
Monkey patch SwerexModalEnvironment to use a background thread event loop
|
||||||
|
instead of asyncio.run(). This makes it safe to call from inside Atropos's
|
||||||
|
async event loop.
|
||||||
|
|
||||||
|
The patched methods have the exact same interface and behavior -- the only
|
||||||
|
difference is HOW the async work is executed internally.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from minisweagent.environments.extra.swerex_modal import (
|
||||||
|
SwerexModalEnvironment,
|
||||||
|
SwerexModalEnvironmentConfig,
|
||||||
|
)
|
||||||
|
from swerex.deployment.modal import ModalDeployment
|
||||||
|
from swerex.runtime.abstract import Command as RexCommand
|
||||||
|
except ImportError:
|
||||||
|
# mini-swe-agent or swe-rex not installed -- nothing to patch
|
||||||
|
logger.debug("mini-swe-agent Modal backend not available, skipping patch")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Save original methods so we can refer to config handling
|
||||||
|
_original_init = SwerexModalEnvironment.__init__
|
||||||
|
|
||||||
|
def _patched_init(self, **kwargs):
|
||||||
|
"""Patched __init__: creates Modal deployment on a background thread."""
|
||||||
|
self.config = SwerexModalEnvironmentConfig(**kwargs)
|
||||||
|
|
||||||
|
# Start a dedicated event loop thread for all Modal async operations
|
||||||
|
self._worker = _AsyncWorker()
|
||||||
|
self._worker.start()
|
||||||
|
|
||||||
|
# Create AND start the deployment entirely on the worker's loop/thread
|
||||||
|
# so all gRPC channels and async state are bound to that loop
|
||||||
|
async def _create_and_start():
|
||||||
|
deployment = ModalDeployment(
|
||||||
|
image=self.config.image,
|
||||||
|
startup_timeout=self.config.startup_timeout,
|
||||||
|
runtime_timeout=self.config.runtime_timeout,
|
||||||
|
deployment_timeout=self.config.deployment_timeout,
|
||||||
|
install_pipx=self.config.install_pipx,
|
||||||
|
modal_sandbox_kwargs=self.config.modal_sandbox_kwargs,
|
||||||
|
)
|
||||||
|
await deployment.start()
|
||||||
|
return deployment
|
||||||
|
|
||||||
|
self.deployment = self._worker.run_coroutine(_create_and_start())
|
||||||
|
|
||||||
|
def _patched_execute(self, command: str, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]:
|
||||||
|
"""Patched execute: runs commands on the background thread's loop."""
|
||||||
|
async def _do_execute():
|
||||||
|
return await self.deployment.runtime.execute(
|
||||||
|
RexCommand(
|
||||||
|
command=command,
|
||||||
|
shell=True,
|
||||||
|
check=False,
|
||||||
|
cwd=cwd or self.config.cwd,
|
||||||
|
timeout=timeout or self.config.timeout,
|
||||||
|
merge_output_streams=True,
|
||||||
|
env=self.config.env if self.config.env else None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
output = self._worker.run_coroutine(_do_execute())
|
||||||
|
return {
|
||||||
|
"output": output.stdout,
|
||||||
|
"returncode": output.exit_code,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _patched_stop(self):
|
||||||
|
"""Patched stop: stops deployment on the background thread, then stops the thread."""
|
||||||
|
try:
|
||||||
|
self._worker.run_coroutine(
|
||||||
|
asyncio.wait_for(self.deployment.stop(), timeout=10),
|
||||||
|
timeout=15,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
self._worker.stop()
|
||||||
|
|
||||||
|
# Apply the patches
|
||||||
|
SwerexModalEnvironment.__init__ = _patched_init
|
||||||
|
SwerexModalEnvironment.execute = _patched_execute
|
||||||
|
SwerexModalEnvironment.stop = _patched_stop
|
||||||
|
|
||||||
|
logger.debug("Patched SwerexModalEnvironment for async-safe operation")
|
||||||
|
|
||||||
|
|
||||||
|
def apply_patches():
|
||||||
|
"""
|
||||||
|
Apply all monkey patches needed for Atropos compatibility.
|
||||||
|
|
||||||
|
Safe to call multiple times -- patches are only applied once.
|
||||||
|
Safe for normal CLI use -- patched code works identically when
|
||||||
|
there is no running event loop.
|
||||||
|
"""
|
||||||
|
global _patches_applied
|
||||||
|
if _patches_applied:
|
||||||
|
return
|
||||||
|
|
||||||
|
_patch_swerex_modal()
|
||||||
|
|
||||||
|
_patches_applied = True
|
||||||
0
environments/terminal_test_env/__init__.py
Normal file
0
environments/terminal_test_env/__init__.py
Normal file
34
environments/terminal_test_env/default.yaml
Normal file
34
environments/terminal_test_env/default.yaml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# Terminal Test Environment -- Default Configuration
|
||||||
|
#
|
||||||
|
# Simple file-creation tasks for validating the full Atropos + hermes-agent stack.
|
||||||
|
# Uses Modal terminal backend and OpenRouter (Claude) for inference.
|
||||||
|
# API keys loaded from ~/hermes-agent/.env
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# run-api
|
||||||
|
# python environments/terminal_test_env/terminal_test_env.py serve \
|
||||||
|
# --config environments/terminal_test_env/default.yaml
|
||||||
|
|
||||||
|
env:
|
||||||
|
enabled_toolsets: ["terminal", "file"]
|
||||||
|
max_agent_turns: 10
|
||||||
|
max_token_length: 2048
|
||||||
|
group_size: 3
|
||||||
|
total_steps: 3
|
||||||
|
steps_per_eval: 3
|
||||||
|
terminal_backend: "modal"
|
||||||
|
tool_call_parser: "hermes"
|
||||||
|
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||||
|
ensure_scores_are_not_same: false
|
||||||
|
use_wandb: false
|
||||||
|
system_prompt: >
|
||||||
|
You are a helpful assistant with access to a terminal and file tools.
|
||||||
|
Complete the user's request by using the available tools.
|
||||||
|
Be precise and follow instructions exactly.
|
||||||
|
|
||||||
|
openai:
|
||||||
|
base_url: "https://openrouter.ai/api/v1"
|
||||||
|
model_name: "anthropic/claude-opus-4.6"
|
||||||
|
server_type: "openai"
|
||||||
|
health_check: false
|
||||||
|
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||||
292
environments/terminal_test_env/terminal_test_env.py
Normal file
292
environments/terminal_test_env/terminal_test_env.py
Normal file
@@ -0,0 +1,292 @@
|
|||||||
|
"""
|
||||||
|
TerminalTestEnv -- Simple Test Environment for Validating the Stack
|
||||||
|
|
||||||
|
A self-contained environment with inline tasks (no external dataset needed).
|
||||||
|
Each task asks the model to create a file at a known path with specific content.
|
||||||
|
The reward verifier cats the file and checks if the content matches.
|
||||||
|
|
||||||
|
Enables only terminal + file toolsets. Uses Modal terminal backend with
|
||||||
|
OpenRouter (Claude) by default.
|
||||||
|
|
||||||
|
Training tasks (3):
|
||||||
|
1. Create ~/greeting.txt with "Hello from Hermes Agent"
|
||||||
|
2. Create ~/count.txt with numbers 1-5, one per line
|
||||||
|
3. Create ~/answer.txt with the result of 123 + 456
|
||||||
|
|
||||||
|
Eval task (1):
|
||||||
|
1. Create ~/result.txt with the result of 6 * 7
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Start Atropos API server
|
||||||
|
run-api
|
||||||
|
|
||||||
|
# Run environment (uses OpenRouter + Modal by default)
|
||||||
|
python environments/terminal_test_env.py serve
|
||||||
|
|
||||||
|
# Process mode (no run-api needed, saves to JSONL)
|
||||||
|
python environments/terminal_test_env.py process \\
|
||||||
|
--env.data_path_to_save_groups terminal_test_output.jsonl
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
# Ensure repo root is on sys.path for imports
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
from atroposlib.envs.base import ScoredDataGroup
|
||||||
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||||
|
from atroposlib.type_definitions import Item
|
||||||
|
|
||||||
|
from environments.agent_loop import AgentResult
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
from environments.tool_context import ToolContext
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Inline task definitions -- no external dataset needed
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
TRAIN_TASKS = [
|
||||||
|
{
|
||||||
|
"prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
|
||||||
|
"verify_path": "~/greeting.txt",
|
||||||
|
"expected_content": "Hello from Hermes Agent",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
|
||||||
|
"verify_path": "~/count.txt",
|
||||||
|
"expected_content": "1\n2\n3\n4\n5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
|
||||||
|
"verify_path": "~/answer.txt",
|
||||||
|
"expected_content": "579",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
EVAL_TASKS = [
|
||||||
|
{
|
||||||
|
"prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
|
||||||
|
"verify_path": "~/result.txt",
|
||||||
|
"expected_content": "42",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TerminalTestEnvConfig(HermesAgentEnvConfig):
|
||||||
|
"""Config with defaults suitable for terminal testing."""
|
||||||
|
|
||||||
|
pass # Inherits all fields, overrides defaults in config_init
|
||||||
|
|
||||||
|
|
||||||
|
class TerminalTestEnv(HermesAgentBaseEnv):
|
||||||
|
"""
|
||||||
|
Simple test environment with inline file-creation tasks.
|
||||||
|
|
||||||
|
All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
|
||||||
|
The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
|
||||||
|
against the expected string. Same verifier logic for all tasks.
|
||||||
|
|
||||||
|
This environment is designed to validate the full stack end-to-end:
|
||||||
|
- Agent loop executes tool calls (terminal/file)
|
||||||
|
- ToolContext provides terminal access to the reward function
|
||||||
|
- Reward function verifies file content via cat
|
||||||
|
- Scored data flows through the Atropos pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "terminal-test"
|
||||||
|
env_config_cls = TerminalTestEnvConfig
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
|
||||||
|
"""
|
||||||
|
Default configuration for the terminal test environment.
|
||||||
|
|
||||||
|
Uses Modal terminal backend for cloud isolation and OpenRouter with
|
||||||
|
Claude for inference. API keys loaded from ~/hermes-agent/.env.
|
||||||
|
"""
|
||||||
|
env_config = TerminalTestEnvConfig(
|
||||||
|
# Terminal + file tools only
|
||||||
|
enabled_toolsets=["terminal", "file"],
|
||||||
|
disabled_toolsets=None,
|
||||||
|
distribution=None,
|
||||||
|
# Agent settings
|
||||||
|
max_agent_turns=10, # Simple tasks, don't need many turns
|
||||||
|
max_token_length=16000,
|
||||||
|
agent_temperature=1.0,
|
||||||
|
system_prompt=(
|
||||||
|
"You are a helpful assistant with access to a terminal and file tools. "
|
||||||
|
"Complete the user's request by using the available tools. "
|
||||||
|
"Be precise and follow instructions exactly."
|
||||||
|
),
|
||||||
|
# Modal terminal backend for cloud-isolated sandboxes per rollout
|
||||||
|
terminal_backend="modal",
|
||||||
|
# Atropos settings
|
||||||
|
group_size=3, # 3 rollouts per group
|
||||||
|
tokenizer_name="NousResearch/q-30b-t-h45-e1",
|
||||||
|
tool_call_parser="hermes",
|
||||||
|
steps_per_eval=3, # Eval after all 3 steps
|
||||||
|
total_steps=3, # 3 groups total (1 group per step)
|
||||||
|
use_wandb=True,
|
||||||
|
wandb_name="terminal-test",
|
||||||
|
ensure_scores_are_not_same=False, # Allow all-same scores for simple tasks
|
||||||
|
# No external dataset
|
||||||
|
dataset_name=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
|
||||||
|
server_configs = [
|
||||||
|
APIServerConfig(
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
model_name="anthropic/claude-opus-4.6",
|
||||||
|
server_type="openai",
|
||||||
|
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||||
|
health_check=False, # OpenRouter doesn't have a /health endpoint
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
async def setup(self):
|
||||||
|
"""Initialize inline task lists."""
|
||||||
|
self.train_tasks = list(TRAIN_TASKS)
|
||||||
|
self.eval_tasks = list(EVAL_TASKS)
|
||||||
|
self.iter = 0
|
||||||
|
# Track reward stats for wandb logging
|
||||||
|
self.reward_buffer: List[float] = []
|
||||||
|
|
||||||
|
async def get_next_item(self) -> Dict[str, str]:
|
||||||
|
"""Cycle through training tasks."""
|
||||||
|
item = self.train_tasks[self.iter % len(self.train_tasks)]
|
||||||
|
self.iter += 1
|
||||||
|
return item
|
||||||
|
|
||||||
|
def format_prompt(self, item: Dict[str, str]) -> str:
|
||||||
|
"""The prompt is directly in the task item."""
|
||||||
|
return item["prompt"]
|
||||||
|
|
||||||
|
async def compute_reward(
|
||||||
|
self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Verify by cat-ing the expected file path and checking content matches.
|
||||||
|
Same verifier for all tasks -- they all write a file at a known path.
|
||||||
|
|
||||||
|
Scoring:
|
||||||
|
1.0 = exact match
|
||||||
|
0.5 = expected content is present but has extra stuff
|
||||||
|
0.0 = file doesn't exist or content doesn't match
|
||||||
|
"""
|
||||||
|
verify_result = ctx.terminal(f"cat {item['verify_path']}")
|
||||||
|
|
||||||
|
# File doesn't exist or can't be read
|
||||||
|
if verify_result["exit_code"] != 0:
|
||||||
|
self.reward_buffer.append(0.0)
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
actual = verify_result.get("output", "").strip()
|
||||||
|
expected = item["expected_content"].strip()
|
||||||
|
|
||||||
|
# Exact match
|
||||||
|
if actual == expected:
|
||||||
|
self.reward_buffer.append(1.0)
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
# Partial credit: expected content is present but has extra stuff
|
||||||
|
if expected in actual:
|
||||||
|
self.reward_buffer.append(0.5)
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
self.reward_buffer.append(0.0)
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
async def evaluate(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Run eval tasks using the agent loop and verify results.
|
||||||
|
Logs accuracy metrics.
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
correct = 0
|
||||||
|
total = len(self.eval_tasks)
|
||||||
|
samples = []
|
||||||
|
|
||||||
|
for eval_item in self.eval_tasks:
|
||||||
|
try:
|
||||||
|
# For eval, we do a simple single-turn completion (not full agent loop)
|
||||||
|
# to keep eval fast. The agent loop is tested via training.
|
||||||
|
completion = await self.server.chat_completion(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": self.config.system_prompt or ""},
|
||||||
|
{"role": "user", "content": eval_item["prompt"]},
|
||||||
|
],
|
||||||
|
n=1,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
temperature=0.0,
|
||||||
|
split="eval",
|
||||||
|
)
|
||||||
|
|
||||||
|
response_content = (
|
||||||
|
completion.choices[0].message.content if completion.choices else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
samples.append(
|
||||||
|
{
|
||||||
|
"prompt": eval_item["prompt"],
|
||||||
|
"response": response_content,
|
||||||
|
"expected": eval_item["expected_content"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Eval failed for item: %s", e)
|
||||||
|
samples.append(
|
||||||
|
{
|
||||||
|
"prompt": eval_item["prompt"],
|
||||||
|
"response": f"ERROR: {e}",
|
||||||
|
"expected": eval_item["expected_content"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
eval_metrics = {
|
||||||
|
"eval/num_samples": total,
|
||||||
|
}
|
||||||
|
|
||||||
|
await self.evaluate_log(
|
||||||
|
metrics=eval_metrics,
|
||||||
|
samples=samples,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||||
|
"""Log training metrics including reward stats and accuracy."""
|
||||||
|
if wandb_metrics is None:
|
||||||
|
wandb_metrics = {}
|
||||||
|
|
||||||
|
if self.reward_buffer:
|
||||||
|
total = len(self.reward_buffer)
|
||||||
|
correct = sum(1 for r in self.reward_buffer if r == 1.0)
|
||||||
|
partial = sum(1 for r in self.reward_buffer if r == 0.5)
|
||||||
|
|
||||||
|
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
|
||||||
|
wandb_metrics["train/accuracy"] = correct / total
|
||||||
|
wandb_metrics["train/partial_match_rate"] = partial / total
|
||||||
|
wandb_metrics["train/total_rollouts"] = total
|
||||||
|
self.reward_buffer = []
|
||||||
|
|
||||||
|
await super().wandb_log(wandb_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
TerminalTestEnv.cli()
|
||||||
120
environments/tool_call_parsers/__init__.py
Normal file
120
environments/tool_call_parsers/__init__.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
"""
|
||||||
|
Tool Call Parser Registry
|
||||||
|
|
||||||
|
Client-side parsers that extract structured tool_calls from raw model output text.
|
||||||
|
Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns
|
||||||
|
raw text without tool call parsing.
|
||||||
|
|
||||||
|
Each parser is a standalone reimplementation of the corresponding VLLM parser's
|
||||||
|
non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library
|
||||||
|
(re, json, uuid) and openai types.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from environments.tool_call_parsers import get_parser
|
||||||
|
|
||||||
|
parser = get_parser("hermes")
|
||||||
|
content, tool_calls = parser.parse(raw_model_output)
|
||||||
|
# content = text with tool call markup stripped
|
||||||
|
# tool_calls = list of ChatCompletionMessageToolCall objects, or None
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, List, Optional, Tuple, Type
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Type alias for parser return value
|
||||||
|
ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]]
|
||||||
|
|
||||||
|
|
||||||
|
class ToolCallParser(ABC):
|
||||||
|
"""
|
||||||
|
Base class for tool call parsers.
|
||||||
|
|
||||||
|
Each parser knows how to extract structured tool_calls from a specific
|
||||||
|
model family's raw output text format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
"""
|
||||||
|
Parse raw model output text for tool calls.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw decoded text from the model's completion
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (content, tool_calls) where:
|
||||||
|
- content: text with tool call markup stripped (the message 'content' field),
|
||||||
|
or None if the entire output was tool calls
|
||||||
|
- tool_calls: list of ChatCompletionMessageToolCall objects,
|
||||||
|
or None if no tool calls were found
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
# Global parser registry: name -> parser class
|
||||||
|
PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def register_parser(name: str):
|
||||||
|
"""
|
||||||
|
Decorator to register a parser class under a given name.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
@register_parser("hermes")
|
||||||
|
class HermesToolCallParser(ToolCallParser):
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]:
|
||||||
|
PARSER_REGISTRY[name] = cls
|
||||||
|
return cls
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser(name: str) -> ToolCallParser:
|
||||||
|
"""
|
||||||
|
Get a parser instance by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Parser name (e.g., "hermes", "mistral", "llama3_json")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Instantiated parser
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
KeyError: If parser name is not found in registry
|
||||||
|
"""
|
||||||
|
if name not in PARSER_REGISTRY:
|
||||||
|
available = sorted(PARSER_REGISTRY.keys())
|
||||||
|
raise KeyError(
|
||||||
|
f"Tool call parser '{name}' not found. Available parsers: {available}"
|
||||||
|
)
|
||||||
|
return PARSER_REGISTRY[name]()
|
||||||
|
|
||||||
|
|
||||||
|
def list_parsers() -> List[str]:
|
||||||
|
"""Return sorted list of registered parser names."""
|
||||||
|
return sorted(PARSER_REGISTRY.keys())
|
||||||
|
|
||||||
|
|
||||||
|
# Import all parser modules to trigger registration via @register_parser decorators
|
||||||
|
# Each module registers itself when imported
|
||||||
|
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.mistral_parser import MistralToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.llama_parser import LlamaToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.qwen_parser import QwenToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser # noqa: E402, F401
|
||||||
|
from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser # noqa: E402, F401
|
||||||
72
environments/tool_call_parsers/deepseek_v3_1_parser.py
Normal file
72
environments/tool_call_parsers/deepseek_v3_1_parser.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
DeepSeek V3.1 tool call parser.
|
||||||
|
|
||||||
|
Similar to V3 but with a slightly different format:
|
||||||
|
<|tool▁call▁begin|>function_name<|tool▁sep|>arguments<|tool▁call▁end|>
|
||||||
|
|
||||||
|
Note: V3 has type+name before the separator, V3.1 has name before and args after.
|
||||||
|
|
||||||
|
Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls()
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("deepseek_v3_1")
|
||||||
|
@register_parser("deepseek_v31")
|
||||||
|
class DeepSeekV31ToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for DeepSeek V3.1 tool calls.
|
||||||
|
|
||||||
|
Slightly different regex than V3: function_name comes before the separator,
|
||||||
|
arguments come after (no type field, no json code block wrapper).
|
||||||
|
"""
|
||||||
|
|
||||||
|
START_TOKEN = "<|tool▁calls▁begin|>"
|
||||||
|
|
||||||
|
# Regex captures: function_name, function_arguments
|
||||||
|
PATTERN = re.compile(
|
||||||
|
r"<|tool▁call▁begin|>(?P<function_name>.*?)<|tool▁sep|>(?P<function_arguments>.*?)<|tool▁call▁end|>",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
if self.START_TOKEN not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
matches = self.PATTERN.findall(text)
|
||||||
|
if not matches:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
for match in matches:
|
||||||
|
func_name, func_args = match
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=func_name.strip(),
|
||||||
|
arguments=func_args.strip(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
content = text[: text.find(self.START_TOKEN)].strip()
|
||||||
|
return content if content else None, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
76
environments/tool_call_parsers/deepseek_v3_parser.py
Normal file
76
environments/tool_call_parsers/deepseek_v3_parser.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""
|
||||||
|
DeepSeek V3 tool call parser.
|
||||||
|
|
||||||
|
Format uses special unicode tokens:
|
||||||
|
<|tool▁calls▁begin|>
|
||||||
|
<|tool▁call▁begin|>type<|tool▁sep|>function_name
|
||||||
|
```json
|
||||||
|
{"arg": "value"}
|
||||||
|
```
|
||||||
|
<|tool▁call▁end|>
|
||||||
|
<|tool▁calls▁end|>
|
||||||
|
|
||||||
|
Based on VLLM's DeepSeekV3ToolParser.extract_tool_calls()
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("deepseek_v3")
|
||||||
|
class DeepSeekV3ToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for DeepSeek V3 tool calls.
|
||||||
|
|
||||||
|
Uses special unicode tokens with fullwidth angle brackets and block elements.
|
||||||
|
Extracts type, function name, and JSON arguments from the structured format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
START_TOKEN = "<|tool▁calls▁begin|>"
|
||||||
|
|
||||||
|
# Regex captures: type, function_name, function_arguments
|
||||||
|
PATTERN = re.compile(
|
||||||
|
r"<|tool▁call▁begin|>(?P<type>.*)<|tool▁sep|>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<|tool▁call▁end|>",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
if self.START_TOKEN not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
matches = self.PATTERN.findall(text)
|
||||||
|
if not matches:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
for match in matches:
|
||||||
|
tc_type, func_name, func_args = match
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=func_name.strip(),
|
||||||
|
arguments=func_args.strip(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
# Content is everything before the tool calls section
|
||||||
|
content = text[: text.find(self.START_TOKEN)].strip()
|
||||||
|
return content if content else None, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
109
environments/tool_call_parsers/glm45_parser.py
Normal file
109
environments/tool_call_parsers/glm45_parser.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
"""
|
||||||
|
GLM 4.5 (GLM-4-MoE) tool call parser.
|
||||||
|
|
||||||
|
Format uses custom arg_key/arg_value tags rather than standard JSON:
|
||||||
|
<tool_call>function_name
|
||||||
|
<arg_key>param1</arg_key><arg_value>value1</arg_value>
|
||||||
|
<arg_key>param2</arg_key><arg_value>value2</arg_value>
|
||||||
|
</tool_call>
|
||||||
|
|
||||||
|
Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback.
|
||||||
|
|
||||||
|
Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls()
|
||||||
|
"""
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
def _deserialize_value(value: str) -> Any:
|
||||||
|
"""
|
||||||
|
Try to deserialize a string value to its native Python type.
|
||||||
|
Attempts json.loads, then ast.literal_eval, then returns raw string.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return json.loads(value)
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
return ast.literal_eval(value)
|
||||||
|
except (ValueError, SyntaxError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("glm45")
|
||||||
|
class Glm45ToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for GLM 4.5 (GLM-4-MoE) tool calls.
|
||||||
|
|
||||||
|
Uses <tool_call>...</tool_call> tags with <arg_key>/<arg_value> pairs
|
||||||
|
instead of standard JSON arguments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
FUNC_CALL_REGEX = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
|
||||||
|
FUNC_DETAIL_REGEX = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
|
||||||
|
FUNC_ARG_REGEX = re.compile(
|
||||||
|
r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
|
||||||
|
)
|
||||||
|
|
||||||
|
START_TOKEN = "<tool_call>"
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
if self.START_TOKEN not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
matched_calls = self.FUNC_CALL_REGEX.findall(text)
|
||||||
|
if not matched_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
|
||||||
|
for match in matched_calls:
|
||||||
|
detail = self.FUNC_DETAIL_REGEX.search(match)
|
||||||
|
if not detail:
|
||||||
|
continue
|
||||||
|
|
||||||
|
func_name = detail.group(1).strip()
|
||||||
|
func_args_raw = detail.group(2)
|
||||||
|
|
||||||
|
# Parse arg_key/arg_value pairs
|
||||||
|
pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else []
|
||||||
|
arg_dict: Dict[str, Any] = {}
|
||||||
|
for key, value in pairs:
|
||||||
|
arg_key = key.strip()
|
||||||
|
arg_val = _deserialize_value(value.strip())
|
||||||
|
arg_dict[arg_key] = arg_val
|
||||||
|
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=func_name,
|
||||||
|
arguments=json.dumps(arg_dict, ensure_ascii=False),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
content = text[: text.find(self.START_TOKEN)].strip()
|
||||||
|
return content if content else None, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
35
environments/tool_call_parsers/glm47_parser.py
Normal file
35
environments/tool_call_parsers/glm47_parser.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""
|
||||||
|
GLM 4.7 tool call parser.
|
||||||
|
|
||||||
|
Same as GLM 4.5 but with slightly different regex patterns.
|
||||||
|
The tool_call tags may wrap differently and arg parsing handles
|
||||||
|
newlines between key/value pairs.
|
||||||
|
|
||||||
|
Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, register_parser
|
||||||
|
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("glm47")
|
||||||
|
class Glm47ToolCallParser(Glm45ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for GLM 4.7 tool calls.
|
||||||
|
Extends GLM 4.5 with updated regex patterns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
# GLM 4.7 uses a slightly different detail regex that includes
|
||||||
|
# the <tool_call> wrapper and optional arg_key content
|
||||||
|
self.FUNC_DETAIL_REGEX = re.compile(
|
||||||
|
r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
|
||||||
|
)
|
||||||
|
# GLM 4.7 handles newlines between arg_key and arg_value tags
|
||||||
|
self.FUNC_ARG_REGEX = re.compile(
|
||||||
|
r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
73
environments/tool_call_parsers/hermes_parser.py
Normal file
73
environments/tool_call_parsers/hermes_parser.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
"""
|
||||||
|
Hermes tool call parser.
|
||||||
|
|
||||||
|
Format: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
|
||||||
|
Based on VLLM's Hermes2ProToolParser.extract_tool_calls()
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("hermes")
|
||||||
|
class HermesToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for Hermes-format tool calls.
|
||||||
|
|
||||||
|
Matches <tool_call>...</tool_call> tags containing JSON with "name" and "arguments".
|
||||||
|
Also handles unclosed <tool_call> at end-of-string (truncated generation).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Matches both closed and unclosed tool_call tags
|
||||||
|
PATTERN = re.compile(
|
||||||
|
r"<tool_call>\s*(.*?)\s*</tool_call>|<tool_call>\s*(.*)", re.DOTALL
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
if "<tool_call>" not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
matches = self.PATTERN.findall(text)
|
||||||
|
if not matches:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
for match in matches:
|
||||||
|
# match is a tuple: (closed_content, unclosed_content)
|
||||||
|
raw_json = match[0] if match[0] else match[1]
|
||||||
|
if not raw_json.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
tc_data = json.loads(raw_json)
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=tc_data["name"],
|
||||||
|
arguments=json.dumps(
|
||||||
|
tc_data.get("arguments", {}), ensure_ascii=False
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
# Content is everything before the first <tool_call> tag
|
||||||
|
content = text[: text.find("<tool_call>")].strip()
|
||||||
|
return content if content else None, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
93
environments/tool_call_parsers/kimi_k2_parser.py
Normal file
93
environments/tool_call_parsers/kimi_k2_parser.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
"""
|
||||||
|
Kimi K2 tool call parser.
|
||||||
|
|
||||||
|
Format:
|
||||||
|
<|tool_calls_section_begin|>
|
||||||
|
<|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|>
|
||||||
|
<|tool_calls_section_end|>
|
||||||
|
|
||||||
|
The function_id format is typically "functions.func_name:index" or "func_name:index".
|
||||||
|
|
||||||
|
Based on VLLM's KimiK2ToolParser.extract_tool_calls()
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("kimi_k2")
|
||||||
|
class KimiK2ToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for Kimi K2 tool calls.
|
||||||
|
|
||||||
|
Uses section begin/end tokens wrapping individual tool call begin/end tokens.
|
||||||
|
The tool_call_id contains the function name (after last dot, before colon).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Support both singular and plural variants
|
||||||
|
START_TOKENS = [
|
||||||
|
"<|tool_calls_section_begin|>",
|
||||||
|
"<|tool_call_section_begin|>",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments
|
||||||
|
PATTERN = re.compile(
|
||||||
|
r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*"
|
||||||
|
r"<\|tool_call_argument_begin\|>\s*"
|
||||||
|
r"(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*"
|
||||||
|
r"<\|tool_call_end\|>",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
# Check for any variant of the start token
|
||||||
|
has_start = any(token in text for token in self.START_TOKENS)
|
||||||
|
if not has_start:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
matches = self.PATTERN.findall(text)
|
||||||
|
if not matches:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
for match in matches:
|
||||||
|
function_id, function_args = match
|
||||||
|
|
||||||
|
# Extract function name from ID format: "functions.get_weather:0" -> "get_weather"
|
||||||
|
function_name = function_id.split(":")[0].split(".")[-1]
|
||||||
|
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=function_id, # Preserve the original ID format
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=function_name,
|
||||||
|
arguments=function_args.strip(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
# Content is everything before the tool calls section
|
||||||
|
earliest_start = len(text)
|
||||||
|
for token in self.START_TOKENS:
|
||||||
|
idx = text.find(token)
|
||||||
|
if idx >= 0 and idx < earliest_start:
|
||||||
|
earliest_start = idx
|
||||||
|
|
||||||
|
content = text[:earliest_start].strip()
|
||||||
|
return content if content else None, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
96
environments/tool_call_parsers/llama_parser.py
Normal file
96
environments/tool_call_parsers/llama_parser.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
"""
|
||||||
|
Llama 3.x / 4 tool call parser.
|
||||||
|
|
||||||
|
Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys.
|
||||||
|
May be preceded by <|python_tag|> token. Supports multiple JSON objects separated
|
||||||
|
by content or semicolons.
|
||||||
|
|
||||||
|
Based on VLLM's Llama3JsonToolParser.extract_tool_calls()
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("llama3_json")
|
||||||
|
@register_parser("llama4_json")
|
||||||
|
class LlamaToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for Llama 3.x and 4 JSON-format tool calls.
|
||||||
|
|
||||||
|
Finds JSON objects containing "name" + ("arguments" or "parameters") keys.
|
||||||
|
Uses Python's json.JSONDecoder.raw_decode for robust extraction of
|
||||||
|
JSON objects from mixed text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
BOT_TOKEN = "<|python_tag|>"
|
||||||
|
|
||||||
|
# Regex to find the start of potential JSON objects
|
||||||
|
JSON_START = re.compile(r"\{")
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
# Quick check: need either the bot token or a JSON brace
|
||||||
|
if self.BOT_TOKEN not in text and "{" not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
decoder = json.JSONDecoder()
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
end_index = -1 # Track where the last parsed JSON ended
|
||||||
|
|
||||||
|
for match in self.JSON_START.finditer(text):
|
||||||
|
start = match.start()
|
||||||
|
# Skip if this brace is inside a previously parsed JSON object
|
||||||
|
if start <= end_index:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
obj, json_end = decoder.raw_decode(text[start:])
|
||||||
|
end_index = start + json_end
|
||||||
|
|
||||||
|
# Must have "name" and either "arguments" or "parameters"
|
||||||
|
name = obj.get("name")
|
||||||
|
args = obj.get("arguments", obj.get("parameters"))
|
||||||
|
|
||||||
|
if not name or args is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize arguments to JSON string
|
||||||
|
if isinstance(args, dict):
|
||||||
|
args = json.dumps(args, ensure_ascii=False)
|
||||||
|
elif not isinstance(args, str):
|
||||||
|
args = json.dumps(args, ensure_ascii=False)
|
||||||
|
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||||
|
type="function",
|
||||||
|
function=Function(name=name, arguments=args),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, KeyError, ValueError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
# Content is everything before the first tool call JSON
|
||||||
|
# Find where the first tool call starts in the text
|
||||||
|
first_tc_start = text.find("{")
|
||||||
|
if self.BOT_TOKEN in text:
|
||||||
|
first_tc_start = text.find(self.BOT_TOKEN)
|
||||||
|
content = text[:first_tc_start].strip() if first_tc_start > 0 else None
|
||||||
|
|
||||||
|
return content, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
69
environments/tool_call_parsers/longcat_parser.py
Normal file
69
environments/tool_call_parsers/longcat_parser.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
"""
|
||||||
|
Longcat Flash Chat tool call parser.
|
||||||
|
|
||||||
|
Same as Hermes but uses <longcat_tool_call> tags instead of <tool_call>.
|
||||||
|
Based on VLLM's LongcatFlashToolParser (extends Hermes2ProToolParser).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("longcat")
|
||||||
|
class LongcatToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for Longcat Flash Chat tool calls.
|
||||||
|
Identical logic to Hermes, just different tag names.
|
||||||
|
"""
|
||||||
|
|
||||||
|
PATTERN = re.compile(
|
||||||
|
r"<longcat_tool_call>\s*(.*?)\s*</longcat_tool_call>|<longcat_tool_call>\s*(.*)",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
if "<longcat_tool_call>" not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
matches = self.PATTERN.findall(text)
|
||||||
|
if not matches:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
for match in matches:
|
||||||
|
raw_json = match[0] if match[0] else match[1]
|
||||||
|
if not raw_json.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
tc_data = json.loads(raw_json)
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=tc_data["name"],
|
||||||
|
arguments=json.dumps(
|
||||||
|
tc_data.get("arguments", {}), ensure_ascii=False
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
content = text[: text.find("<longcat_tool_call>")].strip()
|
||||||
|
return content if content else None, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
130
environments/tool_call_parsers/mistral_parser.py
Normal file
130
environments/tool_call_parsers/mistral_parser.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
"""
|
||||||
|
Mistral tool call parser.
|
||||||
|
|
||||||
|
Supports two formats depending on tokenizer version:
|
||||||
|
- Pre-v11: content[TOOL_CALLS] [{"name": ..., "arguments": {...}}, ...]
|
||||||
|
- v11+: content[TOOL_CALLS]tool_name1{"arg": "val"}[TOOL_CALLS]tool_name2{"arg": "val"}
|
||||||
|
|
||||||
|
Based on VLLM's MistralToolParser.extract_tool_calls()
|
||||||
|
The [TOOL_CALLS] token is the bot_token used by Mistral models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_mistral_id() -> str:
|
||||||
|
"""Mistral tool call IDs are 9-char alphanumeric strings."""
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
|
||||||
|
return "".join(random.choices(string.ascii_letters + string.digits, k=9))
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("mistral")
|
||||||
|
class MistralToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for Mistral-format tool calls.
|
||||||
|
|
||||||
|
Detects format by checking if the content after [TOOL_CALLS] starts with '['
|
||||||
|
(pre-v11 JSON array) or with a tool name (v11+ format).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# The [TOOL_CALLS] token -- may appear as different strings depending on tokenizer
|
||||||
|
BOT_TOKEN = "[TOOL_CALLS]"
|
||||||
|
|
||||||
|
# Fallback regex for pre-v11 format when JSON parsing fails
|
||||||
|
TOOL_CALL_REGEX = re.compile(r"\[?\s*(\{.*?\})\s*\]?", re.DOTALL)
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
if self.BOT_TOKEN not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
parts = text.split(self.BOT_TOKEN)
|
||||||
|
content = parts[0].strip()
|
||||||
|
raw_tool_calls = parts[1:]
|
||||||
|
|
||||||
|
# Detect format: if the first raw part starts with '[', it's pre-v11
|
||||||
|
first_raw = raw_tool_calls[0].strip() if raw_tool_calls else ""
|
||||||
|
is_pre_v11 = first_raw.startswith("[") or first_raw.startswith("{")
|
||||||
|
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
|
||||||
|
if not is_pre_v11:
|
||||||
|
# v11+ format: [TOOL_CALLS]tool_name{args}[TOOL_CALLS]tool_name2{args2}
|
||||||
|
for raw in raw_tool_calls:
|
||||||
|
raw = raw.strip()
|
||||||
|
if not raw or "{" not in raw:
|
||||||
|
continue
|
||||||
|
|
||||||
|
brace_idx = raw.find("{")
|
||||||
|
tool_name = raw[:brace_idx].strip()
|
||||||
|
args_str = raw[brace_idx:]
|
||||||
|
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=_generate_mistral_id(),
|
||||||
|
type="function",
|
||||||
|
function=Function(name=tool_name, arguments=args_str),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Pre-v11 format: [TOOL_CALLS] [{"name": ..., "arguments": {...}}]
|
||||||
|
try:
|
||||||
|
parsed = json.loads(first_raw)
|
||||||
|
if isinstance(parsed, dict):
|
||||||
|
parsed = [parsed]
|
||||||
|
|
||||||
|
for tc in parsed:
|
||||||
|
args = tc.get("arguments", {})
|
||||||
|
if isinstance(args, dict):
|
||||||
|
args = json.dumps(args, ensure_ascii=False)
|
||||||
|
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=_generate_mistral_id(),
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=tc["name"], arguments=args
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Fallback regex extraction
|
||||||
|
match = self.TOOL_CALL_REGEX.findall(first_raw)
|
||||||
|
if match:
|
||||||
|
for raw_json in match:
|
||||||
|
try:
|
||||||
|
tc = json.loads(raw_json)
|
||||||
|
args = tc.get("arguments", {})
|
||||||
|
if isinstance(args, dict):
|
||||||
|
args = json.dumps(args, ensure_ascii=False)
|
||||||
|
tool_calls.append(
|
||||||
|
ChatCompletionMessageToolCall(
|
||||||
|
id=_generate_mistral_id(),
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=tc["name"], arguments=args
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, KeyError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
return content if content else None, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
163
environments/tool_call_parsers/qwen3_coder_parser.py
Normal file
163
environments/tool_call_parsers/qwen3_coder_parser.py
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
"""
|
||||||
|
Qwen3-Coder tool call parser.
|
||||||
|
|
||||||
|
Format uses XML-style nested tags:
|
||||||
|
<tool_call>
|
||||||
|
<function=function_name>
|
||||||
|
<parameter=param_name>value</parameter>
|
||||||
|
<parameter=param_name2>value2</parameter>
|
||||||
|
</function>
|
||||||
|
</tool_call>
|
||||||
|
|
||||||
|
Parameters are extracted from <parameter=name>value</parameter> tags and
|
||||||
|
type-converted using the schema if available, otherwise treated as strings.
|
||||||
|
|
||||||
|
Based on VLLM's Qwen3CoderToolParser.extract_tool_calls()
|
||||||
|
"""
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from openai.types.chat.chat_completion_message_tool_call import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Function,
|
||||||
|
)
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||||
|
|
||||||
|
|
||||||
|
def _try_convert_value(value: str) -> Any:
|
||||||
|
"""
|
||||||
|
Try to convert a parameter value string to a native Python type.
|
||||||
|
Handles null, numbers, booleans, JSON objects/arrays, and falls back to string.
|
||||||
|
"""
|
||||||
|
stripped = value.strip()
|
||||||
|
|
||||||
|
# Handle null
|
||||||
|
if stripped.lower() == "null":
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Try JSON first (handles objects, arrays, strings, numbers, booleans)
|
||||||
|
try:
|
||||||
|
return json.loads(stripped)
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Try Python literal eval (handles tuples, etc.)
|
||||||
|
try:
|
||||||
|
return ast.literal_eval(stripped)
|
||||||
|
except (ValueError, SyntaxError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Return as string
|
||||||
|
return stripped
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("qwen3_coder")
|
||||||
|
class Qwen3CoderToolCallParser(ToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for Qwen3-Coder XML-format tool calls.
|
||||||
|
|
||||||
|
Uses nested XML tags: <tool_call><function=name><parameter=key>val</parameter></function></tool_call>
|
||||||
|
"""
|
||||||
|
|
||||||
|
START_TOKEN = "<tool_call>"
|
||||||
|
FUNCTION_PREFIX = "<function="
|
||||||
|
|
||||||
|
# Find complete tool_call blocks (or unclosed at end)
|
||||||
|
TOOL_CALL_REGEX = re.compile(
|
||||||
|
r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find function blocks within a tool_call
|
||||||
|
FUNCTION_REGEX = re.compile(
|
||||||
|
r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find parameter blocks within a function
|
||||||
|
PARAMETER_REGEX = re.compile(
|
||||||
|
r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_function_call(self, function_str: str) -> Optional[ChatCompletionMessageToolCall]:
|
||||||
|
"""Parse a single <function=name>...</function> block into a ToolCall."""
|
||||||
|
try:
|
||||||
|
# Extract function name: everything before the first '>'
|
||||||
|
gt_idx = function_str.index(">")
|
||||||
|
func_name = function_str[:gt_idx].strip()
|
||||||
|
params_str = function_str[gt_idx + 1:]
|
||||||
|
|
||||||
|
# Extract parameters
|
||||||
|
param_dict: Dict[str, Any] = {}
|
||||||
|
for match_text in self.PARAMETER_REGEX.findall(params_str):
|
||||||
|
if ">" not in match_text:
|
||||||
|
continue
|
||||||
|
eq_idx = match_text.index(">")
|
||||||
|
param_name = match_text[:eq_idx].strip()
|
||||||
|
param_value = match_text[eq_idx + 1:]
|
||||||
|
|
||||||
|
# Clean up whitespace
|
||||||
|
if param_value.startswith("\n"):
|
||||||
|
param_value = param_value[1:]
|
||||||
|
if param_value.endswith("\n"):
|
||||||
|
param_value = param_value[:-1]
|
||||||
|
|
||||||
|
param_dict[param_name] = _try_convert_value(param_value)
|
||||||
|
|
||||||
|
return ChatCompletionMessageToolCall(
|
||||||
|
id=f"call_{uuid.uuid4().hex[:24]}",
|
||||||
|
type="function",
|
||||||
|
function=Function(
|
||||||
|
name=func_name,
|
||||||
|
arguments=json.dumps(param_dict, ensure_ascii=False),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse(self, text: str) -> ParseResult:
|
||||||
|
if self.FUNCTION_PREFIX not in text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Find all tool_call blocks
|
||||||
|
tc_matches = self.TOOL_CALL_REGEX.findall(text)
|
||||||
|
raw_blocks = [m[0] if m[0] else m[1] for m in tc_matches]
|
||||||
|
|
||||||
|
# Fallback: if no tool_call tags, try the whole text
|
||||||
|
if not raw_blocks:
|
||||||
|
raw_blocks = [text]
|
||||||
|
|
||||||
|
# Find function blocks within each tool_call
|
||||||
|
function_strs: List[str] = []
|
||||||
|
for block in raw_blocks:
|
||||||
|
func_matches = self.FUNCTION_REGEX.findall(block)
|
||||||
|
function_strs.extend(m[0] if m[0] else m[1] for m in func_matches)
|
||||||
|
|
||||||
|
if not function_strs:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
# Parse each function call
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||||
|
for func_str in function_strs:
|
||||||
|
tc = self._parse_function_call(func_str)
|
||||||
|
if tc is not None:
|
||||||
|
tool_calls.append(tc)
|
||||||
|
|
||||||
|
if not tool_calls:
|
||||||
|
return text, None
|
||||||
|
|
||||||
|
# Content before tool calls
|
||||||
|
first_tc = text.find(self.START_TOKEN)
|
||||||
|
if first_tc < 0:
|
||||||
|
first_tc = text.find(self.FUNCTION_PREFIX)
|
||||||
|
content = text[:first_tc].strip() if first_tc > 0 else None
|
||||||
|
|
||||||
|
return content, tool_calls
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return text, None
|
||||||
19
environments/tool_call_parsers/qwen_parser.py
Normal file
19
environments/tool_call_parsers/qwen_parser.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
"""
|
||||||
|
Qwen 2.5 tool call parser.
|
||||||
|
|
||||||
|
Uses the same <tool_call> format as Hermes.
|
||||||
|
Registered as a separate parser name for clarity when using --tool-parser=qwen.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from environments.tool_call_parsers import register_parser
|
||||||
|
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser
|
||||||
|
|
||||||
|
|
||||||
|
@register_parser("qwen")
|
||||||
|
class QwenToolCallParser(HermesToolCallParser):
|
||||||
|
"""
|
||||||
|
Parser for Qwen 2.5 tool calls.
|
||||||
|
Same <tool_call>{"name": ..., "arguments": ...}</tool_call> format as Hermes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass # Identical format -- inherits everything from Hermes
|
||||||
474
environments/tool_context.py
Normal file
474
environments/tool_context.py
Normal file
@@ -0,0 +1,474 @@
|
|||||||
|
"""
|
||||||
|
ToolContext -- Unrestricted Tool Access for Reward Functions
|
||||||
|
|
||||||
|
A per-rollout handle that gives reward/verification functions direct access to
|
||||||
|
ALL hermes-agent tools, scoped to the rollout's task_id. The same task_id means
|
||||||
|
the terminal/browser session is the SAME one the model used during its rollout --
|
||||||
|
all state (files, processes, browser tabs) is preserved.
|
||||||
|
|
||||||
|
The verifier author decides which tools to use. Nothing is hardcoded or gated.
|
||||||
|
|
||||||
|
Example usage in a compute_reward():
|
||||||
|
async def compute_reward(self, item, result, ctx):
|
||||||
|
# Run tests in the model's terminal sandbox
|
||||||
|
test = ctx.terminal("pytest -v")
|
||||||
|
if test["exit_code"] == 0:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
# Check if a file was created
|
||||||
|
content = ctx.read_file("/workspace/solution.py")
|
||||||
|
if content.get("content"):
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
from model_tools import handle_function_call
|
||||||
|
from tools.terminal_tool import cleanup_vm
|
||||||
|
from tools.browser_tool import cleanup_browser
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Thread pool for running sync tool calls that internally use asyncio.run()
|
||||||
|
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_tool_in_thread(tool_name: str, arguments: Dict[str, Any], task_id: str) -> str:
|
||||||
|
"""
|
||||||
|
Run a tool call in a thread pool executor so backends that use asyncio.run()
|
||||||
|
internally (modal, docker, daytona) get a clean event loop.
|
||||||
|
|
||||||
|
If we're already in an async context, executes handle_function_call() in a
|
||||||
|
disposable worker thread and blocks for the result.
|
||||||
|
If not (e.g., called from sync code), runs directly.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
# We're in an async context -- need to run in thread
|
||||||
|
import concurrent.futures
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||||
|
future = pool.submit(
|
||||||
|
handle_function_call, tool_name, arguments, task_id
|
||||||
|
)
|
||||||
|
return future.result(timeout=300)
|
||||||
|
except RuntimeError:
|
||||||
|
# No running event loop -- safe to call directly
|
||||||
|
return handle_function_call(tool_name, arguments, task_id)
|
||||||
|
|
||||||
|
|
||||||
|
class ToolContext:
|
||||||
|
"""
|
||||||
|
Open-ended access to all hermes-agent tools for a specific rollout.
|
||||||
|
|
||||||
|
Passed to compute_reward() so verifiers can use any tool they need:
|
||||||
|
terminal commands, file reads/writes, web searches, browser automation, etc.
|
||||||
|
All calls share the rollout's task_id for session isolation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, task_id: str):
|
||||||
|
self.task_id = task_id
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Terminal tools
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def terminal(self, command: str, timeout: int = 180) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Run a command in the rollout's terminal session.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
command: Shell command to execute
|
||||||
|
timeout: Command timeout in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'exit_code' (int) and 'output' (str)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
backend = os.getenv("TERMINAL_ENV", "local")
|
||||||
|
logger.debug("ToolContext.terminal [%s backend] task=%s: %s", backend, self.task_id[:8], command[:100])
|
||||||
|
|
||||||
|
# Run via thread helper so modal/docker/daytona backends' asyncio.run() doesn't deadlock
|
||||||
|
result = _run_tool_in_thread(
|
||||||
|
"terminal",
|
||||||
|
{"command": command, "timeout": timeout},
|
||||||
|
self.task_id,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"exit_code": -1, "output": result}
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# File tools
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def read_file(self, path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Read a file from the rollout's filesystem.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: File path to read
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with file content or error
|
||||||
|
"""
|
||||||
|
result = handle_function_call(
|
||||||
|
"read_file", {"path": path}, task_id=self.task_id
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": result}
|
||||||
|
|
||||||
|
def write_file(self, path: str, content: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Write a TEXT file in the rollout's filesystem.
|
||||||
|
|
||||||
|
Uses a shell heredoc under the hood, so this is only safe for text content.
|
||||||
|
For binary files (images, compiled artifacts, etc.), use upload_file() instead.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: File path to write
|
||||||
|
content: Text content to write
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with success status or error
|
||||||
|
"""
|
||||||
|
result = handle_function_call(
|
||||||
|
"write_file", {"path": path, "content": content}, task_id=self.task_id
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": result}
|
||||||
|
|
||||||
|
def upload_file(self, local_path: str, remote_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Upload a local file to the rollout's sandbox (binary-safe).
|
||||||
|
|
||||||
|
Unlike write_file() which passes content through a shell heredoc (text-only),
|
||||||
|
this method base64-encodes the file and decodes it inside the sandbox.
|
||||||
|
Safe for any file type: binaries, images, archives, etc.
|
||||||
|
|
||||||
|
For large files (>1MB), the content is split into chunks to avoid
|
||||||
|
hitting shell command-length limits.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
local_path: Path to a local file on the host
|
||||||
|
remote_path: Destination path inside the sandbox
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'exit_code' and 'output'
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
from pathlib import Path as _Path
|
||||||
|
|
||||||
|
local = _Path(local_path)
|
||||||
|
if not local.exists():
|
||||||
|
return {"exit_code": -1, "output": f"Local file not found: {local_path}"}
|
||||||
|
|
||||||
|
raw = local.read_bytes()
|
||||||
|
b64 = base64.b64encode(raw).decode("ascii")
|
||||||
|
|
||||||
|
# Ensure parent directory exists in the sandbox
|
||||||
|
parent = str(_Path(remote_path).parent)
|
||||||
|
if parent not in (".", "/"):
|
||||||
|
self.terminal(f"mkdir -p {parent}", timeout=10)
|
||||||
|
|
||||||
|
# For small files, single command is fine
|
||||||
|
chunk_size = 60_000 # ~60KB per chunk (well within shell limits)
|
||||||
|
if len(b64) <= chunk_size:
|
||||||
|
result = self.terminal(
|
||||||
|
f"printf '%s' '{b64}' | base64 -d > {remote_path}",
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# For larger files, write base64 in chunks then decode
|
||||||
|
tmp_b64 = "/tmp/_hermes_upload.b64"
|
||||||
|
self.terminal(f": > {tmp_b64}", timeout=5) # truncate
|
||||||
|
for i in range(0, len(b64), chunk_size):
|
||||||
|
chunk = b64[i : i + chunk_size]
|
||||||
|
self.terminal(f"printf '%s' '{chunk}' >> {tmp_b64}", timeout=15)
|
||||||
|
result = self.terminal(
|
||||||
|
f"base64 -d {tmp_b64} > {remote_path} && rm -f {tmp_b64}",
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def upload_dir(self, local_dir: str, remote_dir: str) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Upload an entire local directory to the rollout's sandbox (binary-safe).
|
||||||
|
|
||||||
|
Recursively uploads all files, preserving directory structure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
local_dir: Path to a local directory on the host
|
||||||
|
remote_dir: Destination directory inside the sandbox
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of results, one per file uploaded
|
||||||
|
"""
|
||||||
|
from pathlib import Path as _Path
|
||||||
|
|
||||||
|
local = _Path(local_dir)
|
||||||
|
if not local.exists() or not local.is_dir():
|
||||||
|
return [{"exit_code": -1, "output": f"Local directory not found: {local_dir}"}]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for file_path in sorted(local.rglob("*")):
|
||||||
|
if file_path.is_file():
|
||||||
|
relative = file_path.relative_to(local)
|
||||||
|
target = f"{remote_dir}/{relative}"
|
||||||
|
results.append(self.upload_file(str(file_path), target))
|
||||||
|
return results
|
||||||
|
|
||||||
|
def download_file(self, remote_path: str, local_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Download a file from the rollout's sandbox to the host (binary-safe).
|
||||||
|
|
||||||
|
The inverse of upload_file(). Base64-encodes the file inside the sandbox,
|
||||||
|
reads the encoded data through the terminal, and decodes it locally.
|
||||||
|
Safe for any file type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
remote_path: Path to the file inside the sandbox
|
||||||
|
local_path: Destination path on the host
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'success' (bool) and 'bytes' (int) or 'error' (str)
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
from pathlib import Path as _Path
|
||||||
|
|
||||||
|
# Base64-encode the file inside the sandbox and capture output
|
||||||
|
result = self.terminal(
|
||||||
|
f"base64 {remote_path} 2>/dev/null",
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.get("exit_code", -1) != 0:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": f"Failed to read remote file: {result.get('output', '')}",
|
||||||
|
}
|
||||||
|
|
||||||
|
b64_data = result.get("output", "").strip()
|
||||||
|
if not b64_data:
|
||||||
|
return {"success": False, "error": f"Remote file is empty or missing: {remote_path}"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = base64.b64decode(b64_data)
|
||||||
|
except Exception as e:
|
||||||
|
return {"success": False, "error": f"Base64 decode failed: {e}"}
|
||||||
|
|
||||||
|
# Write to local host filesystem
|
||||||
|
local = _Path(local_path)
|
||||||
|
local.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
local.write_bytes(raw)
|
||||||
|
|
||||||
|
return {"success": True, "bytes": len(raw)}
|
||||||
|
|
||||||
|
def download_dir(self, remote_dir: str, local_dir: str) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Download a directory from the rollout's sandbox to the host (binary-safe).
|
||||||
|
|
||||||
|
Lists all files in the remote directory, then downloads each one.
|
||||||
|
Preserves directory structure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
remote_dir: Path to the directory inside the sandbox
|
||||||
|
local_dir: Destination directory on the host
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of results, one per file downloaded
|
||||||
|
"""
|
||||||
|
from pathlib import Path as _Path
|
||||||
|
|
||||||
|
# List files in the remote directory
|
||||||
|
ls_result = self.terminal(
|
||||||
|
f"find {remote_dir} -type f 2>/dev/null",
|
||||||
|
timeout=15,
|
||||||
|
)
|
||||||
|
|
||||||
|
if ls_result.get("exit_code", -1) != 0:
|
||||||
|
return [{"success": False, "error": f"Failed to list remote dir: {remote_dir}"}]
|
||||||
|
|
||||||
|
file_list = ls_result.get("output", "").strip()
|
||||||
|
if not file_list:
|
||||||
|
return [{"success": False, "error": f"Remote directory is empty or missing: {remote_dir}"}]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for remote_file in file_list.splitlines():
|
||||||
|
remote_file = remote_file.strip()
|
||||||
|
if not remote_file:
|
||||||
|
continue
|
||||||
|
# Compute the relative path to preserve directory structure
|
||||||
|
if remote_file.startswith(remote_dir):
|
||||||
|
relative = remote_file[len(remote_dir):].lstrip("/")
|
||||||
|
else:
|
||||||
|
relative = _Path(remote_file).name
|
||||||
|
local_file = str(_Path(local_dir) / relative)
|
||||||
|
results.append(self.download_file(remote_file, local_file))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def search(self, query: str, path: str = ".") -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Search for text in the rollout's filesystem.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query
|
||||||
|
path: Directory to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with search results
|
||||||
|
"""
|
||||||
|
result = handle_function_call(
|
||||||
|
"search_files", {"pattern": query, "path": path}, task_id=self.task_id
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": result}
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Web tools
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def web_search(self, query: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Search the web.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with search results
|
||||||
|
"""
|
||||||
|
result = handle_function_call("web_search", {"query": query})
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": result}
|
||||||
|
|
||||||
|
def web_extract(self, urls: List[str]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract content from URLs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to extract content from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with extracted content
|
||||||
|
"""
|
||||||
|
result = handle_function_call("web_extract", {"urls": urls})
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": result}
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Browser tools
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def browser_navigate(self, url: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Navigate the rollout's browser session to a URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to navigate to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with page snapshot or error
|
||||||
|
"""
|
||||||
|
result = handle_function_call(
|
||||||
|
"browser_navigate", {"url": url}, task_id=self.task_id
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": result}
|
||||||
|
|
||||||
|
def browser_snapshot(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Take a snapshot of the current browser page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with page content/accessibility snapshot
|
||||||
|
"""
|
||||||
|
result = handle_function_call(
|
||||||
|
"browser_snapshot", {}, task_id=self.task_id
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return json.loads(result)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": result}
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Generic tool access
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Call any hermes-agent tool by name.
|
||||||
|
|
||||||
|
This is the generic escape hatch -- if a tool doesn't have a convenience
|
||||||
|
wrapper above, you can call it directly here.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tool_name: Name of the tool (e.g., "vision_analyze", "skills_list")
|
||||||
|
arguments: Dict of arguments for the tool
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Raw JSON string result from the tool
|
||||||
|
"""
|
||||||
|
return _run_tool_in_thread(tool_name, arguments, self.task_id)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Cleanup
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""
|
||||||
|
Release all resources (terminal VMs, browser sessions, background processes)
|
||||||
|
for this rollout.
|
||||||
|
|
||||||
|
Called automatically by the base environment via try/finally after
|
||||||
|
compute_reward() completes. You generally don't need to call this yourself.
|
||||||
|
"""
|
||||||
|
# Kill any background processes from this rollout (safety net)
|
||||||
|
try:
|
||||||
|
from tools.process_registry import process_registry
|
||||||
|
killed = process_registry.kill_all(task_id=self.task_id)
|
||||||
|
if killed:
|
||||||
|
logger.debug("Process cleanup for task %s: killed %d process(es)", self.task_id, killed)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Process cleanup for task %s: %s", self.task_id, e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
cleanup_vm(self.task_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("VM cleanup for task %s: %s", self.task_id, e)
|
||||||
|
|
||||||
|
# Suppress browser_tool's noisy debug prints during cleanup.
|
||||||
|
# The cleanup still runs (safe), it just doesn't spam the console.
|
||||||
|
_prev_quiet = os.environ.get("HERMES_QUIET")
|
||||||
|
os.environ["HERMES_QUIET"] = "1"
|
||||||
|
try:
|
||||||
|
cleanup_browser(self.task_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Browser cleanup for task %s: %s", self.task_id, e)
|
||||||
|
finally:
|
||||||
|
if _prev_quiet is None:
|
||||||
|
os.environ.pop("HERMES_QUIET", None)
|
||||||
|
else:
|
||||||
|
os.environ["HERMES_QUIET"] = _prev_quiet
|
||||||
643
environments/web_research_env.py
Normal file
643
environments/web_research_env.py
Normal file
@@ -0,0 +1,643 @@
|
|||||||
|
"""
|
||||||
|
WebResearchEnv — RL Environment for Multi-Step Web Research
|
||||||
|
============================================================
|
||||||
|
|
||||||
|
Trains models to do accurate, efficient, multi-source web research.
|
||||||
|
|
||||||
|
Reward signals:
|
||||||
|
- Answer correctness (LLM judge, 0.0–1.0)
|
||||||
|
- Source diversity (used ≥2 distinct domains)
|
||||||
|
- Efficiency (penalizes excessive tool calls)
|
||||||
|
- Tool usage (bonus for actually using web tools)
|
||||||
|
|
||||||
|
Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
|
||||||
|
HuggingFace: google/frames-benchmark
|
||||||
|
Fallback: built-in sample questions (no HF token needed)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Phase 1 (OpenAI-compatible server)
|
||||||
|
python environments/web_research_env.py serve \\
|
||||||
|
--openai.base_url http://localhost:8000/v1 \\
|
||||||
|
--openai.model_name YourModel \\
|
||||||
|
--openai.server_type openai
|
||||||
|
|
||||||
|
# Process mode (offline data generation)
|
||||||
|
python environments/web_research_env.py process \\
|
||||||
|
--env.data_path_to_save_groups data/web_research.jsonl
|
||||||
|
|
||||||
|
# Standalone eval
|
||||||
|
python environments/web_research_env.py evaluate \\
|
||||||
|
--openai.base_url http://localhost:8000/v1 \\
|
||||||
|
--openai.model_name YourModel
|
||||||
|
|
||||||
|
Built by: github.com/jackx707
|
||||||
|
Inspired by: GroceryMind — production Hermes agent doing live web research
|
||||||
|
across German grocery stores (firecrawl + hermes-agent)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
# Ensure hermes-agent root is on path
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Optional HuggingFace datasets import
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
try:
|
||||||
|
from datasets import load_dataset
|
||||||
|
HF_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
HF_AVAILABLE = False
|
||||||
|
|
||||||
|
from atroposlib.envs.base import ScoredDataGroup
|
||||||
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||||
|
from atroposlib.type_definitions import Item
|
||||||
|
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
from environments.agent_loop import AgentResult
|
||||||
|
from environments.tool_context import ToolContext
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fallback sample dataset (used when HuggingFace is unavailable)
|
||||||
|
# Multi-hop questions requiring real web search to answer.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
SAMPLE_QUESTIONS = [
|
||||||
|
{
|
||||||
|
"question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?",
|
||||||
|
"answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.",
|
||||||
|
"difficulty": "medium",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?",
|
||||||
|
"answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.",
|
||||||
|
"difficulty": "medium",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "What programming language was used to write the original version of the web framework used by Instagram?",
|
||||||
|
"answer": "Django, which Instagram was built on, is written in Python.",
|
||||||
|
"difficulty": "easy",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?",
|
||||||
|
"answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).",
|
||||||
|
"difficulty": "hard",
|
||||||
|
"hops": 3,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?",
|
||||||
|
"answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.",
|
||||||
|
"difficulty": "medium",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "How many employees does the parent company of Instagram have?",
|
||||||
|
"answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.",
|
||||||
|
"difficulty": "medium",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?",
|
||||||
|
"answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.",
|
||||||
|
"difficulty": "hard",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Which company acquired the startup founded by the creator of Oculus VR?",
|
||||||
|
"answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.",
|
||||||
|
"difficulty": "medium",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "What is the market cap of the company that owns the most popular search engine in Russia?",
|
||||||
|
"answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.",
|
||||||
|
"difficulty": "hard",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?",
|
||||||
|
"answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.",
|
||||||
|
"difficulty": "hard",
|
||||||
|
"hops": 2,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class WebResearchEnvConfig(HermesAgentEnvConfig):
|
||||||
|
"""Configuration for the web research RL environment."""
|
||||||
|
|
||||||
|
# Reward weights
|
||||||
|
correctness_weight: float = Field(
|
||||||
|
default=0.6,
|
||||||
|
description="Weight for answer correctness in reward (LLM judge score).",
|
||||||
|
)
|
||||||
|
tool_usage_weight: float = Field(
|
||||||
|
default=0.2,
|
||||||
|
description="Weight for tool usage signal (did the model actually use web tools?).",
|
||||||
|
)
|
||||||
|
efficiency_weight: float = Field(
|
||||||
|
default=0.2,
|
||||||
|
description="Weight for efficiency signal (penalizes excessive tool calls).",
|
||||||
|
)
|
||||||
|
diversity_bonus: float = Field(
|
||||||
|
default=0.1,
|
||||||
|
description="Bonus reward for citing ≥2 distinct domains.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Efficiency thresholds
|
||||||
|
efficient_max_calls: int = Field(
|
||||||
|
default=5,
|
||||||
|
description="Maximum tool calls before efficiency penalty begins.",
|
||||||
|
)
|
||||||
|
heavy_penalty_calls: int = Field(
|
||||||
|
default=10,
|
||||||
|
description="Tool call count where efficiency penalty steepens.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Eval
|
||||||
|
eval_size: int = Field(
|
||||||
|
default=20,
|
||||||
|
description="Number of held-out items for evaluation.",
|
||||||
|
)
|
||||||
|
eval_split_ratio: float = Field(
|
||||||
|
default=0.1,
|
||||||
|
description="Fraction of dataset to hold out for evaluation (0.0–1.0).",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Dataset
|
||||||
|
dataset_name: str = Field(
|
||||||
|
default="google/frames-benchmark",
|
||||||
|
description="HuggingFace dataset name for research questions.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Environment
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class WebResearchEnv(HermesAgentBaseEnv):
|
||||||
|
"""
|
||||||
|
RL environment for training multi-step web research skills.
|
||||||
|
|
||||||
|
The model is given a factual question requiring 2-3 hops of web research
|
||||||
|
and must use web_search / web_extract tools to find and synthesize the answer.
|
||||||
|
|
||||||
|
Reward is multi-signal:
|
||||||
|
60% — answer correctness (LLM judge)
|
||||||
|
20% — tool usage (did the model actually search the web?)
|
||||||
|
20% — efficiency (penalizes >5 tool calls)
|
||||||
|
|
||||||
|
Bonus +0.1 for source diversity (≥2 distinct domains cited).
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "web-research"
|
||||||
|
env_config_cls = WebResearchEnvConfig
|
||||||
|
|
||||||
|
# Default toolsets for this environment — web + file for saving notes
|
||||||
|
default_toolsets = ["web", "file"]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]:
|
||||||
|
"""Default configuration for the web research environment."""
|
||||||
|
env_config = WebResearchEnvConfig(
|
||||||
|
enabled_toolsets=["web", "file"],
|
||||||
|
max_agent_turns=15,
|
||||||
|
agent_temperature=1.0,
|
||||||
|
system_prompt=(
|
||||||
|
"You are a highly capable research agent. When asked a factual question, "
|
||||||
|
"always use web_search to find current, accurate information before answering. "
|
||||||
|
"Cite at least 2 sources. Be concise and accurate."
|
||||||
|
),
|
||||||
|
group_size=4,
|
||||||
|
total_steps=1000,
|
||||||
|
steps_per_eval=100,
|
||||||
|
use_wandb=True,
|
||||||
|
wandb_name="web-research",
|
||||||
|
)
|
||||||
|
|
||||||
|
server_configs = [
|
||||||
|
APIServerConfig(
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
model_name="anthropic/claude-sonnet-4.5",
|
||||||
|
server_type="openai",
|
||||||
|
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||||
|
health_check=False,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self._items: list[dict] = []
|
||||||
|
self._eval_items: list[dict] = []
|
||||||
|
self._index: int = 0
|
||||||
|
|
||||||
|
# Metrics tracking for wandb
|
||||||
|
self._reward_buffer: list[float] = []
|
||||||
|
self._correctness_buffer: list[float] = []
|
||||||
|
self._tool_usage_buffer: list[float] = []
|
||||||
|
self._efficiency_buffer: list[float] = []
|
||||||
|
self._diversity_buffer: list[float] = []
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 1. Setup — load dataset
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def setup(self) -> None:
|
||||||
|
"""Load the FRAMES benchmark or fall back to built-in samples."""
|
||||||
|
if HF_AVAILABLE:
|
||||||
|
try:
|
||||||
|
logger.info("Loading FRAMES benchmark from HuggingFace...")
|
||||||
|
ds = load_dataset(self.config.dataset_name, split="test")
|
||||||
|
self._items = [
|
||||||
|
{
|
||||||
|
"question": row["Prompt"],
|
||||||
|
"answer": row["Answer"],
|
||||||
|
"difficulty": row.get("reasoning_types", "unknown"),
|
||||||
|
"hops": 2,
|
||||||
|
}
|
||||||
|
for row in ds
|
||||||
|
]
|
||||||
|
# Hold out for eval
|
||||||
|
eval_size = max(
|
||||||
|
self.config.eval_size,
|
||||||
|
int(len(self._items) * self.config.eval_split_ratio),
|
||||||
|
)
|
||||||
|
random.shuffle(self._items)
|
||||||
|
self._eval_items = self._items[:eval_size]
|
||||||
|
self._items = self._items[eval_size:]
|
||||||
|
logger.info(
|
||||||
|
f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items "
|
||||||
|
f"from FRAMES benchmark."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.")
|
||||||
|
|
||||||
|
# Fallback
|
||||||
|
random.shuffle(SAMPLE_QUESTIONS)
|
||||||
|
split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10)
|
||||||
|
self._items = SAMPLE_QUESTIONS[:split]
|
||||||
|
self._eval_items = SAMPLE_QUESTIONS[split:]
|
||||||
|
logger.info(
|
||||||
|
f"Using built-in sample dataset: {len(self._items)} train / "
|
||||||
|
f"{len(self._eval_items)} eval items."
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 2. get_next_item — return the next question
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def get_next_item(self) -> dict:
|
||||||
|
"""Return the next item, cycling through the dataset."""
|
||||||
|
if not self._items:
|
||||||
|
raise RuntimeError("Dataset is empty. Did you call setup()?")
|
||||||
|
item = self._items[self._index % len(self._items)]
|
||||||
|
self._index += 1
|
||||||
|
return item
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 3. format_prompt — build the user-facing prompt
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def format_prompt(self, item: dict) -> str:
|
||||||
|
"""Format the research question as a task prompt."""
|
||||||
|
return (
|
||||||
|
f"Research the following question thoroughly using web search. "
|
||||||
|
f"You MUST search the web to find current, accurate information — "
|
||||||
|
f"do not rely solely on your training data.\n\n"
|
||||||
|
f"Question: {item['question']}\n\n"
|
||||||
|
f"Requirements:\n"
|
||||||
|
f"- Use web_search and/or web_extract tools to find information\n"
|
||||||
|
f"- Search at least 2 different sources\n"
|
||||||
|
f"- Provide a concise, accurate answer (2-4 sentences)\n"
|
||||||
|
f"- Cite the sources you used"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 4. compute_reward — multi-signal scoring
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def compute_reward(
|
||||||
|
self,
|
||||||
|
item: dict,
|
||||||
|
result: AgentResult,
|
||||||
|
ctx: ToolContext,
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Multi-signal reward function:
|
||||||
|
|
||||||
|
correctness_weight * correctness — LLM judge comparing answer to ground truth
|
||||||
|
tool_usage_weight * tool_used — binary: did the model use web tools?
|
||||||
|
efficiency_weight * efficiency — penalizes wasteful tool usage
|
||||||
|
+ diversity_bonus — source diversity (≥2 distinct domains)
|
||||||
|
"""
|
||||||
|
final_response: str = result.final_response or ""
|
||||||
|
tools_used: list[str] = [
|
||||||
|
tc.tool_name for tc in (result.tool_calls or [])
|
||||||
|
] if hasattr(result, "tool_calls") and result.tool_calls else []
|
||||||
|
tool_call_count: int = result.turns_used or len(tools_used)
|
||||||
|
|
||||||
|
cfg = self.config
|
||||||
|
|
||||||
|
# ---- Signal 1: Answer correctness (LLM judge) ----------------
|
||||||
|
correctness = await self._llm_judge(
|
||||||
|
question=item["question"],
|
||||||
|
expected=item["answer"],
|
||||||
|
model_answer=final_response,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- Signal 2: Web tool usage --------------------------------
|
||||||
|
web_tools = {"web_search", "web_extract", "search", "firecrawl"}
|
||||||
|
tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
|
||||||
|
|
||||||
|
# ---- Signal 3: Efficiency ------------------------------------
|
||||||
|
if tool_call_count <= cfg.efficient_max_calls:
|
||||||
|
efficiency = 1.0
|
||||||
|
elif tool_call_count <= cfg.heavy_penalty_calls:
|
||||||
|
efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08
|
||||||
|
else:
|
||||||
|
efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12)
|
||||||
|
|
||||||
|
# ---- Bonus: Source diversity ---------------------------------
|
||||||
|
domains = self._extract_domains(final_response)
|
||||||
|
diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0
|
||||||
|
|
||||||
|
# ---- Combine ------------------------------------------------
|
||||||
|
reward = (
|
||||||
|
cfg.correctness_weight * correctness
|
||||||
|
+ cfg.tool_usage_weight * tool_used
|
||||||
|
+ cfg.efficiency_weight * efficiency
|
||||||
|
+ diversity
|
||||||
|
)
|
||||||
|
reward = min(1.0, max(0.0, reward)) # clamp to [0, 1]
|
||||||
|
|
||||||
|
# Track for wandb
|
||||||
|
self._reward_buffer.append(reward)
|
||||||
|
self._correctness_buffer.append(correctness)
|
||||||
|
self._tool_usage_buffer.append(tool_used)
|
||||||
|
self._efficiency_buffer.append(efficiency)
|
||||||
|
self._diversity_buffer.append(diversity)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Reward breakdown — correctness={correctness:.2f}, "
|
||||||
|
f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
|
||||||
|
f"diversity={diversity:.1f} → total={reward:.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 5. evaluate — run on held-out eval split
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def evaluate(self, *args, **kwargs) -> None:
|
||||||
|
"""Run evaluation on the held-out split using the agent loop."""
|
||||||
|
import time
|
||||||
|
|
||||||
|
items = self._eval_items
|
||||||
|
if not items:
|
||||||
|
logger.warning("No eval items available.")
|
||||||
|
return
|
||||||
|
|
||||||
|
eval_size = min(self.config.eval_size, len(items))
|
||||||
|
eval_items = items[:eval_size]
|
||||||
|
|
||||||
|
logger.info(f"Running eval on {len(eval_items)} questions...")
|
||||||
|
start_time = time.time()
|
||||||
|
samples = []
|
||||||
|
|
||||||
|
for item in eval_items:
|
||||||
|
try:
|
||||||
|
# Use the base env's agent loop for eval (same as training)
|
||||||
|
prompt = self.format_prompt(item)
|
||||||
|
completion = await self.server.chat_completion(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": self.config.system_prompt or ""},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
n=1,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
temperature=0.0,
|
||||||
|
split="eval",
|
||||||
|
)
|
||||||
|
|
||||||
|
response_content = (
|
||||||
|
completion.choices[0].message.content if completion.choices else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Score the response
|
||||||
|
correctness = await self._llm_judge(
|
||||||
|
question=item["question"],
|
||||||
|
expected=item["answer"],
|
||||||
|
model_answer=response_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
samples.append({
|
||||||
|
"prompt": item["question"],
|
||||||
|
"response": response_content,
|
||||||
|
"expected": item["answer"],
|
||||||
|
"correctness": correctness,
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Eval error on item: {e}")
|
||||||
|
samples.append({
|
||||||
|
"prompt": item["question"],
|
||||||
|
"response": f"ERROR: {e}",
|
||||||
|
"expected": item["answer"],
|
||||||
|
"correctness": 0.0,
|
||||||
|
})
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
# Compute metrics
|
||||||
|
correctness_scores = [s["correctness"] for s in samples]
|
||||||
|
eval_metrics = {
|
||||||
|
"eval/mean_correctness": (
|
||||||
|
sum(correctness_scores) / len(correctness_scores)
|
||||||
|
if correctness_scores else 0.0
|
||||||
|
),
|
||||||
|
"eval/n_items": len(samples),
|
||||||
|
}
|
||||||
|
|
||||||
|
await self.evaluate_log(
|
||||||
|
metrics=eval_metrics,
|
||||||
|
samples=samples,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 6. wandb_log — custom metrics
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None:
|
||||||
|
"""Log reward breakdown metrics to wandb."""
|
||||||
|
if wandb_metrics is None:
|
||||||
|
wandb_metrics = {}
|
||||||
|
|
||||||
|
if self._reward_buffer:
|
||||||
|
n = len(self._reward_buffer)
|
||||||
|
wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n
|
||||||
|
wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n
|
||||||
|
wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n
|
||||||
|
wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n
|
||||||
|
wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n
|
||||||
|
wandb_metrics["train/total_rollouts"] = n
|
||||||
|
|
||||||
|
# Accuracy buckets
|
||||||
|
wandb_metrics["train/correct_rate"] = (
|
||||||
|
sum(1 for c in self._correctness_buffer if c >= 0.7) / n
|
||||||
|
)
|
||||||
|
wandb_metrics["train/tool_usage_rate"] = (
|
||||||
|
sum(1 for t in self._tool_usage_buffer if t > 0) / n
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clear buffers
|
||||||
|
self._reward_buffer.clear()
|
||||||
|
self._correctness_buffer.clear()
|
||||||
|
self._tool_usage_buffer.clear()
|
||||||
|
self._efficiency_buffer.clear()
|
||||||
|
self._diversity_buffer.clear()
|
||||||
|
|
||||||
|
await super().wandb_log(wandb_metrics)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _llm_judge(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
expected: str,
|
||||||
|
model_answer: str,
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Use the server's LLM to judge answer correctness.
|
||||||
|
Falls back to keyword heuristic if LLM call fails.
|
||||||
|
"""
|
||||||
|
if not model_answer or not model_answer.strip():
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
judge_prompt = (
|
||||||
|
"You are an impartial judge evaluating the quality of an AI research answer.\n\n"
|
||||||
|
f"Question: {question}\n\n"
|
||||||
|
f"Reference answer: {expected}\n\n"
|
||||||
|
f"Model answer: {model_answer}\n\n"
|
||||||
|
"Score the model answer on a scale from 0.0 to 1.0 where:\n"
|
||||||
|
" 1.0 = fully correct and complete\n"
|
||||||
|
" 0.7 = mostly correct with minor gaps\n"
|
||||||
|
" 0.4 = partially correct\n"
|
||||||
|
" 0.1 = mentions relevant topic but wrong or very incomplete\n"
|
||||||
|
" 0.0 = completely wrong or no answer\n\n"
|
||||||
|
"Consider: factual accuracy, completeness, and relevance.\n"
|
||||||
|
'Respond with ONLY a JSON object: {"score": <float>, "reason": "<one sentence>"}'
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await self.server.chat_completion(
|
||||||
|
messages=[{"role": "user", "content": judge_prompt}],
|
||||||
|
n=1,
|
||||||
|
max_tokens=150,
|
||||||
|
temperature=0.0,
|
||||||
|
split="eval",
|
||||||
|
)
|
||||||
|
text = response.choices[0].message.content if response.choices else ""
|
||||||
|
parsed = self._parse_judge_json(text)
|
||||||
|
if parsed is not None:
|
||||||
|
return float(parsed)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"LLM judge failed: {e}. Using heuristic.")
|
||||||
|
|
||||||
|
return self._heuristic_score(expected, model_answer)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_judge_json(text: str) -> Optional[float]:
|
||||||
|
"""Extract the score float from LLM judge JSON response."""
|
||||||
|
try:
|
||||||
|
clean = re.sub(r"```(?:json)?|```", "", text).strip()
|
||||||
|
data = json.loads(clean)
|
||||||
|
score = float(data.get("score", -1))
|
||||||
|
if 0.0 <= score <= 1.0:
|
||||||
|
return score
|
||||||
|
except Exception:
|
||||||
|
match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
|
||||||
|
if match:
|
||||||
|
score = float(match.group(1))
|
||||||
|
if 0.0 <= score <= 1.0:
|
||||||
|
return score
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _heuristic_score(expected: str, model_answer: str) -> float:
|
||||||
|
"""Lightweight keyword overlap score as fallback."""
|
||||||
|
stopwords = {
|
||||||
|
"the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
|
||||||
|
"at", "to", "for", "with", "and", "or", "but", "it", "its",
|
||||||
|
"this", "that", "as", "by", "from", "be", "has", "have", "had",
|
||||||
|
}
|
||||||
|
|
||||||
|
def tokenize(text: str) -> set:
|
||||||
|
tokens = re.findall(r'\b\w+\b', text.lower())
|
||||||
|
return {t for t in tokens if t not in stopwords and len(t) > 2}
|
||||||
|
|
||||||
|
expected_tokens = tokenize(expected)
|
||||||
|
answer_tokens = tokenize(model_answer)
|
||||||
|
|
||||||
|
if not expected_tokens:
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
overlap = len(expected_tokens & answer_tokens)
|
||||||
|
union = len(expected_tokens | answer_tokens)
|
||||||
|
|
||||||
|
jaccard = overlap / union if union > 0 else 0.0
|
||||||
|
recall = overlap / len(expected_tokens)
|
||||||
|
return min(1.0, 0.4 * jaccard + 0.6 * recall)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_domains(text: str) -> set:
|
||||||
|
"""Extract unique domains from URLs cited in the response."""
|
||||||
|
urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
|
||||||
|
domains = set()
|
||||||
|
for url in urls:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc.lower().lstrip("www.")
|
||||||
|
if domain:
|
||||||
|
domains.add(domain)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return domains
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
WebResearchEnv.cli()
|
||||||
@@ -1,70 +0,0 @@
|
|||||||
---
|
|
||||||
name: example-skill
|
|
||||||
description: An example skill demonstrating the skill file format and structure
|
|
||||||
---
|
|
||||||
|
|
||||||
# Example Skill
|
|
||||||
|
|
||||||
This is an example skill file that demonstrates how to create skills for the Hermes Agent.
|
|
||||||
|
|
||||||
## Skill File Format
|
|
||||||
|
|
||||||
Skills are markdown files with YAML frontmatter at the top:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
---
|
|
||||||
name: your-skill-name
|
|
||||||
description: A brief one-line description of what this skill does
|
|
||||||
---
|
|
||||||
```
|
|
||||||
|
|
||||||
The frontmatter fields:
|
|
||||||
- **name**: The identifier used to reference this skill (lowercase, hyphens for spaces)
|
|
||||||
- **description**: A brief description shown when listing skills (keep under 200 chars)
|
|
||||||
|
|
||||||
## Writing Effective Skills
|
|
||||||
|
|
||||||
### 1. Be Specific and Actionable
|
|
||||||
|
|
||||||
Good skills provide clear, actionable instructions:
|
|
||||||
|
|
||||||
```
|
|
||||||
When reviewing code:
|
|
||||||
1. Check for security vulnerabilities first
|
|
||||||
2. Verify error handling is comprehensive
|
|
||||||
3. Ensure tests cover edge cases
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Include Examples
|
|
||||||
|
|
||||||
Show concrete examples of what you want:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Good: Descriptive variable names
|
|
||||||
user_authentication_token = get_token()
|
|
||||||
|
|
||||||
# Bad: Cryptic abbreviations
|
|
||||||
uat = gt()
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Define When to Use
|
|
||||||
|
|
||||||
Help the agent understand when this skill applies:
|
|
||||||
|
|
||||||
> Use this skill when: reviewing pull requests, auditing security, or checking code quality.
|
|
||||||
|
|
||||||
## Skill Categories
|
|
||||||
|
|
||||||
Consider organizing skills by purpose:
|
|
||||||
|
|
||||||
- **Conventions**: Coding standards, API patterns, naming rules
|
|
||||||
- **Workflows**: Step-by-step processes for deployments, reviews, releases
|
|
||||||
- **Knowledge**: Domain-specific information, system architecture, gotchas
|
|
||||||
- **Templates**: Boilerplate for common tasks, response formats
|
|
||||||
|
|
||||||
## Tips
|
|
||||||
|
|
||||||
1. Keep the description concise - it's shown in the skills list
|
|
||||||
2. Use headers to organize longer skills
|
|
||||||
3. Include code examples where helpful
|
|
||||||
4. Reference other skills if they're related
|
|
||||||
35
gateway/__init__.py
Normal file
35
gateway/__init__.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""
|
||||||
|
Hermes Gateway - Multi-platform messaging integration.
|
||||||
|
|
||||||
|
This module provides a unified gateway for connecting the Hermes agent
|
||||||
|
to various messaging platforms (Telegram, Discord, WhatsApp) with:
|
||||||
|
- Session management (persistent conversations with reset policies)
|
||||||
|
- Dynamic context injection (agent knows where messages come from)
|
||||||
|
- Delivery routing (cron job outputs to appropriate channels)
|
||||||
|
- Platform-specific toolsets (different capabilities per platform)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .config import GatewayConfig, PlatformConfig, HomeChannel, load_gateway_config
|
||||||
|
from .session import (
|
||||||
|
SessionContext,
|
||||||
|
SessionStore,
|
||||||
|
SessionResetPolicy,
|
||||||
|
build_session_context_prompt,
|
||||||
|
)
|
||||||
|
from .delivery import DeliveryRouter, DeliveryTarget
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Config
|
||||||
|
"GatewayConfig",
|
||||||
|
"PlatformConfig",
|
||||||
|
"HomeChannel",
|
||||||
|
"load_gateway_config",
|
||||||
|
# Session
|
||||||
|
"SessionContext",
|
||||||
|
"SessionStore",
|
||||||
|
"SessionResetPolicy",
|
||||||
|
"build_session_context_prompt",
|
||||||
|
# Delivery
|
||||||
|
"DeliveryRouter",
|
||||||
|
"DeliveryTarget",
|
||||||
|
]
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user