mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-30 07:51:45 +08:00
Compare commits
2043 Commits
test
...
fix/discor
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c8582fc4a2 | ||
|
|
0351e4fa90 | ||
|
|
1b2d6c424c | ||
|
|
28c35d045d | ||
|
|
1f6a1f0028 | ||
|
|
d7029489d6 | ||
|
|
12afccd9ca | ||
|
|
81f76111b0 | ||
|
|
96dac22194 | ||
|
|
2d36819503 | ||
|
|
8e20a7e035 | ||
|
|
4920c5940f | ||
|
|
3744118311 | ||
|
|
5ada0b95e9 | ||
|
|
19eaf5d956 | ||
|
|
365d175100 | ||
|
|
c3ca68d25b | ||
|
|
eaa9ceeb43 | ||
|
|
949fac192f | ||
|
|
4b96d10bc3 | ||
|
|
c16870277c | ||
|
|
247e3c1470 | ||
|
|
2af4af6390 | ||
|
|
749e9977a0 | ||
|
|
1c61ab6bd9 | ||
|
|
e9f1a8e39b | ||
|
|
b6a51c955e | ||
|
|
634c1f6752 | ||
|
|
6ebb816e56 | ||
|
|
37862f74fa | ||
|
|
67546746d4 | ||
|
|
d44b6b7f1b | ||
|
|
3576f44a57 | ||
|
|
4768ea624d | ||
|
|
e3f9894caf | ||
|
|
19c8ad3d3d | ||
|
|
bd3b0c712b | ||
|
|
46176c8029 | ||
|
|
b798062501 | ||
|
|
63e88326a8 | ||
|
|
474301adc6 | ||
|
|
285300528b | ||
|
|
673f132151 | ||
|
|
8d0a96a8bf | ||
|
|
cfa87e77a9 | ||
|
|
60e38e82ec | ||
|
|
ce430fed4c | ||
|
|
6794e79bb4 | ||
|
|
181077b785 | ||
|
|
63635744bf | ||
|
|
2158c44efd | ||
|
|
e6cf1c94a8 | ||
|
|
d998cac319 | ||
|
|
6c84e26e70 | ||
|
|
f4d61c168b | ||
|
|
8feb9e4656 | ||
|
|
25a1f1867f | ||
|
|
5e5c92663d | ||
|
|
942950f5b9 | ||
|
|
d3687d3e81 | ||
|
|
43b8ecd172 | ||
|
|
606f57a3ab | ||
|
|
23b9d88a76 | ||
|
|
c0b88018eb | ||
|
|
fc4080c58a | ||
|
|
91b9495b04 | ||
|
|
c2769dffe0 | ||
|
|
71e35311f5 | ||
|
|
97990e7ad5 | ||
|
|
73f39a7761 | ||
|
|
1ecfe68675 | ||
|
|
447594be28 | ||
|
|
9d1483c7e6 | ||
|
|
8e07f9ca56 | ||
|
|
57be18c026 | ||
|
|
99369b926c | ||
|
|
2633272ea9 | ||
|
|
2ba219fa4b | ||
|
|
9a423c3487 | ||
|
|
5479bb0e0c | ||
|
|
c51e7b4af7 | ||
|
|
7d2c786acc | ||
|
|
b72f522e30 | ||
|
|
352980311b | ||
|
|
b411b979cb | ||
|
|
ac739e485f | ||
|
|
8758e2e8d7 | ||
|
|
17e87478d2 | ||
|
|
a5359e61e7 | ||
|
|
25b0ae7979 | ||
|
|
dfe72b9d97 | ||
|
|
780ddd102b | ||
|
|
8cdbbcaaa2 | ||
|
|
a2f0d14f29 | ||
|
|
2219695d92 | ||
|
|
d23e9a9bed | ||
|
|
add945e53c | ||
|
|
c1ac32737d | ||
|
|
14b049d658 | ||
|
|
002c459981 | ||
|
|
ce660a4413 | ||
|
|
ee579af566 | ||
|
|
caa944e752 | ||
|
|
00110fb3c3 | ||
|
|
3543b755af | ||
|
|
51185354dd | ||
|
|
9e845a6e53 | ||
|
|
00a0c56598 | ||
|
|
30da22e1c1 | ||
|
|
e7d3f1f3ba | ||
|
|
c1da1fdcd5 | ||
|
|
f7c5d8a749 | ||
|
|
9cf7e2f0af | ||
|
|
dd7921d514 | ||
|
|
eb4f0348e1 | ||
|
|
38b4fd3737 | ||
|
|
36dd7a3e8d | ||
|
|
dd698f6d5d | ||
|
|
06a7d19f98 | ||
|
|
3801532bd3 | ||
|
|
aaacab7de7 | ||
|
|
4298c6fd9a | ||
|
|
c30505dddd | ||
|
|
70e24d77a1 | ||
|
|
fa3db2671a | ||
|
|
6fd9f2a0c5 | ||
|
|
1f72ce71b7 | ||
|
|
102a255575 | ||
|
|
5beb681c70 | ||
|
|
c9a9db318e | ||
|
|
01e62c067b | ||
|
|
ceb970c559 | ||
|
|
6894358fe1 | ||
|
|
3f0f4a04a9 | ||
|
|
c564e1c3dc | ||
|
|
210d5ade1e | ||
|
|
33ebedc76d | ||
|
|
5b80654198 | ||
|
|
25e53f3c1a | ||
|
|
103f7b1ebc | ||
|
|
a56937735e | ||
|
|
7148534401 | ||
|
|
4e91b0240b | ||
|
|
5e92a4ce5a | ||
|
|
471c663fdf | ||
|
|
64d333204b | ||
|
|
c44af43840 | ||
|
|
4511322f56 | ||
|
|
934fc9df22 | ||
|
|
5847c180c6 | ||
|
|
93a0c0cddd | ||
|
|
23e8fdd167 | ||
|
|
3268b98779 | ||
|
|
20f381cfb6 | ||
|
|
77bfa252b9 | ||
|
|
f24c00a5bf | ||
|
|
463239ed85 | ||
|
|
60cce9ca6d | ||
|
|
2d57946ee9 | ||
|
|
5f32fd8b6d | ||
|
|
3ea039684e | ||
|
|
63f0ec96ec | ||
|
|
1cacaccca6 | ||
|
|
773f3c1137 | ||
|
|
0cc784068d | ||
|
|
f1b4d0b280 | ||
|
|
5254d0bba1 | ||
|
|
21c20aeaa5 | ||
|
|
dc095f8491 | ||
|
|
621fd80b1e | ||
|
|
2b8fd9a8e3 | ||
|
|
fef710aca8 | ||
|
|
4ae1334287 | ||
|
|
db3e3aa6c5 | ||
|
|
633488e0c0 | ||
|
|
0de200cf4d | ||
|
|
f6fdb18fe6 | ||
|
|
b177b4abad | ||
|
|
232ba441d7 | ||
|
|
34e120bcbb | ||
|
|
779f8df6a6 | ||
|
|
62abb453d3 | ||
|
|
735a6e7651 | ||
|
|
e5ddca1c8b | ||
|
|
214827a594 | ||
|
|
fd0e1aac72 | ||
|
|
678e0bd9cc | ||
|
|
8ccd14a0d4 | ||
|
|
6c611c852e | ||
|
|
f882dabf19 | ||
|
|
973aa9b549 | ||
|
|
2316b8dc98 | ||
|
|
259208bfe4 | ||
|
|
47c5c97654 | ||
|
|
b117bbc125 | ||
|
|
df9020dfa3 | ||
|
|
c6fb7f6463 | ||
|
|
672dc1666f | ||
|
|
5b11570517 | ||
|
|
ff87a566c4 | ||
|
|
9e3752df36 | ||
|
|
15bf0b4af2 | ||
|
|
28b3764d1e | ||
|
|
62f1c2b622 | ||
|
|
71cff92eb7 | ||
|
|
1337c9efd8 | ||
|
|
747612fb3e | ||
|
|
84d99f7754 | ||
|
|
4524cddc72 | ||
|
|
f4e8772de4 | ||
|
|
39fe9e8533 | ||
|
|
d5b64ebdb3 | ||
|
|
f8ceadbad0 | ||
|
|
c36136084a | ||
|
|
4a93cfd889 | ||
|
|
f46b35e3d1 | ||
|
|
e6417cb7bc | ||
|
|
08081e5969 | ||
|
|
30120f05a6 | ||
|
|
6f85283553 | ||
|
|
9a177d6f4b | ||
|
|
6761021fb4 | ||
|
|
00c5e77724 | ||
|
|
69045711c1 | ||
|
|
9938d27e27 | ||
|
|
d36b3d498d | ||
|
|
0c182211a1 | ||
|
|
f4c012873c | ||
|
|
8ac5baf2d8 | ||
|
|
c54db79edc | ||
|
|
2119b68799 | ||
|
|
fd687d0967 | ||
|
|
12bc86d9c9 | ||
|
|
9e0f86cd3b | ||
|
|
883f6c81a2 | ||
|
|
b89177668e | ||
|
|
9f51de7261 | ||
|
|
a05a4afa53 | ||
|
|
db9e512424 | ||
|
|
8ce66a01ee | ||
|
|
f9a61a0d9e | ||
|
|
ba9f82946d | ||
|
|
0614969f7b | ||
|
|
f6ff6639e8 | ||
|
|
861869cb48 | ||
|
|
23bc642c82 | ||
|
|
9c322f7f59 | ||
|
|
b14a07315b | ||
|
|
4f4e2671ac | ||
|
|
ff3473a37c | ||
|
|
cb7690b2b5 | ||
|
|
95939a1b51 | ||
|
|
85ef09e520 | ||
|
|
6b1adb7eb1 | ||
|
|
db362dbd4c | ||
|
|
282df107a5 | ||
|
|
9f6bccd76a | ||
|
|
168a8e2e35 | ||
|
|
a86b487349 | ||
|
|
53d1043a50 | ||
|
|
6c24d76533 | ||
|
|
30b73bdf34 | ||
|
|
31db8c28a4 | ||
|
|
f549981293 | ||
|
|
2a6dbb25b2 | ||
|
|
0fd0eb93e8 | ||
|
|
88a48037d1 | ||
|
|
dc11b86e4b | ||
|
|
26bedf973b | ||
|
|
fc5443d854 | ||
|
|
799114ac8b | ||
|
|
70ea13eb40 | ||
|
|
0bc5aba5d0 | ||
|
|
f8a3e37f54 | ||
|
|
3229e434b8 | ||
|
|
24f61d006a | ||
|
|
c050c2d552 | ||
|
|
81cd367aec | ||
|
|
e099117a3b | ||
|
|
2536ff328b | ||
|
|
f3a074339d | ||
|
|
ea053e8afd | ||
|
|
e052c74727 | ||
|
|
a6dc73fa07 | ||
|
|
c3ea620796 | ||
|
|
7b140b31e6 | ||
|
|
fa89b65230 | ||
|
|
ed0c7194ed | ||
|
|
dc44e183e6 | ||
|
|
79c81b2244 | ||
|
|
e266530c7d | ||
|
|
879b7d3fbf | ||
|
|
9f36483bf4 | ||
|
|
7be314c456 | ||
|
|
9001b34146 | ||
|
|
861202b56c | ||
|
|
9d63dcc3f9 | ||
|
|
df5c61b37c | ||
|
|
b2bdaecf9b | ||
|
|
3fab72f1e1 | ||
|
|
e1824ef8a6 | ||
|
|
f3a38c90fc | ||
|
|
a748257bf5 | ||
|
|
8fb618234f | ||
|
|
5a2fcaab39 | ||
|
|
c207a6b302 | ||
|
|
7dc9281f05 | ||
|
|
2d18b077e1 | ||
|
|
eb8226daab | ||
|
|
60710bc8f8 | ||
|
|
7f485f588e | ||
|
|
f8e4233e67 | ||
|
|
eff0d23dd9 | ||
|
|
f10e26f731 | ||
|
|
1114841a2c | ||
|
|
5319bb6ac4 | ||
|
|
80a243efe6 | ||
|
|
c1d1699a64 | ||
|
|
889c3e2877 | ||
|
|
895fe5a5d3 | ||
|
|
21ad98b74c | ||
|
|
3325e51e53 | ||
|
|
588d4c293c | ||
|
|
88951215d3 | ||
|
|
4422637e7a | ||
|
|
6d8286f396 | ||
|
|
94af51f621 | ||
|
|
e5dc569daa | ||
|
|
14738e0872 | ||
|
|
d2e2d6e2a2 | ||
|
|
ee73b6bf27 | ||
|
|
429c44e377 | ||
|
|
1441525016 | ||
|
|
2054ffdaeb | ||
|
|
0d23ad7a15 | ||
|
|
9ec3a7a21b | ||
|
|
577b477a78 | ||
|
|
fbdce27b9a | ||
|
|
a50550fdb4 | ||
|
|
fbd752b92b | ||
|
|
6d2cfc24e9 | ||
|
|
e5186a0bad | ||
|
|
c6cc92295c | ||
|
|
b26d60c2ab | ||
|
|
a3b6e3c1ca | ||
|
|
f43c078f9e | ||
|
|
681f1068ea | ||
|
|
5e6c2ccbc9 | ||
|
|
6c0bf2824e | ||
|
|
f8b30d1035 | ||
|
|
8f3d7dfcc0 | ||
|
|
8d5563b3f6 | ||
|
|
05770520af | ||
|
|
43d25af964 | ||
|
|
66f8c2d5e8 | ||
|
|
906e25f299 | ||
|
|
707f3ff41f | ||
|
|
d1a1a09a70 | ||
|
|
eb8316ea69 | ||
|
|
02c307b004 | ||
|
|
917adcbaf4 | ||
|
|
19f4f8970a | ||
|
|
95c0bee7f8 | ||
|
|
8602e61fca | ||
|
|
2046a4c08c | ||
|
|
c1cca65168 | ||
|
|
67e80def53 | ||
|
|
63309065b6 | ||
|
|
71cffbfa4f | ||
|
|
9633ddd8d8 | ||
|
|
344adc72a1 | ||
|
|
fa72f4ff55 | ||
|
|
914bb12035 | ||
|
|
483a0b5233 | ||
|
|
04e151714f | ||
|
|
2ff03ebafe | ||
|
|
d2869de477 | ||
|
|
8d61ebe183 | ||
|
|
7b10881b9e | ||
|
|
a0f0f4fe52 | ||
|
|
3198cc8fd9 | ||
|
|
fb3c163612 | ||
|
|
6fa197f973 | ||
|
|
00a0f18544 | ||
|
|
523a1b6faf | ||
|
|
dd6a5732e7 | ||
|
|
767b5463f9 | ||
|
|
acc669645f | ||
|
|
42c778b5eb | ||
|
|
f764c7135d | ||
|
|
b646440ca0 | ||
|
|
92c14ec4b0 | ||
|
|
eb34c0b09a | ||
|
|
7a24168080 | ||
|
|
cc0a453476 | ||
|
|
35748a2fb0 | ||
|
|
1ad5e0ed15 | ||
|
|
49f3f0fc62 | ||
|
|
e3126aeb40 | ||
|
|
41162e0aca | ||
|
|
69cb373864 | ||
|
|
eb052b1b42 | ||
|
|
b8f8d3ef9e | ||
|
|
c433c89d7d | ||
|
|
fa2c825e2f | ||
|
|
5b47b87c42 | ||
|
|
a21f518c0b | ||
|
|
44abe852fb | ||
|
|
c797314fcf | ||
|
|
0ff1b4ade2 | ||
|
|
d646442692 | ||
|
|
0a8985acf9 | ||
|
|
2c84979d77 | ||
|
|
3260413cc7 | ||
|
|
238a431545 | ||
|
|
79ed0effdd | ||
|
|
9722bd8be0 | ||
|
|
c925d2ee76 | ||
|
|
34c324ff59 | ||
|
|
86ddaaee9c | ||
|
|
0d56b79685 | ||
|
|
3431f73c96 | ||
|
|
fbf47e9ff6 | ||
|
|
dcb84a8d30 | ||
|
|
095815d520 | ||
|
|
62e75cd158 | ||
|
|
815e83952e | ||
|
|
e21a13488b | ||
|
|
1b10c3711d | ||
|
|
f078cb4038 | ||
|
|
6205f061fe | ||
|
|
c477f660da | ||
|
|
d3e09df01a | ||
|
|
db51cfa60e | ||
|
|
536be3e0f6 | ||
|
|
ddfbc22b7c | ||
|
|
4e3b14dc69 | ||
|
|
a3905ef289 | ||
|
|
e50323f730 | ||
|
|
75bd5a582b | ||
|
|
2bb2312ea2 | ||
|
|
c0c358d051 | ||
|
|
cc974904f8 | ||
|
|
cbe4c23efa | ||
|
|
f6cf4ca826 | ||
|
|
d80da5ddd8 | ||
|
|
8aab13d12d | ||
|
|
39a77431e2 | ||
|
|
eb79dda04b | ||
|
|
eec04d180a | ||
|
|
8b57a3cb7e | ||
|
|
c3dc4448bf | ||
|
|
0a89933f9b | ||
|
|
bcf4513cb3 | ||
|
|
9d58cafec9 | ||
|
|
d0e3b39e69 | ||
|
|
ecc3dd7c63 | ||
|
|
6e51729c4c | ||
|
|
ddfd6e0c59 | ||
|
|
a78249230c | ||
|
|
fc893f98f4 | ||
|
|
a8838a7ae5 | ||
|
|
b859dfab16 | ||
|
|
143cc68946 | ||
|
|
46db7aeffd | ||
|
|
404123aea7 | ||
|
|
b00c5949fc | ||
|
|
3a1b35ed92 | ||
|
|
7d4b4e95f1 | ||
|
|
a15fa85248 | ||
|
|
fd4f229eab | ||
|
|
179d9e1a22 | ||
|
|
d7425343ee | ||
|
|
dad865e920 | ||
|
|
32b033c11c | ||
|
|
bfd9c97705 | ||
|
|
a69bd55b5a | ||
|
|
c23928d089 | ||
|
|
37b01ab964 | ||
|
|
ea5b89825a | ||
|
|
ec32e9a540 | ||
|
|
1a6fbef8a9 | ||
|
|
1a857123b3 | ||
|
|
02752c83b4 | ||
|
|
a48ebc68f4 | ||
|
|
b42ee3050e | ||
|
|
5c9a84219d | ||
|
|
50d6659392 | ||
|
|
9525db913f | ||
|
|
3126c60885 | ||
|
|
cac238c2a3 | ||
|
|
7e52e8eb54 | ||
|
|
96c250e538 | ||
|
|
ce56b45514 | ||
|
|
1182aeea00 | ||
|
|
cf3dceafe1 | ||
|
|
b5a7e807d0 | ||
|
|
c2c37ef158 | ||
|
|
2f8dbe4e77 | ||
|
|
95d49401ee | ||
|
|
26f8b790c9 | ||
|
|
7901d863dd | ||
|
|
e9a7441c9b | ||
|
|
41f22de20f | ||
|
|
b91cac7b4b | ||
|
|
29312a23d9 | ||
|
|
0bb7ed1d95 | ||
|
|
f279bb004f | ||
|
|
cbbba87099 | ||
|
|
6036793f60 | ||
|
|
f685741481 | ||
|
|
115dd17b3c | ||
|
|
486cb772b8 | ||
|
|
11e6775f98 | ||
|
|
52ba940c9b | ||
|
|
9492f42aa7 | ||
|
|
5c479eedf1 | ||
|
|
4aa94ae7cc | ||
|
|
728fa66ef0 | ||
|
|
1e23d14568 | ||
|
|
1117a21065 | ||
|
|
936040d8f7 | ||
|
|
74d7964688 | ||
|
|
d87a1615ce | ||
|
|
1869e88169 | ||
|
|
6f1889b0fa | ||
|
|
4250a7eb90 | ||
|
|
f5cf1f8a45 | ||
|
|
375ce8a881 | ||
|
|
9283877204 | ||
|
|
29176f302e | ||
|
|
25481d4286 | ||
|
|
2fe853bcc9 | ||
|
|
2166292157 | ||
|
|
163fa4a9d1 | ||
|
|
a628c607f0 | ||
|
|
08208323f2 | ||
|
|
358dab52ce | ||
|
|
806b79b589 | ||
|
|
c2a7921f3b | ||
|
|
a20d373945 | ||
|
|
21422dba44 | ||
|
|
b59da08730 | ||
|
|
329f83ff2d | ||
|
|
af8791a49d | ||
|
|
7c3cb9bb31 | ||
|
|
a154a13811 | ||
|
|
253d54a9e1 | ||
|
|
22990ed378 | ||
|
|
206e56cc5e | ||
|
|
984f00e0b0 | ||
|
|
607689095e | ||
|
|
437ec17125 | ||
|
|
2bf6b7ad1a | ||
|
|
899cb52e7a | ||
|
|
529729831c | ||
|
|
938e887b4c | ||
|
|
57e98fe6c9 | ||
|
|
07d70a0345 | ||
|
|
cf78349911 | ||
|
|
76efb0153a | ||
|
|
6733a9a538 | ||
|
|
58475261c4 | ||
|
|
cda5910ab0 | ||
|
|
bfb82b5cee | ||
|
|
c8bfb1db8f | ||
|
|
ebd4f2c6a8 | ||
|
|
b74facd119 | ||
|
|
07927f6bf2 | ||
|
|
11b577671b | ||
|
|
153ccbfd61 | ||
|
|
e8c9bcea2b | ||
|
|
7aea893b5a | ||
|
|
938edc6466 | ||
|
|
b8b45bfb77 | ||
|
|
d425901bae | ||
|
|
bcefc2a475 | ||
|
|
9667c71df8 | ||
|
|
808d81f921 | ||
|
|
9f676d1394 | ||
|
|
02a819b16e | ||
|
|
4644f71faf | ||
|
|
9a7ed81b4b | ||
|
|
646b4ec533 | ||
|
|
c92507e53d | ||
|
|
4b53ecb1c7 | ||
|
|
61531396a0 | ||
|
|
6235fdde75 | ||
|
|
8f8dd83443 | ||
|
|
06a5cc484c | ||
|
|
0157253145 | ||
|
|
76a654f949 | ||
|
|
0a88b133c2 | ||
|
|
98b55360a9 | ||
|
|
ccfbf42844 | ||
|
|
c097e56142 | ||
|
|
ef3f3f9c08 | ||
|
|
5d0d5b191c | ||
|
|
1a5f31d631 | ||
|
|
34c8a5fe8b | ||
|
|
bb3f5ed32a | ||
|
|
f562d97f13 | ||
|
|
31afb31108 | ||
|
|
8a3e7e15c6 | ||
|
|
d24bcad90b | ||
|
|
6ceae61a56 | ||
|
|
638136e353 | ||
|
|
8de14c5624 | ||
|
|
2a1f92ef4a | ||
|
|
15911d70c0 | ||
|
|
3dc148ab6f | ||
|
|
9dfa81ab4b | ||
|
|
e5b8e06037 | ||
|
|
a282322845 | ||
|
|
475dd58a8e | ||
|
|
28ffa8e693 | ||
|
|
e53dfd88bb | ||
|
|
93c3a1a9c9 | ||
|
|
064c66df8c | ||
|
|
22479b053c | ||
|
|
a1c4431479 | ||
|
|
3bc933586a | ||
|
|
0219abfeed | ||
|
|
e976879cf2 | ||
|
|
319e6615c3 | ||
|
|
7f7282c78d | ||
|
|
809abd60bf | ||
|
|
aaaba78126 | ||
|
|
4068f20ce9 | ||
|
|
cd4e995d54 | ||
|
|
d51243b6d3 | ||
|
|
df07baedfe | ||
|
|
38aa47ad6c | ||
|
|
978e1356c0 | ||
|
|
39f3c0aeb0 | ||
|
|
7086fde37e | ||
|
|
4cb553c765 | ||
|
|
987410fff3 | ||
|
|
4a8cd6f856 | ||
|
|
d7adfe8f61 | ||
|
|
def7b84a12 | ||
|
|
8121aef83c | ||
|
|
1bb8ed4495 | ||
|
|
5e12442b4b | ||
|
|
fefc709b2c | ||
|
|
45d3e83ad1 | ||
|
|
0aed9bfde1 | ||
|
|
ae2a5e5743 | ||
|
|
f896bb5d8c | ||
|
|
cd6e5e44e4 | ||
|
|
47e49da77c | ||
|
|
e004c094ea | ||
|
|
5c54128475 | ||
|
|
42cf66ae39 | ||
|
|
73ea5102dc | ||
|
|
d53035ad82 | ||
|
|
5a4348d046 | ||
|
|
400b8d92b7 | ||
|
|
6b211bf008 | ||
|
|
68fdc62d8f | ||
|
|
bb7cdc6d44 | ||
|
|
7e637d3b6a | ||
|
|
2a62514d17 | ||
|
|
e9c3317158 | ||
|
|
1e3607150c | ||
|
|
c7fc39bde0 | ||
|
|
e782b92bca | ||
|
|
a370ab8391 | ||
|
|
2eb778119d | ||
|
|
92e9809c86 | ||
|
|
364cb956c1 | ||
|
|
8d182ec733 | ||
|
|
323ca70846 | ||
|
|
a37fc05171 | ||
|
|
1956b9d97a | ||
|
|
9cb9d1a47a | ||
|
|
2192b17670 | ||
|
|
7febdf7208 | ||
|
|
ec2c6dff70 | ||
|
|
65356003e3 | ||
|
|
a7e5f19528 | ||
|
|
9302690e1b | ||
|
|
a29801286f | ||
|
|
29ef69c703 | ||
|
|
0aa31cd3cb | ||
|
|
013cc4d2fc | ||
|
|
07f09ecd83 | ||
|
|
8805e705a7 | ||
|
|
2d35016b94 | ||
|
|
8cddcfa0d8 | ||
|
|
3c813535a7 | ||
|
|
0712639441 | ||
|
|
4f427167ac | ||
|
|
44bf859c3b | ||
|
|
d987ff54a1 | ||
|
|
a0b0dbe6b2 | ||
|
|
8fa96debc9 | ||
|
|
a8409a161f | ||
|
|
452593319b | ||
|
|
73ba4987d5 | ||
|
|
41fa4fbaa5 | ||
|
|
11825ccefa | ||
|
|
91101065bb | ||
|
|
01bec40724 | ||
|
|
9b58b9bced | ||
|
|
b66c8b409c | ||
|
|
09b1de5f71 | ||
|
|
3667138d05 | ||
|
|
66c0b719de | ||
|
|
a182d12778 | ||
|
|
d905e612aa | ||
|
|
fa7a18f42a | ||
|
|
82113f1f1e | ||
|
|
047b118299 | ||
|
|
01d3b31479 | ||
|
|
a5ffa1278c | ||
|
|
b7d58320a8 | ||
|
|
605ba4adea | ||
|
|
24a0c08d58 | ||
|
|
b4a100dfc0 | ||
|
|
4a8f23eddf | ||
|
|
a54405e339 | ||
|
|
efb780c754 | ||
|
|
c64efa9260 | ||
|
|
43cb35cb21 | ||
|
|
db496180db | ||
|
|
c69adfbb17 | ||
|
|
683c8b24d4 | ||
|
|
d2dee43825 | ||
|
|
59b53f0a23 | ||
|
|
d198a647e2 | ||
|
|
0f53275169 | ||
|
|
366de72a38 | ||
|
|
13f5459670 | ||
|
|
93333387d6 | ||
|
|
1f9e7cd659 | ||
|
|
09fc64c6b6 | ||
|
|
84147f4d81 | ||
|
|
ee4b20b55b | ||
|
|
ed27b826c5 | ||
|
|
b03aefaf20 | ||
|
|
d7f4db53f5 | ||
|
|
2c97bf3936 | ||
|
|
1dfa544250 | ||
|
|
eac5f8f40f | ||
|
|
184aa5b2b3 | ||
|
|
bdcf247efe | ||
|
|
b16d7f2da6 | ||
|
|
9423fda5cb | ||
|
|
4d873f77c1 | ||
|
|
09336a6710 | ||
|
|
9149c34a26 | ||
|
|
c837ef949d | ||
|
|
1d4a23fa6c | ||
|
|
a71736ea73 | ||
|
|
a82ce60294 | ||
|
|
69090d6da1 | ||
|
|
322ffbed61 | ||
|
|
fe9da5280f | ||
|
|
4864a5684a | ||
|
|
f1510ec33e | ||
|
|
4523cc09cf | ||
|
|
f524aed23e | ||
|
|
925f378baa | ||
|
|
fe29594716 | ||
|
|
7721518591 | ||
|
|
6e303def12 | ||
|
|
ad1fbd88b2 | ||
|
|
b8067ac27e | ||
|
|
bd2606a576 | ||
|
|
f5324f9aa5 | ||
|
|
de2b881886 | ||
|
|
0d6b25274c | ||
|
|
fbfdde496b | ||
|
|
ae1c11c5a5 | ||
|
|
5abee4fb23 | ||
|
|
331af8df23 | ||
|
|
3a2fd1a5c9 | ||
|
|
2e1aa1b424 | ||
|
|
aead9c8ead | ||
|
|
93230af7bd | ||
|
|
21ff0d39ad | ||
|
|
4b619c9672 | ||
|
|
c5321298ce | ||
|
|
359352b947 | ||
|
|
a9241f3e3e | ||
|
|
ea0a263434 | ||
|
|
3be6e8a5f2 | ||
|
|
2b244762e1 | ||
|
|
a169a656b4 | ||
|
|
a9fdd8dc3c | ||
|
|
8eb9eed074 | ||
|
|
909e048ad4 | ||
|
|
5eb62ef423 | ||
|
|
58dbd81f03 | ||
|
|
a35c37a2f9 | ||
|
|
1518734e59 | ||
|
|
67b9470207 | ||
|
|
586fe5d62d | ||
|
|
2d80ef7872 | ||
|
|
b76cae94d4 | ||
|
|
23270d41b9 | ||
|
|
24479625a2 | ||
|
|
d41a214c1a | ||
|
|
d502952bac | ||
|
|
ac53bf1d71 | ||
|
|
145c57fc01 | ||
|
|
2dddfce08c | ||
|
|
03a4f184e6 | ||
|
|
be2e259596 | ||
|
|
05bc8b19fe | ||
|
|
cb6b70bbfb | ||
|
|
a458b535c9 | ||
|
|
b53d5dad67 | ||
|
|
ad7a16dca6 | ||
|
|
6e851a1f6a | ||
|
|
c1171fe666 | ||
|
|
2210068f5b | ||
|
|
d6ab35c1a3 | ||
|
|
5fc751e543 | ||
|
|
cea78c5e27 | ||
|
|
53be6afe92 | ||
|
|
d04b9f4dc5 | ||
|
|
d94519c5ba | ||
|
|
4c54c2709c | ||
|
|
c90ba029ce | ||
|
|
5489c66cdf | ||
|
|
960c1521f3 | ||
|
|
149516f365 | ||
|
|
87349b9bc1 | ||
|
|
87cc5287a8 | ||
|
|
c047c03e82 | ||
|
|
0cb639d472 | ||
|
|
792be0e8e3 | ||
|
|
c1228e9a4a | ||
|
|
6782249df9 | ||
|
|
b4af03aea8 | ||
|
|
74c214e957 | ||
|
|
0229e6b407 | ||
|
|
c358af7861 | ||
|
|
8eefbef91c | ||
|
|
e590caf8d8 | ||
|
|
46b95ee694 | ||
|
|
0fdeffe6c4 | ||
|
|
cc4ead999a | ||
|
|
60cba55d82 | ||
|
|
1caee06b22 | ||
|
|
a6eaf0f41f | ||
|
|
fadad820dd | ||
|
|
e8b19b5826 | ||
|
|
9ea2209a43 | ||
|
|
87af622df4 | ||
|
|
2c21c4b897 | ||
|
|
771969f747 | ||
|
|
e9742e202f | ||
|
|
a2ea85924a | ||
|
|
8318a519e6 | ||
|
|
8ef3c815e7 | ||
|
|
de07aa7c40 | ||
|
|
928bb16da1 | ||
|
|
441f498d6f | ||
|
|
a630ca15de | ||
|
|
52e3580cd4 | ||
|
|
694a3ebdd5 | ||
|
|
2a062e2f45 | ||
|
|
49ec1c9e8f | ||
|
|
4bd579f915 | ||
|
|
e4adb67ed8 | ||
|
|
ff09cad879 | ||
|
|
580e6ba2ff | ||
|
|
ca23875575 | ||
|
|
d6d5a43d3a | ||
|
|
d723208b1b | ||
|
|
b0a5fe8974 | ||
|
|
899dfdcfb9 | ||
|
|
8f0b07ed29 | ||
|
|
f16f2912cf | ||
|
|
af748539f8 | ||
|
|
695c017411 | ||
|
|
5e6c7bc205 | ||
|
|
e8cec55fad | ||
|
|
67fc6bc4e9 | ||
|
|
cbca0225f6 | ||
|
|
36ac91c902 | ||
|
|
a2902fbad5 | ||
|
|
d03de749a1 | ||
|
|
c3dec1dcda | ||
|
|
4945240fc3 | ||
|
|
1db8609ac9 | ||
|
|
f6bc620d39 | ||
|
|
b4b46d1b67 | ||
|
|
c1775de56f | ||
|
|
de6750ed23 | ||
|
|
c0ffd6b704 | ||
|
|
8b9de366f2 | ||
|
|
60d3f79c72 | ||
|
|
6f3a673aba | ||
|
|
ab6a6338c4 | ||
|
|
1ec8c1fcaa | ||
|
|
739eb6702e | ||
|
|
1aa7badb3c | ||
|
|
ee4008431a | ||
|
|
88f8bcde38 | ||
|
|
2285615010 | ||
|
|
805ce8177b | ||
|
|
bdce33e239 | ||
|
|
9be8d88ccc | ||
|
|
6ab3ebf195 | ||
|
|
0a628c1aef | ||
|
|
36328a996f | ||
|
|
4bc32dc0f1 | ||
|
|
4de5e017f1 | ||
|
|
3e352f8a0d | ||
|
|
28ae5db9b0 | ||
|
|
d5811c887a | ||
|
|
975fd86dc4 | ||
|
|
0ff7fe3ee2 | ||
|
|
b9d55d5719 | ||
|
|
ab7dc22984 | ||
|
|
bf8350ac18 | ||
|
|
a5c6348d41 | ||
|
|
320f881e0b | ||
|
|
0d96f1991c | ||
|
|
172a38c344 | ||
|
|
8bc0d4f77d | ||
|
|
8eabdefa8a | ||
|
|
f658af45c2 | ||
|
|
5212644861 | ||
|
|
1151f84351 | ||
|
|
9abd6bf342 | ||
|
|
d2c7ef6b41 | ||
|
|
4e3a8a0637 | ||
|
|
a34102049b | ||
|
|
ef5d811aba | ||
|
|
2d44ed1c5b | ||
|
|
fa2e72ae9c | ||
|
|
5bfc4ed53b | ||
|
|
520aec20e0 | ||
|
|
64bec1d060 | ||
|
|
ac58309dbd | ||
|
|
94023e6a85 | ||
|
|
5eaf4a3f32 | ||
|
|
a5a5d82a21 | ||
|
|
34e8d088c2 | ||
|
|
b78b605ba9 | ||
|
|
c3cf88b202 | ||
|
|
58b756f04c | ||
|
|
34f8ac2d85 | ||
|
|
1a10eb8cd9 | ||
|
|
59705b80cd | ||
|
|
46a7d6aeb2 | ||
|
|
c754135965 | ||
|
|
c6b75baad0 | ||
|
|
a7ad6f6d28 | ||
|
|
1a2141d04d | ||
|
|
ff3f3169b2 | ||
|
|
f4580b6010 | ||
|
|
d82fcef91b | ||
|
|
7b63a787b3 | ||
|
|
069570d103 | ||
|
|
0dafdcab86 | ||
|
|
654e16187e | ||
|
|
732c66b0f3 | ||
|
|
1f0944de21 | ||
|
|
912efe11b5 | ||
|
|
4684aaffdc | ||
|
|
f1a1b58319 | ||
|
|
c21d77ca08 | ||
|
|
d6c710706f | ||
|
|
a6d3becd6a | ||
|
|
3b67606c42 | ||
|
|
f8240143b6 | ||
|
|
0ce190be0d | ||
|
|
a2d0d07109 | ||
|
|
aedb773f0d | ||
|
|
aaf8f2d2d2 | ||
|
|
12f4800631 | ||
|
|
57b48a81ca | ||
|
|
7af33accf1 | ||
|
|
3214c05e82 | ||
|
|
4608a7fe4e | ||
|
|
af67ea8800 | ||
|
|
37c3dcf551 | ||
|
|
6a49fbb7da | ||
|
|
eb0b01de7b | ||
|
|
5b1528519c | ||
|
|
52f92eb689 | ||
|
|
7f9dd60c15 | ||
|
|
77da3bbc95 | ||
|
|
bb489a3903 | ||
|
|
167eb824cb | ||
|
|
efb64aee5a | ||
|
|
3045e29232 | ||
|
|
5d7d76025a | ||
|
|
e6c829384e | ||
|
|
5c658a416c | ||
|
|
a130aa8165 | ||
|
|
35d57ed752 | ||
|
|
1404f846a7 | ||
|
|
5785bd3272 | ||
|
|
cf9482984e | ||
|
|
67275641f8 | ||
|
|
3ffaac00dd | ||
|
|
816a3ef6f1 | ||
|
|
a8bf414f4a | ||
|
|
3b312d45c5 | ||
|
|
fcd899f888 | ||
|
|
315f3ea429 | ||
|
|
7241e8784a | ||
|
|
b7d6eae64c | ||
|
|
b3765c28d0 | ||
|
|
4cfb66bac2 | ||
|
|
0c4cff352a | ||
|
|
503269b85a | ||
|
|
161436cfdd | ||
|
|
24f549a692 | ||
|
|
7a8778ac73 | ||
|
|
763c6d104d | ||
|
|
4d7d9d9715 | ||
|
|
a9c35f9175 | ||
|
|
37752ff1ac | ||
|
|
31b84213e4 | ||
|
|
2036c22f88 | ||
|
|
7185a66b96 | ||
|
|
2394e18729 | ||
|
|
99f7582175 | ||
|
|
93c5997290 | ||
|
|
2d1a1c1c47 | ||
|
|
71e81728ac | ||
|
|
ebe60646db | ||
|
|
f996d7950b | ||
|
|
ae4a674c84 | ||
|
|
169615abc8 | ||
|
|
7c30ac2141 | ||
|
|
192501528f | ||
|
|
5ae0b731d0 | ||
|
|
d9f373654b | ||
|
|
0efbb137e8 | ||
|
|
cf63b2471f | ||
|
|
d8df91dfa8 | ||
|
|
f88343a6da | ||
|
|
491605cfea | ||
|
|
3aded1d4e5 | ||
|
|
4f0402ed3a | ||
|
|
ecac6321c4 | ||
|
|
20c6573e0a | ||
|
|
97b1c76b14 | ||
|
|
24a37032fa | ||
|
|
c0520223fd | ||
|
|
1f1caa836a | ||
|
|
b3ea7714f5 | ||
|
|
a7f9721785 | ||
|
|
a5461e07bf | ||
|
|
2e73a9e893 | ||
|
|
26bb56b775 | ||
|
|
95b1130485 | ||
|
|
3fb8938cd3 | ||
|
|
7791174ced | ||
|
|
c5e8166c8b | ||
|
|
2b88568653 | ||
|
|
34b4fe495e | ||
|
|
4fdd6c0dac | ||
|
|
60b6abefd9 | ||
|
|
4d53b7ccaa | ||
|
|
0c3253a485 | ||
|
|
d0f84c0964 | ||
|
|
ceefe36756 | ||
|
|
67421ed74f | ||
|
|
081079da62 | ||
|
|
e2fe1373f3 | ||
|
|
7891050e06 | ||
|
|
e28dc13cd5 | ||
|
|
9eee529a7f | ||
|
|
333e4abe30 | ||
|
|
cd77c7100c | ||
|
|
cf810c2950 | ||
|
|
a23bcb81ce | ||
|
|
d07d867718 | ||
|
|
666f2dd486 | ||
|
|
34792dd907 | ||
|
|
7ad6fc8a40 | ||
|
|
f824c10429 | ||
|
|
132e5ec179 | ||
|
|
66d3e6a0c2 | ||
|
|
4a09ae2985 | ||
|
|
8c734f2f27 | ||
|
|
245d174359 | ||
|
|
77f47768dd | ||
|
|
7b1f40dd00 | ||
|
|
90fa9e54ca | ||
|
|
9d3a44e0e8 | ||
|
|
932d596466 | ||
|
|
d518f40e8b | ||
|
|
f016cfca46 | ||
|
|
b8120df860 | ||
|
|
0df7df52f3 | ||
|
|
bfa27d0a68 | ||
|
|
5a20c486e3 | ||
|
|
78e19ebc95 | ||
|
|
b383cafc44 | ||
|
|
b10ff83566 | ||
|
|
daa1f542f9 | ||
|
|
d507f593d0 | ||
|
|
f210510276 | ||
|
|
19b6f81ee7 | ||
|
|
76545ab365 | ||
|
|
b8c3bc7841 | ||
|
|
a680367568 | ||
|
|
dfd37a4b31 | ||
|
|
5ee9b67d9b | ||
|
|
542faf225f | ||
|
|
5684c68121 | ||
|
|
4be783446a | ||
|
|
8d719b180a | ||
|
|
bf048c8aec | ||
|
|
c5a9d1ef9d | ||
|
|
c7b6f423c7 | ||
|
|
6d34207167 | ||
|
|
fcde9be10d | ||
|
|
3830bbda41 | ||
|
|
4447e7d71a | ||
|
|
7bccd904c7 | ||
|
|
313d522b61 | ||
|
|
9ee4fe41fe | ||
|
|
39ee3512cb | ||
|
|
42673556af | ||
|
|
faab73ad58 | ||
|
|
7e36468511 | ||
|
|
86eed141af | ||
|
|
c6df39955c | ||
|
|
9ba5d399e5 | ||
|
|
19459b7623 | ||
|
|
306d92a9d7 | ||
|
|
5baae0df88 | ||
|
|
24f6a193e7 | ||
|
|
8c0f8baf32 | ||
|
|
d80c30cc92 | ||
|
|
e64d646bad | ||
|
|
b84f9e410c | ||
|
|
ee5daba061 | ||
|
|
23e84de830 | ||
|
|
48e0dc8791 | ||
|
|
b0b19fdeb1 | ||
|
|
fb0f579b16 | ||
|
|
5a711f32b1 | ||
|
|
8c26a057a3 | ||
|
|
ae4644f495 | ||
|
|
4d34427cc7 | ||
|
|
70cffa4d3b | ||
|
|
ee7d8c56c7 | ||
|
|
41877183bc | ||
|
|
451a007fb1 | ||
|
|
0a82396718 | ||
|
|
5da55ea1e3 | ||
|
|
40bc7216e1 | ||
|
|
5cdcb9e26f | ||
|
|
ce7e7fef30 | ||
|
|
064c009deb | ||
|
|
86caa8539c | ||
|
|
caab1cf453 | ||
|
|
55c70f3508 | ||
|
|
d29249b8fa | ||
|
|
f668e9fc75 | ||
|
|
74fe1e2254 | ||
|
|
348936752a | ||
|
|
69a36a3361 | ||
|
|
8712dd6d1c | ||
|
|
55a21fe37b | ||
|
|
f55f625277 | ||
|
|
9dac85b069 | ||
|
|
99bd69baa8 | ||
|
|
a62a137a4f | ||
|
|
82b18e8ac2 | ||
|
|
0111c9848d | ||
|
|
ab9cadfeee | ||
|
|
8bf28e1441 | ||
|
|
ce28f847ce | ||
|
|
5609117882 | ||
|
|
b4fbb6fe10 | ||
|
|
82d7e9429e | ||
|
|
e2821effb5 | ||
|
|
9742f11fda | ||
|
|
53b4b7651a | ||
|
|
388dd4789c | ||
|
|
fdebca4573 | ||
|
|
479dfc096a | ||
|
|
3c6c11b7c9 | ||
|
|
bc091eb7ef | ||
|
|
a857321463 | ||
|
|
f75b1d21b4 | ||
|
|
33cfe1515d | ||
|
|
94053d75a6 | ||
|
|
2a68099675 | ||
|
|
3b43f7267a | ||
|
|
6cd3bc6640 | ||
|
|
211b55815e | ||
|
|
8ae4a6f824 | ||
|
|
b98301677a | ||
|
|
f2fdde5ba4 | ||
|
|
4f56e31dc7 | ||
|
|
6d3804770c | ||
|
|
ab0f4126cf | ||
|
|
1755a9e38a | ||
|
|
585f8528b2 | ||
|
|
75f523f5c0 | ||
|
|
68fbae5692 | ||
|
|
80f1dd8d37 | ||
|
|
b52b37ae64 | ||
|
|
566aeaeefa | ||
|
|
7a0544ab57 | ||
|
|
d63b363cde | ||
|
|
c05c60665e | ||
|
|
b4873a5de7 | ||
|
|
913f8ce0a5 | ||
|
|
453e0677d6 | ||
|
|
4a63737227 | ||
|
|
3e93db16bd | ||
|
|
f863a42351 | ||
|
|
dc55f493be | ||
|
|
936fda3f9e | ||
|
|
ecb8148a9f | ||
|
|
2dbbedc05a | ||
|
|
c30967806c | ||
|
|
145f719d30 | ||
|
|
32dbd31b9a | ||
|
|
b89eb29174 | ||
|
|
3670089a42 | ||
|
|
3982fcf095 | ||
|
|
8481fdcf08 | ||
|
|
39299e2de4 | ||
|
|
efec4fcaab | ||
|
|
5ce2c47d60 | ||
|
|
f6f3d1de9b | ||
|
|
ec0fe3242a | ||
|
|
f2e24faaca | ||
|
|
8c80b96318 | ||
|
|
2387465dcc | ||
|
|
32636ecf8a | ||
|
|
6055adbe1b | ||
|
|
ffd2f8dc50 | ||
|
|
e93b4d1dcd | ||
|
|
014a5b712d | ||
|
|
2317d115cd | ||
|
|
8253b54be9 | ||
|
|
5c867fd79f | ||
|
|
a44e041acf | ||
|
|
e9f05b3524 | ||
|
|
e2a834578d | ||
|
|
ffc752a79e | ||
|
|
399562a7d1 | ||
|
|
fec8a0da72 | ||
|
|
9f4542b3db | ||
|
|
363633e2ba | ||
|
|
a41ba57a7a | ||
|
|
884c8ea70a | ||
|
|
c886333d32 | ||
|
|
55b173dd03 | ||
|
|
9079a27814 | ||
|
|
d7d10b14cd | ||
|
|
81986022b7 | ||
|
|
dcba291d45 | ||
|
|
48e65631f6 | ||
|
|
a6499b6107 | ||
|
|
14a11d24b4 | ||
|
|
74a36b0729 | ||
|
|
efc7a7b957 | ||
|
|
4f1464b3af | ||
|
|
3a41079fac | ||
|
|
5279540bb4 | ||
|
|
577da79a47 | ||
|
|
1faa9648d3 | ||
|
|
ad57bf1e4b | ||
|
|
d5efb82c7c | ||
|
|
36214d14db | ||
|
|
ea2f7ef2f6 | ||
|
|
435530018b | ||
|
|
df61054a84 | ||
|
|
690b8bb563 | ||
|
|
c43451a50b | ||
|
|
1e312c6582 | ||
|
|
e36c8cd49a | ||
|
|
16cb6d1a6e | ||
|
|
21d61bdd71 | ||
|
|
ad9c26afb8 | ||
|
|
71c0cd00e5 | ||
|
|
83f99d8203 | ||
|
|
6b37d38dee | ||
|
|
938499ddfb | ||
|
|
d92266d7c0 | ||
|
|
a352b5c193 | ||
|
|
82f7483999 | ||
|
|
56dc9277d7 | ||
|
|
d50e9bcef7 | ||
|
|
c4e520fd6e | ||
|
|
30ff395924 | ||
|
|
f55025952d | ||
|
|
1bc45ee8fe | ||
|
|
19016497ef | ||
|
|
d578d06f59 | ||
|
|
e25ad79d5d | ||
|
|
f2624a1426 | ||
|
|
15561ec425 | ||
|
|
93d93fdea4 | ||
|
|
87f4e4cb9b | ||
|
|
82cb1752d9 | ||
|
|
ada3713e77 | ||
|
|
7d79ce92ac | ||
|
|
1708dcd2b2 | ||
|
|
5702eba93b | ||
|
|
a1767fd69c | ||
|
|
b4b426c69d | ||
|
|
2465674fda | ||
|
|
2eca0d4af1 | ||
|
|
11a7c6b112 | ||
|
|
50ea8adf46 | ||
|
|
ca33372595 | ||
|
|
7d47e3b776 | ||
|
|
fe15a2c65c | ||
|
|
d400fb8b23 | ||
|
|
3221818b6e | ||
|
|
2af2f148ab | ||
|
|
d19109742e | ||
|
|
078e2e4b19 | ||
|
|
9aa2999388 | ||
|
|
d0d9897e81 | ||
|
|
9306a1e06a | ||
|
|
141b12bd39 | ||
|
|
ae3deff8d4 | ||
|
|
41adca4e77 | ||
|
|
8e901b31c1 | ||
|
|
11a5a64729 | ||
|
|
0dba3027c1 | ||
|
|
405c7e08be | ||
|
|
cb36930f1d | ||
|
|
90e6fa2612 | ||
|
|
fd22ae5fcb | ||
|
|
e1baab90f7 | ||
|
|
4fcfa329ba | ||
|
|
b336980229 | ||
|
|
7128f95621 | ||
|
|
ffc6d767ec | ||
|
|
44a2d0c01f | ||
|
|
3e2ed18ad0 | ||
|
|
db58cfb13d | ||
|
|
3220bb8aaa | ||
|
|
ff3a479156 | ||
|
|
6f4941616d | ||
|
|
bd3025d669 | ||
|
|
4c72329412 | ||
|
|
8311e8984b | ||
|
|
093acd72dd | ||
|
|
e9ab711b66 | ||
|
|
b2a9f6beaa | ||
|
|
d3504f84af | ||
|
|
34badeb19c | ||
|
|
f93b48226c | ||
|
|
4805be0119 | ||
|
|
a3ca71fe26 | ||
|
|
70a0a5ff4a | ||
|
|
021f62cb0c | ||
|
|
ba214e43c8 | ||
|
|
520a26c48f | ||
|
|
a787a0d60b | ||
|
|
8d2d8cc728 | ||
|
|
4ae61b0886 | ||
|
|
79871c2083 | ||
|
|
7796ac1411 | ||
|
|
c45aeb45b1 | ||
|
|
ee7fde6531 | ||
|
|
0ea6c34325 | ||
|
|
3db3d60368 | ||
|
|
bfd08d5648 | ||
|
|
7f9777a0b0 | ||
|
|
87a16ad2e5 | ||
|
|
f90a627f9a | ||
|
|
152e0800e6 | ||
|
|
d8f10fa515 | ||
|
|
e86f391cac | ||
|
|
e39de2e752 | ||
|
|
1538be45de | ||
|
|
95e3f4b001 | ||
|
|
b7821b6dc1 | ||
|
|
556a132f2d | ||
|
|
fafb9c23bf | ||
|
|
1754bdf1e8 | ||
|
|
fa3d7b3d03 | ||
|
|
73f2998d48 | ||
|
|
6a51fd23df | ||
|
|
ffec21236d | ||
|
|
db0521ce0e | ||
|
|
a1c25046a9 | ||
|
|
de0af4df66 | ||
|
|
0e1723ef74 | ||
|
|
aefc330b8f | ||
|
|
f967471758 | ||
|
|
4f5ffb8909 | ||
|
|
54909b0282 | ||
|
|
f084538cb9 | ||
|
|
535b46f813 | ||
|
|
4766b3cdb9 | ||
|
|
354af6ccee | ||
|
|
c9afbbac0b | ||
|
|
83fa442c1b | ||
|
|
1900e5238b | ||
|
|
ddae1aa2e9 | ||
|
|
16274d5a82 | ||
|
|
5749f5809c | ||
|
|
d10108f8ca | ||
|
|
8b520f9848 | ||
|
|
4cc431afab | ||
|
|
a718aed1be | ||
|
|
5f29e7b63c | ||
|
|
245c766512 | ||
|
|
f08ad94d4d | ||
|
|
cdf5375b9a | ||
|
|
bdf4758510 | ||
|
|
84e45b5c40 | ||
|
|
daedec6957 | ||
|
|
de59d91add | ||
|
|
68cc81a74d | ||
|
|
3ead3401e0 | ||
|
|
eec31b0089 | ||
|
|
7df14227a9 | ||
|
|
60effcfc44 | ||
|
|
63f5e14c69 | ||
|
|
64ff8f065b | ||
|
|
468b7fdbad | ||
|
|
14b0ad95c6 | ||
|
|
221e4228ec | ||
|
|
dd9d3f89b9 | ||
|
|
b0cce17da6 | ||
|
|
c6b3b8c847 | ||
|
|
2ba87a10b0 | ||
|
|
5fa3e24b76 | ||
|
|
ac6d747fa6 | ||
|
|
ee541c84f1 | ||
|
|
6053236158 | ||
|
|
11615014a4 | ||
|
|
3588396263 | ||
|
|
11a2ecb936 | ||
|
|
151e8d896c | ||
|
|
593c549bc4 | ||
|
|
aa2ecaef29 | ||
|
|
0eb0bec74c | ||
|
|
3c252ae44b | ||
|
|
6789084ec0 | ||
|
|
b603b6e1c9 | ||
|
|
3c13feed4c | ||
|
|
7652afb8de | ||
|
|
7862e7010c | ||
|
|
4faf2a6cf4 | ||
|
|
8c48bb080f | ||
|
|
6d2481ee5c | ||
|
|
ca5525bcd7 | ||
|
|
56b53bff6e | ||
|
|
fd335a4e26 | ||
|
|
c4ea996612 | ||
|
|
39bfd226b8 | ||
|
|
234b67f5fd | ||
|
|
e27e3a4f8a | ||
|
|
7a11ff95a9 | ||
|
|
33ab5cec82 | ||
|
|
1cb2311bad | ||
|
|
25c65bc99e | ||
|
|
afb680b50d | ||
|
|
c574a4d086 | ||
|
|
bd8b20b933 | ||
|
|
866fd9476b | ||
|
|
d2ec5aaacf | ||
|
|
e265006fd6 | ||
|
|
b1bf11b0fe | ||
|
|
6bf3aad62e | ||
|
|
3a840a130c | ||
|
|
14396e3fe7 | ||
|
|
1ad930cbd0 | ||
|
|
7a0b37712f | ||
|
|
e2b8740fcf | ||
|
|
45d132d098 | ||
|
|
719f2eef32 | ||
|
|
698b35933e | ||
|
|
0512ada793 | ||
|
|
47289ba6f1 | ||
|
|
5e5e0efc60 | ||
|
|
7b38afc179 | ||
|
|
e5893075f9 | ||
|
|
5e598a588f | ||
|
|
c2d8d17285 | ||
|
|
8bc2de4ab6 | ||
|
|
75a92a3f82 | ||
|
|
72963e9ccb | ||
|
|
92da8e7e62 | ||
|
|
c84d5ce738 | ||
|
|
dda9f3e734 | ||
|
|
834e25a662 | ||
|
|
196a13f3dc | ||
|
|
440d33eec4 | ||
|
|
11f5c1ecf0 | ||
|
|
3b745633e4 | ||
|
|
900d48714a | ||
|
|
3fdf03390e | ||
|
|
25fb9aafcb | ||
|
|
54147474d3 | ||
|
|
4d6f380bd1 | ||
|
|
93f5fd80b8 | ||
|
|
177be32b7f | ||
|
|
30efc263ff | ||
|
|
ed0e860abb | ||
|
|
41d8a80226 | ||
|
|
4ec386cc72 | ||
|
|
dd69f16c3e | ||
|
|
1db5598294 | ||
|
|
23d0b7af6a | ||
|
|
a7c2b9e280 | ||
|
|
70dfec9638 | ||
|
|
95b0610f36 | ||
|
|
500f0eab4a | ||
|
|
86b1db0598 | ||
|
|
5a79e423fe | ||
|
|
7f7643cf63 | ||
|
|
bf52468a91 | ||
|
|
b4688f10d4 | ||
|
|
31a5cd185a | ||
|
|
7166647ca1 | ||
|
|
f7300a858e | ||
|
|
e87859e82c | ||
|
|
de101a8202 | ||
|
|
7f1f4c2248 | ||
|
|
c33f8d381b | ||
|
|
3f58e47c63 | ||
|
|
b7f8a17c24 | ||
|
|
6cbb8f3a0c | ||
|
|
ec97f9ad1a | ||
|
|
10085041cf | ||
|
|
7b23dbfe68 | ||
|
|
8e0c48e6d2 | ||
|
|
b759602483 | ||
|
|
2205b22409 | ||
|
|
1ddf8c26f5 | ||
|
|
9769e07cd5 | ||
|
|
08250a53a1 | ||
|
|
ff6d62802d | ||
|
|
46506769f1 | ||
|
|
4ea29978fc | ||
|
|
dfd50ceccd | ||
|
|
6366177118 | ||
|
|
2390728cc3 | ||
|
|
b32c642af3 | ||
|
|
c36b256de5 | ||
|
|
0afe1b707d | ||
|
|
f213620c8b | ||
|
|
35655298e6 | ||
|
|
1e463a8e39 | ||
|
|
de5a88bd97 | ||
|
|
0862fa96fd | ||
|
|
924570c5be | ||
|
|
4d8689c10c | ||
|
|
1d7ce5e063 | ||
|
|
72d3425eef | ||
|
|
b7f099beed | ||
|
|
912ef50165 | ||
|
|
4a9086b848 | ||
|
|
50cb4d5fc7 | ||
|
|
2bc9508b7c | ||
|
|
337cd574c8 | ||
|
|
9fb027915e | ||
|
|
2b821c3a14 | ||
|
|
0d113fab1a | ||
|
|
19f28a633a | ||
|
|
2c817ce4a5 | ||
|
|
66a5bc64db | ||
|
|
7f423508e4 | ||
|
|
306c6706a6 | ||
|
|
64be67e062 | ||
|
|
0c0a2eb0a2 | ||
|
|
de0829cec3 | ||
|
|
20177660bb | ||
|
|
609fc6d080 | ||
|
|
518826e70c | ||
|
|
13992a58da | ||
|
|
0d2ac1c07f | ||
|
|
fb7df099e0 | ||
|
|
f14ff3e041 | ||
|
|
07fcb94bc0 | ||
|
|
66d9983d46 | ||
|
|
4f3cb98e5e | ||
|
|
8c1f5efcab | ||
|
|
c92bdd8785 | ||
|
|
e09ef6b8bc | ||
|
|
f7677ed275 | ||
|
|
e5f719a33b | ||
|
|
79bd65034c | ||
|
|
fbb1923fad | ||
|
|
bf75c450b7 | ||
|
|
b2172c4b2e | ||
|
|
69ccd76679 | ||
|
|
8b54bb4d89 | ||
|
|
2595d81733 | ||
|
|
f9e05218ca | ||
|
|
2ddda5da89 | ||
|
|
dc80f0b222 | ||
|
|
5007a122b2 | ||
|
|
43f2321225 | ||
|
|
1362f92f2e | ||
|
|
445d2646a9 | ||
|
|
ae8d25faca | ||
|
|
9061c03b6d | ||
|
|
8174f5a988 | ||
|
|
03f7b551be | ||
|
|
80ad6572a3 | ||
|
|
c77f3da0ce | ||
|
|
c104647450 | ||
|
|
547ba73b82 | ||
|
|
3526fa27fd | ||
|
|
9eabdb64ff | ||
|
|
6f543eac9f | ||
|
|
64eca85876 | ||
|
|
152271851f | ||
|
|
0909be3aa8 | ||
|
|
274e623b50 | ||
|
|
2972f982e4 | ||
|
|
df8a62d018 | ||
|
|
fec5d59fb3 | ||
|
|
7285e44064 | ||
|
|
2ff54ae6b3 | ||
|
|
f74ac0fc3a | ||
|
|
26a6da27fa | ||
|
|
19abbfff96 | ||
|
|
8aa531c7fa | ||
|
|
21cf339a85 | ||
|
|
588cdacd49 | ||
|
|
0cce536fb2 | ||
|
|
b281ecd50a | ||
|
|
b267e34092 | ||
|
|
58fce0a37b | ||
|
|
f0458ebdb8 | ||
|
|
0a231c0783 | ||
|
|
7c1f90045e | ||
|
|
a5ea272936 | ||
|
|
715825eac3 | ||
|
|
1a97e82000 | ||
|
|
70d1abf81b | ||
|
|
1fd0fcddb2 | ||
|
|
ab4bbf2fb2 | ||
|
|
669e4d0297 | ||
|
|
f92875bc3e | ||
|
|
7f36259f88 | ||
|
|
2c28d9f560 | ||
|
|
c21b071e77 | ||
|
|
de197bd7cb | ||
|
|
bf9dd83c10 | ||
|
|
760fb2ca0e | ||
|
|
a8ccaca8ea | ||
|
|
32070e6bc0 | ||
|
|
f02f647237 | ||
|
|
96043a8f7e | ||
|
|
0bb8d8faf5 | ||
|
|
f5c09a3aba | ||
|
|
3227cc65d1 | ||
|
|
90ca2ae16b | ||
|
|
25e260bb3a | ||
|
|
feea8332d6 | ||
|
|
ffbdd7fcce | ||
|
|
b699cf8c48 | ||
|
|
2efd9bbac4 | ||
|
|
0ac3af8776 | ||
|
|
fed9f06c4e | ||
|
|
240f33a06f | ||
|
|
254aafb265 | ||
|
|
8bd82119be | ||
|
|
9a148bb9a3 | ||
|
|
7a4241e406 | ||
|
|
cb92fbe749 | ||
|
|
1d04074464 | ||
|
|
c4096b4731 | ||
|
|
178658bf9f | ||
|
|
d372eb1f0e | ||
|
|
ebe25fefd6 | ||
|
|
688ccf05cb | ||
|
|
9dc5615b9d | ||
|
|
696e2316a8 | ||
|
|
f2891b70d0 | ||
|
|
dcf370cb6e | ||
|
|
1b8eb85eeb | ||
|
|
cf3236ed27 | ||
|
|
6c86c7c4a9 | ||
|
|
9cc2cf3241 | ||
|
|
9eb4a4a481 | ||
|
|
8463b7ea59 | ||
|
|
faa185e37c | ||
|
|
53b3177ca5 | ||
|
|
76badfed63 | ||
|
|
3c1e31de3e | ||
|
|
d2c932d3ac | ||
|
|
5a569eb1b6 | ||
|
|
e5bd25c73f | ||
|
|
eb88474dd8 | ||
|
|
9fc0ca0a72 | ||
|
|
95b6bd5df6 | ||
|
|
f1311ad3de | ||
|
|
0310170869 | ||
|
|
b6d7e222c1 | ||
|
|
e71d9a89d2 | ||
|
|
74c662b63a | ||
|
|
91bdb9eb2d | ||
|
|
47f16505d2 | ||
|
|
e63986b534 | ||
|
|
cbde8548f4 | ||
|
|
7a3656aea2 | ||
|
|
3ba8b15f13 | ||
|
|
7727a792f2 | ||
|
|
ce175d7372 | ||
|
|
609b19b630 | ||
|
|
e3cb957a10 | ||
|
|
55a0178490 | ||
|
|
9a858b8d67 | ||
|
|
cbff32585d | ||
|
|
8fc28c34ce | ||
|
|
d72b9eadec | ||
|
|
3c5bf5b9d8 | ||
|
|
5a07e26405 | ||
|
|
cd66546e24 | ||
|
|
21a59a4a7c | ||
|
|
b5dbf8e43d | ||
|
|
54e50b8a6e | ||
|
|
b35dbb0420 | ||
|
|
63f6afd75b | ||
|
|
3e311a0092 | ||
|
|
69d3d3c15a | ||
|
|
33bc1a3b58 | ||
|
|
740dd928f7 | ||
|
|
757d012ab5 | ||
|
|
f64a87209d | ||
|
|
41df8ee4f5 | ||
|
|
6877d5f3b5 | ||
|
|
6d74d424d3 | ||
|
|
9ec4f7504b | ||
|
|
9166d56f17 | ||
|
|
80b90dd0d9 | ||
|
|
91907789af | ||
|
|
6845852e82 | ||
|
|
99af12af3f | ||
|
|
fd76ff60ac | ||
|
|
cc6bea8b90 | ||
|
|
c1d9e9a285 | ||
|
|
681141a526 | ||
|
|
c100541f07 | ||
|
|
d64f62c2ef | ||
|
|
e049441d93 | ||
|
|
a30b2f34eb | ||
|
|
2bf96ad244 | ||
|
|
a183827128 | ||
|
|
54dd1b3038 | ||
|
|
75d251b81a | ||
|
|
7a6d4666a2 | ||
|
|
d802db4de0 | ||
|
|
b103bb4c8b | ||
|
|
a9d16c40c7 | ||
|
|
98e3a26b2a | ||
|
|
f209a92b7e | ||
|
|
0edfc7fa49 | ||
|
|
cefe038a87 | ||
|
|
0858ee2f27 | ||
|
|
4d1f2ea522 | ||
|
|
6447a6020c | ||
|
|
b3bf21db56 | ||
|
|
674a6f96d3 | ||
|
|
79f8831738 | ||
|
|
224c900532 | ||
|
|
4f9f5f70e3 | ||
|
|
38db6e9366 | ||
|
|
d18c753b3c | ||
|
|
8fedbf87d9 | ||
|
|
d8a369e194 | ||
|
|
90af34bc83 | ||
|
|
c7857dc1d4 | ||
|
|
08e4dc2563 | ||
|
|
92447141d9 | ||
|
|
e0ed44388f | ||
|
|
16d0aa7b4d | ||
|
|
6037b6a5ab | ||
|
|
e1604b2b4a | ||
|
|
db23f51bc6 | ||
|
|
3c6750f37b | ||
|
|
df2ec585f1 | ||
|
|
250b2ca01a | ||
|
|
c2d5f7bf26 | ||
|
|
e223b4ac09 | ||
|
|
f072801f38 | ||
|
|
ededaaa874 | ||
|
|
b1f55e3ee5 | ||
|
|
51b95236f9 | ||
|
|
9123cfb5dd | ||
|
|
9018e9dd70 | ||
|
|
08ff1c1aa8 | ||
|
|
6134939882 | ||
|
|
7cb6427dea | ||
|
|
79b62497d1 | ||
|
|
0729ef7353 | ||
|
|
8f6788474b | ||
|
|
5c2926102b | ||
|
|
bff37075f6 | ||
|
|
c98ee98525 | ||
|
|
ecb430effe | ||
|
|
7ee7221af1 | ||
|
|
748fd3db88 | ||
|
|
cbff1b818c | ||
|
|
a885d2f240 | ||
|
|
b6247b71b5 | ||
|
|
3555c6173d | ||
|
|
3976962621 | ||
|
|
a54a27595b | ||
|
|
7283b9f6cf | ||
|
|
3dfc0a9679 | ||
|
|
6903c4605c | ||
|
|
5b3f708fcb | ||
|
|
b33ed9176f | ||
|
|
c48817f69b | ||
|
|
70dd3a16dc | ||
|
|
9a19fe1f50 | ||
|
|
3961f8e7a4 | ||
|
|
fc37b17b1f | ||
|
|
630bd3d789 | ||
|
|
5c4c0c0cba | ||
|
|
24c241d29b | ||
|
|
a3d760ff12 | ||
|
|
77a3dda59d | ||
|
|
f6daceb449 | ||
|
|
cfef34f7a6 | ||
|
|
c007b9e5bd | ||
|
|
b9f3518b33 | ||
|
|
ba07d9d5e3 | ||
|
|
90e5211128 | ||
|
|
c0d412a736 | ||
|
|
f9eb5edb96 | ||
|
|
ba8b80a163 | ||
|
|
3b90fa5c9b | ||
|
|
273b367f05 | ||
|
|
783acd712d | ||
|
|
748f0b2b5f | ||
|
|
9350e26e68 | ||
|
|
997f793af1 | ||
|
|
4d5f29c74c | ||
|
|
d070b8698d | ||
|
|
057d3e1810 | ||
|
|
d49af633f0 | ||
|
|
3191a9ba11 | ||
|
|
53e13fe1f1 | ||
|
|
59cb0cecb2 | ||
|
|
1c6846c4c2 | ||
|
|
b88e441a07 | ||
|
|
4f57d7116d | ||
|
|
422607df7c | ||
|
|
3f4b494c61 | ||
|
|
109dffb242 | ||
|
|
0e8ee051c6 | ||
|
|
5c545e67f3 | ||
|
|
2daf5e4296 | ||
|
|
d0c8dd78c2 | ||
|
|
21c3e9973a | ||
|
|
8e4d013154 | ||
|
|
37fb01b17d | ||
|
|
ac0a70b369 | ||
|
|
a4bc6f73d7 | ||
|
|
56ee8a5cc6 | ||
|
|
440c244cac | ||
|
|
655303f2f1 | ||
|
|
14e59706b7 | ||
|
|
d59e93d5e9 | ||
|
|
9e85408c7b | ||
|
|
225ae32e7a | ||
|
|
50ef18644b | ||
|
|
41608beb35 | ||
|
|
d9a8e421a4 | ||
|
|
d7cef744ec | ||
|
|
54cbf30c14 | ||
|
|
dfa3c6265c | ||
|
|
a7f52911e1 | ||
|
|
1e31614572 | ||
|
|
3b615b0f7a | ||
|
|
e184f5ab3a | ||
|
|
d0f82e6dcc | ||
|
|
49e1f9ea89 | ||
|
|
6731230d73 | ||
|
|
ec59d71e60 | ||
|
|
bdac541d1e | ||
|
|
061fa70907 | ||
|
|
48b5cfd085 | ||
|
|
a7609c97be | ||
|
|
c33feb6dc9 | ||
|
|
2c7deb41f6 | ||
|
|
8117d0adab | ||
|
|
01a3a6ab0d | ||
|
|
45a8098d3a | ||
|
|
60812ae041 | ||
|
|
635bec06cb | ||
|
|
0f58dfdea4 | ||
|
|
dd5fe334f3 | ||
|
|
e0c9d495ef | ||
|
|
2f34e6fd30 | ||
|
|
69aa35a51c | ||
|
|
5404a8fcd8 | ||
|
|
eb49936a60 | ||
|
|
ff9ea6c4b1 | ||
|
|
586b0a7047 | ||
|
|
84718d183a | ||
|
|
3099a2f53c | ||
|
|
ed010752dd | ||
|
|
f5be6177b2 | ||
|
|
89c6f24d48 | ||
|
|
f23856df8e | ||
|
|
1b7bc299f3 | ||
|
|
a291cc99cf | ||
|
|
389ac5e017 | ||
|
|
fc792a4be9 | ||
|
|
07501bef14 | ||
|
|
137ce05324 | ||
|
|
ada0b4f131 | ||
|
|
abe925e212 | ||
|
|
8fb44608bf | ||
|
|
153cd5bb44 | ||
|
|
669545f551 | ||
|
|
cfe2f3fe15 | ||
|
|
140d609e0c | ||
|
|
a32ad1a656 | ||
|
|
62ba69a29d | ||
|
|
9b0f2a16ca | ||
|
|
85e629e915 | ||
|
|
999a28062d | ||
|
|
ba3fea24f1 | ||
|
|
6b4a8d0b17 | ||
|
|
5ec75e38b9 | ||
|
|
ad042fdd68 | ||
|
|
35ad3146a8 | ||
|
|
e8343f2d87 | ||
|
|
1b1307d0d1 | ||
|
|
7a11be9f3f | ||
|
|
192ce958c3 | ||
|
|
c441681dc2 | ||
|
|
dd70d57b9b | ||
|
|
f12ea1bc02 | ||
|
|
fa76a331b0 | ||
|
|
d999d9876d | ||
|
|
578a5fb6a9 | ||
|
|
a8809bbd3e | ||
|
|
a478e44585 | ||
|
|
c0494b3558 | ||
|
|
7f1cd014f2 | ||
|
|
07b615e96e | ||
|
|
ab387a6120 | ||
|
|
ac79725923 | ||
|
|
8dd38318fc | ||
|
|
533c064269 | ||
|
|
5c3105b437 | ||
|
|
3c0d0dba49 | ||
|
|
12bbca95ec | ||
|
|
f6574978de | ||
|
|
8380895ae3 | ||
|
|
f018999da9 | ||
|
|
51a6b7d2b5 | ||
|
|
9bfe185a2e | ||
|
|
beeb7896e0 | ||
|
|
212460289b | ||
|
|
221fb17c5e | ||
|
|
488deb04a4 | ||
|
|
9d9eea9ac9 | ||
|
|
e7f0ffbf5d | ||
|
|
a09b018bd5 | ||
|
|
7eac4ee9fe | ||
|
|
17a5efb416 | ||
|
|
3e634aa7e4 | ||
|
|
5d3398aa8a | ||
|
|
76d929e177 | ||
|
|
be91af7551 | ||
|
|
c9011fc7e1 | ||
|
|
ff776b57bf | ||
|
|
3ee788dacc | ||
|
|
fef504f038 | ||
|
|
bbb5776763 | ||
|
|
e87bee9ccd | ||
|
|
69a338610a | ||
|
|
aa6394e94f | ||
|
|
ef409c6a24 | ||
|
|
da4167560f | ||
|
|
3488576bd8 | ||
|
|
619c72e566 | ||
|
|
a3ba41fce2 | ||
|
|
c935a604f8 | ||
|
|
e114f09f70 | ||
|
|
9b4d9452ba | ||
|
|
bbeed5b5d1 | ||
|
|
971ed2bbdf | ||
|
|
affc4e9a8f | ||
|
|
3db83b6824 | ||
|
|
9c8d707530 | ||
|
|
8f5f99c22a | ||
|
|
32254d3010 | ||
|
|
20f2875472 | ||
|
|
c360da4f35 | ||
|
|
bc76a032ba | ||
|
|
8e986584f4 | ||
|
|
4b68d30b0e | ||
|
|
b292192467 | ||
|
|
f172f7d4aa | ||
|
|
8e8b6be690 | ||
|
|
e8c6135a91 | ||
|
|
771cf41fea | ||
|
|
7ea17bb957 | ||
|
|
f8846f85a1 | ||
|
|
4c05ef0ba8 | ||
|
|
5438b64e32 | ||
|
|
248acf715e | ||
|
|
54ca0997ee | ||
|
|
b78076cac7 | ||
|
|
ba19d530ad | ||
|
|
47555602d7 | ||
|
|
6eb76c7c1a | ||
|
|
b32cc4b09d | ||
|
|
6e3dbb8d8b | ||
|
|
b66c093316 | ||
|
|
13d360030f | ||
|
|
66daebe88f | ||
|
|
4071ba29da | ||
|
|
21f9e2df40 | ||
|
|
80d326310e | ||
|
|
53fc705b13 | ||
|
|
d5af53888a | ||
|
|
a7a37249f7 | ||
|
|
6af6ff2a0a | ||
|
|
30ca282594 | ||
|
|
ab7293bed6 | ||
|
|
1614c15bb1 | ||
|
|
f813959750 | ||
|
|
f957ec2267 | ||
|
|
92e3074c10 | ||
|
|
0c618482c4 | ||
|
|
2d8f6c46f1 | ||
|
|
0fbc0475f3 | ||
|
|
c27787f09f | ||
|
|
d90fcd4e2b | ||
|
|
69fd0ca9aa | ||
|
|
e5e77381f0 | ||
|
|
066514e2a9 | ||
|
|
045a1737f8 |
23
.cursorrules
23
.cursorrules
@@ -1,23 +0,0 @@
|
||||
Hermes-Agent is an agent harness for LLMs.
|
||||
|
||||
When building, the tool functionality is in the tools/ directory, where each specific tool (or in some cases, tools that are built for the same execution category or api) are placed in a script each their own.
|
||||
|
||||
Each tool is then consolidated in the model_tools.py file in the repo root.
|
||||
|
||||
There is also a way to consolidate sets of tools in toolsets.py for the agent to use.
|
||||
|
||||
The primary agent runner code is in run_agent, but other runners could be developed using the tools and framework.
|
||||
|
||||
Always ensure consistency between tools, the model_tools.py and toolsets.py when changing any of them, otherwise they could become desynced in a way that is detrimental to functionality.
|
||||
|
||||
The expected pathway for using API keys is to setup and place them in a .env file in the repo root.
|
||||
|
||||
Test scripts will be placed in tests/
|
||||
|
||||
The run_agent loop is setup to:
|
||||
- Process the enabled toolsets to provide to the model,
|
||||
- Pipe in a prompt or problem from the input to the agent,
|
||||
- Loop the LLM each time it calls a tool, until the model decides no more tools are needed and provides a natural language response,
|
||||
- Return that response.
|
||||
|
||||
There are additional caveats for logging, where we restructure the "tools" as a system prompt for storage later into a format that can be used and handled properly later.
|
||||
302
.env.example
302
.env.example
@@ -1,49 +1,301 @@
|
||||
# Hermes Agent Environment Configuration
|
||||
# Copy this file to .env and fill in your API keys
|
||||
# Get API keys from the URLs listed below
|
||||
|
||||
# =============================================================================
|
||||
# REQUIRED API KEYS
|
||||
# LLM PROVIDER (OpenRouter)
|
||||
# =============================================================================
|
||||
# OpenRouter provides access to many models through one API
|
||||
# All LLM calls go through OpenRouter - no direct provider keys needed
|
||||
# Get your key at: https://openrouter.ai/keys
|
||||
OPENROUTER_API_KEY=
|
||||
|
||||
# Anthropic API Key - Main agent model
|
||||
# Get at: https://console.anthropic.com/
|
||||
ANTHROPIC_API_KEY=
|
||||
# Default model to use (OpenRouter format: provider/model)
|
||||
# Examples: anthropic/claude-opus-4.6, openai/gpt-4o, google/gemini-3-flash-preview, zhipuai/glm-4-plus
|
||||
LLM_MODEL=anthropic/claude-opus-4.6
|
||||
|
||||
# =============================================================================
|
||||
# LLM PROVIDER (z.ai / GLM)
|
||||
# =============================================================================
|
||||
# z.ai provides access to ZhipuAI GLM models (GLM-4-Plus, etc.)
|
||||
# Get your key at: https://z.ai or https://open.bigmodel.cn
|
||||
GLM_API_KEY=
|
||||
# GLM_BASE_URL=https://api.z.ai/api/paas/v4 # Override default base URL
|
||||
|
||||
# =============================================================================
|
||||
# LLM PROVIDER (Kimi / Moonshot)
|
||||
# =============================================================================
|
||||
# Kimi Code provides access to Moonshot AI coding models (kimi-k2.5, etc.)
|
||||
# Get your key at: https://platform.kimi.ai (Kimi Code console)
|
||||
# Keys prefixed sk-kimi- use the Kimi Code API (api.kimi.com) by default.
|
||||
# Legacy keys from platform.moonshot.ai need KIMI_BASE_URL override below.
|
||||
KIMI_API_KEY=
|
||||
# KIMI_BASE_URL=https://api.kimi.com/coding/v1 # Default for sk-kimi- keys
|
||||
# KIMI_BASE_URL=https://api.moonshot.ai/v1 # For legacy Moonshot keys
|
||||
# KIMI_BASE_URL=https://api.moonshot.cn/v1 # For Moonshot China keys
|
||||
|
||||
# =============================================================================
|
||||
# LLM PROVIDER (MiniMax)
|
||||
# =============================================================================
|
||||
# MiniMax provides access to MiniMax models (global endpoint)
|
||||
# Get your key at: https://www.minimax.io
|
||||
MINIMAX_API_KEY=
|
||||
# MINIMAX_BASE_URL=https://api.minimax.io/v1 # Override default base URL
|
||||
|
||||
# MiniMax China endpoint (for users in mainland China)
|
||||
MINIMAX_CN_API_KEY=
|
||||
# MINIMAX_CN_BASE_URL=https://api.minimaxi.com/v1 # Override default base URL
|
||||
|
||||
# =============================================================================
|
||||
# TOOL API KEYS
|
||||
# =============================================================================
|
||||
|
||||
# Firecrawl API Key - Web search, extract, and crawl
|
||||
# Get at: https://firecrawl.dev/
|
||||
FIRECRAWL_API_KEY=
|
||||
|
||||
# Nous Research API Key - Vision analysis and multi-model reasoning
|
||||
# Get at: https://inference-api.nousresearch.com/
|
||||
NOUS_API_KEY=
|
||||
|
||||
# Morph API Key - Terminal/command execution tools
|
||||
# Get at: https://morph.so/
|
||||
MORPH_API_KEY=
|
||||
|
||||
# FAL.ai API Key - Image generation
|
||||
# Get at: https://fal.ai/
|
||||
FAL_KEY=
|
||||
|
||||
# =============================================================================
|
||||
# OPTIONAL API KEYS
|
||||
# =============================================================================
|
||||
|
||||
# OpenAI API Key - Optional, for enhanced Hecate features
|
||||
# Get at: https://platform.openai.com/
|
||||
OPENAI_API_KEY=
|
||||
# Honcho - Cross-session AI-native user modeling (optional)
|
||||
# Builds a persistent understanding of the user across sessions and tools.
|
||||
# Get at: https://app.honcho.dev
|
||||
# Also requires ~/.honcho/config.json with enabled=true (see README).
|
||||
HONCHO_API_KEY=
|
||||
|
||||
# =============================================================================
|
||||
# OPTIONAL CONFIGURATION
|
||||
# TERMINAL TOOL CONFIGURATION (mini-swe-agent backend)
|
||||
# =============================================================================
|
||||
# Backend type: "local", "singularity", "docker", "modal", or "ssh"
|
||||
# Terminal backend is configured in ~/.hermes/config.yaml (terminal.backend).
|
||||
# Use 'hermes setup' or 'hermes config set terminal.backend docker' to change.
|
||||
# Supported: local, docker, singularity, modal, ssh
|
||||
#
|
||||
# Only override here if you need to force a backend without touching config.yaml:
|
||||
# TERMINAL_ENV=local
|
||||
|
||||
# Terminal Tool Settings
|
||||
HECATE_VM_LIFETIME_SECONDS=300
|
||||
HECATE_DEFAULT_SNAPSHOT_ID=snapshot_p5294qxt
|
||||
# Container images (for singularity/docker/modal backends)
|
||||
# TERMINAL_DOCKER_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
|
||||
# TERMINAL_SINGULARITY_IMAGE=docker://nikolaik/python-nodejs:python3.11-nodejs20
|
||||
TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
|
||||
|
||||
# Debug Logging (set to "true" to enable, logs saved to ./logs/)
|
||||
|
||||
# Working directory for terminal commands
|
||||
# For local backend: "." means current directory (resolved automatically)
|
||||
# For remote backends (ssh/docker/modal/singularity): use an absolute path
|
||||
# INSIDE the target environment, or leave unset for the backend's default
|
||||
# (/root for modal, / for docker, ~ for ssh). Do NOT use a host-local path.
|
||||
# Usually managed by config.yaml (terminal.cwd) — uncomment to override
|
||||
# TERMINAL_CWD=.
|
||||
|
||||
# Default command timeout in seconds
|
||||
TERMINAL_TIMEOUT=60
|
||||
|
||||
# Cleanup inactive environments after this many seconds
|
||||
TERMINAL_LIFETIME_SECONDS=300
|
||||
|
||||
# =============================================================================
|
||||
# SSH REMOTE EXECUTION (for TERMINAL_ENV=ssh)
|
||||
# =============================================================================
|
||||
# Run terminal commands on a remote server via SSH.
|
||||
# Agent code stays on your machine, commands execute remotely.
|
||||
#
|
||||
# SECURITY BENEFITS:
|
||||
# - Agent cannot read your .env file (API keys protected)
|
||||
# - Agent cannot modify its own code
|
||||
# - Remote server acts as isolated sandbox
|
||||
# - Can safely configure passwordless sudo on remote
|
||||
#
|
||||
# TERMINAL_SSH_HOST=192.168.1.100
|
||||
# TERMINAL_SSH_USER=agent
|
||||
# TERMINAL_SSH_PORT=22
|
||||
# TERMINAL_SSH_KEY=~/.ssh/id_rsa
|
||||
|
||||
# =============================================================================
|
||||
# SUDO SUPPORT (works with ALL terminal backends)
|
||||
# =============================================================================
|
||||
# If set, enables sudo commands by piping password via `sudo -S`.
|
||||
# Works with: local, docker, singularity, modal, and ssh backends.
|
||||
#
|
||||
# SECURITY WARNING: Password stored in plaintext. Only use on trusted machines.
|
||||
#
|
||||
# ALTERNATIVES:
|
||||
# - For SSH backend: Configure passwordless sudo on the remote server
|
||||
# - For containers: Run as root inside the container (no sudo needed)
|
||||
# - For local: Configure /etc/sudoers for specific commands
|
||||
# - For CLI: Leave unset - you'll be prompted interactively with 45s timeout
|
||||
#
|
||||
# SUDO_PASSWORD=your_password_here
|
||||
|
||||
# =============================================================================
|
||||
# MODAL CLOUD BACKEND (Optional - for TERMINAL_ENV=modal)
|
||||
# =============================================================================
|
||||
# Modal uses CLI authentication, not environment variables.
|
||||
# Run: pip install modal && modal setup
|
||||
# This will authenticate via browser and store credentials locally.
|
||||
# No API key needed in .env - Modal handles auth automatically.
|
||||
|
||||
# =============================================================================
|
||||
# BROWSER TOOL CONFIGURATION (agent-browser + Browserbase)
|
||||
# =============================================================================
|
||||
# Browser automation requires Browserbase cloud service for remote browser execution.
|
||||
# This allows the agent to navigate websites, fill forms, and extract information.
|
||||
#
|
||||
# STEALTH MODES:
|
||||
# - Basic Stealth: ALWAYS active (random fingerprints, auto CAPTCHA solving)
|
||||
# - Advanced Stealth: Requires BROWSERBASE_ADVANCED_STEALTH=true (Scale Plan only)
|
||||
|
||||
# Browserbase API Key - Cloud browser execution
|
||||
# Get at: https://browserbase.com/
|
||||
BROWSERBASE_API_KEY=
|
||||
|
||||
# Browserbase Project ID - From your Browserbase dashboard
|
||||
BROWSERBASE_PROJECT_ID=
|
||||
|
||||
# Enable residential proxies for better CAPTCHA solving (default: true)
|
||||
# Routes traffic through residential IPs, significantly improves success rate
|
||||
BROWSERBASE_PROXIES=true
|
||||
|
||||
# Enable advanced stealth mode (default: false, requires Scale Plan)
|
||||
# Uses custom Chromium build to avoid bot detection altogether
|
||||
BROWSERBASE_ADVANCED_STEALTH=false
|
||||
|
||||
# Browser session timeout in seconds (default: 300)
|
||||
# Sessions are cleaned up after this duration of inactivity
|
||||
BROWSER_SESSION_TIMEOUT=300
|
||||
|
||||
# Browser inactivity timeout - auto-cleanup inactive sessions (default: 120 = 2 min)
|
||||
# Browser sessions are automatically closed after this period of no activity
|
||||
BROWSER_INACTIVITY_TIMEOUT=120
|
||||
|
||||
# =============================================================================
|
||||
# SESSION LOGGING
|
||||
# =============================================================================
|
||||
# Session trajectories are automatically saved to logs/ directory
|
||||
# Format: logs/session_YYYYMMDD_HHMMSS_UUID.json
|
||||
# Contains full conversation history in trajectory format for debugging/replay
|
||||
|
||||
# =============================================================================
|
||||
# VOICE TRANSCRIPTION & OPENAI TTS
|
||||
# =============================================================================
|
||||
# Required for voice message transcription (Whisper) and OpenAI TTS voices.
|
||||
# Uses OpenAI's API directly (not via OpenRouter).
|
||||
# Named VOICE_TOOLS_OPENAI_KEY to avoid interference with OpenRouter.
|
||||
# Get at: https://platform.openai.com/api-keys
|
||||
VOICE_TOOLS_OPENAI_KEY=
|
||||
|
||||
# =============================================================================
|
||||
# SLACK INTEGRATION
|
||||
# =============================================================================
|
||||
# Slack Bot Token - From Slack App settings (OAuth & Permissions)
|
||||
# Get at: https://api.slack.com/apps
|
||||
# SLACK_BOT_TOKEN=xoxb-...
|
||||
|
||||
# Slack App Token - For Socket Mode (App-Level Tokens in Slack App settings)
|
||||
# SLACK_APP_TOKEN=xapp-...
|
||||
|
||||
# Slack allowed users (comma-separated Slack user IDs)
|
||||
# SLACK_ALLOWED_USERS=
|
||||
|
||||
# WhatsApp (built-in Baileys bridge — run `hermes whatsapp` to pair)
|
||||
# WHATSAPP_ENABLED=false
|
||||
# WHATSAPP_ALLOWED_USERS=15551234567
|
||||
|
||||
# Email (IMAP/SMTP — send and receive emails as Hermes)
|
||||
# For Gmail: enable 2FA → create App Password at https://myaccount.google.com/apppasswords
|
||||
# EMAIL_ADDRESS=hermes@gmail.com
|
||||
# EMAIL_PASSWORD=xxxx xxxx xxxx xxxx
|
||||
# EMAIL_IMAP_HOST=imap.gmail.com
|
||||
# EMAIL_IMAP_PORT=993
|
||||
# EMAIL_SMTP_HOST=smtp.gmail.com
|
||||
# EMAIL_SMTP_PORT=587
|
||||
# EMAIL_POLL_INTERVAL=15
|
||||
# EMAIL_ALLOWED_USERS=your@email.com
|
||||
# EMAIL_HOME_ADDRESS=your@email.com
|
||||
|
||||
# Gateway-wide: allow ALL users without an allowlist (default: false = deny)
|
||||
# Only set to true if you intentionally want open access.
|
||||
# GATEWAY_ALLOW_ALL_USERS=false
|
||||
|
||||
# =============================================================================
|
||||
# RESPONSE PACING
|
||||
# =============================================================================
|
||||
# Human-like delays between message chunks on messaging platforms.
|
||||
# Makes the bot feel less robotic.
|
||||
# HERMES_HUMAN_DELAY_MODE=off # off | natural | custom
|
||||
# HERMES_HUMAN_DELAY_MIN_MS=800 # Min delay in ms (custom mode)
|
||||
# HERMES_HUMAN_DELAY_MAX_MS=2500 # Max delay in ms (custom mode)
|
||||
|
||||
# =============================================================================
|
||||
# DEBUG OPTIONS
|
||||
# =============================================================================
|
||||
WEB_TOOLS_DEBUG=false
|
||||
VISION_TOOLS_DEBUG=false
|
||||
MOA_TOOLS_DEBUG=false
|
||||
IMAGE_TOOLS_DEBUG=false
|
||||
|
||||
# =============================================================================
|
||||
# CONTEXT COMPRESSION (Auto-shrinks long conversations)
|
||||
# =============================================================================
|
||||
# When conversation approaches model's context limit, middle turns are
|
||||
# automatically summarized to free up space.
|
||||
#
|
||||
# Context compression is configured in ~/.hermes/config.yaml under compression:
|
||||
# CONTEXT_COMPRESSION_ENABLED=true # Enable auto-compression (default: true)
|
||||
# CONTEXT_COMPRESSION_THRESHOLD=0.85 # Compress at 85% of context limit
|
||||
# Model is set via compression.summary_model in config.yaml (default: google/gemini-3-flash-preview)
|
||||
|
||||
# =============================================================================
|
||||
# RL TRAINING (Tinker + Atropos)
|
||||
# =============================================================================
|
||||
# Run reinforcement learning training on language models using the Tinker API.
|
||||
# Requires the rl-server to be running (from tinker-atropos package).
|
||||
|
||||
# Tinker API Key - RL training service
|
||||
# Get at: https://tinker-console.thinkingmachines.ai/keys
|
||||
TINKER_API_KEY=
|
||||
|
||||
# Weights & Biases API Key - Experiment tracking and metrics
|
||||
# Get at: https://wandb.ai/authorize
|
||||
WANDB_API_KEY=
|
||||
|
||||
# RL API Server URL (default: http://localhost:8080)
|
||||
# Change if running the rl-server on a different host/port
|
||||
# RL_API_URL=http://localhost:8080
|
||||
|
||||
# =============================================================================
|
||||
# SKILLS HUB (GitHub integration for skill search/install/publish)
|
||||
# =============================================================================
|
||||
|
||||
# GitHub Personal Access Token — for higher API rate limits on skill search/install
|
||||
# Get at: https://github.com/settings/tokens (Fine-grained recommended)
|
||||
# GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
||||
|
||||
# GitHub App credentials (optional — for bot identity on PRs)
|
||||
# GITHUB_APP_ID=
|
||||
# GITHUB_APP_PRIVATE_KEY_PATH=
|
||||
# GITHUB_APP_INSTALLATION_ID=
|
||||
|
||||
# Groq API key (free tier — used for Whisper STT in voice mode)
|
||||
# GROQ_API_KEY=
|
||||
|
||||
# =============================================================================
|
||||
# STT PROVIDER SELECTION
|
||||
# =============================================================================
|
||||
# Default STT provider is "local" (faster-whisper) — runs on your machine, no API key needed.
|
||||
# Install with: pip install faster-whisper
|
||||
# Model downloads automatically on first use (~150 MB for "base").
|
||||
# To use cloud providers instead, set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY above.
|
||||
# Provider priority: local > groq > openai
|
||||
# Configure in config.yaml: stt.provider: local | groq | openai
|
||||
|
||||
# =============================================================================
|
||||
# STT ADVANCED OVERRIDES (optional)
|
||||
# =============================================================================
|
||||
# Override default STT models per provider (normally set via stt.model in config.yaml)
|
||||
# STT_GROQ_MODEL=whisper-large-v3-turbo
|
||||
# STT_OPENAI_MODEL=whisper-1
|
||||
|
||||
# Override STT provider endpoints (for proxies or self-hosted instances)
|
||||
# GROQ_BASE_URL=https://api.groq.com/openai/v1
|
||||
# STT_OPENAI_BASE_URL=https://api.openai.com/v1
|
||||
|
||||
144
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
144
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
name: "🐛 Bug Report"
|
||||
description: Report a bug — something that's broken, crashes, or behaves incorrectly.
|
||||
title: "[Bug]: "
|
||||
labels: ["bug"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for reporting a bug! Please fill out the sections below so we can reproduce and fix it quickly.
|
||||
|
||||
**Before submitting**, please:
|
||||
- [ ] Search [existing issues](https://github.com/NousResearch/hermes-agent/issues) to avoid duplicates
|
||||
- [ ] Update to the latest version (`hermes update`) and confirm the bug still exists
|
||||
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Bug Description
|
||||
description: A clear description of what's broken. Include error messages, tracebacks, or screenshots if relevant.
|
||||
placeholder: |
|
||||
What happened? What did you expect to happen instead?
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
attributes:
|
||||
label: Steps to Reproduce
|
||||
description: Minimal steps to trigger the bug. The more specific, the faster we can fix it.
|
||||
placeholder: |
|
||||
1. Run `hermes chat`
|
||||
2. Send the message "..."
|
||||
3. Agent calls tool X
|
||||
4. Error appears: ...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: expected
|
||||
attributes:
|
||||
label: Expected Behavior
|
||||
description: What should have happened instead?
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: actual
|
||||
attributes:
|
||||
label: Actual Behavior
|
||||
description: What actually happened? Include full error output if available.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: dropdown
|
||||
id: component
|
||||
attributes:
|
||||
label: Affected Component
|
||||
description: Which part of Hermes is affected?
|
||||
multiple: true
|
||||
options:
|
||||
- CLI (interactive chat)
|
||||
- Gateway (Telegram/Discord/Slack/WhatsApp)
|
||||
- Setup / Installation
|
||||
- Tools (terminal, file ops, web, code execution, etc.)
|
||||
- Skills (skill loading, skill hub, skill guard)
|
||||
- Agent Core (conversation loop, context compression, memory)
|
||||
- Configuration (config.yaml, .env, hermes setup)
|
||||
- Other
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: dropdown
|
||||
id: platform
|
||||
attributes:
|
||||
label: Messaging Platform (if gateway-related)
|
||||
description: Which platform adapter is affected?
|
||||
multiple: true
|
||||
options:
|
||||
- N/A (CLI only)
|
||||
- Telegram
|
||||
- Discord
|
||||
- Slack
|
||||
- WhatsApp
|
||||
|
||||
- type: input
|
||||
id: os
|
||||
attributes:
|
||||
label: Operating System
|
||||
description: e.g. Ubuntu 24.04, macOS 15.2, Windows 11
|
||||
placeholder: Ubuntu 24.04
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: python-version
|
||||
attributes:
|
||||
label: Python Version
|
||||
description: Output of `python --version`
|
||||
placeholder: "3.11.9"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: hermes-version
|
||||
attributes:
|
||||
label: Hermes Version
|
||||
description: Output of `hermes version`
|
||||
placeholder: "2.1.0"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant Logs / Traceback
|
||||
description: Paste any error output, traceback, or log messages. This will be auto-formatted as code.
|
||||
render: shell
|
||||
|
||||
- type: textarea
|
||||
id: root-cause
|
||||
attributes:
|
||||
label: Root Cause Analysis (optional)
|
||||
description: |
|
||||
If you've dug into the code and identified the root cause, share it here.
|
||||
Include file paths, line numbers, and code snippets if possible. This massively speeds up fixes.
|
||||
placeholder: |
|
||||
The bug is in `gateway/run.py` line 949. `len(history)` counts session_meta entries
|
||||
but `agent_messages` was built from filtered history...
|
||||
|
||||
- type: textarea
|
||||
id: proposed-fix
|
||||
attributes:
|
||||
label: Proposed Fix (optional)
|
||||
description: If you have a fix in mind (or a PR ready), describe it here.
|
||||
placeholder: |
|
||||
Replace `.get()` with `.pop()` on line 289 of `gateway/platforms/base.py`
|
||||
to actually clear the pending message after retrieval.
|
||||
|
||||
- type: checkboxes
|
||||
id: pr-ready
|
||||
attributes:
|
||||
label: Are you willing to submit a PR for this?
|
||||
options:
|
||||
- label: I'd like to fix this myself and submit a PR
|
||||
11
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
11
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: 💬 Nous Research Discord
|
||||
url: https://discord.gg/NousResearch
|
||||
about: For quick questions, showcasing projects, sharing skills, and community chat.
|
||||
- name: 📖 Documentation
|
||||
url: https://github.com/NousResearch/hermes-agent/blob/main/README.md
|
||||
about: Check the README and docs before opening an issue.
|
||||
- name: 🤝 Contributing Guide
|
||||
url: https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md
|
||||
about: Read this before submitting a PR.
|
||||
73
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
73
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
@@ -0,0 +1,73 @@
|
||||
name: "✨ Feature Request"
|
||||
description: Suggest a new feature or improvement.
|
||||
title: "[Feature]: "
|
||||
labels: ["enhancement"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for the suggestion! Before submitting, please consider:
|
||||
|
||||
- **Is this a new skill?** Most capabilities should be [skills, not tools](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#should-it-be-a-skill-or-a-tool). If it's a specialized integration (crypto, NFT, niche SaaS), it belongs on the Skills Hub, not bundled.
|
||||
- **Search [existing issues](https://github.com/NousResearch/hermes-agent/issues)** — someone may have already proposed this.
|
||||
|
||||
- type: textarea
|
||||
id: problem
|
||||
attributes:
|
||||
label: Problem or Use Case
|
||||
description: What problem does this solve? What are you trying to do that you can't today?
|
||||
placeholder: |
|
||||
I'm trying to use Hermes with [provider/platform/workflow] but currently
|
||||
there's no way to...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: solution
|
||||
attributes:
|
||||
label: Proposed Solution
|
||||
description: How do you think this should work? Be as specific as you can — CLI flags, config options, UI behavior.
|
||||
placeholder: |
|
||||
Add a `--foo` flag to `hermes chat` that enables...
|
||||
Or: Add a config key `bar.baz` that controls...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: alternatives
|
||||
attributes:
|
||||
label: Alternatives Considered
|
||||
description: What other approaches did you consider? Why is the proposed solution better?
|
||||
|
||||
- type: dropdown
|
||||
id: type
|
||||
attributes:
|
||||
label: Feature Type
|
||||
options:
|
||||
- New tool
|
||||
- New bundled skill
|
||||
- CLI improvement
|
||||
- Gateway / messaging improvement
|
||||
- Configuration option
|
||||
- Performance / reliability
|
||||
- Developer experience (tests, docs, CI)
|
||||
- Other
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: dropdown
|
||||
id: scope
|
||||
attributes:
|
||||
label: Scope
|
||||
description: How big is this change?
|
||||
options:
|
||||
- Small (single file, < 50 lines)
|
||||
- Medium (few files, < 300 lines)
|
||||
- Large (new module or significant refactor)
|
||||
|
||||
- type: checkboxes
|
||||
id: pr-ready
|
||||
attributes:
|
||||
label: Contribution
|
||||
options:
|
||||
- label: I'd like to implement this myself and submit a PR
|
||||
100
.github/ISSUE_TEMPLATE/setup_help.yml
vendored
Normal file
100
.github/ISSUE_TEMPLATE/setup_help.yml
vendored
Normal file
@@ -0,0 +1,100 @@
|
||||
name: "🔧 Setup / Installation Help"
|
||||
description: Having trouble installing or configuring Hermes? Ask here.
|
||||
title: "[Setup]: "
|
||||
labels: ["setup"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Sorry you're having trouble! Please fill out the details below so we can help.
|
||||
|
||||
**Quick checks first:**
|
||||
- Run `hermes doctor` and include the output below
|
||||
- Try `hermes update` to get the latest version
|
||||
- Check the [README troubleshooting section](https://github.com/NousResearch/hermes-agent#troubleshooting)
|
||||
- For general questions, consider the [Nous Research Discord](https://discord.gg/NousResearch) for faster help
|
||||
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: What's Going Wrong?
|
||||
description: Describe what you're trying to do and where it fails.
|
||||
placeholder: |
|
||||
I ran `hermes setup` and selected Nous Portal, but when I try to
|
||||
start the gateway I get...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: steps
|
||||
attributes:
|
||||
label: Steps Taken
|
||||
description: What did you do? Include the exact commands you ran.
|
||||
placeholder: |
|
||||
1. Ran the install script: `curl -fsSL ... | bash`
|
||||
2. Ran `hermes setup` and chose "Quick setup"
|
||||
3. Selected OpenRouter, entered API key
|
||||
4. Ran `hermes chat` and got error...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: dropdown
|
||||
id: install-method
|
||||
attributes:
|
||||
label: Installation Method
|
||||
options:
|
||||
- Install script (curl | bash)
|
||||
- Manual clone + pip/uv install
|
||||
- PowerShell installer (Windows)
|
||||
- Docker
|
||||
- Other
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: os
|
||||
attributes:
|
||||
label: Operating System
|
||||
placeholder: Ubuntu 24.04 / macOS 15.2 / Windows 11
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: python-version
|
||||
attributes:
|
||||
label: Python Version
|
||||
description: Output of `python --version` (or `python3 --version`)
|
||||
placeholder: "3.11.9"
|
||||
|
||||
- type: input
|
||||
id: hermes-version
|
||||
attributes:
|
||||
label: Hermes Version
|
||||
description: Output of `hermes version` (if install got that far)
|
||||
placeholder: "2.1.0"
|
||||
|
||||
- type: textarea
|
||||
id: doctor-output
|
||||
attributes:
|
||||
label: Output of `hermes doctor`
|
||||
description: Run `hermes doctor` and paste the full output. This will be auto-formatted.
|
||||
render: shell
|
||||
|
||||
- type: textarea
|
||||
id: error-output
|
||||
attributes:
|
||||
label: Full Error Output
|
||||
description: Paste the complete error message or traceback. This will be auto-formatted.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: tried
|
||||
attributes:
|
||||
label: What I've Already Tried
|
||||
description: List any fixes or workarounds you've already attempted.
|
||||
placeholder: |
|
||||
- Ran `hermes update`
|
||||
- Tried reinstalling with `pip install -e ".[all]"`
|
||||
- Checked that OPENROUTER_API_KEY is set in ~/.hermes/.env
|
||||
75
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
75
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
## What does this PR do?
|
||||
|
||||
<!-- Describe the change clearly. What problem does it solve? Why is this approach the right one? -->
|
||||
|
||||
|
||||
|
||||
## Related Issue
|
||||
|
||||
<!-- Link the issue this PR addresses. If no issue exists, consider creating one first. -->
|
||||
|
||||
Fixes #
|
||||
|
||||
## Type of Change
|
||||
|
||||
<!-- Check the one that applies. -->
|
||||
|
||||
- [ ] 🐛 Bug fix (non-breaking change that fixes an issue)
|
||||
- [ ] ✨ New feature (non-breaking change that adds functionality)
|
||||
- [ ] 🔒 Security fix
|
||||
- [ ] 📝 Documentation update
|
||||
- [ ] ✅ Tests (adding or improving test coverage)
|
||||
- [ ] ♻️ Refactor (no behavior change)
|
||||
- [ ] 🎯 New skill (bundled or hub)
|
||||
|
||||
## Changes Made
|
||||
|
||||
<!-- List the specific changes. Include file paths for code changes. -->
|
||||
|
||||
-
|
||||
|
||||
## How to Test
|
||||
|
||||
<!-- Steps to verify this change works. For bugs: reproduction steps + proof that the fix works. -->
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
## Checklist
|
||||
|
||||
<!-- Complete these before requesting review. -->
|
||||
|
||||
### Code
|
||||
|
||||
- [ ] I've read the [Contributing Guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md)
|
||||
- [ ] My commit messages follow [Conventional Commits](https://www.conventionalcommits.org/) (`fix(scope):`, `feat(scope):`, etc.)
|
||||
- [ ] I searched for [existing PRs](https://github.com/NousResearch/hermes-agent/pulls) to make sure this isn't a duplicate
|
||||
- [ ] My PR contains **only** changes related to this fix/feature (no unrelated commits)
|
||||
- [ ] I've run `pytest tests/ -q` and all tests pass
|
||||
- [ ] I've added tests for my changes (required for bug fixes, strongly encouraged for features)
|
||||
- [ ] I've tested on my platform: <!-- e.g. Ubuntu 24.04, macOS 15.2, Windows 11 -->
|
||||
|
||||
### Documentation & Housekeeping
|
||||
|
||||
<!-- Check all that apply. It's OK to check "N/A" if a category doesn't apply to your change. -->
|
||||
|
||||
- [ ] I've updated relevant documentation (README, `docs/`, docstrings) — or N/A
|
||||
- [ ] I've updated `cli-config.yaml.example` if I added/changed config keys — or N/A
|
||||
- [ ] I've updated `CONTRIBUTING.md` or `AGENTS.md` if I changed architecture or workflows — or N/A
|
||||
- [ ] I've considered cross-platform impact (Windows, macOS) per the [compatibility guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#cross-platform-compatibility) — or N/A
|
||||
- [ ] I've updated tool descriptions/schemas if I changed tool behavior — or N/A
|
||||
|
||||
## For New Skills
|
||||
|
||||
<!-- Only fill this out if you're adding a skill. Delete this section otherwise. -->
|
||||
|
||||
- [ ] This skill is **broadly useful** to most users (if bundled) — see [Contributing Guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#should-the-skill-be-bundled)
|
||||
- [ ] SKILL.md follows the [standard format](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#skillmd-format) (frontmatter, trigger conditions, steps, pitfalls)
|
||||
- [ ] No external dependencies that aren't already available (prefer stdlib, curl, existing Hermes tools)
|
||||
- [ ] I've tested the skill end-to-end: `hermes --toolsets skills -q "Use the X skill to do Y"`
|
||||
|
||||
## Screenshots / Logs
|
||||
|
||||
<!-- If applicable, add screenshots or log output showing the fix/feature in action. -->
|
||||
|
||||
60
.github/workflows/deploy-site.yml
vendored
Normal file
60
.github/workflows/deploy-site.yml
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
name: Deploy Site
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'website/**'
|
||||
- 'landingpage/**'
|
||||
- '.github/workflows/deploy-site.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: pages
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
build-and-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deploy.outputs.page_url }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: npm
|
||||
cache-dependency-path: website/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
working-directory: website
|
||||
|
||||
- name: Build Docusaurus
|
||||
run: npm run build
|
||||
working-directory: website
|
||||
|
||||
- name: Stage deployment
|
||||
run: |
|
||||
mkdir -p _site/docs
|
||||
# Landing page at root
|
||||
cp -r landingpage/* _site/
|
||||
# Docusaurus at /docs/
|
||||
cp -r website/build/* _site/docs/
|
||||
# CNAME so GitHub Pages keeps the custom domain between deploys
|
||||
echo "hermes-agent.nousresearch.com" > _site/CNAME
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v3
|
||||
with:
|
||||
path: _site
|
||||
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deploy
|
||||
uses: actions/deploy-pages@v4
|
||||
39
.github/workflows/docs-site-checks.yml
vendored
Normal file
39
.github/workflows/docs-site-checks.yml
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
name: Docs Site Checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'website/**'
|
||||
- '.github/workflows/docs-site-checks.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
docs-site-checks:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: npm
|
||||
cache-dependency-path: website/package-lock.json
|
||||
|
||||
- name: Install website dependencies
|
||||
run: npm ci
|
||||
working-directory: website
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install ascii-guard
|
||||
run: python -m pip install ascii-guard
|
||||
|
||||
- name: Lint docs diagrams
|
||||
run: npm run lint:diagrams
|
||||
working-directory: website
|
||||
|
||||
- name: Build Docusaurus
|
||||
run: npm run build
|
||||
working-directory: website
|
||||
42
.github/workflows/tests.yml
vendored
Normal file
42
.github/workflows/tests.yml
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
name: Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
# Cancel in-progress runs for the same PR/branch
|
||||
concurrency:
|
||||
group: tests-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
|
||||
- name: Set up Python 3.11
|
||||
run: uv python install 3.11
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv venv .venv --python 3.11
|
||||
source .venv/bin/activate
|
||||
uv pip install -e ".[all,dev]"
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pytest tests/ -q --ignore=tests/integration --tb=short -n auto
|
||||
env:
|
||||
# Ensure tests don't accidentally call real APIs
|
||||
OPENROUTER_API_KEY: ""
|
||||
OPENAI_API_KEY: ""
|
||||
NOUS_API_KEY: ""
|
||||
78
.gitignore
vendored
78
.gitignore
vendored
@@ -1,23 +1,55 @@
|
||||
/venv/
|
||||
/_pycache/
|
||||
hecate/
|
||||
hecate-lib/
|
||||
*.pyc*
|
||||
__pycache__/
|
||||
.venv/
|
||||
.vscode/
|
||||
.env
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.development
|
||||
.env.test
|
||||
export*
|
||||
__pycache__/model_tools.cpython-310.pyc
|
||||
__pycache__/web_tools.cpython-310.pyc
|
||||
logs/
|
||||
data/
|
||||
.pytest_cache/
|
||||
tmp/
|
||||
temp_vision_images/
|
||||
/venv/
|
||||
/_pycache/
|
||||
*.pyc*
|
||||
__pycache__/
|
||||
.venv/
|
||||
.vscode/
|
||||
.env
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.development
|
||||
.env.test
|
||||
export*
|
||||
__pycache__/model_tools.cpython-310.pyc
|
||||
__pycache__/web_tools.cpython-310.pyc
|
||||
logs/
|
||||
data/
|
||||
.pytest_cache/
|
||||
tmp/
|
||||
temp_vision_images/
|
||||
hermes-*/*
|
||||
examples/
|
||||
tests/quick_test_dataset.jsonl
|
||||
tests/sample_dataset.jsonl
|
||||
run_datagen_kimik2-thinking.sh
|
||||
run_datagen_megascience_glm4-6.sh
|
||||
run_datagen_sonnet.sh
|
||||
source-data/*
|
||||
run_datagen_megascience_glm4-6.sh
|
||||
data/*
|
||||
node_modules/
|
||||
browser-use/
|
||||
agent-browser/
|
||||
# Private keys
|
||||
*.ppk
|
||||
*.pem
|
||||
privvy*
|
||||
images/
|
||||
__pycache__/
|
||||
hermes_agent.egg-info/
|
||||
wandb/
|
||||
testlogs
|
||||
|
||||
# CLI config (may contain sensitive SSH paths)
|
||||
cli-config.yaml
|
||||
|
||||
# Skills Hub state (lives in ~/.hermes/skills/.hub/ at runtime, but just in case)
|
||||
skills/.hub/
|
||||
ignored/
|
||||
.worktrees/
|
||||
environments/benchmarks/evals/
|
||||
|
||||
# Release script temp files
|
||||
.release_notes.md
|
||||
|
||||
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
[submodule "mini-swe-agent"]
|
||||
path = mini-swe-agent
|
||||
url = https://github.com/SWE-agent/mini-swe-agent
|
||||
[submodule "tinker-atropos"]
|
||||
path = tinker-atropos
|
||||
url = https://github.com/nousresearch/tinker-atropos
|
||||
291
.plans/openai-api-server.md
Normal file
291
.plans/openai-api-server.md
Normal file
@@ -0,0 +1,291 @@
|
||||
# OpenAI-Compatible API Server for Hermes Agent
|
||||
|
||||
## Motivation
|
||||
|
||||
Every major chat frontend (Open WebUI 126k★, LobeChat 73k★, LibreChat 34k★,
|
||||
AnythingLLM 56k★, NextChat 87k★, ChatBox 39k★, Jan 26k★, HF Chat-UI 8k★,
|
||||
big-AGI 7k★) connects to backends via the OpenAI-compatible REST API with
|
||||
SSE streaming. By exposing this endpoint, hermes-agent becomes instantly
|
||||
usable as a backend for all of them — no custom adapters needed.
|
||||
|
||||
## What It Enables
|
||||
|
||||
```
|
||||
┌──────────────────┐
|
||||
│ Open WebUI │──┐
|
||||
│ LobeChat │ │ POST /v1/chat/completions
|
||||
│ LibreChat │ ├──► Authorization: Bearer <key> ┌─────────────────┐
|
||||
│ AnythingLLM │ │ {"messages": [...]} │ hermes-agent │
|
||||
│ NextChat │ │ │ gateway │
|
||||
│ Any OAI client │──┘ ◄── SSE streaming response │ (API server) │
|
||||
└──────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
A user would:
|
||||
1. Set `API_SERVER_ENABLED=true` in `~/.hermes/.env`
|
||||
2. Run `hermes gateway` (API server starts alongside Telegram/Discord/etc.)
|
||||
3. Point Open WebUI (or any frontend) at `http://localhost:8642/v1`
|
||||
4. Chat with hermes-agent through any OpenAI-compatible UI
|
||||
|
||||
## Endpoints
|
||||
|
||||
| Method | Path | Purpose |
|
||||
|--------|------|---------|
|
||||
| POST | `/v1/chat/completions` | Chat with the agent (streaming + non-streaming) |
|
||||
| GET | `/v1/models` | List available "models" (returns hermes-agent as a model) |
|
||||
| GET | `/health` | Health check |
|
||||
|
||||
## Architecture
|
||||
|
||||
### Option A: Gateway Platform Adapter (recommended)
|
||||
|
||||
Create `gateway/platforms/api_server.py` as a new platform adapter that
|
||||
extends `BasePlatformAdapter`. This is the cleanest approach because:
|
||||
|
||||
- Reuses all gateway infrastructure (session management, auth, context building)
|
||||
- Runs in the same async loop as other adapters
|
||||
- Gets message handling, interrupt support, and session persistence for free
|
||||
- Follows the established pattern (like Telegram, Discord, etc.)
|
||||
- Uses `aiohttp.web` (already a dependency) for the HTTP server
|
||||
|
||||
The adapter would start an `aiohttp.web.Application` server in `connect()`
|
||||
and route incoming HTTP requests through the standard `handle_message()` pipeline.
|
||||
|
||||
### Option B: Standalone Component
|
||||
|
||||
A separate HTTP server class in `gateway/api_server.py` that creates its own
|
||||
AIAgent instances directly. Simpler but duplicates session/auth logic.
|
||||
|
||||
**Recommendation: Option A** — fits the existing architecture, less code to
|
||||
maintain, gets all gateway features for free.
|
||||
|
||||
## Request/Response Format
|
||||
|
||||
### Chat Completions (non-streaming)
|
||||
|
||||
```
|
||||
POST /v1/chat/completions
|
||||
Authorization: Bearer hermes-api-key-here
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"model": "hermes-agent",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What files are in the current directory?"}
|
||||
],
|
||||
"stream": false,
|
||||
"temperature": 0.7
|
||||
}
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"id": "chatcmpl-abc123",
|
||||
"object": "chat.completion",
|
||||
"created": 1710000000,
|
||||
"model": "hermes-agent",
|
||||
"choices": [{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "Here are the files in the current directory:\n..."
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}],
|
||||
"usage": {
|
||||
"prompt_tokens": 50,
|
||||
"completion_tokens": 200,
|
||||
"total_tokens": 250
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Chat Completions (streaming)
|
||||
|
||||
Same request with `"stream": true`. Response is SSE:
|
||||
|
||||
```
|
||||
data: {"id":"chatcmpl-abc123","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
|
||||
|
||||
data: {"id":"chatcmpl-abc123","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"Here "},"finish_reason":null}]}
|
||||
|
||||
data: {"id":"chatcmpl-abc123","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"are "},"finish_reason":null}]}
|
||||
|
||||
data: {"id":"chatcmpl-abc123","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}
|
||||
|
||||
data: [DONE]
|
||||
```
|
||||
|
||||
### Models List
|
||||
|
||||
```
|
||||
GET /v1/models
|
||||
Authorization: Bearer hermes-api-key-here
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"object": "list",
|
||||
"data": [{
|
||||
"id": "hermes-agent",
|
||||
"object": "model",
|
||||
"created": 1710000000,
|
||||
"owned_by": "hermes-agent"
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
## Key Design Decisions
|
||||
|
||||
### 1. Session Management
|
||||
|
||||
The OpenAI API is stateless — each request includes the full conversation.
|
||||
But hermes-agent sessions have persistent state (memory, skills, tool context).
|
||||
|
||||
**Approach: Hybrid**
|
||||
- Default: Stateless. Each request is independent. The `messages` array IS
|
||||
the conversation. No session persistence between requests.
|
||||
- Opt-in persistent sessions via `X-Session-ID` header. When provided, the
|
||||
server maintains session state across requests (conversation history,
|
||||
memory context, tool state). This enables richer agent behavior.
|
||||
- The session ID also enables interrupt support — a subsequent request with
|
||||
the same session ID while one is running triggers an interrupt.
|
||||
|
||||
### 2. Streaming
|
||||
|
||||
The agent's `run_conversation()` is synchronous and returns the full response.
|
||||
For real SSE streaming, we need to emit chunks as they're generated.
|
||||
|
||||
**Phase 1 (MVP):** Run agent in a thread, return the complete response as
|
||||
a single SSE chunk + `[DONE]`. This works with all frontends — they just see
|
||||
a fast single-chunk response. Not true streaming but functional.
|
||||
|
||||
**Phase 2:** Add a response callback to AIAgent that emits text chunks as the
|
||||
LLM generates them. The API server captures these via a queue and streams them
|
||||
as SSE events. This gives real token-by-token streaming.
|
||||
|
||||
**Phase 3:** Stream tool execution progress too — emit tool call/result events
|
||||
as the agent works, giving frontends visibility into what the agent is doing.
|
||||
|
||||
### 3. Tool Transparency
|
||||
|
||||
Two modes:
|
||||
- **Opaque (default):** Frontends see only the final response. Tool calls
|
||||
happen server-side and are invisible. Best for general-purpose UIs.
|
||||
- **Transparent (opt-in via header):** Tool calls are emitted as OpenAI-format
|
||||
tool_call/tool_result messages in the stream. Useful for agent-aware frontends.
|
||||
|
||||
### 4. Authentication
|
||||
|
||||
- Bearer token via `Authorization: Bearer <key>` header
|
||||
- Token configured via `API_SERVER_KEY` env var
|
||||
- Optional: allow unauthenticated local-only access (127.0.0.1 bind)
|
||||
- Follows the same pattern as other platform adapters
|
||||
|
||||
### 5. Model Mapping
|
||||
|
||||
Frontends send `"model": "hermes-agent"` (or whatever). The actual LLM model
|
||||
used is configured server-side in config.yaml. The API server maps any
|
||||
requested model name to the configured hermes-agent model.
|
||||
|
||||
Optionally, allow model passthrough: if the frontend sends
|
||||
`"model": "anthropic/claude-sonnet-4"`, the agent uses that model. Controlled
|
||||
by a config flag.
|
||||
|
||||
## Configuration
|
||||
|
||||
```yaml
|
||||
# In config.yaml
|
||||
api_server:
|
||||
enabled: true
|
||||
port: 8642
|
||||
host: "127.0.0.1" # localhost only by default
|
||||
key: "your-secret-key" # or via API_SERVER_KEY env var
|
||||
allow_model_override: false # let clients choose the model
|
||||
max_concurrent: 5 # max simultaneous requests
|
||||
```
|
||||
|
||||
Environment variables:
|
||||
```bash
|
||||
API_SERVER_ENABLED=true
|
||||
API_SERVER_PORT=8642
|
||||
API_SERVER_HOST=127.0.0.1
|
||||
API_SERVER_KEY=your-secret-key
|
||||
```
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: MVP (non-streaming) — PR
|
||||
|
||||
1. `gateway/platforms/api_server.py` — new adapter
|
||||
- aiohttp.web server with endpoints:
|
||||
- `POST /v1/chat/completions` — Chat Completions API (universal compat)
|
||||
- `POST /v1/responses` — Responses API (server-side state, tool preservation)
|
||||
- `GET /v1/models` — list available models
|
||||
- `GET /health` — health check
|
||||
- Bearer token auth middleware
|
||||
- Non-streaming responses (run agent, return full result)
|
||||
- Chat Completions: stateless, messages array is the conversation
|
||||
- Responses API: server-side conversation storage via previous_response_id
|
||||
- Store full internal conversation (including tool calls) keyed by response ID
|
||||
- On subsequent requests, reconstruct full context from stored chain
|
||||
- Frontend system prompt layered on top of hermes-agent's core prompt
|
||||
|
||||
2. `gateway/config.py` — add `Platform.API_SERVER` enum + config
|
||||
|
||||
3. `gateway/run.py` — register adapter in `_create_adapter()`
|
||||
|
||||
4. Tests in `tests/gateway/test_api_server.py`
|
||||
|
||||
### Phase 2: SSE Streaming
|
||||
|
||||
1. Add response streaming to both endpoints
|
||||
- Chat Completions: `choices[0].delta.content` SSE format
|
||||
- Responses API: semantic events (response.output_text.delta, etc.)
|
||||
- Run agent in thread, collect output via callback queue
|
||||
- Handle client disconnect (cancel agent)
|
||||
|
||||
2. Add `stream_callback` parameter to `AIAgent.run_conversation()`
|
||||
|
||||
### Phase 3: Enhanced Features
|
||||
|
||||
1. Tool call transparency mode (opt-in)
|
||||
2. Model passthrough/override
|
||||
3. Concurrent request limiting
|
||||
4. Usage tracking / rate limiting
|
||||
5. CORS headers for browser-based frontends
|
||||
6. GET /v1/responses/{id} — retrieve stored response
|
||||
7. DELETE /v1/responses/{id} — delete stored response
|
||||
|
||||
## Files Changed
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `gateway/platforms/api_server.py` | NEW — main adapter (~300 lines) |
|
||||
| `gateway/config.py` | Add Platform.API_SERVER + config (~20 lines) |
|
||||
| `gateway/run.py` | Register adapter in _create_adapter() (~10 lines) |
|
||||
| `tests/gateway/test_api_server.py` | NEW — tests (~200 lines) |
|
||||
| `cli-config.yaml.example` | Add api_server section |
|
||||
| `README.md` | Mention API server in platform list |
|
||||
|
||||
## Compatibility Matrix
|
||||
|
||||
Once implemented, hermes-agent works as a drop-in backend for:
|
||||
|
||||
| Frontend | Stars | How to Connect |
|
||||
|----------|-------|---------------|
|
||||
| Open WebUI | 126k | Settings → Connections → Add OpenAI API, URL: `http://localhost:8642/v1` |
|
||||
| NextChat | 87k | BASE_URL env var |
|
||||
| LobeChat | 73k | Custom provider endpoint |
|
||||
| AnythingLLM | 56k | LLM Provider → Generic OpenAI |
|
||||
| Oobabooga | 42k | Already a backend, not a frontend |
|
||||
| ChatBox | 39k | API Host setting |
|
||||
| LibreChat | 34k | librechat.yaml custom endpoint |
|
||||
| Chatbot UI | 29k | Custom API endpoint |
|
||||
| Jan | 26k | Remote model config |
|
||||
| AionUI | 18k | Custom API endpoint |
|
||||
| HF Chat-UI | 8k | OPENAI_BASE_URL env var |
|
||||
| big-AGI | 7k | Custom endpoint |
|
||||
705
.plans/streaming-support.md
Normal file
705
.plans/streaming-support.md
Normal file
@@ -0,0 +1,705 @@
|
||||
# Streaming LLM Response Support for Hermes Agent
|
||||
|
||||
## Overview
|
||||
|
||||
Add token-by-token streaming of LLM responses across all platforms. When enabled,
|
||||
users see the response typing out live instead of waiting for the full generation.
|
||||
Streaming is opt-in via config, defaults to off, and all existing non-streaming
|
||||
code paths remain intact as the default.
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Feature-flagged**: `streaming.enabled: true` in config.yaml. Off by default.
|
||||
When off, all existing code paths are unchanged — zero risk to current behavior.
|
||||
2. **Callback-based**: A simple `stream_callback(text_delta: str)` function injected
|
||||
into AIAgent. The agent doesn't know or care what the consumer does with tokens.
|
||||
3. **Graceful degradation**: If the provider doesn't support streaming, or streaming
|
||||
fails for any reason, silently fall back to the non-streaming path.
|
||||
4. **Platform-agnostic core**: The streaming mechanism in AIAgent works the same
|
||||
regardless of whether the consumer is CLI, Telegram, Discord, or the API server.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
stream_callback(delta)
|
||||
│
|
||||
┌─────────────┐ ┌─────────────▼──────────────┐
|
||||
│ LLM API │ │ queue.Queue() │
|
||||
│ (stream) │───►│ thread-safe bridge between │
|
||||
│ │ │ agent thread & consumer │
|
||||
└─────────────┘ └─────────────┬──────────────┘
|
||||
│
|
||||
┌──────────────┼──────────────┐
|
||||
│ │ │
|
||||
┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼─────┐
|
||||
│ CLI │ │ Gateway │ │ API Server│
|
||||
│ print to │ │ edit msg │ │ SSE event │
|
||||
│ terminal │ │ on Tg/Dc │ │ to client │
|
||||
└───────────┘ └───────────┘ └───────────┘
|
||||
```
|
||||
|
||||
The agent runs in a thread. The callback puts tokens into a thread-safe queue.
|
||||
Each consumer reads the queue in its own context (async task, main thread, etc.).
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### config.yaml
|
||||
|
||||
```yaml
|
||||
streaming:
|
||||
enabled: false # Master switch. Default off.
|
||||
# Per-platform overrides (optional):
|
||||
# cli: true # Override for CLI only
|
||||
# telegram: true # Override for Telegram only
|
||||
# discord: false # Keep Discord non-streaming
|
||||
# api_server: true # Override for API server
|
||||
```
|
||||
|
||||
### Environment variables
|
||||
|
||||
```
|
||||
HERMES_STREAMING_ENABLED=true # Master switch via env
|
||||
```
|
||||
|
||||
### How the flag is read
|
||||
|
||||
- **CLI**: `load_cli_config()` reads `streaming.enabled`, sets env var. AIAgent
|
||||
checks at init time.
|
||||
- **Gateway**: `_run_agent()` reads config, decides whether to pass
|
||||
`stream_callback` to the AIAgent constructor.
|
||||
- **API server**: For Chat Completions `stream=true` requests, always uses streaming
|
||||
regardless of config (the client is explicitly requesting it). For non-stream
|
||||
requests, uses config.
|
||||
|
||||
### Precedence
|
||||
|
||||
1. API server: client's `stream` field overrides everything
|
||||
2. Per-platform config override (e.g., `streaming.telegram: true`)
|
||||
3. Master `streaming.enabled` flag
|
||||
4. Default: off
|
||||
|
||||
---
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Core streaming infrastructure in AIAgent
|
||||
|
||||
**File: run_agent.py**
|
||||
|
||||
#### 1a. Add stream_callback parameter to __init__ (~5 lines)
|
||||
|
||||
```python
|
||||
def __init__(self, ..., stream_callback: callable = None, ...):
|
||||
self.stream_callback = stream_callback
|
||||
```
|
||||
|
||||
No other init changes. The callback is optional — when None, everything
|
||||
works exactly as before.
|
||||
|
||||
#### 1b. Add _run_streaming_chat_completion() method (~65 lines)
|
||||
|
||||
New method for Chat Completions API streaming:
|
||||
|
||||
```python
|
||||
def _run_streaming_chat_completion(self, api_kwargs: dict):
|
||||
"""Stream a chat completion, emitting text tokens via stream_callback.
|
||||
|
||||
Returns a fake response object compatible with the non-streaming code path.
|
||||
Falls back to non-streaming on any error.
|
||||
"""
|
||||
stream_kwargs = dict(api_kwargs)
|
||||
stream_kwargs["stream"] = True
|
||||
stream_kwargs["stream_options"] = {"include_usage": True}
|
||||
|
||||
accumulated_content = []
|
||||
accumulated_tool_calls = {} # index -> {id, name, arguments}
|
||||
final_usage = None
|
||||
|
||||
try:
|
||||
stream = self.client.chat.completions.create(**stream_kwargs)
|
||||
|
||||
for chunk in stream:
|
||||
if not chunk.choices:
|
||||
# Usage-only chunk (final)
|
||||
if chunk.usage:
|
||||
final_usage = chunk.usage
|
||||
continue
|
||||
|
||||
delta = chunk.choices[0].delta
|
||||
|
||||
# Text content — emit via callback
|
||||
if delta.content:
|
||||
accumulated_content.append(delta.content)
|
||||
if self.stream_callback:
|
||||
try:
|
||||
self.stream_callback(delta.content)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Tool call deltas — accumulate silently
|
||||
if delta.tool_calls:
|
||||
for tc_delta in delta.tool_calls:
|
||||
idx = tc_delta.index
|
||||
if idx not in accumulated_tool_calls:
|
||||
accumulated_tool_calls[idx] = {
|
||||
"id": tc_delta.id or "",
|
||||
"name": "", "arguments": ""
|
||||
}
|
||||
if tc_delta.function:
|
||||
if tc_delta.function.name:
|
||||
accumulated_tool_calls[idx]["name"] = tc_delta.function.name
|
||||
if tc_delta.function.arguments:
|
||||
accumulated_tool_calls[idx]["arguments"] += tc_delta.function.arguments
|
||||
|
||||
# Build fake response compatible with existing code
|
||||
tool_calls = []
|
||||
for idx in sorted(accumulated_tool_calls):
|
||||
tc = accumulated_tool_calls[idx]
|
||||
if tc["name"]:
|
||||
tool_calls.append(SimpleNamespace(
|
||||
id=tc["id"], type="function",
|
||||
function=SimpleNamespace(name=tc["name"], arguments=tc["arguments"]),
|
||||
))
|
||||
|
||||
return SimpleNamespace(
|
||||
choices=[SimpleNamespace(
|
||||
message=SimpleNamespace(
|
||||
content="".join(accumulated_content) or "",
|
||||
tool_calls=tool_calls or None,
|
||||
role="assistant",
|
||||
),
|
||||
finish_reason="tool_calls" if tool_calls else "stop",
|
||||
)],
|
||||
usage=final_usage,
|
||||
model=self.model,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("Streaming failed, falling back to non-streaming: %s", e)
|
||||
return self.client.chat.completions.create(**api_kwargs)
|
||||
```
|
||||
|
||||
#### 1c. Modify _run_codex_stream() for Responses API (~10 lines)
|
||||
|
||||
The method already iterates the stream. Add callback emission:
|
||||
|
||||
```python
|
||||
def _run_codex_stream(self, api_kwargs: dict):
|
||||
with self.client.responses.stream(**api_kwargs) as stream:
|
||||
for event in stream:
|
||||
# Emit text deltas if streaming callback is set
|
||||
if self.stream_callback and hasattr(event, 'type'):
|
||||
if event.type == 'response.output_text.delta':
|
||||
try:
|
||||
self.stream_callback(event.delta)
|
||||
except Exception:
|
||||
pass
|
||||
return stream.get_final_response()
|
||||
```
|
||||
|
||||
#### 1d. Modify _interruptible_api_call() (~5 lines)
|
||||
|
||||
Add the streaming branch:
|
||||
|
||||
```python
|
||||
def _call():
|
||||
try:
|
||||
if self.api_mode == "codex_responses":
|
||||
result["response"] = self._run_codex_stream(api_kwargs)
|
||||
elif self.stream_callback is not None:
|
||||
result["response"] = self._run_streaming_chat_completion(api_kwargs)
|
||||
else:
|
||||
result["response"] = self.client.chat.completions.create(**api_kwargs)
|
||||
except Exception as e:
|
||||
result["error"] = e
|
||||
```
|
||||
|
||||
#### 1e. Signal end-of-stream to consumers (~5 lines)
|
||||
|
||||
After the API call returns, signal the callback that streaming is done
|
||||
so consumers can finalize (remove cursor, close SSE, etc.):
|
||||
|
||||
```python
|
||||
# In run_conversation(), after _interruptible_api_call returns:
|
||||
if self.stream_callback:
|
||||
try:
|
||||
self.stream_callback(None) # None = end of stream signal
|
||||
except Exception:
|
||||
pass
|
||||
```
|
||||
|
||||
Consumers check: `if delta is None: finalize()`
|
||||
|
||||
**Tests for Phase 1:** (~150 lines)
|
||||
- Test _run_streaming_chat_completion with mocked stream
|
||||
- Test fallback to non-streaming on error
|
||||
- Test tool_call accumulation during streaming
|
||||
- Test stream_callback receives correct deltas
|
||||
- Test None signal at end of stream
|
||||
- Test streaming disabled when callback is None
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Gateway consumers (Telegram, Discord, etc.)
|
||||
|
||||
**File: gateway/run.py**
|
||||
|
||||
#### 2a. Read streaming config (~15 lines)
|
||||
|
||||
In `_run_agent()`, before creating the AIAgent:
|
||||
|
||||
```python
|
||||
# Read streaming config
|
||||
_streaming_enabled = False
|
||||
try:
|
||||
# Check per-platform override first
|
||||
platform_key = source.platform.value if source.platform else ""
|
||||
_stream_cfg = {} # loaded from config.yaml streaming section
|
||||
if _stream_cfg.get(platform_key) is not None:
|
||||
_streaming_enabled = bool(_stream_cfg[platform_key])
|
||||
else:
|
||||
_streaming_enabled = bool(_stream_cfg.get("enabled", False))
|
||||
except Exception:
|
||||
pass
|
||||
# Env var override
|
||||
if os.getenv("HERMES_STREAMING_ENABLED", "").lower() in ("true", "1", "yes"):
|
||||
_streaming_enabled = True
|
||||
```
|
||||
|
||||
#### 2b. Set up queue + callback (~15 lines)
|
||||
|
||||
```python
|
||||
_stream_q = None
|
||||
_stream_done = None
|
||||
_stream_msg_id = [None] # mutable ref for the async task
|
||||
|
||||
if _streaming_enabled:
|
||||
import queue as _q
|
||||
_stream_q = _q.Queue()
|
||||
_stream_done = threading.Event()
|
||||
|
||||
def _on_token(delta):
|
||||
if delta is None:
|
||||
_stream_done.set()
|
||||
else:
|
||||
_stream_q.put(delta)
|
||||
```
|
||||
|
||||
Pass `stream_callback=_on_token` to the AIAgent constructor.
|
||||
|
||||
#### 2c. Telegram/Discord stream preview task (~50 lines)
|
||||
|
||||
```python
|
||||
async def stream_preview():
|
||||
"""Progressively edit a message with streaming tokens."""
|
||||
if not _stream_q:
|
||||
return
|
||||
adapter = self.adapters.get(source.platform)
|
||||
if not adapter:
|
||||
return
|
||||
|
||||
accumulated = []
|
||||
token_count = 0
|
||||
last_edit = 0.0
|
||||
MIN_TOKENS = 20 # Don't show until enough context
|
||||
EDIT_INTERVAL = 1.5 # Respect Telegram rate limits
|
||||
|
||||
try:
|
||||
while not _stream_done.is_set():
|
||||
try:
|
||||
chunk = _stream_q.get(timeout=0.1)
|
||||
accumulated.append(chunk)
|
||||
token_count += 1
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
now = time.monotonic()
|
||||
if token_count >= MIN_TOKENS and (now - last_edit) >= EDIT_INTERVAL:
|
||||
preview = "".join(accumulated) + " ▌"
|
||||
if _stream_msg_id[0] is None:
|
||||
r = await adapter.send(
|
||||
chat_id=source.chat_id,
|
||||
content=preview,
|
||||
metadata=_thread_metadata,
|
||||
)
|
||||
if r.success and r.message_id:
|
||||
_stream_msg_id[0] = r.message_id
|
||||
else:
|
||||
await adapter.edit_message(
|
||||
chat_id=source.chat_id,
|
||||
message_id=_stream_msg_id[0],
|
||||
content=preview,
|
||||
)
|
||||
last_edit = now
|
||||
|
||||
# Drain remaining tokens
|
||||
while not _stream_q.empty():
|
||||
accumulated.append(_stream_q.get_nowait())
|
||||
|
||||
# Final edit — remove cursor, show complete text
|
||||
if _stream_msg_id[0] and accumulated:
|
||||
await adapter.edit_message(
|
||||
chat_id=source.chat_id,
|
||||
message_id=_stream_msg_id[0],
|
||||
content="".join(accumulated),
|
||||
)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
# Clean up on cancel
|
||||
if _stream_msg_id[0] and accumulated:
|
||||
try:
|
||||
await adapter.edit_message(
|
||||
chat_id=source.chat_id,
|
||||
message_id=_stream_msg_id[0],
|
||||
content="".join(accumulated),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug("stream_preview error: %s", e)
|
||||
```
|
||||
|
||||
#### 2d. Skip final send if already streamed (~10 lines)
|
||||
|
||||
In `_process_message_background()` (base.py), after getting the response,
|
||||
if streaming was active and `_stream_msg_id[0]` is set, the final response
|
||||
was already delivered via progressive edits. Skip the normal `self.send()`
|
||||
call to avoid duplicating the message.
|
||||
|
||||
This is the most delicate integration point — we need to communicate from
|
||||
the gateway's `_run_agent` back to the base adapter's response sender that
|
||||
the response was already delivered. Options:
|
||||
|
||||
- **Option A**: Return a special marker in the result dict:
|
||||
`result["_streamed_msg_id"] = _stream_msg_id[0]`
|
||||
The base adapter checks this and skips `send()`.
|
||||
|
||||
- **Option B**: Edit the already-sent message with the final response
|
||||
(which may differ slightly from accumulated tokens due to think-block
|
||||
stripping, etc.) and don't send a new one.
|
||||
|
||||
- **Option C**: The stream preview task handles the FULL final response
|
||||
(including any post-processing), and the handler returns None to skip
|
||||
the normal send path.
|
||||
|
||||
Recommended: **Option A** — cleanest separation. The result dict already
|
||||
carries metadata; adding one more field is low-risk.
|
||||
|
||||
**Platform-specific considerations:**
|
||||
|
||||
| Platform | Edit support | Rate limits | Streaming approach |
|
||||
|----------|-------------|-------------|-------------------|
|
||||
| Telegram | ✅ edit_message_text | ~20 edits/min | Edit every 1.5s |
|
||||
| Discord | ✅ message.edit | 5 edits/5s per message | Edit every 1.2s |
|
||||
| Slack | ✅ chat.update | Tier 3 (~50/min) | Edit every 1.5s |
|
||||
| WhatsApp | ❌ no edit support | N/A | Skip streaming, use normal path |
|
||||
| HomeAssistant | ❌ no edit | N/A | Skip streaming |
|
||||
| API Server | ✅ SSE native | No limit | Real SSE events |
|
||||
|
||||
WhatsApp and HomeAssistant fall back to non-streaming automatically because
|
||||
they don't support message editing.
|
||||
|
||||
**Tests for Phase 2:** (~100 lines)
|
||||
- Test stream_preview sends/edits correctly
|
||||
- Test skip-final-send when streaming delivered
|
||||
- Test WhatsApp/HA graceful fallback
|
||||
- Test streaming disabled per-platform config
|
||||
- Test thread_id metadata forwarded in stream messages
|
||||
|
||||
---
|
||||
|
||||
### Phase 3: CLI streaming
|
||||
|
||||
**File: cli.py**
|
||||
|
||||
#### 3a. Set up callback in the CLI chat loop (~20 lines)
|
||||
|
||||
In `_chat_once()` or wherever the agent is invoked:
|
||||
|
||||
```python
|
||||
if streaming_enabled:
|
||||
_stream_q = queue.Queue()
|
||||
_stream_done = threading.Event()
|
||||
|
||||
def _cli_stream_callback(delta):
|
||||
if delta is None:
|
||||
_stream_done.set()
|
||||
else:
|
||||
_stream_q.put(delta)
|
||||
|
||||
agent.stream_callback = _cli_stream_callback
|
||||
```
|
||||
|
||||
#### 3b. Token display thread/task (~30 lines)
|
||||
|
||||
Start a thread that reads the queue and prints tokens:
|
||||
|
||||
```python
|
||||
def _stream_display():
|
||||
"""Print tokens to terminal as they arrive."""
|
||||
first_token = True
|
||||
while not _stream_done.is_set():
|
||||
try:
|
||||
delta = _stream_q.get(timeout=0.1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
if first_token:
|
||||
# Print response box top border
|
||||
_cprint(f"\n{top}")
|
||||
first_token = False
|
||||
sys.stdout.write(delta)
|
||||
sys.stdout.flush()
|
||||
# Drain remaining
|
||||
while not _stream_q.empty():
|
||||
sys.stdout.write(_stream_q.get_nowait())
|
||||
sys.stdout.flush()
|
||||
# Print bottom border
|
||||
_cprint(f"\n\n{bot}")
|
||||
```
|
||||
|
||||
**Integration challenge: prompt_toolkit**
|
||||
|
||||
The CLI uses prompt_toolkit which controls the terminal. Writing directly
|
||||
to stdout while prompt_toolkit is active can cause display corruption.
|
||||
The existing KawaiiSpinner already solves this by using prompt_toolkit's
|
||||
`patch_stdout` context. The streaming display would need to do the same.
|
||||
|
||||
Alternative: use `_cprint()` for each token chunk (routes through
|
||||
prompt_toolkit's renderer). But this might be slow for individual tokens.
|
||||
|
||||
Recommended approach: accumulate tokens in small batches (e.g., every 50ms)
|
||||
and `_cprint()` the batch. This balances display responsiveness with
|
||||
prompt_toolkit compatibility.
|
||||
|
||||
**Tests for Phase 3:** (~50 lines)
|
||||
- Test CLI streaming callback setup
|
||||
- Test response box borders with streaming
|
||||
- Test fallback when streaming disabled
|
||||
|
||||
---
|
||||
|
||||
### Phase 4: API Server real streaming
|
||||
|
||||
**File: gateway/platforms/api_server.py**
|
||||
|
||||
Replace the pseudo-streaming `_write_sse_chat_completion()` with real
|
||||
token-by-token SSE when the agent supports it.
|
||||
|
||||
#### 4a. Wire streaming callback for stream=true requests (~20 lines)
|
||||
|
||||
```python
|
||||
if stream:
|
||||
_stream_q = queue.Queue()
|
||||
|
||||
def _api_stream_callback(delta):
|
||||
_stream_q.put(delta) # None = done
|
||||
|
||||
# Pass callback to _run_agent
|
||||
result, usage = await self._run_agent(
|
||||
..., stream_callback=_api_stream_callback,
|
||||
)
|
||||
```
|
||||
|
||||
#### 4b. Real SSE writer (~40 lines)
|
||||
|
||||
```python
|
||||
async def _write_real_sse(self, request, completion_id, model, stream_q):
|
||||
response = web.StreamResponse(
|
||||
headers={"Content-Type": "text/event-stream", "Cache-Control": "no-cache"},
|
||||
)
|
||||
await response.prepare(request)
|
||||
|
||||
# Role chunk
|
||||
await response.write(...)
|
||||
|
||||
# Stream content chunks as they arrive
|
||||
while True:
|
||||
try:
|
||||
delta = await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: stream_q.get(timeout=0.1)
|
||||
)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
if delta is None: # End of stream
|
||||
break
|
||||
|
||||
chunk = {"id": completion_id, "object": "chat.completion.chunk", ...
|
||||
"choices": [{"delta": {"content": delta}, ...}]}
|
||||
await response.write(f"data: {json.dumps(chunk)}\n\n".encode())
|
||||
|
||||
# Finish + [DONE]
|
||||
await response.write(...)
|
||||
await response.write(b"data: [DONE]\n\n")
|
||||
return response
|
||||
```
|
||||
|
||||
**Challenge: concurrent execution**
|
||||
|
||||
The agent runs in a thread executor. SSE writing happens in the async event
|
||||
loop. The queue bridges them. But `_run_agent()` currently awaits the full
|
||||
result before returning. For real streaming, we need to start the agent in
|
||||
the background and stream tokens while it runs:
|
||||
|
||||
```python
|
||||
# Start agent in background
|
||||
agent_task = asyncio.create_task(self._run_agent_async(...))
|
||||
|
||||
# Stream tokens while agent runs
|
||||
await self._write_real_sse(request, ..., stream_q)
|
||||
|
||||
# Agent is done by now (stream_q received None)
|
||||
result, usage = await agent_task
|
||||
```
|
||||
|
||||
This requires splitting `_run_agent` into an async version that doesn't
|
||||
block waiting for the result, or running it in a separate task.
|
||||
|
||||
**Responses API SSE format:**
|
||||
|
||||
For `/v1/responses` with `stream=true`, the SSE events are different:
|
||||
|
||||
```
|
||||
event: response.output_text.delta
|
||||
data: {"type":"response.output_text.delta","delta":"Hello"}
|
||||
|
||||
event: response.completed
|
||||
data: {"type":"response.completed","response":{...}}
|
||||
```
|
||||
|
||||
This needs a separate SSE writer that emits Responses API format events.
|
||||
|
||||
**Tests for Phase 4:** (~80 lines)
|
||||
- Test real SSE streaming with mocked agent
|
||||
- Test SSE event format (Chat Completions vs Responses)
|
||||
- Test client disconnect during streaming
|
||||
- Test fallback to pseudo-streaming when callback not available
|
||||
|
||||
---
|
||||
|
||||
## Integration Issues & Edge Cases
|
||||
|
||||
### 1. Tool calls during streaming
|
||||
|
||||
When the model returns tool calls instead of text, no text tokens are emitted.
|
||||
The stream_callback is simply never called with text. After tools execute, the
|
||||
next API call may produce the final text response — streaming picks up again.
|
||||
|
||||
The stream preview task needs to handle this: if no tokens arrive during a
|
||||
tool-call round, don't send/edit any message. The tool progress messages
|
||||
continue working as before.
|
||||
|
||||
### 2. Duplicate messages
|
||||
|
||||
The biggest risk: the agent sends the final response normally (via the
|
||||
existing send path) AND the stream preview already showed it. The user
|
||||
sees the response twice.
|
||||
|
||||
Prevention: when streaming is active and tokens were delivered, the final
|
||||
response send must be suppressed. The `result["_streamed_msg_id"]` marker
|
||||
tells the base adapter to skip its normal send.
|
||||
|
||||
### 3. Response post-processing
|
||||
|
||||
The final response may differ from the accumulated streamed tokens:
|
||||
- Think block stripping (`<think>...</think>` removed)
|
||||
- Trailing whitespace cleanup
|
||||
- Tool result media tag appending
|
||||
|
||||
The stream preview shows raw tokens. The final edit should use the
|
||||
post-processed version. This means the final edit (removing the cursor)
|
||||
should use the post-processed `final_response`, not just the accumulated
|
||||
stream text.
|
||||
|
||||
### 4. Context compression during streaming
|
||||
|
||||
If the agent triggers context compression mid-conversation, the streaming
|
||||
tokens from BEFORE compression are from a different context than those
|
||||
after. This isn't a problem in practice — compression happens between
|
||||
API calls, not during streaming.
|
||||
|
||||
### 5. Interrupt during streaming
|
||||
|
||||
User sends a new message while streaming → interrupt. The stream is killed
|
||||
(HTTP connection closed), accumulated tokens are shown as-is (no cursor),
|
||||
and the interrupt message is processed normally. This is already handled by
|
||||
`_interruptible_api_call` closing the client.
|
||||
|
||||
### 6. Multi-model / fallback
|
||||
|
||||
If the primary model fails and the agent falls back to a different model,
|
||||
streaming state resets. The fallback call may or may not support streaming.
|
||||
The graceful fallback in `_run_streaming_chat_completion` handles this.
|
||||
|
||||
### 7. Rate limiting on edits
|
||||
|
||||
Telegram: ~20 edits/minute (~1 every 3 seconds to be safe)
|
||||
Discord: 5 edits per 5 seconds per message
|
||||
Slack: ~50 API calls/minute
|
||||
|
||||
The 1.5s edit interval is conservative enough for all platforms. If we get
|
||||
429 rate limit errors on edits, just skip that edit cycle and try next time.
|
||||
|
||||
---
|
||||
|
||||
## Files Changed Summary
|
||||
|
||||
| File | Phase | Changes |
|
||||
|------|-------|---------|
|
||||
| `run_agent.py` | 1 | +stream_callback param, +_run_streaming_chat_completion(), modify _run_codex_stream(), modify _interruptible_api_call() |
|
||||
| `gateway/run.py` | 2 | +streaming config reader, +queue/callback setup, +stream_preview task, +skip-final-send logic |
|
||||
| `gateway/platforms/base.py` | 2 | +check for _streamed_msg_id in response handler |
|
||||
| `cli.py` | 3 | +streaming setup, +token display, +response box integration |
|
||||
| `gateway/platforms/api_server.py` | 4 | +real SSE writer, +streaming callback wiring |
|
||||
| `hermes_cli/config.py` | 1 | +streaming config defaults |
|
||||
| `cli-config.yaml.example` | 1 | +streaming section |
|
||||
| `tests/test_streaming.py` | 1-4 | NEW — ~380 lines of tests |
|
||||
|
||||
**Total new code**: ~500 lines across all phases
|
||||
**Total test code**: ~380 lines
|
||||
|
||||
---
|
||||
|
||||
## Rollout Plan
|
||||
|
||||
1. **Phase 1** (core): Merge to main. Streaming disabled by default.
|
||||
Zero impact on existing behavior. Can be tested with env var.
|
||||
|
||||
2. **Phase 2** (gateway): Merge to main. Test on Telegram manually.
|
||||
Enable per-platform: `streaming.telegram: true` in config.
|
||||
|
||||
3. **Phase 3** (CLI): Merge to main. Test in terminal.
|
||||
Enable: `streaming.cli: true` or `streaming.enabled: true`.
|
||||
|
||||
4. **Phase 4** (API server): Merge to main. Test with Open WebUI.
|
||||
Auto-enabled when client sends `stream: true`.
|
||||
|
||||
Each phase is independently mergeable and testable. Streaming stays
|
||||
off by default throughout. Once all phases are stable, consider
|
||||
changing the default to enabled.
|
||||
|
||||
---
|
||||
|
||||
## Config Reference (final state)
|
||||
|
||||
```yaml
|
||||
# config.yaml
|
||||
streaming:
|
||||
enabled: false # Master switch (default: off)
|
||||
cli: true # Per-platform override
|
||||
telegram: true
|
||||
discord: true
|
||||
slack: true
|
||||
api_server: true # API server always streams when client requests it
|
||||
edit_interval: 1.5 # Seconds between message edits (default: 1.5)
|
||||
min_tokens: 20 # Tokens before first display (default: 20)
|
||||
```
|
||||
|
||||
```bash
|
||||
# Environment variable override
|
||||
HERMES_STREAMING_ENABLED=true
|
||||
```
|
||||
385
AGENTS.md
Normal file
385
AGENTS.md
Normal file
@@ -0,0 +1,385 @@
|
||||
# Hermes Agent - Development Guide
|
||||
|
||||
Instructions for AI coding assistants and developers working on the hermes-agent codebase.
|
||||
|
||||
## Development Environment
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate # ALWAYS activate before running Python
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
hermes-agent/
|
||||
├── run_agent.py # AIAgent class — core conversation loop
|
||||
├── model_tools.py # Tool orchestration, _discover_tools(), handle_function_call()
|
||||
├── toolsets.py # Toolset definitions, _HERMES_CORE_TOOLS list
|
||||
├── cli.py # HermesCLI class — interactive CLI orchestrator
|
||||
├── hermes_state.py # SessionDB — SQLite session store (FTS5 search)
|
||||
├── agent/ # Agent internals
|
||||
│ ├── prompt_builder.py # System prompt assembly
|
||||
│ ├── context_compressor.py # Auto context compression
|
||||
│ ├── prompt_caching.py # Anthropic prompt caching
|
||||
│ ├── auxiliary_client.py # Auxiliary LLM client (vision, summarization)
|
||||
│ ├── model_metadata.py # Model context lengths, token estimation
|
||||
│ ├── display.py # KawaiiSpinner, tool preview formatting
|
||||
│ ├── skill_commands.py # Skill slash commands (shared CLI/gateway)
|
||||
│ └── trajectory.py # Trajectory saving helpers
|
||||
├── hermes_cli/ # CLI subcommands and setup
|
||||
│ ├── main.py # Entry point — all `hermes` subcommands
|
||||
│ ├── config.py # DEFAULT_CONFIG, OPTIONAL_ENV_VARS, migration
|
||||
│ ├── commands.py # Slash command definitions + SlashCommandCompleter
|
||||
│ ├── callbacks.py # Terminal callbacks (clarify, sudo, approval)
|
||||
│ ├── setup.py # Interactive setup wizard
|
||||
│ ├── skin_engine.py # Skin/theme engine — CLI visual customization
|
||||
│ ├── skills_config.py # `hermes skills` — enable/disable skills per platform
|
||||
│ ├── tools_config.py # `hermes tools` — enable/disable tools per platform
|
||||
│ ├── skills_hub.py # `/skills` slash command (search, browse, install)
|
||||
│ ├── models.py # Model catalog, provider model lists
|
||||
│ └── auth.py # Provider credential resolution
|
||||
├── tools/ # Tool implementations (one file per tool)
|
||||
│ ├── registry.py # Central tool registry (schemas, handlers, dispatch)
|
||||
│ ├── approval.py # Dangerous command detection
|
||||
│ ├── terminal_tool.py # Terminal orchestration
|
||||
│ ├── process_registry.py # Background process management
|
||||
│ ├── file_tools.py # File read/write/search/patch
|
||||
│ ├── web_tools.py # Firecrawl search/extract
|
||||
│ ├── browser_tool.py # Browserbase browser automation
|
||||
│ ├── code_execution_tool.py # execute_code sandbox
|
||||
│ ├── delegate_tool.py # Subagent delegation
|
||||
│ ├── mcp_tool.py # MCP client (~1050 lines)
|
||||
│ └── environments/ # Terminal backends (local, docker, ssh, modal, daytona, singularity)
|
||||
├── gateway/ # Messaging platform gateway
|
||||
│ ├── run.py # Main loop, slash commands, message dispatch
|
||||
│ ├── session.py # SessionStore — conversation persistence
|
||||
│ └── platforms/ # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal
|
||||
├── acp_adapter/ # ACP server (VS Code / Zed / JetBrains integration)
|
||||
├── cron/ # Scheduler (jobs.py, scheduler.py)
|
||||
├── environments/ # RL training environments (Atropos)
|
||||
├── tests/ # Pytest suite (~3000 tests)
|
||||
└── batch_runner.py # Parallel batch processing
|
||||
```
|
||||
|
||||
**User config:** `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys)
|
||||
|
||||
## File Dependency Chain
|
||||
|
||||
```
|
||||
tools/registry.py (no deps — imported by all tool files)
|
||||
↑
|
||||
tools/*.py (each calls registry.register() at import time)
|
||||
↑
|
||||
model_tools.py (imports tools/registry + triggers tool discovery)
|
||||
↑
|
||||
run_agent.py, cli.py, batch_runner.py, environments/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## AIAgent Class (run_agent.py)
|
||||
|
||||
```python
|
||||
class AIAgent:
|
||||
def __init__(self,
|
||||
model: str = "anthropic/claude-opus-4.6",
|
||||
max_iterations: int = 90,
|
||||
enabled_toolsets: list = None,
|
||||
disabled_toolsets: list = None,
|
||||
quiet_mode: bool = False,
|
||||
save_trajectories: bool = False,
|
||||
platform: str = None, # "cli", "telegram", etc.
|
||||
session_id: str = None,
|
||||
skip_context_files: bool = False,
|
||||
skip_memory: bool = False,
|
||||
# ... plus provider, api_mode, callbacks, routing params
|
||||
): ...
|
||||
|
||||
def chat(self, message: str) -> str:
|
||||
"""Simple interface — returns final response string."""
|
||||
|
||||
def run_conversation(self, user_message: str, system_message: str = None,
|
||||
conversation_history: list = None, task_id: str = None) -> dict:
|
||||
"""Full interface — returns dict with final_response + messages."""
|
||||
```
|
||||
|
||||
### Agent Loop
|
||||
|
||||
The core loop is inside `run_conversation()` — entirely synchronous:
|
||||
|
||||
```python
|
||||
while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
|
||||
response = client.chat.completions.create(model=model, messages=messages, tools=tool_schemas)
|
||||
if response.tool_calls:
|
||||
for tool_call in response.tool_calls:
|
||||
result = handle_function_call(tool_call.name, tool_call.args, task_id)
|
||||
messages.append(tool_result_message(result))
|
||||
api_call_count += 1
|
||||
else:
|
||||
return response.content
|
||||
```
|
||||
|
||||
Messages follow OpenAI format: `{"role": "system/user/assistant/tool", ...}`. Reasoning content is stored in `assistant_msg["reasoning"]`.
|
||||
|
||||
---
|
||||
|
||||
## CLI Architecture (cli.py)
|
||||
|
||||
- **Rich** for banner/panels, **prompt_toolkit** for input with autocomplete
|
||||
- **KawaiiSpinner** (`agent/display.py`) — animated faces during API calls, `┊` activity feed for tool results
|
||||
- `load_cli_config()` in cli.py merges hardcoded defaults + user config YAML
|
||||
- **Skin engine** (`hermes_cli/skin_engine.py`) — data-driven CLI theming; initialized from `display.skin` config key at startup; skins customize banner colors, spinner faces/verbs/wings, tool prefix, response box, branding text
|
||||
- `process_command()` is a method on `HermesCLI` — dispatches on canonical command name resolved via `resolve_command()` from the central registry
|
||||
- Skill slash commands: `agent/skill_commands.py` scans `~/.hermes/skills/`, injects as **user message** (not system prompt) to preserve prompt caching
|
||||
|
||||
### Slash Command Registry (`hermes_cli/commands.py`)
|
||||
|
||||
All slash commands are defined in a central `COMMAND_REGISTRY` list of `CommandDef` objects. Every downstream consumer derives from this registry automatically:
|
||||
|
||||
- **CLI** — `process_command()` resolves aliases via `resolve_command()`, dispatches on canonical name
|
||||
- **Gateway** — `GATEWAY_KNOWN_COMMANDS` frozenset for hook emission, `resolve_command()` for dispatch
|
||||
- **Gateway help** — `gateway_help_lines()` generates `/help` output
|
||||
- **Telegram** — `telegram_bot_commands()` generates the BotCommand menu
|
||||
- **Slack** — `slack_subcommand_map()` generates `/hermes` subcommand routing
|
||||
- **Autocomplete** — `COMMANDS` flat dict feeds `SlashCommandCompleter`
|
||||
- **CLI help** — `COMMANDS_BY_CATEGORY` dict feeds `show_help()`
|
||||
|
||||
### Adding a Slash Command
|
||||
|
||||
1. Add a `CommandDef` entry to `COMMAND_REGISTRY` in `hermes_cli/commands.py`:
|
||||
```python
|
||||
CommandDef("mycommand", "Description of what it does", "Session",
|
||||
aliases=("mc",), args_hint="[arg]"),
|
||||
```
|
||||
2. Add handler in `HermesCLI.process_command()` in `cli.py`:
|
||||
```python
|
||||
elif canonical == "mycommand":
|
||||
self._handle_mycommand(cmd_original)
|
||||
```
|
||||
3. If the command is available in the gateway, add a handler in `gateway/run.py`:
|
||||
```python
|
||||
if canonical == "mycommand":
|
||||
return await self._handle_mycommand(event)
|
||||
```
|
||||
4. For persistent settings, use `save_config_value()` in `cli.py`
|
||||
|
||||
**CommandDef fields:**
|
||||
- `name` — canonical name without slash (e.g. `"background"`)
|
||||
- `description` — human-readable description
|
||||
- `category` — one of `"Session"`, `"Configuration"`, `"Tools & Skills"`, `"Info"`, `"Exit"`
|
||||
- `aliases` — tuple of alternative names (e.g. `("bg",)`)
|
||||
- `args_hint` — argument placeholder shown in help (e.g. `"<prompt>"`, `"[name]"`)
|
||||
- `cli_only` — only available in the interactive CLI
|
||||
- `gateway_only` — only available in messaging platforms
|
||||
|
||||
**Adding an alias** requires only adding it to the `aliases` tuple on the existing `CommandDef`. No other file changes needed — dispatch, help text, Telegram menu, Slack mapping, and autocomplete all update automatically.
|
||||
|
||||
---
|
||||
|
||||
## Adding New Tools
|
||||
|
||||
Requires changes in **3 files**:
|
||||
|
||||
**1. Create `tools/your_tool.py`:**
|
||||
```python
|
||||
import json, os
|
||||
from tools.registry import registry
|
||||
|
||||
def check_requirements() -> bool:
|
||||
return bool(os.getenv("EXAMPLE_API_KEY"))
|
||||
|
||||
def example_tool(param: str, task_id: str = None) -> str:
|
||||
return json.dumps({"success": True, "data": "..."})
|
||||
|
||||
registry.register(
|
||||
name="example_tool",
|
||||
toolset="example",
|
||||
schema={"name": "example_tool", "description": "...", "parameters": {...}},
|
||||
handler=lambda args, **kw: example_tool(param=args.get("param", ""), task_id=kw.get("task_id")),
|
||||
check_fn=check_requirements,
|
||||
requires_env=["EXAMPLE_API_KEY"],
|
||||
)
|
||||
```
|
||||
|
||||
**2. Add import** in `model_tools.py` `_discover_tools()` list.
|
||||
|
||||
**3. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.
|
||||
|
||||
The registry handles schema collection, dispatch, availability checking, and error wrapping. All handlers MUST return a JSON string.
|
||||
|
||||
**Agent-level tools** (todo, memory): intercepted by `run_agent.py` before `handle_function_call()`. See `todo_tool.py` for the pattern.
|
||||
|
||||
---
|
||||
|
||||
## Adding Configuration
|
||||
|
||||
### config.yaml options:
|
||||
1. Add to `DEFAULT_CONFIG` in `hermes_cli/config.py`
|
||||
2. Bump `_config_version` (currently 5) to trigger migration for existing users
|
||||
|
||||
### .env variables:
|
||||
1. Add to `OPTIONAL_ENV_VARS` in `hermes_cli/config.py` with metadata:
|
||||
```python
|
||||
"NEW_API_KEY": {
|
||||
"description": "What it's for",
|
||||
"prompt": "Display name",
|
||||
"url": "https://...",
|
||||
"password": True,
|
||||
"category": "tool", # provider, tool, messaging, setting
|
||||
},
|
||||
```
|
||||
|
||||
### Config loaders (two separate systems):
|
||||
|
||||
| Loader | Used by | Location |
|
||||
|--------|---------|----------|
|
||||
| `load_cli_config()` | CLI mode | `cli.py` |
|
||||
| `load_config()` | `hermes tools`, `hermes setup` | `hermes_cli/config.py` |
|
||||
| Direct YAML load | Gateway | `gateway/run.py` |
|
||||
|
||||
---
|
||||
|
||||
## Skin/Theme System
|
||||
|
||||
The skin engine (`hermes_cli/skin_engine.py`) provides data-driven CLI visual customization. Skins are **pure data** — no code changes needed to add a new skin.
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
hermes_cli/skin_engine.py # SkinConfig dataclass, built-in skins, YAML loader
|
||||
~/.hermes/skins/*.yaml # User-installed custom skins (drop-in)
|
||||
```
|
||||
|
||||
- `init_skin_from_config()` — called at CLI startup, reads `display.skin` from config
|
||||
- `get_active_skin()` — returns cached `SkinConfig` for the current skin
|
||||
- `set_active_skin(name)` — switches skin at runtime (used by `/skin` command)
|
||||
- `load_skin(name)` — loads from user skins first, then built-ins, then falls back to default
|
||||
- Missing skin values inherit from the `default` skin automatically
|
||||
|
||||
### What skins customize
|
||||
|
||||
| Element | Skin Key | Used By |
|
||||
|---------|----------|---------|
|
||||
| Banner panel border | `colors.banner_border` | `banner.py` |
|
||||
| Banner panel title | `colors.banner_title` | `banner.py` |
|
||||
| Banner section headers | `colors.banner_accent` | `banner.py` |
|
||||
| Banner dim text | `colors.banner_dim` | `banner.py` |
|
||||
| Banner body text | `colors.banner_text` | `banner.py` |
|
||||
| Response box border | `colors.response_border` | `cli.py` |
|
||||
| Spinner faces (waiting) | `spinner.waiting_faces` | `display.py` |
|
||||
| Spinner faces (thinking) | `spinner.thinking_faces` | `display.py` |
|
||||
| Spinner verbs | `spinner.thinking_verbs` | `display.py` |
|
||||
| Spinner wings (optional) | `spinner.wings` | `display.py` |
|
||||
| Tool output prefix | `tool_prefix` | `display.py` |
|
||||
| Per-tool emojis | `tool_emojis` | `display.py` → `get_tool_emoji()` |
|
||||
| Agent name | `branding.agent_name` | `banner.py`, `cli.py` |
|
||||
| Welcome message | `branding.welcome` | `cli.py` |
|
||||
| Response box label | `branding.response_label` | `cli.py` |
|
||||
| Prompt symbol | `branding.prompt_symbol` | `cli.py` |
|
||||
|
||||
### Built-in skins
|
||||
|
||||
- `default` — Classic Hermes gold/kawaii (the current look)
|
||||
- `ares` — Crimson/bronze war-god theme with custom spinner wings
|
||||
- `mono` — Clean grayscale monochrome
|
||||
- `slate` — Cool blue developer-focused theme
|
||||
|
||||
### Adding a built-in skin
|
||||
|
||||
Add to `_BUILTIN_SKINS` dict in `hermes_cli/skin_engine.py`:
|
||||
|
||||
```python
|
||||
"mytheme": {
|
||||
"name": "mytheme",
|
||||
"description": "Short description",
|
||||
"colors": { ... },
|
||||
"spinner": { ... },
|
||||
"branding": { ... },
|
||||
"tool_prefix": "┊",
|
||||
},
|
||||
```
|
||||
|
||||
### User skins (YAML)
|
||||
|
||||
Users create `~/.hermes/skins/<name>.yaml`:
|
||||
|
||||
```yaml
|
||||
name: cyberpunk
|
||||
description: Neon-soaked terminal theme
|
||||
|
||||
colors:
|
||||
banner_border: "#FF00FF"
|
||||
banner_title: "#00FFFF"
|
||||
banner_accent: "#FF1493"
|
||||
|
||||
spinner:
|
||||
thinking_verbs: ["jacking in", "decrypting", "uploading"]
|
||||
wings:
|
||||
- ["⟨⚡", "⚡⟩"]
|
||||
|
||||
branding:
|
||||
agent_name: "Cyber Agent"
|
||||
response_label: " ⚡ Cyber "
|
||||
|
||||
tool_prefix: "▏"
|
||||
```
|
||||
|
||||
Activate with `/skin cyberpunk` or `display.skin: cyberpunk` in config.yaml.
|
||||
|
||||
---
|
||||
|
||||
## Important Policies
|
||||
### Prompt Caching Must Not Break
|
||||
|
||||
Hermes-Agent ensures caching remains valid throughout a conversation. **Do NOT implement changes that would:**
|
||||
- Alter past context mid-conversation
|
||||
- Change toolsets mid-conversation
|
||||
- Reload memories or rebuild system prompts mid-conversation
|
||||
|
||||
Cache-breaking forces dramatically higher costs. The ONLY time we alter context is during context compression.
|
||||
|
||||
### Working Directory Behavior
|
||||
- **CLI**: Uses current directory (`.` → `os.getcwd()`)
|
||||
- **Messaging**: Uses `MESSAGING_CWD` env var (default: home directory)
|
||||
|
||||
### Background Process Notifications (Gateway)
|
||||
|
||||
When `terminal(background=true, check_interval=...)` is used, the gateway runs a watcher that
|
||||
pushes status updates to the user's chat. Control verbosity with `display.background_process_notifications`
|
||||
in config.yaml (or `HERMES_BACKGROUND_NOTIFICATIONS` env var):
|
||||
|
||||
- `all` — running-output updates + final message (default)
|
||||
- `result` — only the final completion message
|
||||
- `error` — only the final message when exit code != 0
|
||||
- `off` — no watcher messages at all
|
||||
|
||||
---
|
||||
|
||||
## Known Pitfalls
|
||||
|
||||
### DO NOT use `simple_term_menu` for interactive menus
|
||||
Rendering bugs in tmux/iTerm2 — ghosting on scroll. Use `curses` (stdlib) instead. See `hermes_cli/tools_config.py` for the pattern.
|
||||
|
||||
### DO NOT use `\033[K` (ANSI erase-to-EOL) in spinner/display code
|
||||
Leaks as literal `?[K` text under `prompt_toolkit`'s `patch_stdout`. Use space-padding: `f"\r{line}{' ' * pad}"`.
|
||||
|
||||
### `_last_resolved_tool_names` is a process-global in `model_tools.py`
|
||||
When subagents overwrite this global, `execute_code` calls after delegation may fail with missing tool imports. Known bug.
|
||||
|
||||
### Tests must not write to `~/.hermes/`
|
||||
The `_isolate_hermes_home` autouse fixture in `tests/conftest.py` redirects `HERMES_HOME` to a temp dir. Never hardcode `~/.hermes/` paths in tests.
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
python -m pytest tests/ -q # Full suite (~3000 tests, ~3 min)
|
||||
python -m pytest tests/test_model_tools.py -q # Toolset resolution
|
||||
python -m pytest tests/test_cli_init.py -q # CLI config loading
|
||||
python -m pytest tests/gateway/ -q # Gateway tests
|
||||
python -m pytest tests/tools/ -q # Tool-level tests
|
||||
```
|
||||
|
||||
Always run the full suite before pushing changes.
|
||||
659
CONTRIBUTING.md
Normal file
659
CONTRIBUTING.md
Normal file
@@ -0,0 +1,659 @@
|
||||
# Contributing to Hermes Agent
|
||||
|
||||
Thank you for contributing to Hermes Agent! This guide covers everything you need: setting up your dev environment, understanding the architecture, deciding what to build, and getting your PR merged.
|
||||
|
||||
---
|
||||
|
||||
## Contribution Priorities
|
||||
|
||||
We value contributions in this order:
|
||||
|
||||
1. **Bug fixes** — crashes, incorrect behavior, data loss. Always top priority.
|
||||
2. **Cross-platform compatibility** — Windows, macOS, different Linux distros, different terminal emulators. We want Hermes to work everywhere.
|
||||
3. **Security hardening** — shell injection, prompt injection, path traversal, privilege escalation. See [Security](#security-considerations).
|
||||
4. **Performance and robustness** — retry logic, error handling, graceful degradation.
|
||||
5. **New skills** — but only broadly useful ones. See [Should it be a Skill or a Tool?](#should-it-be-a-skill-or-a-tool)
|
||||
6. **New tools** — rarely needed. Most capabilities should be skills. See below.
|
||||
7. **Documentation** — fixes, clarifications, new examples.
|
||||
|
||||
---
|
||||
|
||||
## Should it be a Skill or a Tool?
|
||||
|
||||
This is the most common question for new contributors. The answer is almost always **skill**.
|
||||
|
||||
### Make it a Skill when:
|
||||
|
||||
- The capability can be expressed as instructions + shell commands + existing tools
|
||||
- It wraps an external CLI or API that the agent can call via `terminal` or `web_extract`
|
||||
- It doesn't need custom Python integration or API key management baked into the agent
|
||||
- Examples: arXiv search, git workflows, Docker management, PDF processing, email via CLI tools
|
||||
|
||||
### Make it a Tool when:
|
||||
|
||||
- It requires end-to-end integration with API keys, auth flows, or multi-component configuration managed by the agent harness
|
||||
- It needs custom processing logic that must execute precisely every time (not "best effort" from LLM interpretation)
|
||||
- It handles binary data, streaming, or real-time events that can't go through the terminal
|
||||
- Examples: browser automation (Browserbase session management), TTS (audio encoding + platform delivery), vision analysis (base64 image handling)
|
||||
|
||||
### Should the Skill be bundled?
|
||||
|
||||
Bundled skills (in `skills/`) ship with every Hermes install. They should be **broadly useful to most users**:
|
||||
|
||||
- Document handling, web research, common dev workflows, system administration
|
||||
- Used regularly by a wide range of people
|
||||
|
||||
If your skill is official and useful but not universally needed (e.g., a paid service integration, a heavyweight dependency), put it in **`optional-skills/`** — it ships with the repo but isn't activated by default. Users can discover it via `hermes skills browse` (labeled "official") and install it with `hermes skills install` (no third-party warning, builtin trust).
|
||||
|
||||
If your skill is specialized, community-contributed, or niche, it's better suited for a **Skills Hub** — upload it to a skills registry and share it in the [Nous Research Discord](https://discord.gg/NousResearch). Users can install it with `hermes skills install`.
|
||||
|
||||
---
|
||||
|
||||
## Development Setup
|
||||
|
||||
### Prerequisites
|
||||
|
||||
| Requirement | Notes |
|
||||
|-------------|-------|
|
||||
| **Git** | With `--recurse-submodules` support |
|
||||
| **Python 3.11+** | uv will install it if missing |
|
||||
| **uv** | Fast Python package manager ([install](https://docs.astral.sh/uv/)) |
|
||||
| **Node.js 18+** | Optional — needed for browser tools and WhatsApp bridge |
|
||||
|
||||
### Clone and install
|
||||
|
||||
```bash
|
||||
git clone --recurse-submodules https://github.com/NousResearch/hermes-agent.git
|
||||
cd hermes-agent
|
||||
|
||||
# Create venv with Python 3.11
|
||||
uv venv venv --python 3.11
|
||||
export VIRTUAL_ENV="$(pwd)/venv"
|
||||
|
||||
# Install with all extras (messaging, cron, CLI menus, dev tools)
|
||||
uv pip install -e ".[all,dev]"
|
||||
uv pip install -e "./mini-swe-agent"
|
||||
uv pip install -e "./tinker-atropos"
|
||||
|
||||
# Optional: browser tools
|
||||
npm install
|
||||
```
|
||||
|
||||
### Configure for development
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.hermes/{cron,sessions,logs,memories,skills}
|
||||
cp cli-config.yaml.example ~/.hermes/config.yaml
|
||||
touch ~/.hermes/.env
|
||||
|
||||
# Add at minimum an LLM provider key:
|
||||
echo 'OPENROUTER_API_KEY=sk-or-v1-your-key' >> ~/.hermes/.env
|
||||
```
|
||||
|
||||
### Run
|
||||
|
||||
```bash
|
||||
# Symlink for global access
|
||||
mkdir -p ~/.local/bin
|
||||
ln -sf "$(pwd)/venv/bin/hermes" ~/.local/bin/hermes
|
||||
|
||||
# Verify
|
||||
hermes doctor
|
||||
hermes chat -q "Hello"
|
||||
```
|
||||
|
||||
### Run tests
|
||||
|
||||
```bash
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
hermes-agent/
|
||||
├── run_agent.py # AIAgent class — core conversation loop, tool dispatch, session persistence
|
||||
├── cli.py # HermesCLI class — interactive TUI, prompt_toolkit integration
|
||||
├── model_tools.py # Tool orchestration (thin layer over tools/registry.py)
|
||||
├── toolsets.py # Tool groupings and presets (hermes-cli, hermes-telegram, etc.)
|
||||
├── hermes_state.py # SQLite session database with FTS5 full-text search, session titles
|
||||
├── batch_runner.py # Parallel batch processing for trajectory generation
|
||||
│
|
||||
├── agent/ # Agent internals (extracted modules)
|
||||
│ ├── prompt_builder.py # System prompt assembly (identity, skills, context files, memory)
|
||||
│ ├── context_compressor.py # Auto-summarization when approaching context limits
|
||||
│ ├── auxiliary_client.py # Resolves auxiliary OpenAI clients (summarization, vision)
|
||||
│ ├── display.py # KawaiiSpinner, tool progress formatting
|
||||
│ ├── model_metadata.py # Model context lengths, token estimation
|
||||
│ └── trajectory.py # Trajectory saving helpers
|
||||
│
|
||||
├── hermes_cli/ # CLI command implementations
|
||||
│ ├── main.py # Entry point, argument parsing, command dispatch
|
||||
│ ├── config.py # Config management, migration, env var definitions
|
||||
│ ├── setup.py # Interactive setup wizard
|
||||
│ ├── auth.py # Provider resolution, OAuth, Nous Portal
|
||||
│ ├── models.py # OpenRouter model selection lists
|
||||
│ ├── banner.py # Welcome banner, ASCII art
|
||||
│ ├── commands.py # Central slash command registry (CommandDef), autocomplete, gateway helpers
|
||||
│ ├── callbacks.py # Interactive callbacks (clarify, sudo, approval)
|
||||
│ ├── doctor.py # Diagnostics
|
||||
│ ├── skills_hub.py # Skills Hub CLI + /skills slash command
|
||||
│ └── skin_engine.py # Skin/theme engine — data-driven CLI visual customization
|
||||
│
|
||||
├── tools/ # Tool implementations (self-registering)
|
||||
│ ├── registry.py # Central tool registry (schemas, handlers, dispatch)
|
||||
│ ├── approval.py # Dangerous command detection + per-session approval
|
||||
│ ├── terminal_tool.py # Terminal orchestration (sudo, env lifecycle, backends)
|
||||
│ ├── file_operations.py # read_file, write_file, search, patch, etc.
|
||||
│ ├── web_tools.py # web_search, web_extract (Firecrawl + Gemini summarization)
|
||||
│ ├── vision_tools.py # Image analysis via multimodal models
|
||||
│ ├── delegate_tool.py # Subagent spawning and parallel task execution
|
||||
│ ├── code_execution_tool.py # Sandboxed Python with RPC tool access
|
||||
│ ├── session_search_tool.py # Search past conversations with FTS5 + summarization
|
||||
│ ├── cronjob_tools.py # Scheduled task management
|
||||
│ ├── skill_tools.py # Skill search, load, manage
|
||||
│ └── environments/ # Terminal execution backends
|
||||
│ ├── base.py # BaseEnvironment ABC
|
||||
│ ├── local.py, docker.py, ssh.py, singularity.py, modal.py, daytona.py
|
||||
│
|
||||
├── gateway/ # Messaging gateway
|
||||
│ ├── run.py # GatewayRunner — platform lifecycle, message routing, cron
|
||||
│ ├── config.py # Platform configuration resolution
|
||||
│ ├── session.py # Session store, context prompts, reset policies
|
||||
│ └── platforms/ # Platform adapters
|
||||
│ ├── telegram.py, discord_adapter.py, slack.py, whatsapp.py
|
||||
│
|
||||
├── scripts/ # Installer and bridge scripts
|
||||
│ ├── install.sh # Linux/macOS installer
|
||||
│ ├── install.ps1 # Windows PowerShell installer
|
||||
│ └── whatsapp-bridge/ # Node.js WhatsApp bridge (Baileys)
|
||||
│
|
||||
├── skills/ # Bundled skills (copied to ~/.hermes/skills/ on install)
|
||||
├── optional-skills/ # Official optional skills (discoverable via hub, not activated by default)
|
||||
├── environments/ # RL training environments (Atropos integration)
|
||||
├── tests/ # Test suite
|
||||
├── website/ # Documentation site (hermes-agent.nousresearch.com)
|
||||
│
|
||||
├── cli-config.yaml.example # Example configuration (copied to ~/.hermes/config.yaml)
|
||||
└── AGENTS.md # Development guide for AI coding assistants
|
||||
```
|
||||
|
||||
### User configuration (stored in `~/.hermes/`)
|
||||
|
||||
| Path | Purpose |
|
||||
|------|---------|
|
||||
| `~/.hermes/config.yaml` | Settings (model, terminal, toolsets, compression, etc.) |
|
||||
| `~/.hermes/.env` | API keys and secrets |
|
||||
| `~/.hermes/auth.json` | OAuth credentials (Nous Portal) |
|
||||
| `~/.hermes/skills/` | All active skills (bundled + hub-installed + agent-created) |
|
||||
| `~/.hermes/memories/` | Persistent memory (MEMORY.md, USER.md) |
|
||||
| `~/.hermes/state.db` | SQLite session database |
|
||||
| `~/.hermes/sessions/` | JSON session logs |
|
||||
| `~/.hermes/cron/` | Scheduled job data |
|
||||
| `~/.hermes/whatsapp/session/` | WhatsApp bridge credentials |
|
||||
|
||||
---
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
### Core Loop
|
||||
|
||||
```
|
||||
User message → AIAgent._run_agent_loop()
|
||||
├── Build system prompt (prompt_builder.py)
|
||||
├── Build API kwargs (model, messages, tools, reasoning config)
|
||||
├── Call LLM (OpenAI-compatible API)
|
||||
├── If tool_calls in response:
|
||||
│ ├── Execute each tool via registry dispatch
|
||||
│ ├── Add tool results to conversation
|
||||
│ └── Loop back to LLM call
|
||||
├── If text response:
|
||||
│ ├── Persist session to DB
|
||||
│ └── Return final_response
|
||||
└── Context compression if approaching token limit
|
||||
```
|
||||
|
||||
### Key Design Patterns
|
||||
|
||||
- **Self-registering tools**: Each tool file calls `registry.register()` at import time. `model_tools.py` triggers discovery by importing all tool modules.
|
||||
- **Toolset grouping**: Tools are grouped into toolsets (`web`, `terminal`, `file`, `browser`, etc.) that can be enabled/disabled per platform.
|
||||
- **Session persistence**: All conversations are stored in SQLite (`hermes_state.py`) with full-text search and unique session titles. JSON logs go to `~/.hermes/sessions/`.
|
||||
- **Ephemeral injection**: System prompts and prefill messages are injected at API call time, never persisted to the database or logs.
|
||||
- **Provider abstraction**: The agent works with any OpenAI-compatible API. Provider resolution happens at init time (Nous Portal OAuth, OpenRouter API key, or custom endpoint).
|
||||
- **Provider routing**: When using OpenRouter, `provider_routing` in config.yaml controls provider selection (sort by throughput/latency/price, allow/ignore specific providers, data retention policies). These are injected as `extra_body.provider` in API requests.
|
||||
|
||||
---
|
||||
|
||||
## Code Style
|
||||
|
||||
- **PEP 8** with practical exceptions (we don't enforce strict line length)
|
||||
- **Comments**: Only when explaining non-obvious intent, trade-offs, or API quirks. Don't narrate what the code does — `# increment counter` adds nothing
|
||||
- **Error handling**: Catch specific exceptions. Log with `logger.warning()`/`logger.error()` — use `exc_info=True` for unexpected errors so stack traces appear in logs
|
||||
- **Cross-platform**: Never assume Unix. See [Cross-Platform Compatibility](#cross-platform-compatibility)
|
||||
|
||||
---
|
||||
|
||||
## Adding a New Tool
|
||||
|
||||
Before writing a tool, ask: [should this be a skill instead?](#should-it-be-a-skill-or-a-tool)
|
||||
|
||||
Tools self-register with the central registry. Each tool file co-locates its schema, handler, and registration:
|
||||
|
||||
```python
|
||||
"""my_tool — Brief description of what this tool does."""
|
||||
|
||||
import json
|
||||
from tools.registry import registry
|
||||
|
||||
|
||||
def my_tool(param1: str, param2: int = 10, **kwargs) -> str:
|
||||
"""Handler. Returns a string result (often JSON)."""
|
||||
result = do_work(param1, param2)
|
||||
return json.dumps(result)
|
||||
|
||||
|
||||
MY_TOOL_SCHEMA = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "my_tool",
|
||||
"description": "What this tool does and when the agent should use it.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"param1": {"type": "string", "description": "What param1 is"},
|
||||
"param2": {"type": "integer", "description": "What param2 is", "default": 10},
|
||||
},
|
||||
"required": ["param1"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _check_requirements() -> bool:
|
||||
"""Return True if this tool's dependencies are available."""
|
||||
return True
|
||||
|
||||
|
||||
registry.register(
|
||||
name="my_tool",
|
||||
toolset="my_toolset",
|
||||
schema=MY_TOOL_SCHEMA,
|
||||
handler=lambda args, **kw: my_tool(**args, **kw),
|
||||
check_fn=_check_requirements,
|
||||
)
|
||||
```
|
||||
|
||||
Then add the import to `model_tools.py` in the `_modules` list:
|
||||
|
||||
```python
|
||||
_modules = [
|
||||
# ... existing modules ...
|
||||
"tools.my_tool",
|
||||
]
|
||||
```
|
||||
|
||||
If it's a new toolset, add it to `toolsets.py` and to the relevant platform presets.
|
||||
|
||||
---
|
||||
|
||||
## Adding a Skill
|
||||
|
||||
Bundled skills live in `skills/` organized by category. Official optional skills use the same structure in `optional-skills/`:
|
||||
|
||||
```
|
||||
skills/
|
||||
├── research/
|
||||
│ └── arxiv/
|
||||
│ ├── SKILL.md # Required: main instructions
|
||||
│ └── scripts/ # Optional: helper scripts
|
||||
│ └── search_arxiv.py
|
||||
├── productivity/
|
||||
│ └── ocr-and-documents/
|
||||
│ ├── SKILL.md
|
||||
│ ├── scripts/
|
||||
│ └── references/
|
||||
└── ...
|
||||
```
|
||||
|
||||
### SKILL.md format
|
||||
|
||||
```markdown
|
||||
---
|
||||
name: my-skill
|
||||
description: Brief description (shown in skill search results)
|
||||
version: 1.0.0
|
||||
author: Your Name
|
||||
license: MIT
|
||||
platforms: [macos, linux] # Optional — restrict to specific OS platforms
|
||||
# Valid: macos, linux, windows
|
||||
# Omit to load on all platforms (default)
|
||||
required_environment_variables: # Optional — secure setup-on-load metadata
|
||||
- name: MY_API_KEY
|
||||
prompt: API key
|
||||
help: Where to get it
|
||||
required_for: full functionality
|
||||
prerequisites: # Optional legacy runtime requirements
|
||||
env_vars: [MY_API_KEY] # Backward-compatible alias for required env vars
|
||||
commands: [curl, jq] # Advisory only; does not hide the skill
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [Category, Subcategory, Keywords]
|
||||
related_skills: [other-skill-name]
|
||||
fallback_for_toolsets: [web] # Optional — show only when toolset is unavailable
|
||||
requires_toolsets: [terminal] # Optional — show only when toolset is available
|
||||
---
|
||||
|
||||
# Skill Title
|
||||
|
||||
Brief intro.
|
||||
|
||||
## When to Use
|
||||
Trigger conditions — when should the agent load this skill?
|
||||
|
||||
## Quick Reference
|
||||
Table of common commands or API calls.
|
||||
|
||||
## Procedure
|
||||
Step-by-step instructions the agent follows.
|
||||
|
||||
## Pitfalls
|
||||
Known failure modes and how to handle them.
|
||||
|
||||
## Verification
|
||||
How the agent confirms it worked.
|
||||
```
|
||||
|
||||
### Platform-specific skills
|
||||
|
||||
Skills can declare which OS platforms they support via the `platforms` frontmatter field. Skills with this field are automatically hidden from the system prompt, `skills_list()`, and slash commands on incompatible platforms.
|
||||
|
||||
```yaml
|
||||
platforms: [macos] # macOS only (e.g., iMessage, Apple Reminders)
|
||||
platforms: [macos, linux] # macOS and Linux
|
||||
platforms: [windows] # Windows only
|
||||
```
|
||||
|
||||
If the field is omitted or empty, the skill loads on all platforms (backward compatible). See `skills/apple/` for examples of macOS-only skills.
|
||||
|
||||
### Conditional skill activation
|
||||
|
||||
Skills can declare conditions that control when they appear in the system prompt, based on which tools and toolsets are available in the current session. This is primarily used for **fallback skills** — alternatives that should only be shown when a primary tool is unavailable.
|
||||
|
||||
Four fields are supported under `metadata.hermes`:
|
||||
|
||||
```yaml
|
||||
metadata:
|
||||
hermes:
|
||||
fallback_for_toolsets: [web] # Show ONLY when these toolsets are unavailable
|
||||
requires_toolsets: [terminal] # Show ONLY when these toolsets are available
|
||||
fallback_for_tools: [web_search] # Show ONLY when these specific tools are unavailable
|
||||
requires_tools: [terminal] # Show ONLY when these specific tools are available
|
||||
```
|
||||
|
||||
**Semantics:**
|
||||
- `fallback_for_*`: The skill is a backup. It is **hidden** when the listed tools/toolsets are available, and **shown** when they are unavailable. Use this for free alternatives to premium tools.
|
||||
- `requires_*`: The skill needs certain tools to function. It is **hidden** when the listed tools/toolsets are unavailable. Use this for skills that depend on specific capabilities (e.g., a skill that only makes sense with terminal access).
|
||||
- If both are specified, both conditions must be satisfied for the skill to appear.
|
||||
- If neither is specified, the skill is always shown (backward compatible).
|
||||
|
||||
**Examples:**
|
||||
|
||||
```yaml
|
||||
# DuckDuckGo search — shown when Firecrawl (web toolset) is unavailable
|
||||
metadata:
|
||||
hermes:
|
||||
fallback_for_toolsets: [web]
|
||||
|
||||
# Smart home skill — only useful when terminal is available
|
||||
metadata:
|
||||
hermes:
|
||||
requires_toolsets: [terminal]
|
||||
|
||||
# Local browser fallback — shown when Browserbase is unavailable
|
||||
metadata:
|
||||
hermes:
|
||||
fallback_for_toolsets: [browser]
|
||||
```
|
||||
|
||||
The filtering happens at prompt build time in `agent/prompt_builder.py`. The `build_skills_system_prompt()` function receives the set of available tools and toolsets from the agent and uses `_skill_should_show()` to evaluate each skill's conditions.
|
||||
|
||||
### Skill setup metadata
|
||||
|
||||
Skills can declare secure setup-on-load metadata via the `required_environment_variables` frontmatter field. Missing values do not hide the skill from discovery; they trigger a CLI-only secure prompt when the skill is actually loaded.
|
||||
|
||||
```yaml
|
||||
required_environment_variables:
|
||||
- name: TENOR_API_KEY
|
||||
prompt: Tenor API key
|
||||
help: Get a key from https://developers.google.com/tenor
|
||||
required_for: full functionality
|
||||
```
|
||||
|
||||
The user may skip setup and keep loading the skill. Hermes only exposes metadata (`stored_as`, `skipped`, `validated`) to the model — never the secret value.
|
||||
|
||||
Legacy `prerequisites.env_vars` remains supported and is normalized into the new representation.
|
||||
|
||||
```yaml
|
||||
prerequisites:
|
||||
env_vars: [TENOR_API_KEY] # Legacy alias for required_environment_variables
|
||||
commands: [curl, jq] # Advisory CLI checks
|
||||
```
|
||||
|
||||
Gateway and messaging sessions never collect secrets in-band; they instruct the user to run `hermes setup` or update `~/.hermes/.env` locally.
|
||||
|
||||
**When to declare required environment variables:**
|
||||
- The skill uses an API key or token that should be collected securely at load time
|
||||
- The skill can still be useful if the user skips setup, but may degrade gracefully
|
||||
|
||||
**When to declare command prerequisites:**
|
||||
- The skill relies on a CLI tool that may not be installed (e.g., `himalaya`, `openhue`, `ddgs`)
|
||||
- Treat command checks as guidance, not discovery-time hiding
|
||||
|
||||
See `skills/gifs/gif-search/` and `skills/email/himalaya/` for examples.
|
||||
|
||||
### Skill guidelines
|
||||
|
||||
- **No external dependencies unless absolutely necessary.** Prefer stdlib Python, curl, and existing Hermes tools (`web_extract`, `terminal`, `read_file`).
|
||||
- **Progressive disclosure.** Put the most common workflow first. Edge cases and advanced usage go at the bottom.
|
||||
- **Include helper scripts** for XML/JSON parsing or complex logic — don't expect the LLM to write parsers inline every time.
|
||||
- **Test it.** Run `hermes --toolsets skills -q "Use the X skill to do Y"` and verify the agent follows the instructions correctly.
|
||||
|
||||
---
|
||||
|
||||
## Adding a Skin / Theme
|
||||
|
||||
Hermes uses a data-driven skin system — no code changes needed to add a new skin.
|
||||
|
||||
**Option A: User skin (YAML file)**
|
||||
|
||||
Create `~/.hermes/skins/<name>.yaml`:
|
||||
|
||||
```yaml
|
||||
name: mytheme
|
||||
description: Short description of the theme
|
||||
|
||||
colors:
|
||||
banner_border: "#HEX" # Panel border color
|
||||
banner_title: "#HEX" # Panel title color
|
||||
banner_accent: "#HEX" # Section header color
|
||||
banner_dim: "#HEX" # Muted/dim text color
|
||||
banner_text: "#HEX" # Body text color
|
||||
response_border: "#HEX" # Response box border
|
||||
|
||||
spinner:
|
||||
waiting_faces: ["(⚔)", "(⛨)"]
|
||||
thinking_faces: ["(⚔)", "(⌁)"]
|
||||
thinking_verbs: ["forging", "plotting"]
|
||||
wings: # Optional left/right decorations
|
||||
- ["⟪⚔", "⚔⟫"]
|
||||
|
||||
branding:
|
||||
agent_name: "My Agent"
|
||||
welcome: "Welcome message"
|
||||
response_label: " ⚔ Agent "
|
||||
prompt_symbol: "⚔ ❯ "
|
||||
|
||||
tool_prefix: "╎" # Tool output line prefix
|
||||
```
|
||||
|
||||
All fields are optional — missing values inherit from the default skin.
|
||||
|
||||
**Option B: Built-in skin**
|
||||
|
||||
Add to `_BUILTIN_SKINS` dict in `hermes_cli/skin_engine.py`. Use the same schema as above but as a Python dict. Built-in skins ship with the package and are always available.
|
||||
|
||||
**Activating:**
|
||||
- CLI: `/skin mytheme` or set `display.skin: mytheme` in config.yaml
|
||||
- Config: `display: { skin: mytheme }`
|
||||
|
||||
See `hermes_cli/skin_engine.py` for the full schema and existing skins as examples.
|
||||
|
||||
---
|
||||
|
||||
## Cross-Platform Compatibility
|
||||
|
||||
Hermes runs on Linux, macOS, and Windows. When writing code that touches the OS:
|
||||
|
||||
### Critical rules
|
||||
|
||||
1. **`termios` and `fcntl` are Unix-only.** Always catch both `ImportError` and `NotImplementedError`:
|
||||
```python
|
||||
try:
|
||||
from simple_term_menu import TerminalMenu
|
||||
menu = TerminalMenu(options)
|
||||
idx = menu.show()
|
||||
except (ImportError, NotImplementedError):
|
||||
# Fallback: numbered menu for Windows
|
||||
for i, opt in enumerate(options):
|
||||
print(f" {i+1}. {opt}")
|
||||
idx = int(input("Choice: ")) - 1
|
||||
```
|
||||
|
||||
2. **File encoding.** Windows may save `.env` files in `cp1252`. Always handle encoding errors:
|
||||
```python
|
||||
try:
|
||||
load_dotenv(env_path)
|
||||
except UnicodeDecodeError:
|
||||
load_dotenv(env_path, encoding="latin-1")
|
||||
```
|
||||
|
||||
3. **Process management.** `os.setsid()`, `os.killpg()`, and signal handling differ on Windows. Use platform checks:
|
||||
```python
|
||||
import platform
|
||||
if platform.system() != "Windows":
|
||||
kwargs["preexec_fn"] = os.setsid
|
||||
```
|
||||
|
||||
4. **Path separators.** Use `pathlib.Path` instead of string concatenation with `/`.
|
||||
|
||||
5. **Shell commands in installers.** If you change `scripts/install.sh`, check if the equivalent change is needed in `scripts/install.ps1`.
|
||||
|
||||
---
|
||||
|
||||
## Security Considerations
|
||||
|
||||
Hermes has terminal access. Security matters.
|
||||
|
||||
### Existing protections
|
||||
|
||||
| Layer | Implementation |
|
||||
|-------|---------------|
|
||||
| **Sudo password piping** | Uses `shlex.quote()` to prevent shell injection |
|
||||
| **Dangerous command detection** | Regex patterns in `tools/approval.py` with user approval flow |
|
||||
| **Cron prompt injection** | Scanner in `tools/cronjob_tools.py` blocks instruction-override patterns |
|
||||
| **Write deny list** | Protected paths (`~/.ssh/authorized_keys`, `/etc/shadow`) resolved via `os.path.realpath()` to prevent symlink bypass |
|
||||
| **Skills guard** | Security scanner for hub-installed skills (`tools/skills_guard.py`) |
|
||||
| **Code execution sandbox** | `execute_code` child process runs with API keys stripped from environment |
|
||||
| **Container hardening** | Docker: all capabilities dropped, no privilege escalation, PID limits, size-limited tmpfs |
|
||||
|
||||
### When contributing security-sensitive code
|
||||
|
||||
- **Always use `shlex.quote()`** when interpolating user input into shell commands
|
||||
- **Resolve symlinks** with `os.path.realpath()` before path-based access control checks
|
||||
- **Don't log secrets.** API keys, tokens, and passwords should never appear in log output
|
||||
- **Catch broad exceptions** around tool execution so a single failure doesn't crash the agent loop
|
||||
- **Test on all platforms** if your change touches file paths, process management, or shell commands
|
||||
|
||||
If your PR affects security, note it explicitly in the description.
|
||||
|
||||
---
|
||||
|
||||
## Pull Request Process
|
||||
|
||||
### Branch naming
|
||||
|
||||
```
|
||||
fix/description # Bug fixes
|
||||
feat/description # New features
|
||||
docs/description # Documentation
|
||||
test/description # Tests
|
||||
refactor/description # Code restructuring
|
||||
```
|
||||
|
||||
### Before submitting
|
||||
|
||||
1. **Run tests**: `pytest tests/ -v`
|
||||
2. **Test manually**: Run `hermes` and exercise the code path you changed
|
||||
3. **Check cross-platform impact**: If you touch file I/O, process management, or terminal handling, consider Windows and macOS
|
||||
4. **Keep PRs focused**: One logical change per PR. Don't mix a bug fix with a refactor with a new feature.
|
||||
|
||||
### PR description
|
||||
|
||||
Include:
|
||||
- **What** changed and **why**
|
||||
- **How to test** it (reproduction steps for bugs, usage examples for features)
|
||||
- **What platforms** you tested on
|
||||
- Reference any related issues
|
||||
|
||||
### Commit messages
|
||||
|
||||
We use [Conventional Commits](https://www.conventionalcommits.org/):
|
||||
|
||||
```
|
||||
<type>(<scope>): <description>
|
||||
```
|
||||
|
||||
| Type | Use for |
|
||||
|------|---------|
|
||||
| `fix` | Bug fixes |
|
||||
| `feat` | New features |
|
||||
| `docs` | Documentation |
|
||||
| `test` | Tests |
|
||||
| `refactor` | Code restructuring (no behavior change) |
|
||||
| `chore` | Build, CI, dependency updates |
|
||||
|
||||
Scopes: `cli`, `gateway`, `tools`, `skills`, `agent`, `install`, `whatsapp`, `security`, etc.
|
||||
|
||||
Examples:
|
||||
```
|
||||
fix(cli): prevent crash in save_config_value when model is a string
|
||||
feat(gateway): add WhatsApp multi-user session isolation
|
||||
fix(security): prevent shell injection in sudo password piping
|
||||
test(tools): add unit tests for file_operations
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Reporting Issues
|
||||
|
||||
- Use [GitHub Issues](https://github.com/NousResearch/hermes-agent/issues)
|
||||
- Include: OS, Python version, Hermes version (`hermes version`), full error traceback
|
||||
- Include steps to reproduce
|
||||
- Check existing issues before creating duplicates
|
||||
- For security vulnerabilities, please report privately
|
||||
|
||||
---
|
||||
|
||||
## Community
|
||||
|
||||
- **Discord**: [discord.gg/NousResearch](https://discord.gg/NousResearch) — for questions, showcasing projects, and sharing skills
|
||||
- **GitHub Discussions**: For design proposals and architecture discussions
|
||||
- **Skills Hub**: Upload specialized skills to a registry and share them with the community
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
By contributing, you agree that your contributions will be licensed under the [MIT License](LICENSE).
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Nous Research
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
354
README.md
354
README.md
@@ -1,243 +1,177 @@
|
||||
# Hermes Agent
|
||||
<p align="center">
|
||||
<img src="assets/banner.png" alt="Hermes Agent" width="100%">
|
||||
</p>
|
||||
|
||||
An AI agent with advanced tool-calling capabilities, featuring a flexible toolsets system for organizing and managing tools.
|
||||
# Hermes Agent ⚕
|
||||
|
||||
## Features
|
||||
<p align="center">
|
||||
<a href="https://hermes-agent.nousresearch.com/docs/"><img src="https://img.shields.io/badge/Docs-hermes--agent.nousresearch.com-FFD700?style=for-the-badge" alt="Documentation"></a>
|
||||
<a href="https://discord.gg/NousResearch"><img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord"></a>
|
||||
<a href="https://github.com/NousResearch/hermes-agent/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-green?style=for-the-badge" alt="License: MIT"></a>
|
||||
<a href="https://nousresearch.com"><img src="https://img.shields.io/badge/Built%20by-Nous%20Research-blueviolet?style=for-the-badge" alt="Built by Nous Research"></a>
|
||||
</p>
|
||||
|
||||
- **Web Tools**: Search, extract content, and crawl websites
|
||||
- **Terminal Tools**: Execute commands with interactive session support
|
||||
- **Vision Tools**: Analyze images from URLs
|
||||
- **Reasoning Tools**: Advanced multi-model reasoning (Mixture of Agents)
|
||||
- **Creative Tools**: Generate images from text prompts
|
||||
- **Toolsets System**: Organize tools into logical groups for different scenarios
|
||||
- **Batch Processing**: Process datasets in parallel with checkpointing and statistics tracking
|
||||
- **Ephemeral System Prompts**: Guide model behavior without polluting training datasets
|
||||
**The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.
|
||||
|
||||
## Setup
|
||||
Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
|
||||
|
||||
### 1. Install Dependencies
|
||||
```bash
|
||||
# Create and activate virtual environment (recommended)
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
<table>
|
||||
<tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
|
||||
<tr><td><b>Lives where you do</b></td><td>Telegram, Discord, Slack, WhatsApp, Signal, and CLI — all from a single gateway process. Voice memo transcription, cross-platform conversation continuity.</td></tr>
|
||||
<tr><td><b>A closed learning loop</b></td><td>Agent-curated memory with periodic nudges. Autonomous skill creation after complex tasks. Skills self-improve during use. FTS5 session search with LLM summarization for cross-session recall. <a href="https://github.com/plastic-labs/honcho">Honcho</a> dialectic user modeling. Compatible with the <a href="https://agentskills.io">agentskills.io</a> open standard.</td></tr>
|
||||
<tr><td><b>Scheduled automations</b></td><td>Built-in cron scheduler with delivery to any platform. Daily reports, nightly backups, weekly audits — all in natural language, running unattended.</td></tr>
|
||||
<tr><td><b>Delegates and parallelizes</b></td><td>Spawn isolated subagents for parallel workstreams. Write Python scripts that call tools via RPC, collapsing multi-step pipelines into zero-context-cost turns.</td></tr>
|
||||
<tr><td><b>Runs anywhere, not just your laptop</b></td><td>Six terminal backends — local, Docker, SSH, Daytona, Singularity, and Modal. Daytona and Modal offer serverless persistence — your agent's environment hibernates when idle and wakes on demand, costing nearly nothing between sessions. Run it on a $5 VPS or a GPU cluster.</td></tr>
|
||||
<tr><td><b>Research-ready</b></td><td>Batch trajectory generation, Atropos RL environments, trajectory compression for training the next generation of tool-calling models.</td></tr>
|
||||
</table>
|
||||
|
||||
# Install required packages
|
||||
pip install -r requirements.txt
|
||||
---
|
||||
|
||||
# Install Hecate for terminal tools
|
||||
git clone git@github.com:NousResearch/hecate.git
|
||||
cd hecate
|
||||
pip install -e .
|
||||
cd ..
|
||||
```
|
||||
|
||||
### 2. Configure Environment Variables
|
||||
```bash
|
||||
# Copy the example environment file
|
||||
cp .env.example .env
|
||||
|
||||
# Edit .env and add your API keys
|
||||
nano .env # or use your preferred editor
|
||||
```
|
||||
|
||||
**Required API Keys:**
|
||||
- `ANTHROPIC_API_KEY` - Main agent model (get at: https://console.anthropic.com/)
|
||||
- `FIRECRAWL_API_KEY` - Web tools (get at: https://firecrawl.dev/)
|
||||
- `NOUS_API_KEY` - Vision & reasoning tools (get at: https://inference-api.nousresearch.com/)
|
||||
- `MORPH_API_KEY` - Terminal tools (get at: https://morph.so/)
|
||||
- `FAL_KEY` - Image generation (get at: https://fal.ai/)
|
||||
- `OPENAI_API_KEY` - Optional, for some Hecate features
|
||||
|
||||
See `.env.example` for all available configuration options including debug settings and terminal tool configuration.
|
||||
|
||||
## Toolsets System
|
||||
|
||||
The agent uses a toolsets system for organizing and managing tools. All tools must be part of a toolset to be accessible - individual tool selection is not supported. This ensures consistent and logical grouping of capabilities.
|
||||
|
||||
### Key Concepts
|
||||
|
||||
- **Toolsets**: Logical groups of tools for specific use cases (e.g., "research", "development", "debugging")
|
||||
- **Composition**: Toolsets can include other toolsets for powerful combinations
|
||||
- **Custom Toolsets**: Create your own toolsets at runtime or by editing `toolsets.py`
|
||||
- **Toolset-Only Access**: Tools are only accessible through toolsets, not individually
|
||||
|
||||
### Available Toolsets
|
||||
|
||||
See `toolsets.py` for the complete list of predefined toolsets including:
|
||||
- Basic toolsets (web, terminal, vision, creative, reasoning)
|
||||
- Composite toolsets (research, development, analysis, etc.)
|
||||
- Scenario-specific toolsets (debugging, documentation, API testing, etc.)
|
||||
- Special toolsets (safe mode without terminal, minimal, offline)
|
||||
|
||||
### Using Toolsets
|
||||
## Quick Install
|
||||
|
||||
```bash
|
||||
# Use a predefined toolset
|
||||
python run_agent.py --enabled_toolsets=research --query "Find latest AI papers"
|
||||
|
||||
# Combine multiple toolsets
|
||||
python run_agent.py --enabled_toolsets=web,vision --query "Analyze this website"
|
||||
|
||||
# Enable all toolsets explicitly (same as omitting the flag)
|
||||
python run_agent.py --enabled_toolsets=all --query "Do web research and run commands if helpful"
|
||||
|
||||
# Safe mode (no terminal access)
|
||||
python run_agent.py --enabled_toolsets=safe --query "Help without running commands"
|
||||
|
||||
# List all available toolsets and tools
|
||||
python run_agent.py --list_tools
|
||||
curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
|
||||
```
|
||||
|
||||
For detailed documentation on toolsets, see `TOOLSETS_README.md`.
|
||||
Works on Linux, macOS, and WSL2. The installer handles everything — Python, Node.js, dependencies, and the `hermes` command. No prerequisites except git.
|
||||
|
||||
## Basic Usage
|
||||
> **Windows:** Native Windows is not supported. Please install [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) and run the command above.
|
||||
|
||||
### Default (all tools enabled)
|
||||
```bash
|
||||
python run_agent.py \
|
||||
--query "search up the latest docs on jit in python 3.13 and write me basic example that's not in their docs. profile its perf" \
|
||||
--max_turns 20 \
|
||||
--model claude-sonnet-4-20250514 \
|
||||
--base_url https://api.anthropic.com/v1/ \
|
||||
--api_key $ANTHROPIC_API_KEY
|
||||
```
|
||||
|
||||
### With specific toolset
|
||||
```bash
|
||||
python run_agent.py \
|
||||
--query "Debug this Python error" \
|
||||
--enabled_toolsets=debugging \
|
||||
--model claude-sonnet-4-20250514 \
|
||||
--api_key $ANTHROPIC_API_KEY
|
||||
```
|
||||
|
||||
### Python API
|
||||
```python
|
||||
from run_agent import AIAgent
|
||||
|
||||
# Use a specific toolset
|
||||
agent = AIAgent(
|
||||
model="claude-opus-4-20250514",
|
||||
enabled_toolsets=["research"]
|
||||
)
|
||||
response = agent.chat("Find information about quantum computing")
|
||||
|
||||
# Create custom toolset at runtime
|
||||
from toolsets import create_custom_toolset
|
||||
|
||||
create_custom_toolset(
|
||||
name="my_tools",
|
||||
description="My custom toolkit",
|
||||
tools=["web_search"],
|
||||
includes=["terminal", "vision"]
|
||||
)
|
||||
|
||||
agent = AIAgent(enabled_toolsets=["my_tools"])
|
||||
```
|
||||
|
||||
## Batch Processing
|
||||
|
||||
Process multiple prompts from a dataset in parallel with automatic checkpointing and statistics tracking:
|
||||
After installation:
|
||||
|
||||
```bash
|
||||
# Basic batch processing
|
||||
python batch_runner.py \
|
||||
--dataset_file=prompts.jsonl \
|
||||
--batch_size=20 \
|
||||
--run_name=my_run
|
||||
|
||||
# With specific distribution
|
||||
python batch_runner.py \
|
||||
--dataset_file=prompts.jsonl \
|
||||
--batch_size=20 \
|
||||
--run_name=image_run \
|
||||
--distribution=image_gen \
|
||||
--num_workers=4
|
||||
source ~/.bashrc # reload shell (or: source ~/.zshrc)
|
||||
hermes # start chatting!
|
||||
```
|
||||
|
||||
**Key Features:**
|
||||
- Parallel processing with configurable workers
|
||||
- Toolset distributions for varied data generation
|
||||
- Automatic checkpointing and resume capability
|
||||
- Combined output in `data/<run_name>/trajectories.jsonl`
|
||||
- Tool usage statistics and success rates
|
||||
---
|
||||
|
||||
**Quick Start:** See [QUICKSTART_BATCH.md](QUICKSTART_BATCH.md) for a 5-minute getting started guide.
|
||||
**Full Documentation:** See [BATCH_PROCESSING.md](BATCH_PROCESSING.md) for comprehensive documentation.
|
||||
## Getting Started
|
||||
|
||||
### Ephemeral System Prompts
|
||||
|
||||
The ephemeral system prompt feature allows you to guide the model's behavior during batch processing **without** saving that prompt to the training dataset trajectories. This is useful for:
|
||||
|
||||
- Guiding model behavior during data collection
|
||||
- Adding task-specific instructions
|
||||
- Keeping saved trajectories clean and focused on tool-calling format
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
python batch_runner.py \
|
||||
--dataset_file=prompts.jsonl \
|
||||
--batch_size=10 \
|
||||
--run_name=my_run \
|
||||
--ephemeral_system_prompt="You are a helpful assistant focused on image generation."
|
||||
hermes # Interactive CLI — start a conversation
|
||||
hermes model # Choose your LLM provider and model
|
||||
hermes tools # Configure which tools are enabled
|
||||
hermes config set # Set individual config values
|
||||
hermes gateway # Start the messaging gateway (Telegram, Discord, etc.)
|
||||
hermes setup # Run the full setup wizard (configures everything at once)
|
||||
hermes claw migrate # Migrate from OpenClaw (if coming from OpenClaw)
|
||||
hermes update # Update to the latest version
|
||||
hermes doctor # Diagnose any issues
|
||||
```
|
||||
|
||||
The ephemeral prompt will influence the model's behavior during execution, but **only the standard tool-calling system prompt** will be saved in the trajectory files.
|
||||
📖 **[Full documentation →](https://hermes-agent.nousresearch.com/docs/)**
|
||||
|
||||
**Documentation:** See [docs/ephemeral_system_prompt.md](docs/ephemeral_system_prompt.md) for complete details.
|
||||
## CLI vs Messaging Quick Reference
|
||||
|
||||
## Command Line Arguments
|
||||
Hermes has two entry points: start the terminal UI with `hermes`, or run the gateway and talk to it from Telegram, Discord, Slack, WhatsApp, Signal, or Email. Once you're in a conversation, many slash commands are shared across both interfaces.
|
||||
|
||||
**Single Agent (`run_agent.py`):**
|
||||
- `--query`: The question or task for the agent
|
||||
- `--model`: Model to use (default: claude-opus-4-20250514)
|
||||
- `--api_key`: API key for authentication
|
||||
- `--base_url`: API endpoint URL
|
||||
- `--max_turns`: Maximum number of tool-calling iterations
|
||||
- `--enabled_toolsets`: Comma-separated list of toolsets to enable. Use `all` (or `*`) to enable everything. If omitted, all toolsets are enabled by default.
|
||||
- `--disabled_toolsets`: Comma-separated list of toolsets to disable
|
||||
- `--list_tools`: List all available toolsets and tools
|
||||
- `--save_trajectories`: Save conversation trajectories to JSONL files
|
||||
| Action | CLI | Messaging platforms |
|
||||
|---------|-----|---------------------|
|
||||
| Start chatting | `hermes` | Run `hermes gateway setup` + `hermes gateway start`, then send the bot a message |
|
||||
| Start fresh conversation | `/new` or `/reset` | `/new` or `/reset` |
|
||||
| Change model | `/model [provider:model]` | `/model [provider:model]` |
|
||||
| Set a personality | `/personality [name]` | `/personality [name]` |
|
||||
| Retry or undo the last turn | `/retry`, `/undo` | `/retry`, `/undo` |
|
||||
| Compress context / check usage | `/compress`, `/usage`, `/insights [--days N]` | `/compress`, `/usage`, `/insights [days]` |
|
||||
| Browse skills | `/skills` or `/<skill-name>` | `/skills` or `/<skill-name>` |
|
||||
| Interrupt current work | `Ctrl+C` or send a new message | `/stop` or send a new message |
|
||||
| Platform-specific status | `/platforms` | `/status`, `/sethome` |
|
||||
|
||||
**Batch Processing (`batch_runner.py`):**
|
||||
- `--dataset_file`: Path to JSONL file with prompts
|
||||
- `--batch_size`: Number of prompts per batch
|
||||
- `--run_name`: Name for this run (for output/checkpointing)
|
||||
- `--distribution`: Toolset distribution to use (default: "default")
|
||||
- `--num_workers`: Number of parallel workers (default: 4)
|
||||
- `--resume`: Resume from checkpoint if interrupted
|
||||
- `--ephemeral_system_prompt`: System prompt used during execution but NOT saved to trajectories
|
||||
- `--list_distributions`: List available toolset distributions
|
||||
For the full command lists, see the [CLI guide](https://hermes-agent.nousresearch.com/docs/user-guide/cli) and the [Messaging Gateway guide](https://hermes-agent.nousresearch.com/docs/user-guide/messaging).
|
||||
|
||||
## Environment Variables
|
||||
|
||||
All environment variables can be configured in the `.env` file (copy from `.env.example`).
|
||||
|
||||
**Core API Keys:**
|
||||
- `ANTHROPIC_API_KEY`: Main agent model
|
||||
- `FIRECRAWL_API_KEY`: Web tools (search, extract, crawl)
|
||||
- `NOUS_API_KEY`: Vision and reasoning tools
|
||||
- `MORPH_API_KEY`: Terminal tools
|
||||
- `FAL_KEY`: Image generation tools
|
||||
- `OPENAI_API_KEY`: Optional, for some Hecate features
|
||||
|
||||
**Configuration Options:**
|
||||
- `HECATE_VM_LIFETIME_SECONDS`: VM lifetime (default: 300)
|
||||
- `HECATE_DEFAULT_SNAPSHOT_ID`: Default snapshot (default: snapshot_p5294qxt)
|
||||
- `WEB_TOOLS_DEBUG`, `VISION_TOOLS_DEBUG`, `MOA_TOOLS_DEBUG`, `IMAGE_TOOLS_DEBUG`: Enable debug logging
|
||||
---
|
||||
|
||||
## Documentation
|
||||
|
||||
**Single Agent Usage:**
|
||||
- `TOOLSETS_README.md`: Comprehensive guide to the toolsets system
|
||||
- `toolsets.py`: View and modify available toolsets
|
||||
- `model_tools.py`: Core tool definitions and handlers
|
||||
All documentation lives at **[hermes-agent.nousresearch.com/docs](https://hermes-agent.nousresearch.com/docs/)**:
|
||||
|
||||
**Batch Processing:**
|
||||
- `QUICKSTART_BATCH.md`: 5-minute quick start guide
|
||||
- `BATCH_PROCESSING.md`: Complete batch processing documentation
|
||||
- `toolset_distributions.py`: Toolset distributions for data generation
|
||||
| Section | What's Covered |
|
||||
|---------|---------------|
|
||||
| [Quickstart](https://hermes-agent.nousresearch.com/docs/getting-started/quickstart) | Install → setup → first conversation in 2 minutes |
|
||||
| [CLI Usage](https://hermes-agent.nousresearch.com/docs/user-guide/cli) | Commands, keybindings, personalities, sessions |
|
||||
| [Configuration](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) | Config file, providers, models, all options |
|
||||
| [Messaging Gateway](https://hermes-agent.nousresearch.com/docs/user-guide/messaging) | Telegram, Discord, Slack, WhatsApp, Signal, Home Assistant |
|
||||
| [Security](https://hermes-agent.nousresearch.com/docs/user-guide/security) | Command approval, DM pairing, container isolation |
|
||||
| [Tools & Toolsets](https://hermes-agent.nousresearch.com/docs/user-guide/features/tools) | 40+ tools, toolset system, terminal backends |
|
||||
| [Skills System](https://hermes-agent.nousresearch.com/docs/user-guide/features/skills) | Procedural memory, Skills Hub, creating skills |
|
||||
| [Memory](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) | Persistent memory, user profiles, best practices |
|
||||
| [MCP Integration](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) | Connect any MCP server for extended capabilities |
|
||||
| [Cron Scheduling](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) | Scheduled tasks with platform delivery |
|
||||
| [Context Files](https://hermes-agent.nousresearch.com/docs/user-guide/features/context-files) | Project context that shapes every conversation |
|
||||
| [Architecture](https://hermes-agent.nousresearch.com/docs/developer-guide/architecture) | Project structure, agent loop, key classes |
|
||||
| [Contributing](https://hermes-agent.nousresearch.com/docs/developer-guide/contributing) | Development setup, PR process, code style |
|
||||
| [CLI Reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) | All commands and flags |
|
||||
| [Environment Variables](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) | Complete env var reference |
|
||||
|
||||
## Examples
|
||||
---
|
||||
|
||||
See `TOOLSETS_README.md` for extensive examples of using different toolsets for various scenarios.
|
||||
## Migrating from OpenClaw
|
||||
|
||||
If you're coming from OpenClaw, Hermes can automatically import your settings, memories, skills, and API keys.
|
||||
|
||||
**During first-time setup:** The setup wizard (`hermes setup`) automatically detects `~/.openclaw` and offers to migrate before configuration begins.
|
||||
|
||||
**Anytime after install:**
|
||||
|
||||
```bash
|
||||
hermes claw migrate # Interactive migration (full preset)
|
||||
hermes claw migrate --dry-run # Preview what would be migrated
|
||||
hermes claw migrate --preset user-data # Migrate without secrets
|
||||
hermes claw migrate --overwrite # Overwrite existing conflicts
|
||||
```
|
||||
|
||||
What gets imported:
|
||||
- **SOUL.md** — persona file
|
||||
- **Memories** — MEMORY.md and USER.md entries
|
||||
- **Skills** — user-created skills → `~/.hermes/skills/openclaw-imports/`
|
||||
- **Command allowlist** — approval patterns
|
||||
- **Messaging settings** — platform configs, allowed users, working directory
|
||||
- **API keys** — allowlisted secrets (Telegram, OpenRouter, OpenAI, Anthropic, ElevenLabs)
|
||||
- **TTS assets** — workspace audio files
|
||||
- **Workspace instructions** — AGENTS.md (with `--workspace-target`)
|
||||
|
||||
See `hermes claw migrate --help` for all options, or use the `openclaw-migration` skill for an interactive agent-guided migration with dry-run previews.
|
||||
|
||||
---
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome contributions! See the [Contributing Guide](https://hermes-agent.nousresearch.com/docs/developer-guide/contributing) for development setup, code style, and PR process.
|
||||
|
||||
Quick start for contributors:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NousResearch/hermes-agent.git
|
||||
cd hermes-agent
|
||||
git submodule update --init mini-swe-agent # required terminal backend
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
uv venv .venv --python 3.11
|
||||
source .venv/bin/activate
|
||||
uv pip install -e ".[all,dev]"
|
||||
uv pip install -e "./mini-swe-agent"
|
||||
python -m pytest tests/ -q
|
||||
```
|
||||
|
||||
> **RL Training (optional):** To work on the RL/Tinker-Atropos integration, also run:
|
||||
> ```bash
|
||||
> git submodule update --init tinker-atropos
|
||||
> uv pip install -e "./tinker-atropos"
|
||||
> ```
|
||||
|
||||
---
|
||||
|
||||
## Community
|
||||
|
||||
- 💬 [Discord](https://discord.gg/NousResearch)
|
||||
- 📚 [Skills Hub](https://agentskills.io)
|
||||
- 🐛 [Issues](https://github.com/NousResearch/hermes-agent/issues)
|
||||
- 💡 [Discussions](https://github.com/NousResearch/hermes-agent/discussions)
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
MIT — see [LICENSE](LICENSE).
|
||||
|
||||
Built by [Nous Research](https://nousresearch.com).
|
||||
|
||||
383
RELEASE_v0.2.0.md
Normal file
383
RELEASE_v0.2.0.md
Normal file
@@ -0,0 +1,383 @@
|
||||
# Hermes Agent v0.2.0 (v2026.3.12)
|
||||
|
||||
**Release Date:** March 12, 2026
|
||||
|
||||
> First tagged release since v0.1.0 (the initial pre-public foundation). In just over two weeks, Hermes Agent went from a small internal project to a full-featured AI agent platform — thanks to an explosion of community contributions. This release covers **216 merged pull requests** from **63 contributors**, resolving **119 issues**.
|
||||
|
||||
---
|
||||
|
||||
## ✨ Highlights
|
||||
|
||||
- **Multi-Platform Messaging Gateway** — Telegram, Discord, Slack, WhatsApp, Signal, Email (IMAP/SMTP), and Home Assistant platforms with unified session management, media attachments, and per-platform tool configuration.
|
||||
|
||||
- **MCP (Model Context Protocol) Client** — Native MCP support with stdio and HTTP transports, reconnection, resource/prompt discovery, and sampling (server-initiated LLM requests). ([#291](https://github.com/NousResearch/hermes-agent/pull/291) — @0xbyt4, [#301](https://github.com/NousResearch/hermes-agent/pull/301), [#753](https://github.com/NousResearch/hermes-agent/pull/753))
|
||||
|
||||
- **Skills Ecosystem** — 70+ bundled and optional skills across 15+ categories with a Skills Hub for community discovery, per-platform enable/disable, conditional activation based on tool availability, and prerequisite validation. ([#743](https://github.com/NousResearch/hermes-agent/pull/743) — @teyrebaz33, [#785](https://github.com/NousResearch/hermes-agent/pull/785) — @teyrebaz33)
|
||||
|
||||
- **Centralized Provider Router** — Unified `call_llm()`/`async_call_llm()` API replaces scattered provider logic across vision, summarization, compression, and trajectory saving. All auxiliary consumers route through a single code path with automatic credential resolution. ([#1003](https://github.com/NousResearch/hermes-agent/pull/1003))
|
||||
|
||||
- **ACP Server** — VS Code, Zed, and JetBrains editor integration via the Agent Communication Protocol standard. ([#949](https://github.com/NousResearch/hermes-agent/pull/949))
|
||||
|
||||
- **CLI Skin/Theme Engine** — Data-driven visual customization: banners, spinners, colors, branding. 7 built-in skins + custom YAML skins.
|
||||
|
||||
- **Git Worktree Isolation** — `hermes -w` launches isolated agent sessions in git worktrees for safe parallel work on the same repo. ([#654](https://github.com/NousResearch/hermes-agent/pull/654))
|
||||
|
||||
- **Filesystem Checkpoints & Rollback** — Automatic snapshots before destructive operations with `/rollback` to restore. ([#824](https://github.com/NousResearch/hermes-agent/pull/824))
|
||||
|
||||
- **3,289 Tests** — From near-zero test coverage to a comprehensive test suite covering agent, gateway, tools, cron, and CLI.
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Core Agent & Architecture
|
||||
|
||||
### Provider & Model Support
|
||||
- Centralized provider router with `resolve_provider_client()` + `call_llm()` API ([#1003](https://github.com/NousResearch/hermes-agent/pull/1003))
|
||||
- Nous Portal as first-class provider in setup ([#644](https://github.com/NousResearch/hermes-agent/issues/644))
|
||||
- OpenAI Codex (Responses API) with ChatGPT subscription support ([#43](https://github.com/NousResearch/hermes-agent/pull/43)) — @grp06
|
||||
- Codex OAuth vision support + multimodal content adapter
|
||||
- Validate `/model` against live API instead of hardcoded lists
|
||||
- Self-hosted Firecrawl support ([#460](https://github.com/NousResearch/hermes-agent/pull/460)) — @caentzminger
|
||||
- Kimi Code API support ([#635](https://github.com/NousResearch/hermes-agent/pull/635)) — @christomitov
|
||||
- MiniMax model ID update ([#473](https://github.com/NousResearch/hermes-agent/pull/473)) — @tars90percent
|
||||
- OpenRouter provider routing configuration (provider_preferences)
|
||||
- Nous credential refresh on 401 errors ([#571](https://github.com/NousResearch/hermes-agent/pull/571), [#269](https://github.com/NousResearch/hermes-agent/pull/269)) — @rewbs
|
||||
- z.ai/GLM, Kimi/Moonshot, MiniMax, Azure OpenAI as first-class providers
|
||||
- Unified `/model` and `/provider` into single view
|
||||
|
||||
### Agent Loop & Conversation
|
||||
- Simple fallback model for provider resilience ([#740](https://github.com/NousResearch/hermes-agent/pull/740))
|
||||
- Shared iteration budget across parent + subagent delegation
|
||||
- Iteration budget pressure via tool result injection
|
||||
- Configurable subagent provider/model with full credential resolution
|
||||
- Handle 413 payload-too-large via compression instead of aborting ([#153](https://github.com/NousResearch/hermes-agent/pull/153)) — @tekelala
|
||||
- Retry with rebuilt payload after compression ([#616](https://github.com/NousResearch/hermes-agent/pull/616)) — @tripledoublev
|
||||
- Auto-compress pathologically large gateway sessions ([#628](https://github.com/NousResearch/hermes-agent/issues/628))
|
||||
- Tool call repair middleware — auto-lowercase and invalid tool handler
|
||||
- Reasoning effort configuration and `/reasoning` command ([#921](https://github.com/NousResearch/hermes-agent/pull/921))
|
||||
- Detect and block file re-read/search loops after context compression ([#705](https://github.com/NousResearch/hermes-agent/pull/705)) — @0xbyt4
|
||||
|
||||
### Session & Memory
|
||||
- Session naming with unique titles, auto-lineage, rich listing, and resume by name ([#720](https://github.com/NousResearch/hermes-agent/pull/720))
|
||||
- Interactive session browser with search filtering ([#733](https://github.com/NousResearch/hermes-agent/pull/733))
|
||||
- Display previous messages when resuming a session ([#734](https://github.com/NousResearch/hermes-agent/pull/734))
|
||||
- Honcho AI-native cross-session user modeling ([#38](https://github.com/NousResearch/hermes-agent/pull/38)) — @erosika
|
||||
- Proactive async memory flush on session expiry
|
||||
- Smart context length probing with persistent caching + banner display
|
||||
- `/resume` command for switching to named sessions in gateway
|
||||
- Session reset policy for messaging platforms
|
||||
|
||||
---
|
||||
|
||||
## 📱 Messaging Platforms (Gateway)
|
||||
|
||||
### Telegram
|
||||
- Native file attachments: send_document + send_video
|
||||
- Document file processing for PDF, text, and Office files — @tekelala
|
||||
- Forum topic session isolation ([#766](https://github.com/NousResearch/hermes-agent/pull/766)) — @spanishflu-est1918
|
||||
- Browser screenshot sharing via MEDIA: protocol ([#657](https://github.com/NousResearch/hermes-agent/pull/657))
|
||||
- Location support for find-nearby skill
|
||||
- TTS voice message accumulation fix ([#176](https://github.com/NousResearch/hermes-agent/pull/176)) — @Bartok9
|
||||
- Improved error handling and logging ([#763](https://github.com/NousResearch/hermes-agent/pull/763)) — @aydnOktay
|
||||
- Italic regex newline fix + 43 format tests ([#204](https://github.com/NousResearch/hermes-agent/pull/204)) — @0xbyt4
|
||||
|
||||
### Discord
|
||||
- Channel topic included in session context ([#248](https://github.com/NousResearch/hermes-agent/pull/248)) — @Bartok9
|
||||
- DISCORD_ALLOW_BOTS config for bot message filtering ([#758](https://github.com/NousResearch/hermes-agent/pull/758))
|
||||
- Document and video support ([#784](https://github.com/NousResearch/hermes-agent/pull/784))
|
||||
- Improved error handling and logging ([#761](https://github.com/NousResearch/hermes-agent/pull/761)) — @aydnOktay
|
||||
|
||||
### Slack
|
||||
- App_mention 404 fix + document/video support ([#784](https://github.com/NousResearch/hermes-agent/pull/784))
|
||||
- Structured logging replacing print statements — @aydnOktay
|
||||
|
||||
### WhatsApp
|
||||
- Native media sending — images, videos, documents ([#292](https://github.com/NousResearch/hermes-agent/pull/292)) — @satelerd
|
||||
- Multi-user session isolation ([#75](https://github.com/NousResearch/hermes-agent/pull/75)) — @satelerd
|
||||
- Cross-platform port cleanup replacing Linux-only fuser ([#433](https://github.com/NousResearch/hermes-agent/pull/433)) — @Farukest
|
||||
- DM interrupt key mismatch fix ([#350](https://github.com/NousResearch/hermes-agent/pull/350)) — @Farukest
|
||||
|
||||
### Signal
|
||||
- Full Signal messenger gateway via signal-cli-rest-api ([#405](https://github.com/NousResearch/hermes-agent/issues/405))
|
||||
- Media URL support in message events ([#871](https://github.com/NousResearch/hermes-agent/pull/871))
|
||||
|
||||
### Email (IMAP/SMTP)
|
||||
- New email gateway platform — @0xbyt4
|
||||
|
||||
### Home Assistant
|
||||
- REST tools + WebSocket gateway integration ([#184](https://github.com/NousResearch/hermes-agent/pull/184)) — @0xbyt4
|
||||
- Service discovery and enhanced setup
|
||||
- Toolset mapping fix ([#538](https://github.com/NousResearch/hermes-agent/pull/538)) — @Himess
|
||||
|
||||
### Gateway Core
|
||||
- Expose subagent tool calls and thinking to users ([#186](https://github.com/NousResearch/hermes-agent/pull/186)) — @cutepawss
|
||||
- Configurable background process watcher notifications ([#840](https://github.com/NousResearch/hermes-agent/pull/840))
|
||||
- `edit_message()` for Telegram/Discord/Slack with fallback
|
||||
- `/compress`, `/usage`, `/update` slash commands
|
||||
- Eliminated 3x SQLite message duplication in gateway sessions ([#873](https://github.com/NousResearch/hermes-agent/pull/873))
|
||||
- Stabilize system prompt across gateway turns for cache hits ([#754](https://github.com/NousResearch/hermes-agent/pull/754))
|
||||
- MCP server shutdown on gateway exit ([#796](https://github.com/NousResearch/hermes-agent/pull/796)) — @0xbyt4
|
||||
- Pass session_db to AIAgent, fixing session_search error ([#108](https://github.com/NousResearch/hermes-agent/pull/108)) — @Bartok9
|
||||
- Persist transcript changes in /retry, /undo; fix /reset attribute ([#217](https://github.com/NousResearch/hermes-agent/pull/217)) — @Farukest
|
||||
- UTF-8 encoding fix preventing Windows crashes ([#369](https://github.com/NousResearch/hermes-agent/pull/369)) — @ch3ronsa
|
||||
|
||||
---
|
||||
|
||||
## 🖥️ CLI & User Experience
|
||||
|
||||
### Interactive CLI
|
||||
- Data-driven skin/theme engine — 7 built-in skins (default, ares, mono, slate, poseidon, sisyphus, charizard) + custom YAML skins
|
||||
- `/personality` command with custom personality + disable support ([#773](https://github.com/NousResearch/hermes-agent/pull/773)) — @teyrebaz33
|
||||
- User-defined quick commands that bypass the agent loop ([#746](https://github.com/NousResearch/hermes-agent/pull/746)) — @teyrebaz33
|
||||
- `/reasoning` command for effort level and display toggle ([#921](https://github.com/NousResearch/hermes-agent/pull/921))
|
||||
- `/verbose` slash command to toggle debug at runtime ([#94](https://github.com/NousResearch/hermes-agent/pull/94)) — @cesareth
|
||||
- `/insights` command — usage analytics, cost estimation & activity patterns ([#552](https://github.com/NousResearch/hermes-agent/pull/552))
|
||||
- `/background` command for managing background processes
|
||||
- `/help` formatting with command categories
|
||||
- Bell-on-complete — terminal bell when agent finishes ([#738](https://github.com/NousResearch/hermes-agent/pull/738))
|
||||
- Up/down arrow history navigation
|
||||
- Clipboard image paste (Alt+V / Ctrl+V)
|
||||
- Loading indicators for slow slash commands ([#882](https://github.com/NousResearch/hermes-agent/pull/882))
|
||||
- Spinner flickering fix under patch_stdout ([#91](https://github.com/NousResearch/hermes-agent/pull/91)) — @0xbyt4
|
||||
- `--quiet/-Q` flag for programmatic single-query mode
|
||||
- `--fuck-it-ship-it` flag to bypass all approval prompts ([#724](https://github.com/NousResearch/hermes-agent/pull/724)) — @dmahan93
|
||||
- Tools summary flag ([#767](https://github.com/NousResearch/hermes-agent/pull/767)) — @luisv-1
|
||||
- Terminal blinking fix on SSH ([#284](https://github.com/NousResearch/hermes-agent/pull/284)) — @ygd58
|
||||
- Multi-line paste detection fix ([#84](https://github.com/NousResearch/hermes-agent/pull/84)) — @0xbyt4
|
||||
|
||||
### Setup & Configuration
|
||||
- Modular setup wizard with section subcommands and tool-first UX
|
||||
- Container resource configuration prompts
|
||||
- Backend validation for required binaries
|
||||
- Config migration system (currently v7)
|
||||
- API keys properly routed to .env instead of config.yaml ([#469](https://github.com/NousResearch/hermes-agent/pull/469)) — @ygd58
|
||||
- Atomic write for .env to prevent API key loss on crash ([#954](https://github.com/NousResearch/hermes-agent/pull/954))
|
||||
- `hermes tools` — per-platform tool enable/disable with curses UI
|
||||
- `hermes doctor` for health checks across all configured providers
|
||||
- `hermes update` with auto-restart for gateway service
|
||||
- Show update-available notice in CLI banner
|
||||
- Multiple named custom providers
|
||||
- Shell config detection improvement for PATH setup ([#317](https://github.com/NousResearch/hermes-agent/pull/317)) — @mehmetkr-31
|
||||
- Consistent HERMES_HOME and .env path resolution ([#51](https://github.com/NousResearch/hermes-agent/pull/51), [#48](https://github.com/NousResearch/hermes-agent/pull/48)) — @deankerr
|
||||
- Docker backend fix on macOS + subagent auth for Nous Portal ([#46](https://github.com/NousResearch/hermes-agent/pull/46)) — @rsavitt
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Tool System
|
||||
|
||||
### MCP (Model Context Protocol)
|
||||
- Native MCP client with stdio + HTTP transports ([#291](https://github.com/NousResearch/hermes-agent/pull/291) — @0xbyt4, [#301](https://github.com/NousResearch/hermes-agent/pull/301))
|
||||
- Sampling support — server-initiated LLM requests ([#753](https://github.com/NousResearch/hermes-agent/pull/753))
|
||||
- Resource and prompt discovery
|
||||
- Automatic reconnection and security hardening
|
||||
- Banner integration, `/reload-mcp` command
|
||||
- `hermes tools` UI integration
|
||||
|
||||
### Browser
|
||||
- Local browser backend — zero-cost headless Chromium (no Browserbase needed)
|
||||
- Console/errors tool, annotated screenshots, auto-recording, dogfood QA skill ([#745](https://github.com/NousResearch/hermes-agent/pull/745))
|
||||
- Screenshot sharing via MEDIA: on all messaging platforms ([#657](https://github.com/NousResearch/hermes-agent/pull/657))
|
||||
|
||||
### Terminal & Execution
|
||||
- `execute_code` sandbox with json_parse, shell_quote, retry helpers
|
||||
- Docker: custom volume mounts ([#158](https://github.com/NousResearch/hermes-agent/pull/158)) — @Indelwin
|
||||
- Daytona cloud sandbox backend ([#451](https://github.com/NousResearch/hermes-agent/pull/451)) — @rovle
|
||||
- SSH backend fix ([#59](https://github.com/NousResearch/hermes-agent/pull/59)) — @deankerr
|
||||
- Shell noise filtering and login shell execution for environment consistency
|
||||
- Head+tail truncation for execute_code stdout overflow
|
||||
- Configurable background process notification modes
|
||||
|
||||
### File Operations
|
||||
- Filesystem checkpoints and `/rollback` command ([#824](https://github.com/NousResearch/hermes-agent/pull/824))
|
||||
- Structured tool result hints (next-action guidance) for patch and search_files ([#722](https://github.com/NousResearch/hermes-agent/issues/722))
|
||||
- Docker volumes passed to sandbox container config ([#687](https://github.com/NousResearch/hermes-agent/pull/687)) — @manuelschipper
|
||||
|
||||
---
|
||||
|
||||
## 🧩 Skills Ecosystem
|
||||
|
||||
### Skills System
|
||||
- Per-platform skill enable/disable ([#743](https://github.com/NousResearch/hermes-agent/pull/743)) — @teyrebaz33
|
||||
- Conditional skill activation based on tool availability ([#785](https://github.com/NousResearch/hermes-agent/pull/785)) — @teyrebaz33
|
||||
- Skill prerequisites — hide skills with unmet dependencies ([#659](https://github.com/NousResearch/hermes-agent/pull/659)) — @kshitijk4poor
|
||||
- Optional skills — shipped but not activated by default
|
||||
- `hermes skills browse` — paginated hub browsing
|
||||
- Skills sub-category organization
|
||||
- Platform-conditional skill loading
|
||||
- Atomic skill file writes ([#551](https://github.com/NousResearch/hermes-agent/pull/551)) — @aydnOktay
|
||||
- Skills sync data loss prevention ([#563](https://github.com/NousResearch/hermes-agent/pull/563)) — @0xbyt4
|
||||
- Dynamic skill slash commands for CLI and gateway
|
||||
|
||||
### New Skills (selected)
|
||||
- **ASCII Art** — pyfiglet (571 fonts), cowsay, image-to-ascii ([#209](https://github.com/NousResearch/hermes-agent/pull/209)) — @0xbyt4
|
||||
- **ASCII Video** — Full production pipeline ([#854](https://github.com/NousResearch/hermes-agent/pull/854)) — @SHL0MS
|
||||
- **DuckDuckGo Search** — Firecrawl fallback ([#267](https://github.com/NousResearch/hermes-agent/pull/267)) — @gamedevCloudy; DDGS API expansion ([#598](https://github.com/NousResearch/hermes-agent/pull/598)) — @areu01or00
|
||||
- **Solana Blockchain** — Wallet balances, USD pricing, token names ([#212](https://github.com/NousResearch/hermes-agent/pull/212)) — @gizdusum
|
||||
- **AgentMail** — Agent-owned email inboxes ([#330](https://github.com/NousResearch/hermes-agent/pull/330)) — @teyrebaz33
|
||||
- **Polymarket** — Prediction market data (read-only) ([#629](https://github.com/NousResearch/hermes-agent/pull/629))
|
||||
- **OpenClaw Migration** — Official migration tool ([#570](https://github.com/NousResearch/hermes-agent/pull/570)) — @unmodeled-tyler
|
||||
- **Domain Intelligence** — Passive recon: subdomains, SSL, WHOIS, DNS ([#136](https://github.com/NousResearch/hermes-agent/pull/136)) — @FurkanL0
|
||||
- **Superpowers** — Software development skills ([#137](https://github.com/NousResearch/hermes-agent/pull/137)) — @kaos35
|
||||
- **Hermes-Atropos** — RL environment development skill ([#815](https://github.com/NousResearch/hermes-agent/pull/815))
|
||||
- Plus: arXiv search, OCR/documents, Excalidraw diagrams, YouTube transcripts, GIF search, Pokémon player, Minecraft modpack server, OpenHue (Philips Hue), Google Workspace, Notion, PowerPoint, Obsidian, find-nearby, and 40+ MLOps skills
|
||||
|
||||
---
|
||||
|
||||
## 🔒 Security & Reliability
|
||||
|
||||
### Security Hardening
|
||||
- Path traversal fix in skill_view — prevented reading arbitrary files ([#220](https://github.com/NousResearch/hermes-agent/issues/220)) — @Farukest
|
||||
- Shell injection prevention in sudo password piping ([#65](https://github.com/NousResearch/hermes-agent/pull/65)) — @leonsgithub
|
||||
- Dangerous command detection: multiline bypass fix ([#233](https://github.com/NousResearch/hermes-agent/pull/233)) — @Farukest; tee/process substitution patterns ([#280](https://github.com/NousResearch/hermes-agent/pull/280)) — @dogiladeveloper
|
||||
- Symlink boundary check fix in skills_guard ([#386](https://github.com/NousResearch/hermes-agent/pull/386)) — @Farukest
|
||||
- Symlink bypass fix in write deny list on macOS ([#61](https://github.com/NousResearch/hermes-agent/pull/61)) — @0xbyt4
|
||||
- Multi-word prompt injection bypass prevention ([#192](https://github.com/NousResearch/hermes-agent/pull/192)) — @0xbyt4
|
||||
- Cron prompt injection scanner bypass fix ([#63](https://github.com/NousResearch/hermes-agent/pull/63)) — @0xbyt4
|
||||
- Enforce 0600/0700 file permissions on sensitive files ([#757](https://github.com/NousResearch/hermes-agent/pull/757))
|
||||
- .env file permissions restricted to owner-only ([#529](https://github.com/NousResearch/hermes-agent/pull/529)) — @Himess
|
||||
- `--force` flag properly blocked from overriding dangerous verdicts ([#388](https://github.com/NousResearch/hermes-agent/pull/388)) — @Farukest
|
||||
- FTS5 query sanitization + DB connection leak fix ([#565](https://github.com/NousResearch/hermes-agent/pull/565)) — @0xbyt4
|
||||
- Expand secret redaction patterns + config toggle to disable
|
||||
- In-memory permanent allowlist to prevent data leak ([#600](https://github.com/NousResearch/hermes-agent/pull/600)) — @alireza78a
|
||||
|
||||
### Atomic Writes (data loss prevention)
|
||||
- sessions.json ([#611](https://github.com/NousResearch/hermes-agent/pull/611)) — @alireza78a
|
||||
- Cron jobs ([#146](https://github.com/NousResearch/hermes-agent/pull/146)) — @alireza78a
|
||||
- .env config ([#954](https://github.com/NousResearch/hermes-agent/pull/954))
|
||||
- Process checkpoints ([#298](https://github.com/NousResearch/hermes-agent/pull/298)) — @aydnOktay
|
||||
- Batch runner ([#297](https://github.com/NousResearch/hermes-agent/pull/297)) — @aydnOktay
|
||||
- Skill files ([#551](https://github.com/NousResearch/hermes-agent/pull/551)) — @aydnOktay
|
||||
|
||||
### Reliability
|
||||
- Guard all print() against OSError for systemd/headless environments ([#963](https://github.com/NousResearch/hermes-agent/pull/963))
|
||||
- Reset all retry counters at start of run_conversation ([#607](https://github.com/NousResearch/hermes-agent/pull/607)) — @0xbyt4
|
||||
- Return deny on approval callback timeout instead of None ([#603](https://github.com/NousResearch/hermes-agent/pull/603)) — @0xbyt4
|
||||
- Fix None message content crashes across codebase ([#277](https://github.com/NousResearch/hermes-agent/pull/277))
|
||||
- Fix context overrun crash with local LLM backends ([#403](https://github.com/NousResearch/hermes-agent/pull/403)) — @ch3ronsa
|
||||
- Prevent `_flush_sentinel` from leaking to external APIs ([#227](https://github.com/NousResearch/hermes-agent/pull/227)) — @Farukest
|
||||
- Prevent conversation_history mutation in callers ([#229](https://github.com/NousResearch/hermes-agent/pull/229)) — @Farukest
|
||||
- Fix systemd restart loop ([#614](https://github.com/NousResearch/hermes-agent/pull/614)) — @voidborne-d
|
||||
- Close file handles and sockets to prevent fd leaks ([#568](https://github.com/NousResearch/hermes-agent/pull/568) — @alireza78a, [#296](https://github.com/NousResearch/hermes-agent/pull/296) — @alireza78a, [#709](https://github.com/NousResearch/hermes-agent/pull/709) — @memosr)
|
||||
- Prevent data loss in clipboard PNG conversion ([#602](https://github.com/NousResearch/hermes-agent/pull/602)) — @0xbyt4
|
||||
- Eliminate shell noise from terminal output ([#293](https://github.com/NousResearch/hermes-agent/pull/293)) — @0xbyt4
|
||||
- Timezone-aware now() for prompt, cron, and execute_code ([#309](https://github.com/NousResearch/hermes-agent/pull/309)) — @areu01or00
|
||||
|
||||
### Windows Compatibility
|
||||
- Guard POSIX-only process functions ([#219](https://github.com/NousResearch/hermes-agent/pull/219)) — @Farukest
|
||||
- Windows native support via Git Bash + ZIP-based update fallback
|
||||
- pywinpty for PTY support ([#457](https://github.com/NousResearch/hermes-agent/pull/457)) — @shitcoinsherpa
|
||||
- Explicit UTF-8 encoding on all config/data file I/O ([#458](https://github.com/NousResearch/hermes-agent/pull/458)) — @shitcoinsherpa
|
||||
- Windows-compatible path handling ([#354](https://github.com/NousResearch/hermes-agent/pull/354), [#390](https://github.com/NousResearch/hermes-agent/pull/390)) — @Farukest
|
||||
- Regex-based search output parsing for drive-letter paths ([#533](https://github.com/NousResearch/hermes-agent/pull/533)) — @Himess
|
||||
- Auth store file lock for Windows ([#455](https://github.com/NousResearch/hermes-agent/pull/455)) — @shitcoinsherpa
|
||||
|
||||
---
|
||||
|
||||
## 🐛 Notable Bug Fixes
|
||||
|
||||
- Fix DeepSeek V3 tool call parser silently dropping multi-line JSON arguments ([#444](https://github.com/NousResearch/hermes-agent/pull/444)) — @PercyDikec
|
||||
- Fix gateway transcript losing 1 message per turn due to offset mismatch ([#395](https://github.com/NousResearch/hermes-agent/pull/395)) — @PercyDikec
|
||||
- Fix /retry command silently discarding the agent's final response ([#441](https://github.com/NousResearch/hermes-agent/pull/441)) — @PercyDikec
|
||||
- Fix max-iterations retry returning empty string after think-block stripping ([#438](https://github.com/NousResearch/hermes-agent/pull/438)) — @PercyDikec
|
||||
- Fix max-iterations retry using hardcoded max_tokens ([#436](https://github.com/NousResearch/hermes-agent/pull/436)) — @Farukest
|
||||
- Fix Codex status dict key mismatch ([#448](https://github.com/NousResearch/hermes-agent/pull/448)) and visibility filter ([#446](https://github.com/NousResearch/hermes-agent/pull/446)) — @PercyDikec
|
||||
- Strip \<think\> blocks from final user-facing responses ([#174](https://github.com/NousResearch/hermes-agent/pull/174)) — @Bartok9
|
||||
- Fix \<think\> block regex stripping visible content when model discusses tags literally ([#786](https://github.com/NousResearch/hermes-agent/issues/786))
|
||||
- Fix Mistral 422 errors from leftover finish_reason in assistant messages ([#253](https://github.com/NousResearch/hermes-agent/pull/253)) — @Sertug17
|
||||
- Fix OPENROUTER_API_KEY resolution order across all code paths ([#295](https://github.com/NousResearch/hermes-agent/pull/295)) — @0xbyt4
|
||||
- Fix OPENAI_BASE_URL API key priority ([#420](https://github.com/NousResearch/hermes-agent/pull/420)) — @manuelschipper
|
||||
- Fix Anthropic "prompt is too long" 400 error not detected as context length error ([#813](https://github.com/NousResearch/hermes-agent/issues/813))
|
||||
- Fix SQLite session transcript accumulating duplicate messages — 3-4x token inflation ([#860](https://github.com/NousResearch/hermes-agent/issues/860))
|
||||
- Fix setup wizard skipping API key prompts on first install ([#748](https://github.com/NousResearch/hermes-agent/pull/748))
|
||||
- Fix setup wizard showing OpenRouter model list for Nous Portal ([#575](https://github.com/NousResearch/hermes-agent/pull/575)) — @PercyDikec
|
||||
- Fix provider selection not persisting when switching via hermes model ([#881](https://github.com/NousResearch/hermes-agent/pull/881))
|
||||
- Fix Docker backend failing when docker not in PATH on macOS ([#889](https://github.com/NousResearch/hermes-agent/pull/889))
|
||||
- Fix ClawHub Skills Hub adapter for API endpoint changes ([#286](https://github.com/NousResearch/hermes-agent/pull/286)) — @BP602
|
||||
- Fix Honcho auto-enable when API key is present ([#243](https://github.com/NousResearch/hermes-agent/pull/243)) — @Bartok9
|
||||
- Fix duplicate 'skills' subparser crash on Python 3.11+ ([#898](https://github.com/NousResearch/hermes-agent/issues/898))
|
||||
- Fix memory tool entry parsing when content contains section sign ([#162](https://github.com/NousResearch/hermes-agent/pull/162)) — @aydnOktay
|
||||
- Fix piped install silently aborting when interactive prompts fail ([#72](https://github.com/NousResearch/hermes-agent/pull/72)) — @cutepawss
|
||||
- Fix false positives in recursive delete detection ([#68](https://github.com/NousResearch/hermes-agent/pull/68)) — @cutepawss
|
||||
- Fix Ruff lint warnings across codebase ([#608](https://github.com/NousResearch/hermes-agent/pull/608)) — @JackTheGit
|
||||
- Fix Anthropic native base URL fail-fast ([#173](https://github.com/NousResearch/hermes-agent/pull/173)) — @adavyas
|
||||
- Fix install.sh creating ~/.hermes before moving Node.js directory ([#53](https://github.com/NousResearch/hermes-agent/pull/53)) — @JoshuaMart
|
||||
- Fix SystemExit traceback during atexit cleanup on Ctrl+C ([#55](https://github.com/NousResearch/hermes-agent/pull/55)) — @bierlingm
|
||||
- Restore missing MIT license file ([#620](https://github.com/NousResearch/hermes-agent/pull/620)) — @stablegenius49
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
- **3,289 tests** across agent, gateway, tools, cron, and CLI
|
||||
- Parallelized test suite with pytest-xdist ([#802](https://github.com/NousResearch/hermes-agent/pull/802)) — @OutThisLife
|
||||
- Unit tests batch 1: 8 core modules ([#60](https://github.com/NousResearch/hermes-agent/pull/60)) — @0xbyt4
|
||||
- Unit tests batch 2: 8 more modules ([#62](https://github.com/NousResearch/hermes-agent/pull/62)) — @0xbyt4
|
||||
- Unit tests batch 3: 8 untested modules ([#191](https://github.com/NousResearch/hermes-agent/pull/191)) — @0xbyt4
|
||||
- Unit tests batch 4: 5 security/logic-critical modules ([#193](https://github.com/NousResearch/hermes-agent/pull/193)) — @0xbyt4
|
||||
- AIAgent (run_agent.py) unit tests ([#67](https://github.com/NousResearch/hermes-agent/pull/67)) — @0xbyt4
|
||||
- Trajectory compressor tests ([#203](https://github.com/NousResearch/hermes-agent/pull/203)) — @0xbyt4
|
||||
- Clarify tool tests ([#121](https://github.com/NousResearch/hermes-agent/pull/121)) — @Bartok9
|
||||
- Telegram format tests — 43 tests for italic/bold/code rendering ([#204](https://github.com/NousResearch/hermes-agent/pull/204)) — @0xbyt4
|
||||
- Vision tools type hints + 42 tests ([#792](https://github.com/NousResearch/hermes-agent/pull/792))
|
||||
- Compressor tool-call boundary regression tests ([#648](https://github.com/NousResearch/hermes-agent/pull/648)) — @intertwine
|
||||
- Test structure reorganization ([#34](https://github.com/NousResearch/hermes-agent/pull/34)) — @0xbyt4
|
||||
- Shell noise elimination + fix 36 test failures ([#293](https://github.com/NousResearch/hermes-agent/pull/293)) — @0xbyt4
|
||||
|
||||
---
|
||||
|
||||
## 🔬 RL & Evaluation Environments
|
||||
|
||||
- WebResearchEnv — Multi-step web research RL environment ([#434](https://github.com/NousResearch/hermes-agent/pull/434)) — @jackx707
|
||||
- Modal sandbox concurrency limits to avoid deadlocks ([#621](https://github.com/NousResearch/hermes-agent/pull/621)) — @voteblake
|
||||
- Hermes-atropos-environments bundled skill ([#815](https://github.com/NousResearch/hermes-agent/pull/815))
|
||||
- Local vLLM instance support for evaluation — @dmahan93
|
||||
- YC-Bench long-horizon agent benchmark environment
|
||||
- OpenThoughts-TBLite evaluation environment and scripts
|
||||
|
||||
---
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
- Full documentation website (Docusaurus) with 37+ pages
|
||||
- Comprehensive platform setup guides for Telegram, Discord, Slack, WhatsApp, Signal, Email
|
||||
- AGENTS.md — development guide for AI coding assistants
|
||||
- CONTRIBUTING.md ([#117](https://github.com/NousResearch/hermes-agent/pull/117)) — @Bartok9
|
||||
- Slash commands reference ([#142](https://github.com/NousResearch/hermes-agent/pull/142)) — @Bartok9
|
||||
- Comprehensive AGENTS.md accuracy audit ([#732](https://github.com/NousResearch/hermes-agent/pull/732))
|
||||
- Skin/theme system documentation
|
||||
- MCP documentation and examples
|
||||
- Docs accuracy audit — 35+ corrections
|
||||
- Documentation typo fixes ([#825](https://github.com/NousResearch/hermes-agent/pull/825), [#439](https://github.com/NousResearch/hermes-agent/pull/439)) — @JackTheGit
|
||||
- CLI config precedence and terminology standardization ([#166](https://github.com/NousResearch/hermes-agent/pull/166), [#167](https://github.com/NousResearch/hermes-agent/pull/167), [#168](https://github.com/NousResearch/hermes-agent/pull/168)) — @Jr-kenny
|
||||
- Telegram token regex documentation ([#713](https://github.com/NousResearch/hermes-agent/pull/713)) — @VolodymyrBg
|
||||
|
||||
---
|
||||
|
||||
## 👥 Contributors
|
||||
|
||||
Thank you to the 63 contributors who made this release possible! In just over two weeks, the Hermes Agent community came together to ship an extraordinary amount of work.
|
||||
|
||||
### Core
|
||||
- **@teknium1** — 43 PRs: Project lead, core architecture, provider router, sessions, skills, CLI, documentation
|
||||
|
||||
### Top Community Contributors
|
||||
- **@0xbyt4** — 40 PRs: MCP client, Home Assistant, security fixes (symlink, prompt injection, cron), extensive test coverage (6 batches), ascii-art skill, shell noise elimination, skills sync, Telegram formatting, and dozens more
|
||||
- **@Farukest** — 16 PRs: Security hardening (path traversal, dangerous command detection, symlink boundary), Windows compatibility (POSIX guards, path handling), WhatsApp fixes, max-iterations retry, gateway fixes
|
||||
- **@aydnOktay** — 11 PRs: Atomic writes (process checkpoints, batch runner, skill files), error handling improvements across Telegram, Discord, code execution, transcription, TTS, and skills
|
||||
- **@Bartok9** — 9 PRs: CONTRIBUTING.md, slash commands reference, Discord channel topics, think-block stripping, TTS fix, Honcho fix, session count fix, clarify tests
|
||||
- **@PercyDikec** — 7 PRs: DeepSeek V3 parser fix, /retry response discard, gateway transcript offset, Codex status/visibility, max-iterations retry, setup wizard fix
|
||||
- **@teyrebaz33** — 5 PRs: Skills enable/disable system, quick commands, personality customization, conditional skill activation
|
||||
- **@alireza78a** — 5 PRs: Atomic writes (cron, sessions), fd leak prevention, security allowlist, code execution socket cleanup
|
||||
- **@shitcoinsherpa** — 3 PRs: Windows support (pywinpty, UTF-8 encoding, auth store lock)
|
||||
- **@Himess** — 3 PRs: Cron/HomeAssistant/Daytona fix, Windows drive-letter parsing, .env permissions
|
||||
- **@satelerd** — 2 PRs: WhatsApp native media, multi-user session isolation
|
||||
- **@rovle** — 1 PR: Daytona cloud sandbox backend (4 commits)
|
||||
- **@erosika** — 1 PR: Honcho AI-native memory integration
|
||||
- **@dmahan93** — 1 PR: --fuck-it-ship-it flag + RL environment work
|
||||
- **@SHL0MS** — 1 PR: ASCII video skill
|
||||
|
||||
### All Contributors
|
||||
@0xbyt4, @BP602, @Bartok9, @Farukest, @FurkanL0, @Himess, @Indelwin, @JackTheGit, @JoshuaMart, @Jr-kenny, @OutThisLife, @PercyDikec, @SHL0MS, @Sertug17, @VencentSoliman, @VolodymyrBg, @adavyas, @alireza78a, @areu01or00, @aydnOktay, @batuhankocyigit, @bierlingm, @caentzminger, @cesareth, @ch3ronsa, @christomitov, @cutepawss, @deankerr, @dmahan93, @dogiladeveloper, @dragonkhoi, @erosika, @gamedevCloudy, @gizdusum, @grp06, @intertwine, @jackx707, @jdblackstar, @johnh4098, @kaos35, @kshitijk4poor, @leonsgithub, @luisv-1, @manuelschipper, @mehmetkr-31, @memosr, @PeterFile, @rewbs, @rovle, @rsavitt, @satelerd, @spanishflu-est1918, @stablegenius49, @tars90percent, @tekelala, @teknium1, @teyrebaz33, @tripledoublev, @unmodeled-tyler, @voidborne-d, @voteblake, @ygd58
|
||||
|
||||
---
|
||||
|
||||
**Full Changelog**: [v0.1.0...v2026.3.12](https://github.com/NousResearch/hermes-agent/compare/v0.1.0...v2026.3.12)
|
||||
377
RELEASE_v0.3.0.md
Normal file
377
RELEASE_v0.3.0.md
Normal file
@@ -0,0 +1,377 @@
|
||||
# Hermes Agent v0.3.0 (v2026.3.17)
|
||||
|
||||
**Release Date:** March 17, 2026
|
||||
|
||||
> The streaming, plugins, and provider release — unified real-time token delivery, first-class plugin architecture, rebuilt provider system with Vercel AI Gateway, native Anthropic provider, smart approvals, live Chrome CDP browser connect, ACP IDE integration, Honcho memory, voice mode, persistent shell, and 50+ bug fixes across every platform.
|
||||
|
||||
---
|
||||
|
||||
## ✨ Highlights
|
||||
|
||||
- **Unified Streaming Infrastructure** — Real-time token-by-token delivery in CLI and all gateway platforms. Responses stream as they're generated instead of arriving as a block. ([#1538](https://github.com/NousResearch/hermes-agent/pull/1538))
|
||||
|
||||
- **First-Class Plugin Architecture** — Drop Python files into `~/.hermes/plugins/` to extend Hermes with custom tools, commands, and hooks. No forking required. ([#1544](https://github.com/NousResearch/hermes-agent/pull/1544), [#1555](https://github.com/NousResearch/hermes-agent/pull/1555))
|
||||
|
||||
- **Native Anthropic Provider** — Direct Anthropic API calls with Claude Code credential auto-discovery, OAuth PKCE flows, and native prompt caching. No OpenRouter middleman needed. ([#1097](https://github.com/NousResearch/hermes-agent/pull/1097))
|
||||
|
||||
- **Smart Approvals + /stop Command** — Codex-inspired approval system that learns which commands are safe and remembers your preferences. `/stop` kills the current agent run immediately. ([#1543](https://github.com/NousResearch/hermes-agent/pull/1543))
|
||||
|
||||
- **Honcho Memory Integration** — Async memory writes, configurable recall modes, session title integration, and multi-user isolation in gateway mode. By @erosika. ([#736](https://github.com/NousResearch/hermes-agent/pull/736))
|
||||
|
||||
- **Voice Mode** — Push-to-talk in CLI, voice notes in Telegram/Discord, Discord voice channel support, and local Whisper transcription via faster-whisper. ([#1299](https://github.com/NousResearch/hermes-agent/pull/1299), [#1185](https://github.com/NousResearch/hermes-agent/pull/1185), [#1429](https://github.com/NousResearch/hermes-agent/pull/1429))
|
||||
|
||||
- **Concurrent Tool Execution** — Multiple independent tool calls now run in parallel via ThreadPoolExecutor, significantly reducing latency for multi-tool turns. ([#1152](https://github.com/NousResearch/hermes-agent/pull/1152))
|
||||
|
||||
- **PII Redaction** — When `privacy.redact_pii` is enabled, personally identifiable information is automatically scrubbed before sending context to LLM providers. ([#1542](https://github.com/NousResearch/hermes-agent/pull/1542))
|
||||
|
||||
- **`/browser connect` via CDP** — Attach browser tools to a live Chrome instance through Chrome DevTools Protocol. Debug, inspect, and interact with pages you already have open. ([#1549](https://github.com/NousResearch/hermes-agent/pull/1549))
|
||||
|
||||
- **Vercel AI Gateway Provider** — Route Hermes through Vercel's AI Gateway for access to their model catalog and infrastructure. ([#1628](https://github.com/NousResearch/hermes-agent/pull/1628))
|
||||
|
||||
- **Centralized Provider Router** — Rebuilt provider system with `call_llm` API, unified `/model` command, auto-detect provider on model switch, and direct endpoint overrides for auxiliary/delegation clients. ([#1003](https://github.com/NousResearch/hermes-agent/pull/1003), [#1506](https://github.com/NousResearch/hermes-agent/pull/1506), [#1375](https://github.com/NousResearch/hermes-agent/pull/1375))
|
||||
|
||||
- **ACP Server (IDE Integration)** — VS Code, Zed, and JetBrains can now connect to Hermes as an agent backend, with full slash command support. ([#1254](https://github.com/NousResearch/hermes-agent/pull/1254), [#1532](https://github.com/NousResearch/hermes-agent/pull/1532))
|
||||
|
||||
- **Persistent Shell Mode** — Local and SSH terminal backends can maintain shell state across tool calls — cd, env vars, and aliases persist. By @alt-glitch. ([#1067](https://github.com/NousResearch/hermes-agent/pull/1067), [#1483](https://github.com/NousResearch/hermes-agent/pull/1483))
|
||||
|
||||
- **Agentic On-Policy Distillation (OPD)** — New RL training environment for distilling agent policies, expanding the Atropos training ecosystem. ([#1149](https://github.com/NousResearch/hermes-agent/pull/1149))
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Core Agent & Architecture
|
||||
|
||||
### Provider & Model Support
|
||||
- **Centralized provider router** with `call_llm` API and unified `/model` command — switch models and providers seamlessly ([#1003](https://github.com/NousResearch/hermes-agent/pull/1003))
|
||||
- **Vercel AI Gateway** provider support ([#1628](https://github.com/NousResearch/hermes-agent/pull/1628))
|
||||
- **Auto-detect provider** when switching models via `/model` ([#1506](https://github.com/NousResearch/hermes-agent/pull/1506))
|
||||
- **Direct endpoint overrides** for auxiliary and delegation clients — point vision/subagent calls at specific endpoints ([#1375](https://github.com/NousResearch/hermes-agent/pull/1375))
|
||||
- **Native Anthropic auxiliary vision** — use Claude's native vision API instead of routing through OpenAI-compatible endpoints ([#1377](https://github.com/NousResearch/hermes-agent/pull/1377))
|
||||
- Anthropic OAuth flow improvements — auto-run `claude setup-token`, reauthentication, PKCE state persistence, identity fingerprinting ([#1132](https://github.com/NousResearch/hermes-agent/pull/1132), [#1360](https://github.com/NousResearch/hermes-agent/pull/1360), [#1396](https://github.com/NousResearch/hermes-agent/pull/1396), [#1597](https://github.com/NousResearch/hermes-agent/pull/1597))
|
||||
- Fix adaptive thinking without `budget_tokens` for Claude 4.6 models — by @ASRagab ([#1128](https://github.com/NousResearch/hermes-agent/pull/1128))
|
||||
- Fix Anthropic cache markers through adapter — by @brandtcormorant ([#1216](https://github.com/NousResearch/hermes-agent/pull/1216))
|
||||
- Retry Anthropic 429/529 errors and surface details to users — by @0xbyt4 ([#1585](https://github.com/NousResearch/hermes-agent/pull/1585))
|
||||
- Fix Anthropic adapter max_tokens, fallback crash, proxy base_url — by @0xbyt4 ([#1121](https://github.com/NousResearch/hermes-agent/pull/1121))
|
||||
- Fix DeepSeek V3 parser dropping multiple parallel tool calls — by @mr-emmett-one ([#1365](https://github.com/NousResearch/hermes-agent/pull/1365), [#1300](https://github.com/NousResearch/hermes-agent/pull/1300))
|
||||
- Accept unlisted models with warning instead of rejecting ([#1047](https://github.com/NousResearch/hermes-agent/pull/1047), [#1102](https://github.com/NousResearch/hermes-agent/pull/1102))
|
||||
- Skip reasoning params for unsupported OpenRouter models ([#1485](https://github.com/NousResearch/hermes-agent/pull/1485))
|
||||
- MiniMax Anthropic API compatibility fix ([#1623](https://github.com/NousResearch/hermes-agent/pull/1623))
|
||||
- Custom endpoint `/models` verification and `/v1` base URL suggestion ([#1480](https://github.com/NousResearch/hermes-agent/pull/1480))
|
||||
- Resolve delegation providers from `custom_providers` config ([#1328](https://github.com/NousResearch/hermes-agent/pull/1328))
|
||||
- Kimi model additions and User-Agent fix ([#1039](https://github.com/NousResearch/hermes-agent/pull/1039))
|
||||
- Strip `call_id`/`response_item_id` for Mistral compatibility ([#1058](https://github.com/NousResearch/hermes-agent/pull/1058))
|
||||
|
||||
### Agent Loop & Conversation
|
||||
- **Anthropic Context Editing API** support ([#1147](https://github.com/NousResearch/hermes-agent/pull/1147))
|
||||
- Improved context compaction handoff summaries — compressor now preserves more actionable state ([#1273](https://github.com/NousResearch/hermes-agent/pull/1273))
|
||||
- Sync session_id after mid-run context compression ([#1160](https://github.com/NousResearch/hermes-agent/pull/1160))
|
||||
- Session hygiene threshold tuned to 50% for more proactive compression ([#1096](https://github.com/NousResearch/hermes-agent/pull/1096), [#1161](https://github.com/NousResearch/hermes-agent/pull/1161))
|
||||
- Include session ID in system prompt via `--pass-session-id` flag ([#1040](https://github.com/NousResearch/hermes-agent/pull/1040))
|
||||
- Prevent closed OpenAI client reuse across retries ([#1391](https://github.com/NousResearch/hermes-agent/pull/1391))
|
||||
- Sanitize chat payloads and provider precedence ([#1253](https://github.com/NousResearch/hermes-agent/pull/1253))
|
||||
- Handle dict tool call arguments from Codex and local backends ([#1393](https://github.com/NousResearch/hermes-agent/pull/1393), [#1440](https://github.com/NousResearch/hermes-agent/pull/1440))
|
||||
|
||||
### Memory & Sessions
|
||||
- **Improve memory prioritization** — user preferences and corrections weighted above procedural knowledge ([#1548](https://github.com/NousResearch/hermes-agent/pull/1548))
|
||||
- Tighter memory and session recall guidance in system prompts ([#1329](https://github.com/NousResearch/hermes-agent/pull/1329))
|
||||
- Persist CLI token counts to session DB for `/insights` ([#1498](https://github.com/NousResearch/hermes-agent/pull/1498))
|
||||
- Keep Honcho recall out of the cached system prefix ([#1201](https://github.com/NousResearch/hermes-agent/pull/1201))
|
||||
- Correct `seed_ai_identity` to use `session.add_messages()` ([#1475](https://github.com/NousResearch/hermes-agent/pull/1475))
|
||||
- Isolate Honcho session routing for multi-user gateway ([#1500](https://github.com/NousResearch/hermes-agent/pull/1500))
|
||||
|
||||
---
|
||||
|
||||
## 📱 Messaging Platforms (Gateway)
|
||||
|
||||
### Gateway Core
|
||||
- **System gateway service mode** — run as a system-level systemd service, not just user-level ([#1371](https://github.com/NousResearch/hermes-agent/pull/1371))
|
||||
- **Gateway install scope prompts** — choose user vs system scope during setup ([#1374](https://github.com/NousResearch/hermes-agent/pull/1374))
|
||||
- **Reasoning hot reload** — change reasoning settings without restarting the gateway ([#1275](https://github.com/NousResearch/hermes-agent/pull/1275))
|
||||
- Default group sessions to per-user isolation — no more shared state across users in group chats ([#1495](https://github.com/NousResearch/hermes-agent/pull/1495), [#1417](https://github.com/NousResearch/hermes-agent/pull/1417))
|
||||
- Harden gateway restart recovery ([#1310](https://github.com/NousResearch/hermes-agent/pull/1310))
|
||||
- Cancel active runs during shutdown ([#1427](https://github.com/NousResearch/hermes-agent/pull/1427))
|
||||
- SSL certificate auto-detection for NixOS and non-standard systems ([#1494](https://github.com/NousResearch/hermes-agent/pull/1494))
|
||||
- Auto-detect D-Bus session bus for `systemctl --user` on headless servers ([#1601](https://github.com/NousResearch/hermes-agent/pull/1601))
|
||||
- Auto-enable systemd linger during gateway install on headless servers ([#1334](https://github.com/NousResearch/hermes-agent/pull/1334))
|
||||
- Fall back to module entrypoint when `hermes` is not on PATH ([#1355](https://github.com/NousResearch/hermes-agent/pull/1355))
|
||||
- Fix dual gateways on macOS launchd after `hermes update` ([#1567](https://github.com/NousResearch/hermes-agent/pull/1567))
|
||||
- Remove recursive ExecStop from systemd units ([#1530](https://github.com/NousResearch/hermes-agent/pull/1530))
|
||||
- Prevent logging handler accumulation in gateway mode ([#1251](https://github.com/NousResearch/hermes-agent/pull/1251))
|
||||
- Restart on retryable startup failures — by @jplew ([#1517](https://github.com/NousResearch/hermes-agent/pull/1517))
|
||||
- Backfill model on gateway sessions after agent runs ([#1306](https://github.com/NousResearch/hermes-agent/pull/1306))
|
||||
- PID-based gateway kill and deferred config write ([#1499](https://github.com/NousResearch/hermes-agent/pull/1499))
|
||||
|
||||
### Telegram
|
||||
- Buffer media groups to prevent self-interruption from photo bursts ([#1341](https://github.com/NousResearch/hermes-agent/pull/1341), [#1422](https://github.com/NousResearch/hermes-agent/pull/1422))
|
||||
- Retry on transient TLS failures during connect and send ([#1535](https://github.com/NousResearch/hermes-agent/pull/1535))
|
||||
- Harden polling conflict handling ([#1339](https://github.com/NousResearch/hermes-agent/pull/1339))
|
||||
- Escape chunk indicators and inline code in MarkdownV2 ([#1478](https://github.com/NousResearch/hermes-agent/pull/1478), [#1626](https://github.com/NousResearch/hermes-agent/pull/1626))
|
||||
- Check updater/app state before disconnect ([#1389](https://github.com/NousResearch/hermes-agent/pull/1389))
|
||||
|
||||
### Discord
|
||||
- `/thread` command with `auto_thread` config and media metadata fixes ([#1178](https://github.com/NousResearch/hermes-agent/pull/1178))
|
||||
- Auto-thread on @mention, skip mention text in bot threads ([#1438](https://github.com/NousResearch/hermes-agent/pull/1438))
|
||||
- Retry without reply reference for system messages ([#1385](https://github.com/NousResearch/hermes-agent/pull/1385))
|
||||
- Preserve native document and video attachment support ([#1392](https://github.com/NousResearch/hermes-agent/pull/1392))
|
||||
- Defer discord adapter annotations to avoid optional import crashes ([#1314](https://github.com/NousResearch/hermes-agent/pull/1314))
|
||||
|
||||
### Slack
|
||||
- Thread handling overhaul — progress messages, responses, and session isolation all respect threads ([#1103](https://github.com/NousResearch/hermes-agent/pull/1103))
|
||||
- Formatting, reactions, user resolution, and command improvements ([#1106](https://github.com/NousResearch/hermes-agent/pull/1106))
|
||||
- Fix MAX_MESSAGE_LENGTH 3900 → 39000 ([#1117](https://github.com/NousResearch/hermes-agent/pull/1117))
|
||||
- File upload fallback preserves thread context — by @0xbyt4 ([#1122](https://github.com/NousResearch/hermes-agent/pull/1122))
|
||||
- Improve setup guidance ([#1387](https://github.com/NousResearch/hermes-agent/pull/1387))
|
||||
|
||||
### Email
|
||||
- Fix IMAP UID tracking and SMTP TLS verification ([#1305](https://github.com/NousResearch/hermes-agent/pull/1305))
|
||||
- Add `skip_attachments` option via config.yaml ([#1536](https://github.com/NousResearch/hermes-agent/pull/1536))
|
||||
|
||||
### Home Assistant
|
||||
- Event filtering closed by default ([#1169](https://github.com/NousResearch/hermes-agent/pull/1169))
|
||||
|
||||
---
|
||||
|
||||
## 🖥️ CLI & User Experience
|
||||
|
||||
### Interactive CLI
|
||||
- **Persistent CLI status bar** — always-visible model, provider, and token counts ([#1522](https://github.com/NousResearch/hermes-agent/pull/1522))
|
||||
- **File path autocomplete** in the input prompt ([#1545](https://github.com/NousResearch/hermes-agent/pull/1545))
|
||||
- **`/plan` command** — generate implementation plans from specs ([#1372](https://github.com/NousResearch/hermes-agent/pull/1372), [#1381](https://github.com/NousResearch/hermes-agent/pull/1381))
|
||||
- **Major `/rollback` improvements** — richer checkpoint history, clearer UX ([#1505](https://github.com/NousResearch/hermes-agent/pull/1505))
|
||||
- **Preload CLI skills on launch** — skills are ready before the first prompt ([#1359](https://github.com/NousResearch/hermes-agent/pull/1359))
|
||||
- **Centralized slash command registry** — all commands defined once, consumed everywhere ([#1603](https://github.com/NousResearch/hermes-agent/pull/1603))
|
||||
- `/bg` alias for `/background` ([#1590](https://github.com/NousResearch/hermes-agent/pull/1590))
|
||||
- Prefix matching for slash commands — `/mod` resolves to `/model` ([#1320](https://github.com/NousResearch/hermes-agent/pull/1320))
|
||||
- `/new`, `/reset`, `/clear` now start genuinely fresh sessions ([#1237](https://github.com/NousResearch/hermes-agent/pull/1237))
|
||||
- Accept session ID prefixes for session actions ([#1425](https://github.com/NousResearch/hermes-agent/pull/1425))
|
||||
- TUI prompt and accent output now respect active skin ([#1282](https://github.com/NousResearch/hermes-agent/pull/1282))
|
||||
- Centralize tool emoji metadata in registry + skin integration ([#1484](https://github.com/NousResearch/hermes-agent/pull/1484))
|
||||
- "View full command" option added to dangerous command approval — by @teknium1 based on design by community ([#887](https://github.com/NousResearch/hermes-agent/pull/887))
|
||||
- Non-blocking startup update check and banner deduplication ([#1386](https://github.com/NousResearch/hermes-agent/pull/1386))
|
||||
- `/reasoning` command output ordering and inline think extraction fixes ([#1031](https://github.com/NousResearch/hermes-agent/pull/1031))
|
||||
- Verbose mode shows full untruncated output ([#1472](https://github.com/NousResearch/hermes-agent/pull/1472))
|
||||
- Fix `/status` to report live state and tokens ([#1476](https://github.com/NousResearch/hermes-agent/pull/1476))
|
||||
- Seed a default global SOUL.md ([#1311](https://github.com/NousResearch/hermes-agent/pull/1311))
|
||||
|
||||
### Setup & Configuration
|
||||
- **OpenClaw migration** during first-time setup — by @kshitijk4poor ([#981](https://github.com/NousResearch/hermes-agent/pull/981))
|
||||
- `hermes claw migrate` command + migration docs ([#1059](https://github.com/NousResearch/hermes-agent/pull/1059))
|
||||
- Smart vision setup that respects the user's chosen provider ([#1323](https://github.com/NousResearch/hermes-agent/pull/1323))
|
||||
- Handle headless setup flows end-to-end ([#1274](https://github.com/NousResearch/hermes-agent/pull/1274))
|
||||
- Prefer curses over `simple_term_menu` in setup.py ([#1487](https://github.com/NousResearch/hermes-agent/pull/1487))
|
||||
- Show effective model and provider in `/status` ([#1284](https://github.com/NousResearch/hermes-agent/pull/1284))
|
||||
- Config set examples use placeholder syntax ([#1322](https://github.com/NousResearch/hermes-agent/pull/1322))
|
||||
- Reload .env over stale shell overrides ([#1434](https://github.com/NousResearch/hermes-agent/pull/1434))
|
||||
- Fix is_coding_plan NameError crash — by @0xbyt4 ([#1123](https://github.com/NousResearch/hermes-agent/pull/1123))
|
||||
- Add missing packages to setuptools config — by @alt-glitch ([#912](https://github.com/NousResearch/hermes-agent/pull/912))
|
||||
- Installer: clarify why sudo is needed at every prompt ([#1602](https://github.com/NousResearch/hermes-agent/pull/1602))
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Tool System
|
||||
|
||||
### Terminal & Execution
|
||||
- **Persistent shell mode** for local and SSH backends — maintain shell state across tool calls — by @alt-glitch ([#1067](https://github.com/NousResearch/hermes-agent/pull/1067), [#1483](https://github.com/NousResearch/hermes-agent/pull/1483))
|
||||
- **Tirith pre-exec command scanning** — security layer that analyzes commands before execution ([#1256](https://github.com/NousResearch/hermes-agent/pull/1256))
|
||||
- Strip Hermes provider env vars from all subprocess environments ([#1157](https://github.com/NousResearch/hermes-agent/pull/1157), [#1172](https://github.com/NousResearch/hermes-agent/pull/1172), [#1399](https://github.com/NousResearch/hermes-agent/pull/1399), [#1419](https://github.com/NousResearch/hermes-agent/pull/1419)) — initial fix by @eren-karakus0
|
||||
- SSH preflight check ([#1486](https://github.com/NousResearch/hermes-agent/pull/1486))
|
||||
- Docker backend: make cwd workspace mount explicit opt-in ([#1534](https://github.com/NousResearch/hermes-agent/pull/1534))
|
||||
- Add project root to PYTHONPATH in execute_code sandbox ([#1383](https://github.com/NousResearch/hermes-agent/pull/1383))
|
||||
- Eliminate execute_code progress spam on gateway platforms ([#1098](https://github.com/NousResearch/hermes-agent/pull/1098))
|
||||
- Clearer docker backend preflight errors ([#1276](https://github.com/NousResearch/hermes-agent/pull/1276))
|
||||
|
||||
### Browser
|
||||
- **`/browser connect`** — attach browser tools to a live Chrome instance via CDP ([#1549](https://github.com/NousResearch/hermes-agent/pull/1549))
|
||||
- Improve browser cleanup, local browser PATH setup, and screenshot recovery ([#1333](https://github.com/NousResearch/hermes-agent/pull/1333))
|
||||
|
||||
### MCP
|
||||
- **Selective tool loading** with utility policies — filter which MCP tools are available ([#1302](https://github.com/NousResearch/hermes-agent/pull/1302))
|
||||
- Auto-reload MCP tools when `mcp_servers` config changes without restart ([#1474](https://github.com/NousResearch/hermes-agent/pull/1474))
|
||||
- Resolve npx stdio connection failures ([#1291](https://github.com/NousResearch/hermes-agent/pull/1291))
|
||||
- Preserve MCP toolsets when saving platform tool config ([#1421](https://github.com/NousResearch/hermes-agent/pull/1421))
|
||||
|
||||
### Vision
|
||||
- Unify vision backend gating ([#1367](https://github.com/NousResearch/hermes-agent/pull/1367))
|
||||
- Surface actual error reason instead of generic message ([#1338](https://github.com/NousResearch/hermes-agent/pull/1338))
|
||||
- Make Claude image handling work end-to-end ([#1408](https://github.com/NousResearch/hermes-agent/pull/1408))
|
||||
|
||||
### Cron
|
||||
- **Compress cron management into one tool** — single `cronjob` tool replaces multiple commands ([#1343](https://github.com/NousResearch/hermes-agent/pull/1343))
|
||||
- Suppress duplicate cron sends to auto-delivery targets ([#1357](https://github.com/NousResearch/hermes-agent/pull/1357))
|
||||
- Persist cron sessions to SQLite ([#1255](https://github.com/NousResearch/hermes-agent/pull/1255))
|
||||
- Per-job runtime overrides (provider, model, base_url) ([#1398](https://github.com/NousResearch/hermes-agent/pull/1398))
|
||||
- Atomic write in `save_job_output` to prevent data loss on crash ([#1173](https://github.com/NousResearch/hermes-agent/pull/1173))
|
||||
- Preserve thread context for `deliver=origin` ([#1437](https://github.com/NousResearch/hermes-agent/pull/1437))
|
||||
|
||||
### Patch Tool
|
||||
- Avoid corrupting pipe chars in V4A patch apply ([#1286](https://github.com/NousResearch/hermes-agent/pull/1286))
|
||||
- Permissive `block_anchor` thresholds and unicode normalization ([#1539](https://github.com/NousResearch/hermes-agent/pull/1539))
|
||||
|
||||
### Delegation
|
||||
- Add observability metadata to subagent results (model, tokens, duration, tool trace) ([#1175](https://github.com/NousResearch/hermes-agent/pull/1175))
|
||||
|
||||
---
|
||||
|
||||
## 🧩 Skills Ecosystem
|
||||
|
||||
### Skills System
|
||||
- **Integrate skills.sh** as a hub source alongside ClawHub ([#1303](https://github.com/NousResearch/hermes-agent/pull/1303))
|
||||
- Secure skill env setup on load ([#1153](https://github.com/NousResearch/hermes-agent/pull/1153))
|
||||
- Honor policy table for dangerous verdicts ([#1330](https://github.com/NousResearch/hermes-agent/pull/1330))
|
||||
- Harden ClawHub skill search exact matches ([#1400](https://github.com/NousResearch/hermes-agent/pull/1400))
|
||||
- Fix ClawHub skill install — use `/download` ZIP endpoint ([#1060](https://github.com/NousResearch/hermes-agent/pull/1060))
|
||||
- Avoid mislabeling local skills as builtin — by @arceus77-7 ([#862](https://github.com/NousResearch/hermes-agent/pull/862))
|
||||
|
||||
### New Skills
|
||||
- **Linear** project management ([#1230](https://github.com/NousResearch/hermes-agent/pull/1230))
|
||||
- **X/Twitter** via x-cli ([#1285](https://github.com/NousResearch/hermes-agent/pull/1285))
|
||||
- **Telephony** — Twilio, SMS, and AI calls ([#1289](https://github.com/NousResearch/hermes-agent/pull/1289))
|
||||
- **1Password** — by @arceus77-7 ([#883](https://github.com/NousResearch/hermes-agent/pull/883), [#1179](https://github.com/NousResearch/hermes-agent/pull/1179))
|
||||
- **NeuroSkill BCI** integration ([#1135](https://github.com/NousResearch/hermes-agent/pull/1135))
|
||||
- **Blender MCP** for 3D modeling ([#1531](https://github.com/NousResearch/hermes-agent/pull/1531))
|
||||
- **OSS Security Forensics** ([#1482](https://github.com/NousResearch/hermes-agent/pull/1482))
|
||||
- **Parallel CLI** research skill ([#1301](https://github.com/NousResearch/hermes-agent/pull/1301))
|
||||
- **OpenCode** CLI skill ([#1174](https://github.com/NousResearch/hermes-agent/pull/1174))
|
||||
- **ASCII Video** skill refactored — by @SHL0MS ([#1213](https://github.com/NousResearch/hermes-agent/pull/1213), [#1598](https://github.com/NousResearch/hermes-agent/pull/1598))
|
||||
|
||||
---
|
||||
|
||||
## 🎙️ Voice Mode
|
||||
|
||||
- Voice mode foundation — push-to-talk CLI, Telegram/Discord voice notes ([#1299](https://github.com/NousResearch/hermes-agent/pull/1299))
|
||||
- Free local Whisper transcription via faster-whisper ([#1185](https://github.com/NousResearch/hermes-agent/pull/1185))
|
||||
- Discord voice channel reliability fixes ([#1429](https://github.com/NousResearch/hermes-agent/pull/1429))
|
||||
- Restore local STT fallback for gateway voice notes ([#1490](https://github.com/NousResearch/hermes-agent/pull/1490))
|
||||
- Honor `stt.enabled: false` across gateway transcription ([#1394](https://github.com/NousResearch/hermes-agent/pull/1394))
|
||||
- Fix bogus incapability message on Telegram voice notes (Issue [#1033](https://github.com/NousResearch/hermes-agent/issues/1033))
|
||||
|
||||
---
|
||||
|
||||
## 🔌 ACP (IDE Integration)
|
||||
|
||||
- Restore ACP server implementation ([#1254](https://github.com/NousResearch/hermes-agent/pull/1254))
|
||||
- Support slash commands in ACP adapter ([#1532](https://github.com/NousResearch/hermes-agent/pull/1532))
|
||||
|
||||
---
|
||||
|
||||
## 🧪 RL Training
|
||||
|
||||
- **Agentic On-Policy Distillation (OPD)** environment — new RL training environment for agent policy distillation ([#1149](https://github.com/NousResearch/hermes-agent/pull/1149))
|
||||
- Make tinker-atropos RL training fully optional ([#1062](https://github.com/NousResearch/hermes-agent/pull/1062))
|
||||
|
||||
---
|
||||
|
||||
## 🔒 Security & Reliability
|
||||
|
||||
### Security Hardening
|
||||
- **Tirith pre-exec command scanning** — static analysis of terminal commands before execution ([#1256](https://github.com/NousResearch/hermes-agent/pull/1256))
|
||||
- **PII redaction** when `privacy.redact_pii` is enabled ([#1542](https://github.com/NousResearch/hermes-agent/pull/1542))
|
||||
- Strip Hermes provider/gateway/tool env vars from all subprocess environments ([#1157](https://github.com/NousResearch/hermes-agent/pull/1157), [#1172](https://github.com/NousResearch/hermes-agent/pull/1172), [#1399](https://github.com/NousResearch/hermes-agent/pull/1399), [#1419](https://github.com/NousResearch/hermes-agent/pull/1419))
|
||||
- Docker cwd workspace mount now explicit opt-in — never auto-mount host directories ([#1534](https://github.com/NousResearch/hermes-agent/pull/1534))
|
||||
- Escape parens and braces in fork bomb regex pattern ([#1397](https://github.com/NousResearch/hermes-agent/pull/1397))
|
||||
- Harden `.worktreeinclude` path containment ([#1388](https://github.com/NousResearch/hermes-agent/pull/1388))
|
||||
- Use description as `pattern_key` to prevent approval collisions ([#1395](https://github.com/NousResearch/hermes-agent/pull/1395))
|
||||
|
||||
### Reliability
|
||||
- Guard init-time stdio writes ([#1271](https://github.com/NousResearch/hermes-agent/pull/1271))
|
||||
- Session log writes reuse shared atomic JSON helper ([#1280](https://github.com/NousResearch/hermes-agent/pull/1280))
|
||||
- Atomic temp cleanup protected on interrupts ([#1401](https://github.com/NousResearch/hermes-agent/pull/1401))
|
||||
|
||||
---
|
||||
|
||||
## 🐛 Notable Bug Fixes
|
||||
|
||||
- **`/status` always showing 0 tokens** — now reports live state (Issue [#1465](https://github.com/NousResearch/hermes-agent/issues/1465), [#1476](https://github.com/NousResearch/hermes-agent/pull/1476))
|
||||
- **Custom model endpoints not working** — restored config-saved endpoint resolution (Issue [#1460](https://github.com/NousResearch/hermes-agent/issues/1460), [#1373](https://github.com/NousResearch/hermes-agent/pull/1373))
|
||||
- **MCP tools not visible until restart** — auto-reload on config change (Issue [#1036](https://github.com/NousResearch/hermes-agent/issues/1036), [#1474](https://github.com/NousResearch/hermes-agent/pull/1474))
|
||||
- **`hermes tools` removing MCP tools** — preserve MCP toolsets when saving (Issue [#1247](https://github.com/NousResearch/hermes-agent/issues/1247), [#1421](https://github.com/NousResearch/hermes-agent/pull/1421))
|
||||
- **Terminal subprocesses inheriting `OPENAI_BASE_URL`** breaking external tools (Issue [#1002](https://github.com/NousResearch/hermes-agent/issues/1002), [#1399](https://github.com/NousResearch/hermes-agent/pull/1399))
|
||||
- **Background process lost on gateway restart** — improved recovery (Issue [#1144](https://github.com/NousResearch/hermes-agent/issues/1144))
|
||||
- **Cron jobs not persisting state** — now stored in SQLite (Issue [#1416](https://github.com/NousResearch/hermes-agent/issues/1416), [#1255](https://github.com/NousResearch/hermes-agent/pull/1255))
|
||||
- **Cronjob `deliver: origin` not preserving thread context** (Issue [#1219](https://github.com/NousResearch/hermes-agent/issues/1219), [#1437](https://github.com/NousResearch/hermes-agent/pull/1437))
|
||||
- **Gateway systemd service failing to auto-restart** when browser processes orphaned (Issue [#1617](https://github.com/NousResearch/hermes-agent/issues/1617))
|
||||
- **`/background` completion report cut off in Telegram** (Issue [#1443](https://github.com/NousResearch/hermes-agent/issues/1443))
|
||||
- **Model switching not taking effect** (Issue [#1244](https://github.com/NousResearch/hermes-agent/issues/1244), [#1183](https://github.com/NousResearch/hermes-agent/pull/1183))
|
||||
- **`hermes doctor` reporting cronjob as unavailable** (Issue [#878](https://github.com/NousResearch/hermes-agent/issues/878), [#1180](https://github.com/NousResearch/hermes-agent/pull/1180))
|
||||
- **WhatsApp bridge messages not received** from mobile (Issue [#1142](https://github.com/NousResearch/hermes-agent/issues/1142))
|
||||
- **Setup wizard hanging on headless SSH** (Issue [#905](https://github.com/NousResearch/hermes-agent/issues/905), [#1274](https://github.com/NousResearch/hermes-agent/pull/1274))
|
||||
- **Log handler accumulation** degrading gateway performance (Issue [#990](https://github.com/NousResearch/hermes-agent/issues/990), [#1251](https://github.com/NousResearch/hermes-agent/pull/1251))
|
||||
- **Gateway NULL model in DB** (Issue [#987](https://github.com/NousResearch/hermes-agent/issues/987), [#1306](https://github.com/NousResearch/hermes-agent/pull/1306))
|
||||
- **Strict endpoints rejecting replayed tool_calls** (Issue [#893](https://github.com/NousResearch/hermes-agent/issues/893))
|
||||
- **Remaining hardcoded `~/.hermes` paths** — all now respect `HERMES_HOME` (Issue [#892](https://github.com/NousResearch/hermes-agent/issues/892), [#1233](https://github.com/NousResearch/hermes-agent/pull/1233))
|
||||
- **Delegate tool not working with custom inference providers** (Issue [#1011](https://github.com/NousResearch/hermes-agent/issues/1011), [#1328](https://github.com/NousResearch/hermes-agent/pull/1328))
|
||||
- **Skills Guard blocking official skills** (Issue [#1006](https://github.com/NousResearch/hermes-agent/issues/1006), [#1330](https://github.com/NousResearch/hermes-agent/pull/1330))
|
||||
- **Setup writing provider before model selection** (Issue [#1182](https://github.com/NousResearch/hermes-agent/issues/1182))
|
||||
- **`GatewayConfig.get()` AttributeError** crashing all message handling (Issue [#1158](https://github.com/NousResearch/hermes-agent/issues/1158), [#1287](https://github.com/NousResearch/hermes-agent/pull/1287))
|
||||
- **`/update` hard-failing with "command not found"** (Issue [#1049](https://github.com/NousResearch/hermes-agent/issues/1049))
|
||||
- **Image analysis failing silently** (Issue [#1034](https://github.com/NousResearch/hermes-agent/issues/1034), [#1338](https://github.com/NousResearch/hermes-agent/pull/1338))
|
||||
- **API `BadRequestError` from `'dict'` object has no attribute `'strip'`** (Issue [#1071](https://github.com/NousResearch/hermes-agent/issues/1071))
|
||||
- **Slash commands requiring exact full name** — now uses prefix matching (Issue [#928](https://github.com/NousResearch/hermes-agent/issues/928), [#1320](https://github.com/NousResearch/hermes-agent/pull/1320))
|
||||
- **Gateway stops responding when terminal is closed on headless** (Issue [#1005](https://github.com/NousResearch/hermes-agent/issues/1005))
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
- Cover empty cached Anthropic tool-call turns ([#1222](https://github.com/NousResearch/hermes-agent/pull/1222))
|
||||
- Fix stale CI assumptions in parser and quick-command coverage ([#1236](https://github.com/NousResearch/hermes-agent/pull/1236))
|
||||
- Fix gateway async tests without implicit event loop ([#1278](https://github.com/NousResearch/hermes-agent/pull/1278))
|
||||
- Make gateway async tests xdist-safe ([#1281](https://github.com/NousResearch/hermes-agent/pull/1281))
|
||||
- Cross-timezone naive timestamp regression for cron ([#1319](https://github.com/NousResearch/hermes-agent/pull/1319))
|
||||
- Isolate codex provider tests from local env ([#1335](https://github.com/NousResearch/hermes-agent/pull/1335))
|
||||
- Lock retry replacement semantics ([#1379](https://github.com/NousResearch/hermes-agent/pull/1379))
|
||||
- Improve error logging in session search tool — by @aydnOktay ([#1533](https://github.com/NousResearch/hermes-agent/pull/1533))
|
||||
|
||||
---
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
- Comprehensive SOUL.md guide ([#1315](https://github.com/NousResearch/hermes-agent/pull/1315))
|
||||
- Voice mode documentation ([#1316](https://github.com/NousResearch/hermes-agent/pull/1316), [#1362](https://github.com/NousResearch/hermes-agent/pull/1362))
|
||||
- Provider contribution guide ([#1361](https://github.com/NousResearch/hermes-agent/pull/1361))
|
||||
- ACP and internal systems implementation guides ([#1259](https://github.com/NousResearch/hermes-agent/pull/1259))
|
||||
- Expand Docusaurus coverage across CLI, tools, skills, and skins ([#1232](https://github.com/NousResearch/hermes-agent/pull/1232))
|
||||
- Terminal backend and Windows troubleshooting ([#1297](https://github.com/NousResearch/hermes-agent/pull/1297))
|
||||
- Skills hub reference section ([#1317](https://github.com/NousResearch/hermes-agent/pull/1317))
|
||||
- Checkpoint, /rollback, and git worktrees guide ([#1493](https://github.com/NousResearch/hermes-agent/pull/1493), [#1524](https://github.com/NousResearch/hermes-agent/pull/1524))
|
||||
- CLI status bar and /usage reference ([#1523](https://github.com/NousResearch/hermes-agent/pull/1523))
|
||||
- Fallback providers + /background command docs ([#1430](https://github.com/NousResearch/hermes-agent/pull/1430))
|
||||
- Gateway service scopes docs ([#1378](https://github.com/NousResearch/hermes-agent/pull/1378))
|
||||
- Slack thread reply behavior docs ([#1407](https://github.com/NousResearch/hermes-agent/pull/1407))
|
||||
- Redesigned landing page with Nous blue palette — by @austinpickett ([#974](https://github.com/NousResearch/hermes-agent/pull/974))
|
||||
- Fix several documentation typos — by @JackTheGit ([#953](https://github.com/NousResearch/hermes-agent/pull/953))
|
||||
- Stabilize website diagrams ([#1405](https://github.com/NousResearch/hermes-agent/pull/1405))
|
||||
- CLI vs messaging quick reference in README ([#1491](https://github.com/NousResearch/hermes-agent/pull/1491))
|
||||
- Add search to Docusaurus ([#1053](https://github.com/NousResearch/hermes-agent/pull/1053))
|
||||
- Home Assistant integration docs ([#1170](https://github.com/NousResearch/hermes-agent/pull/1170))
|
||||
|
||||
---
|
||||
|
||||
## 👥 Contributors
|
||||
|
||||
### Core
|
||||
- **@teknium1** — 220+ PRs spanning every area of the codebase
|
||||
|
||||
### Top Community Contributors
|
||||
|
||||
- **@0xbyt4** (4 PRs) — Anthropic adapter fixes (max_tokens, fallback crash, 429/529 retry), Slack file upload thread context, setup NameError fix
|
||||
- **@erosika** (1 PR) — Honcho memory integration: async writes, memory modes, session title integration
|
||||
- **@SHL0MS** (2 PRs) — ASCII video skill design patterns and refactoring
|
||||
- **@alt-glitch** (2 PRs) — Persistent shell mode for local/SSH backends, setuptools packaging fix
|
||||
- **@arceus77-7** (2 PRs) — 1Password skill, fix skills list mislabeling
|
||||
- **@kshitijk4poor** (1 PR) — OpenClaw migration during setup wizard
|
||||
- **@ASRagab** (1 PR) — Fix adaptive thinking for Claude 4.6 models
|
||||
- **@eren-karakus0** (1 PR) — Strip Hermes provider env vars from subprocess environment
|
||||
- **@mr-emmett-one** (1 PR) — Fix DeepSeek V3 parser multi-tool call support
|
||||
- **@jplew** (1 PR) — Gateway restart on retryable startup failures
|
||||
- **@brandtcormorant** (1 PR) — Fix Anthropic cache control for empty text blocks
|
||||
- **@aydnOktay** (1 PR) — Improve error logging in session search tool
|
||||
- **@austinpickett** (1 PR) — Landing page redesign with Nous blue palette
|
||||
- **@JackTheGit** (1 PR) — Documentation typo fixes
|
||||
|
||||
### All Contributors
|
||||
|
||||
@0xbyt4, @alt-glitch, @arceus77-7, @ASRagab, @austinpickett, @aydnOktay, @brandtcormorant, @eren-karakus0, @erosika, @JackTheGit, @jplew, @kshitijk4poor, @mr-emmett-one, @SHL0MS, @teknium1
|
||||
|
||||
---
|
||||
|
||||
**Full Changelog**: [v2026.3.12...v2026.3.17](https://github.com/NousResearch/hermes-agent/compare/v2026.3.12...v2026.3.17)
|
||||
Binary file not shown.
Binary file not shown.
1
acp_adapter/__init__.py
Normal file
1
acp_adapter/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""ACP (Agent Communication Protocol) adapter for hermes-agent."""
|
||||
5
acp_adapter/__main__.py
Normal file
5
acp_adapter/__main__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Allow running the ACP adapter as ``python -m acp_adapter``."""
|
||||
|
||||
from .entry import main
|
||||
|
||||
main()
|
||||
24
acp_adapter/auth.py
Normal file
24
acp_adapter/auth.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""ACP auth helpers — detect the currently configured Hermes provider."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def detect_provider() -> Optional[str]:
|
||||
"""Resolve the active Hermes runtime provider, or None if unavailable."""
|
||||
try:
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
runtime = resolve_runtime_provider()
|
||||
api_key = runtime.get("api_key")
|
||||
provider = runtime.get("provider")
|
||||
if isinstance(api_key, str) and api_key.strip() and isinstance(provider, str) and provider.strip():
|
||||
return provider.strip().lower()
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def has_provider() -> bool:
|
||||
"""Return True if Hermes can resolve any runtime provider credentials."""
|
||||
return detect_provider() is not None
|
||||
85
acp_adapter/entry.py
Normal file
85
acp_adapter/entry.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""CLI entry point for the hermes-agent ACP adapter.
|
||||
|
||||
Loads environment variables from ``~/.hermes/.env``, configures logging
|
||||
to write to stderr (so stdout is reserved for ACP JSON-RPC transport),
|
||||
and starts the ACP agent server.
|
||||
|
||||
Usage::
|
||||
|
||||
python -m acp_adapter.entry
|
||||
# or
|
||||
hermes acp
|
||||
# or
|
||||
hermes-acp
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _setup_logging() -> None:
|
||||
"""Route all logging to stderr so stdout stays clean for ACP stdio."""
|
||||
handler = logging.StreamHandler(sys.stderr)
|
||||
handler.setFormatter(
|
||||
logging.Formatter(
|
||||
"%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
)
|
||||
root = logging.getLogger()
|
||||
root.handlers.clear()
|
||||
root.addHandler(handler)
|
||||
root.setLevel(logging.INFO)
|
||||
|
||||
# Quiet down noisy libraries
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
logging.getLogger("openai").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def _load_env() -> None:
|
||||
"""Load .env from HERMES_HOME (default ``~/.hermes``)."""
|
||||
from hermes_cli.env_loader import load_hermes_dotenv
|
||||
|
||||
hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
loaded = load_hermes_dotenv(hermes_home=hermes_home)
|
||||
if loaded:
|
||||
for env_file in loaded:
|
||||
logging.getLogger(__name__).info("Loaded env from %s", env_file)
|
||||
else:
|
||||
logging.getLogger(__name__).info(
|
||||
"No .env found at %s, using system env", hermes_home / ".env"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Entry point: load env, configure logging, run the ACP agent."""
|
||||
_setup_logging()
|
||||
_load_env()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Starting hermes-agent ACP adapter")
|
||||
|
||||
# Ensure the project root is on sys.path so ``from run_agent import AIAgent`` works
|
||||
project_root = str(Path(__file__).resolve().parent.parent)
|
||||
if project_root not in sys.path:
|
||||
sys.path.insert(0, project_root)
|
||||
|
||||
import acp
|
||||
from .server import HermesACPAgent
|
||||
|
||||
agent = HermesACPAgent()
|
||||
try:
|
||||
asyncio.run(acp.run_agent(agent))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutting down (KeyboardInterrupt)")
|
||||
except Exception:
|
||||
logger.exception("ACP agent crashed")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
171
acp_adapter/events.py
Normal file
171
acp_adapter/events.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""Callback factories for bridging AIAgent events to ACP notifications.
|
||||
|
||||
Each factory returns a callable with the signature that AIAgent expects
|
||||
for its callbacks. Internally, the callbacks push ACP session updates
|
||||
to the client via ``conn.session_update()`` using
|
||||
``asyncio.run_coroutine_threadsafe()`` (since AIAgent runs in a worker
|
||||
thread while the event loop lives on the main thread).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict, deque
|
||||
from typing import Any, Callable, Deque, Dict
|
||||
|
||||
import acp
|
||||
|
||||
from .tools import (
|
||||
build_tool_complete,
|
||||
build_tool_start,
|
||||
make_tool_call_id,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _send_update(
|
||||
conn: acp.Client,
|
||||
session_id: str,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
update: Any,
|
||||
) -> None:
|
||||
"""Fire-and-forget an ACP session update from a worker thread."""
|
||||
try:
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
conn.session_update(session_id, update), loop
|
||||
)
|
||||
future.result(timeout=5)
|
||||
except Exception:
|
||||
logger.debug("Failed to send ACP update", exc_info=True)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tool progress callback
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def make_tool_progress_cb(
|
||||
conn: acp.Client,
|
||||
session_id: str,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
tool_call_ids: Dict[str, Deque[str]],
|
||||
) -> Callable:
|
||||
"""Create a ``tool_progress_callback`` for AIAgent.
|
||||
|
||||
Signature expected by AIAgent::
|
||||
|
||||
tool_progress_callback(name: str, preview: str, args: dict)
|
||||
|
||||
Emits ``ToolCallStart`` for each tool invocation and tracks IDs in a FIFO
|
||||
queue per tool name so duplicate/parallel same-name calls still complete
|
||||
against the correct ACP tool call.
|
||||
"""
|
||||
|
||||
def _tool_progress(name: str, preview: str, args: Any = None) -> None:
|
||||
if isinstance(args, str):
|
||||
try:
|
||||
args = json.loads(args)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
args = {"raw": args}
|
||||
if not isinstance(args, dict):
|
||||
args = {}
|
||||
|
||||
tc_id = make_tool_call_id()
|
||||
queue = tool_call_ids.get(name)
|
||||
if queue is None:
|
||||
queue = deque()
|
||||
tool_call_ids[name] = queue
|
||||
elif isinstance(queue, str):
|
||||
queue = deque([queue])
|
||||
tool_call_ids[name] = queue
|
||||
queue.append(tc_id)
|
||||
|
||||
update = build_tool_start(tc_id, name, args)
|
||||
_send_update(conn, session_id, loop, update)
|
||||
|
||||
return _tool_progress
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thinking callback
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def make_thinking_cb(
|
||||
conn: acp.Client,
|
||||
session_id: str,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
) -> Callable:
|
||||
"""Create a ``thinking_callback`` for AIAgent."""
|
||||
|
||||
def _thinking(text: str) -> None:
|
||||
if not text:
|
||||
return
|
||||
update = acp.update_agent_thought_text(text)
|
||||
_send_update(conn, session_id, loop, update)
|
||||
|
||||
return _thinking
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Step callback
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def make_step_cb(
|
||||
conn: acp.Client,
|
||||
session_id: str,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
tool_call_ids: Dict[str, Deque[str]],
|
||||
) -> Callable:
|
||||
"""Create a ``step_callback`` for AIAgent.
|
||||
|
||||
Signature expected by AIAgent::
|
||||
|
||||
step_callback(api_call_count: int, prev_tools: list)
|
||||
"""
|
||||
|
||||
def _step(api_call_count: int, prev_tools: Any = None) -> None:
|
||||
if prev_tools and isinstance(prev_tools, list):
|
||||
for tool_info in prev_tools:
|
||||
tool_name = None
|
||||
result = None
|
||||
|
||||
if isinstance(tool_info, dict):
|
||||
tool_name = tool_info.get("name") or tool_info.get("function_name")
|
||||
result = tool_info.get("result") or tool_info.get("output")
|
||||
elif isinstance(tool_info, str):
|
||||
tool_name = tool_info
|
||||
|
||||
queue = tool_call_ids.get(tool_name or "")
|
||||
if isinstance(queue, str):
|
||||
queue = deque([queue])
|
||||
tool_call_ids[tool_name] = queue
|
||||
if tool_name and queue:
|
||||
tc_id = queue.popleft()
|
||||
update = build_tool_complete(
|
||||
tc_id, tool_name, result=str(result) if result is not None else None
|
||||
)
|
||||
_send_update(conn, session_id, loop, update)
|
||||
if not queue:
|
||||
tool_call_ids.pop(tool_name, None)
|
||||
|
||||
return _step
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Agent message callback
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def make_message_cb(
|
||||
conn: acp.Client,
|
||||
session_id: str,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
) -> Callable:
|
||||
"""Create a callback that streams agent response text to the editor."""
|
||||
|
||||
def _message(text: str) -> None:
|
||||
if not text:
|
||||
return
|
||||
update = acp.update_agent_message_text(text)
|
||||
_send_update(conn, session_id, loop, update)
|
||||
|
||||
return _message
|
||||
80
acp_adapter/permissions.py
Normal file
80
acp_adapter/permissions.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""ACP permission bridging — maps ACP approval requests to hermes approval callbacks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from concurrent.futures import TimeoutError as FutureTimeout
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from acp.schema import (
|
||||
AllowedOutcome,
|
||||
DeniedOutcome,
|
||||
PermissionOption,
|
||||
RequestPermissionRequest,
|
||||
SelectedPermissionOutcome,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Maps ACP PermissionOptionKind -> hermes approval result strings
|
||||
_KIND_TO_HERMES = {
|
||||
"allow_once": "once",
|
||||
"allow_always": "always",
|
||||
"reject_once": "deny",
|
||||
"reject_always": "deny",
|
||||
}
|
||||
|
||||
|
||||
def make_approval_callback(
|
||||
request_permission_fn: Callable,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
session_id: str,
|
||||
timeout: float = 60.0,
|
||||
) -> Callable[[str, str], str]:
|
||||
"""
|
||||
Return a hermes-compatible ``approval_callback(command, description) -> str``
|
||||
that bridges to the ACP client's ``request_permission`` call.
|
||||
|
||||
Args:
|
||||
request_permission_fn: The ACP connection's ``request_permission`` coroutine.
|
||||
loop: The event loop on which the ACP connection lives.
|
||||
session_id: Current ACP session id.
|
||||
timeout: Seconds to wait for a response before auto-denying.
|
||||
"""
|
||||
|
||||
def _callback(command: str, description: str) -> str:
|
||||
options = [
|
||||
PermissionOption(option_id="allow_once", kind="allow_once", name="Allow once"),
|
||||
PermissionOption(option_id="allow_always", kind="allow_always", name="Allow always"),
|
||||
PermissionOption(option_id="deny", kind="reject_once", name="Deny"),
|
||||
]
|
||||
import acp as _acp
|
||||
|
||||
tool_call = _acp.start_tool_call("perm-check", command, kind="execute")
|
||||
|
||||
coro = request_permission_fn(
|
||||
session_id=session_id,
|
||||
tool_call=tool_call,
|
||||
options=options,
|
||||
)
|
||||
|
||||
try:
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
response = future.result(timeout=timeout)
|
||||
except (FutureTimeout, Exception) as exc:
|
||||
logger.warning("Permission request timed out or failed: %s", exc)
|
||||
return "deny"
|
||||
|
||||
outcome = response.outcome
|
||||
if isinstance(outcome, AllowedOutcome):
|
||||
option_id = outcome.option_id
|
||||
# Look up the kind from our options list
|
||||
for opt in options:
|
||||
if opt.option_id == option_id:
|
||||
return _KIND_TO_HERMES.get(opt.kind, "deny")
|
||||
return "once" # fallback for unknown option_id
|
||||
else:
|
||||
return "deny"
|
||||
|
||||
return _callback
|
||||
479
acp_adapter/server.py
Normal file
479
acp_adapter/server.py
Normal file
@@ -0,0 +1,479 @@
|
||||
"""ACP agent server — exposes Hermes Agent via the Agent Client Protocol."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from collections import defaultdict, deque
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Any, Deque, Optional
|
||||
|
||||
import acp
|
||||
from acp.schema import (
|
||||
AgentCapabilities,
|
||||
AuthenticateResponse,
|
||||
AuthMethod,
|
||||
ClientCapabilities,
|
||||
EmbeddedResourceContentBlock,
|
||||
ForkSessionResponse,
|
||||
ImageContentBlock,
|
||||
AudioContentBlock,
|
||||
Implementation,
|
||||
InitializeResponse,
|
||||
ListSessionsResponse,
|
||||
LoadSessionResponse,
|
||||
NewSessionResponse,
|
||||
PromptResponse,
|
||||
ResumeSessionResponse,
|
||||
ResourceContentBlock,
|
||||
SessionCapabilities,
|
||||
SessionForkCapabilities,
|
||||
SessionListCapabilities,
|
||||
SessionInfo,
|
||||
TextContentBlock,
|
||||
Usage,
|
||||
)
|
||||
|
||||
from acp_adapter.auth import detect_provider, has_provider
|
||||
from acp_adapter.events import (
|
||||
make_message_cb,
|
||||
make_step_cb,
|
||||
make_thinking_cb,
|
||||
make_tool_progress_cb,
|
||||
)
|
||||
from acp_adapter.permissions import make_approval_callback
|
||||
from acp_adapter.session import SessionManager, SessionState
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from hermes_cli import __version__ as HERMES_VERSION
|
||||
except Exception:
|
||||
HERMES_VERSION = "0.0.0"
|
||||
|
||||
# Thread pool for running AIAgent (synchronous) in parallel.
|
||||
_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="acp-agent")
|
||||
|
||||
|
||||
def _extract_text(
|
||||
prompt: list[
|
||||
TextContentBlock
|
||||
| ImageContentBlock
|
||||
| AudioContentBlock
|
||||
| ResourceContentBlock
|
||||
| EmbeddedResourceContentBlock
|
||||
],
|
||||
) -> str:
|
||||
"""Extract plain text from ACP content blocks."""
|
||||
parts: list[str] = []
|
||||
for block in prompt:
|
||||
if isinstance(block, TextContentBlock):
|
||||
parts.append(block.text)
|
||||
elif hasattr(block, "text"):
|
||||
parts.append(str(block.text))
|
||||
# Non-text blocks are ignored for now.
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
class HermesACPAgent(acp.Agent):
|
||||
"""ACP Agent implementation wrapping Hermes AIAgent."""
|
||||
|
||||
def __init__(self, session_manager: SessionManager | None = None):
|
||||
super().__init__()
|
||||
self.session_manager = session_manager or SessionManager()
|
||||
self._conn: Optional[acp.Client] = None
|
||||
|
||||
# ---- Connection lifecycle -----------------------------------------------
|
||||
|
||||
def on_connect(self, conn: acp.Client) -> None:
|
||||
"""Store the client connection for sending session updates."""
|
||||
self._conn = conn
|
||||
logger.info("ACP client connected")
|
||||
|
||||
# ---- ACP lifecycle ------------------------------------------------------
|
||||
|
||||
async def initialize(
|
||||
self,
|
||||
protocol_version: int,
|
||||
client_capabilities: ClientCapabilities | None = None,
|
||||
client_info: Implementation | None = None,
|
||||
**kwargs: Any,
|
||||
) -> InitializeResponse:
|
||||
provider = detect_provider()
|
||||
auth_methods = None
|
||||
if provider:
|
||||
auth_methods = [
|
||||
AuthMethod(
|
||||
id=provider,
|
||||
name=f"{provider} runtime credentials",
|
||||
description=f"Authenticate Hermes using the currently configured {provider} runtime credentials.",
|
||||
)
|
||||
]
|
||||
|
||||
client_name = client_info.name if client_info else "unknown"
|
||||
logger.info("Initialize from %s (protocol v%s)", client_name, protocol_version)
|
||||
|
||||
return InitializeResponse(
|
||||
protocol_version=acp.PROTOCOL_VERSION,
|
||||
agent_info=Implementation(name="hermes-agent", version=HERMES_VERSION),
|
||||
agent_capabilities=AgentCapabilities(
|
||||
session_capabilities=SessionCapabilities(
|
||||
fork=SessionForkCapabilities(),
|
||||
list=SessionListCapabilities(),
|
||||
),
|
||||
),
|
||||
auth_methods=auth_methods,
|
||||
)
|
||||
|
||||
async def authenticate(self, method_id: str, **kwargs: Any) -> AuthenticateResponse | None:
|
||||
if has_provider():
|
||||
return AuthenticateResponse()
|
||||
return None
|
||||
|
||||
# ---- Session management -------------------------------------------------
|
||||
|
||||
async def new_session(
|
||||
self,
|
||||
cwd: str,
|
||||
mcp_servers: list | None = None,
|
||||
**kwargs: Any,
|
||||
) -> NewSessionResponse:
|
||||
state = self.session_manager.create_session(cwd=cwd)
|
||||
logger.info("New session %s (cwd=%s)", state.session_id, cwd)
|
||||
return NewSessionResponse(session_id=state.session_id)
|
||||
|
||||
async def load_session(
|
||||
self,
|
||||
cwd: str,
|
||||
session_id: str,
|
||||
mcp_servers: list | None = None,
|
||||
**kwargs: Any,
|
||||
) -> LoadSessionResponse | None:
|
||||
state = self.session_manager.update_cwd(session_id, cwd)
|
||||
if state is None:
|
||||
logger.warning("load_session: session %s not found", session_id)
|
||||
return None
|
||||
logger.info("Loaded session %s", session_id)
|
||||
return LoadSessionResponse()
|
||||
|
||||
async def resume_session(
|
||||
self,
|
||||
cwd: str,
|
||||
session_id: str,
|
||||
mcp_servers: list | None = None,
|
||||
**kwargs: Any,
|
||||
) -> ResumeSessionResponse:
|
||||
state = self.session_manager.update_cwd(session_id, cwd)
|
||||
if state is None:
|
||||
logger.warning("resume_session: session %s not found, creating new", session_id)
|
||||
state = self.session_manager.create_session(cwd=cwd)
|
||||
logger.info("Resumed session %s", state.session_id)
|
||||
return ResumeSessionResponse()
|
||||
|
||||
async def cancel(self, session_id: str, **kwargs: Any) -> None:
|
||||
state = self.session_manager.get_session(session_id)
|
||||
if state and state.cancel_event:
|
||||
state.cancel_event.set()
|
||||
try:
|
||||
if getattr(state, "agent", None) and hasattr(state.agent, "interrupt"):
|
||||
state.agent.interrupt()
|
||||
except Exception:
|
||||
logger.debug("Failed to interrupt ACP session %s", session_id, exc_info=True)
|
||||
logger.info("Cancelled session %s", session_id)
|
||||
|
||||
async def fork_session(
|
||||
self,
|
||||
cwd: str,
|
||||
session_id: str,
|
||||
mcp_servers: list | None = None,
|
||||
**kwargs: Any,
|
||||
) -> ForkSessionResponse:
|
||||
state = self.session_manager.fork_session(session_id, cwd=cwd)
|
||||
new_id = state.session_id if state else ""
|
||||
logger.info("Forked session %s -> %s", session_id, new_id)
|
||||
return ForkSessionResponse(session_id=new_id)
|
||||
|
||||
async def list_sessions(
|
||||
self,
|
||||
cursor: str | None = None,
|
||||
cwd: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> ListSessionsResponse:
|
||||
infos = self.session_manager.list_sessions()
|
||||
sessions = [
|
||||
SessionInfo(session_id=s["session_id"], cwd=s["cwd"])
|
||||
for s in infos
|
||||
]
|
||||
return ListSessionsResponse(sessions=sessions)
|
||||
|
||||
# ---- Prompt (core) ------------------------------------------------------
|
||||
|
||||
async def prompt(
|
||||
self,
|
||||
prompt: list[
|
||||
TextContentBlock
|
||||
| ImageContentBlock
|
||||
| AudioContentBlock
|
||||
| ResourceContentBlock
|
||||
| EmbeddedResourceContentBlock
|
||||
],
|
||||
session_id: str,
|
||||
**kwargs: Any,
|
||||
) -> PromptResponse:
|
||||
"""Run Hermes on the user's prompt and stream events back to the editor."""
|
||||
state = self.session_manager.get_session(session_id)
|
||||
if state is None:
|
||||
logger.error("prompt: session %s not found", session_id)
|
||||
return PromptResponse(stop_reason="refusal")
|
||||
|
||||
user_text = _extract_text(prompt).strip()
|
||||
if not user_text:
|
||||
return PromptResponse(stop_reason="end_turn")
|
||||
|
||||
# Intercept slash commands — handle locally without calling the LLM
|
||||
if user_text.startswith("/"):
|
||||
response_text = self._handle_slash_command(user_text, state)
|
||||
if response_text is not None:
|
||||
if self._conn:
|
||||
update = acp.update_agent_message_text(response_text)
|
||||
await self._conn.session_update(session_id, update)
|
||||
return PromptResponse(stop_reason="end_turn")
|
||||
|
||||
logger.info("Prompt on session %s: %s", session_id, user_text[:100])
|
||||
|
||||
conn = self._conn
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
if state.cancel_event:
|
||||
state.cancel_event.clear()
|
||||
|
||||
tool_call_ids: dict[str, Deque[str]] = defaultdict(deque)
|
||||
previous_approval_cb = None
|
||||
|
||||
if conn:
|
||||
tool_progress_cb = make_tool_progress_cb(conn, session_id, loop, tool_call_ids)
|
||||
thinking_cb = make_thinking_cb(conn, session_id, loop)
|
||||
step_cb = make_step_cb(conn, session_id, loop, tool_call_ids)
|
||||
message_cb = make_message_cb(conn, session_id, loop)
|
||||
approval_cb = make_approval_callback(conn.request_permission, loop, session_id)
|
||||
else:
|
||||
tool_progress_cb = None
|
||||
thinking_cb = None
|
||||
step_cb = None
|
||||
message_cb = None
|
||||
approval_cb = None
|
||||
|
||||
agent = state.agent
|
||||
agent.tool_progress_callback = tool_progress_cb
|
||||
agent.thinking_callback = thinking_cb
|
||||
agent.step_callback = step_cb
|
||||
agent.message_callback = message_cb
|
||||
|
||||
if approval_cb:
|
||||
try:
|
||||
from tools import terminal_tool as _terminal_tool
|
||||
previous_approval_cb = getattr(_terminal_tool, "_approval_callback", None)
|
||||
_terminal_tool.set_approval_callback(approval_cb)
|
||||
except Exception:
|
||||
logger.debug("Could not set ACP approval callback", exc_info=True)
|
||||
|
||||
def _run_agent() -> dict:
|
||||
try:
|
||||
result = agent.run_conversation(
|
||||
user_message=user_text,
|
||||
conversation_history=state.history,
|
||||
task_id=session_id,
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.exception("Agent error in session %s", session_id)
|
||||
return {"final_response": f"Error: {e}", "messages": state.history}
|
||||
finally:
|
||||
if approval_cb:
|
||||
try:
|
||||
from tools import terminal_tool as _terminal_tool
|
||||
_terminal_tool.set_approval_callback(previous_approval_cb)
|
||||
except Exception:
|
||||
logger.debug("Could not restore approval callback", exc_info=True)
|
||||
|
||||
try:
|
||||
result = await loop.run_in_executor(_executor, _run_agent)
|
||||
except Exception:
|
||||
logger.exception("Executor error for session %s", session_id)
|
||||
return PromptResponse(stop_reason="end_turn")
|
||||
|
||||
if result.get("messages"):
|
||||
state.history = result["messages"]
|
||||
|
||||
final_response = result.get("final_response", "")
|
||||
if final_response and conn:
|
||||
update = acp.update_agent_message_text(final_response)
|
||||
await conn.session_update(session_id, update)
|
||||
|
||||
usage = None
|
||||
usage_data = result.get("usage")
|
||||
if usage_data and isinstance(usage_data, dict):
|
||||
usage = Usage(
|
||||
input_tokens=usage_data.get("prompt_tokens", 0),
|
||||
output_tokens=usage_data.get("completion_tokens", 0),
|
||||
total_tokens=usage_data.get("total_tokens", 0),
|
||||
thought_tokens=usage_data.get("reasoning_tokens"),
|
||||
cached_read_tokens=usage_data.get("cached_tokens"),
|
||||
)
|
||||
|
||||
stop_reason = "cancelled" if state.cancel_event and state.cancel_event.is_set() else "end_turn"
|
||||
return PromptResponse(stop_reason=stop_reason, usage=usage)
|
||||
|
||||
# ---- Slash commands (headless) -------------------------------------------
|
||||
|
||||
_SLASH_COMMANDS = {
|
||||
"help": "Show available commands",
|
||||
"model": "Show or change current model",
|
||||
"tools": "List available tools",
|
||||
"context": "Show conversation context info",
|
||||
"reset": "Clear conversation history",
|
||||
"compact": "Compress conversation context",
|
||||
"version": "Show Hermes version",
|
||||
}
|
||||
|
||||
def _handle_slash_command(self, text: str, state: SessionState) -> str | None:
|
||||
"""Dispatch a slash command and return the response text.
|
||||
|
||||
Returns ``None`` for unrecognized commands so they fall through
|
||||
to the LLM (the user may have typed ``/something`` as prose).
|
||||
"""
|
||||
parts = text.split(maxsplit=1)
|
||||
cmd = parts[0].lstrip("/").lower()
|
||||
args = parts[1].strip() if len(parts) > 1 else ""
|
||||
|
||||
handler = {
|
||||
"help": self._cmd_help,
|
||||
"model": self._cmd_model,
|
||||
"tools": self._cmd_tools,
|
||||
"context": self._cmd_context,
|
||||
"reset": self._cmd_reset,
|
||||
"compact": self._cmd_compact,
|
||||
"version": self._cmd_version,
|
||||
}.get(cmd)
|
||||
|
||||
if handler is None:
|
||||
return None # not a known command — let the LLM handle it
|
||||
|
||||
try:
|
||||
return handler(args, state)
|
||||
except Exception as e:
|
||||
logger.error("Slash command /%s error: %s", cmd, e, exc_info=True)
|
||||
return f"Error executing /{cmd}: {e}"
|
||||
|
||||
def _cmd_help(self, args: str, state: SessionState) -> str:
|
||||
lines = ["Available commands:", ""]
|
||||
for cmd, desc in self._SLASH_COMMANDS.items():
|
||||
lines.append(f" /{cmd:10s} {desc}")
|
||||
lines.append("")
|
||||
lines.append("Unrecognized /commands are sent to the model as normal messages.")
|
||||
return "\n".join(lines)
|
||||
|
||||
def _cmd_model(self, args: str, state: SessionState) -> str:
|
||||
if not args:
|
||||
model = state.model or getattr(state.agent, "model", "unknown")
|
||||
provider = getattr(state.agent, "provider", None) or "auto"
|
||||
return f"Current model: {model}\nProvider: {provider}"
|
||||
|
||||
new_model = args.strip()
|
||||
target_provider = None
|
||||
|
||||
# Auto-detect provider for the requested model
|
||||
try:
|
||||
from hermes_cli.models import parse_model_input, detect_provider_for_model
|
||||
current_provider = getattr(state.agent, "provider", None) or "openrouter"
|
||||
target_provider, new_model = parse_model_input(new_model, current_provider)
|
||||
if target_provider == current_provider:
|
||||
detected = detect_provider_for_model(new_model, current_provider)
|
||||
if detected:
|
||||
target_provider, new_model = detected
|
||||
except Exception:
|
||||
logger.debug("Provider detection failed, using model as-is", exc_info=True)
|
||||
|
||||
state.model = new_model
|
||||
state.agent = self.session_manager._make_agent(
|
||||
session_id=state.session_id,
|
||||
cwd=state.cwd,
|
||||
model=new_model,
|
||||
)
|
||||
provider_label = target_provider or getattr(state.agent, "provider", "auto")
|
||||
logger.info("Session %s: model switched to %s", state.session_id, new_model)
|
||||
return f"Model switched to: {new_model}\nProvider: {provider_label}"
|
||||
|
||||
def _cmd_tools(self, args: str, state: SessionState) -> str:
|
||||
try:
|
||||
from model_tools import get_tool_definitions
|
||||
toolsets = getattr(state.agent, "enabled_toolsets", None) or ["hermes-acp"]
|
||||
tools = get_tool_definitions(enabled_toolsets=toolsets, quiet_mode=True)
|
||||
if not tools:
|
||||
return "No tools available."
|
||||
lines = [f"Available tools ({len(tools)}):"]
|
||||
for t in tools:
|
||||
name = t.get("function", {}).get("name", "?")
|
||||
desc = t.get("function", {}).get("description", "")
|
||||
# Truncate long descriptions
|
||||
if len(desc) > 80:
|
||||
desc = desc[:77] + "..."
|
||||
lines.append(f" {name}: {desc}")
|
||||
return "\n".join(lines)
|
||||
except Exception as e:
|
||||
return f"Could not list tools: {e}"
|
||||
|
||||
def _cmd_context(self, args: str, state: SessionState) -> str:
|
||||
n_messages = len(state.history)
|
||||
if n_messages == 0:
|
||||
return "Conversation is empty (no messages yet)."
|
||||
# Count by role
|
||||
roles: dict[str, int] = {}
|
||||
for msg in state.history:
|
||||
role = msg.get("role", "unknown")
|
||||
roles[role] = roles.get(role, 0) + 1
|
||||
lines = [
|
||||
f"Conversation: {n_messages} messages",
|
||||
f" user: {roles.get('user', 0)}, assistant: {roles.get('assistant', 0)}, "
|
||||
f"tool: {roles.get('tool', 0)}, system: {roles.get('system', 0)}",
|
||||
]
|
||||
model = state.model or getattr(state.agent, "model", "")
|
||||
if model:
|
||||
lines.append(f"Model: {model}")
|
||||
return "\n".join(lines)
|
||||
|
||||
def _cmd_reset(self, args: str, state: SessionState) -> str:
|
||||
state.history.clear()
|
||||
return "Conversation history cleared."
|
||||
|
||||
def _cmd_compact(self, args: str, state: SessionState) -> str:
|
||||
if not state.history:
|
||||
return "Nothing to compress — conversation is empty."
|
||||
try:
|
||||
agent = state.agent
|
||||
if hasattr(agent, "compress_context"):
|
||||
agent.compress_context(state.history)
|
||||
return f"Context compressed. Messages: {len(state.history)}"
|
||||
return "Context compression not available for this agent."
|
||||
except Exception as e:
|
||||
return f"Compression failed: {e}"
|
||||
|
||||
def _cmd_version(self, args: str, state: SessionState) -> str:
|
||||
return f"Hermes Agent v{HERMES_VERSION}"
|
||||
|
||||
# ---- Model switching (ACP protocol method) -------------------------------
|
||||
|
||||
async def set_session_model(
|
||||
self, model_id: str, session_id: str, **kwargs: Any
|
||||
):
|
||||
"""Switch the model for a session (called by ACP protocol)."""
|
||||
state = self.session_manager.get_session(session_id)
|
||||
if state:
|
||||
state.model = model_id
|
||||
state.agent = self.session_manager._make_agent(
|
||||
session_id=session_id,
|
||||
cwd=state.cwd,
|
||||
model=model_id,
|
||||
)
|
||||
logger.info("Session %s: model switched to %s", session_id, model_id)
|
||||
return None
|
||||
203
acp_adapter/session.py
Normal file
203
acp_adapter/session.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""ACP session manager — maps ACP sessions to Hermes AIAgent instances."""
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import logging
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from threading import Lock
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _register_task_cwd(task_id: str, cwd: str) -> None:
|
||||
"""Bind a task/session id to the editor's working directory for tools."""
|
||||
if not task_id:
|
||||
return
|
||||
try:
|
||||
from tools.terminal_tool import register_task_env_overrides
|
||||
register_task_env_overrides(task_id, {"cwd": cwd})
|
||||
except Exception:
|
||||
logger.debug("Failed to register ACP task cwd override", exc_info=True)
|
||||
|
||||
|
||||
def _clear_task_cwd(task_id: str) -> None:
|
||||
"""Remove task-specific cwd overrides for an ACP session."""
|
||||
if not task_id:
|
||||
return
|
||||
try:
|
||||
from tools.terminal_tool import clear_task_env_overrides
|
||||
clear_task_env_overrides(task_id)
|
||||
except Exception:
|
||||
logger.debug("Failed to clear ACP task cwd override", exc_info=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionState:
|
||||
"""Tracks per-session state for an ACP-managed Hermes agent."""
|
||||
|
||||
session_id: str
|
||||
agent: Any # AIAgent instance
|
||||
cwd: str = "."
|
||||
model: str = ""
|
||||
history: List[Dict[str, Any]] = field(default_factory=list)
|
||||
cancel_event: Any = None # threading.Event
|
||||
|
||||
|
||||
class SessionManager:
|
||||
"""Thread-safe manager for ACP sessions backed by Hermes AIAgent instances."""
|
||||
|
||||
def __init__(self, agent_factory=None):
|
||||
"""
|
||||
Args:
|
||||
agent_factory: Optional callable that creates an AIAgent-like object.
|
||||
Used by tests. When omitted, a real AIAgent is created
|
||||
using the current Hermes runtime provider configuration.
|
||||
"""
|
||||
self._sessions: Dict[str, SessionState] = {}
|
||||
self._lock = Lock()
|
||||
self._agent_factory = agent_factory
|
||||
|
||||
# ---- public API ---------------------------------------------------------
|
||||
|
||||
def create_session(self, cwd: str = ".") -> SessionState:
|
||||
"""Create a new session with a unique ID and a fresh AIAgent."""
|
||||
import threading
|
||||
|
||||
session_id = str(uuid.uuid4())
|
||||
agent = self._make_agent(session_id=session_id, cwd=cwd)
|
||||
state = SessionState(
|
||||
session_id=session_id,
|
||||
agent=agent,
|
||||
cwd=cwd,
|
||||
model=getattr(agent, "model", "") or "",
|
||||
cancel_event=threading.Event(),
|
||||
)
|
||||
with self._lock:
|
||||
self._sessions[session_id] = state
|
||||
_register_task_cwd(session_id, cwd)
|
||||
logger.info("Created ACP session %s (cwd=%s)", session_id, cwd)
|
||||
return state
|
||||
|
||||
def get_session(self, session_id: str) -> Optional[SessionState]:
|
||||
"""Return the session for *session_id*, or ``None``."""
|
||||
with self._lock:
|
||||
return self._sessions.get(session_id)
|
||||
|
||||
def remove_session(self, session_id: str) -> bool:
|
||||
"""Remove a session. Returns True if it existed."""
|
||||
with self._lock:
|
||||
existed = self._sessions.pop(session_id, None) is not None
|
||||
if existed:
|
||||
_clear_task_cwd(session_id)
|
||||
return existed
|
||||
|
||||
def fork_session(self, session_id: str, cwd: str = ".") -> Optional[SessionState]:
|
||||
"""Deep-copy a session's history into a new session."""
|
||||
import threading
|
||||
|
||||
with self._lock:
|
||||
original = self._sessions.get(session_id)
|
||||
if original is None:
|
||||
return None
|
||||
|
||||
new_id = str(uuid.uuid4())
|
||||
agent = self._make_agent(
|
||||
session_id=new_id,
|
||||
cwd=cwd,
|
||||
model=original.model or None,
|
||||
)
|
||||
state = SessionState(
|
||||
session_id=new_id,
|
||||
agent=agent,
|
||||
cwd=cwd,
|
||||
model=getattr(agent, "model", original.model) or original.model,
|
||||
history=copy.deepcopy(original.history),
|
||||
cancel_event=threading.Event(),
|
||||
)
|
||||
self._sessions[new_id] = state
|
||||
_register_task_cwd(new_id, cwd)
|
||||
logger.info("Forked ACP session %s -> %s", session_id, new_id)
|
||||
return state
|
||||
|
||||
def list_sessions(self) -> List[Dict[str, Any]]:
|
||||
"""Return lightweight info dicts for all sessions."""
|
||||
with self._lock:
|
||||
return [
|
||||
{
|
||||
"session_id": s.session_id,
|
||||
"cwd": s.cwd,
|
||||
"model": s.model,
|
||||
"history_len": len(s.history),
|
||||
}
|
||||
for s in self._sessions.values()
|
||||
]
|
||||
|
||||
def update_cwd(self, session_id: str, cwd: str) -> Optional[SessionState]:
|
||||
"""Update the working directory for a session and its tool overrides."""
|
||||
with self._lock:
|
||||
state = self._sessions.get(session_id)
|
||||
if state is None:
|
||||
return None
|
||||
state.cwd = cwd
|
||||
_register_task_cwd(session_id, cwd)
|
||||
return state
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Remove all sessions and clear task-specific cwd overrides."""
|
||||
with self._lock:
|
||||
session_ids = list(self._sessions.keys())
|
||||
self._sessions.clear()
|
||||
for session_id in session_ids:
|
||||
_clear_task_cwd(session_id)
|
||||
|
||||
# ---- internal -----------------------------------------------------------
|
||||
|
||||
def _make_agent(
|
||||
self,
|
||||
*,
|
||||
session_id: str,
|
||||
cwd: str,
|
||||
model: str | None = None,
|
||||
):
|
||||
if self._agent_factory is not None:
|
||||
return self._agent_factory()
|
||||
|
||||
from run_agent import AIAgent
|
||||
from hermes_cli.config import load_config
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
|
||||
config = load_config()
|
||||
model_cfg = config.get("model")
|
||||
default_model = "anthropic/claude-opus-4.6"
|
||||
requested_provider = None
|
||||
if isinstance(model_cfg, dict):
|
||||
default_model = str(model_cfg.get("default") or default_model)
|
||||
requested_provider = model_cfg.get("provider")
|
||||
elif isinstance(model_cfg, str) and model_cfg.strip():
|
||||
default_model = model_cfg.strip()
|
||||
|
||||
kwargs = {
|
||||
"platform": "acp",
|
||||
"enabled_toolsets": ["hermes-acp"],
|
||||
"quiet_mode": True,
|
||||
"session_id": session_id,
|
||||
"model": model or default_model,
|
||||
}
|
||||
|
||||
try:
|
||||
runtime = resolve_runtime_provider(requested=requested_provider)
|
||||
kwargs.update(
|
||||
{
|
||||
"provider": runtime.get("provider"),
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
"base_url": runtime.get("base_url"),
|
||||
"api_key": runtime.get("api_key"),
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("ACP session falling back to default provider resolution", exc_info=True)
|
||||
|
||||
_register_task_cwd(session_id, cwd)
|
||||
return AIAgent(**kwargs)
|
||||
215
acp_adapter/tools.py
Normal file
215
acp_adapter/tools.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""ACP tool-call helpers for mapping hermes tools to ACP ToolKind and building content."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import acp
|
||||
from acp.schema import (
|
||||
ToolCallLocation,
|
||||
ToolCallStart,
|
||||
ToolCallProgress,
|
||||
ToolKind,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Map hermes tool names -> ACP ToolKind
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
TOOL_KIND_MAP: Dict[str, ToolKind] = {
|
||||
# File operations
|
||||
"read_file": "read",
|
||||
"write_file": "edit",
|
||||
"patch": "edit",
|
||||
"search_files": "search",
|
||||
# Terminal / execution
|
||||
"terminal": "execute",
|
||||
"process": "execute",
|
||||
"execute_code": "execute",
|
||||
# Web / fetch
|
||||
"web_search": "fetch",
|
||||
"web_extract": "fetch",
|
||||
# Browser
|
||||
"browser_navigate": "fetch",
|
||||
"browser_click": "execute",
|
||||
"browser_type": "execute",
|
||||
"browser_snapshot": "read",
|
||||
"browser_vision": "read",
|
||||
"browser_scroll": "execute",
|
||||
"browser_press": "execute",
|
||||
"browser_back": "execute",
|
||||
"browser_close": "execute",
|
||||
"browser_get_images": "read",
|
||||
# Agent internals
|
||||
"delegate_task": "execute",
|
||||
"vision_analyze": "read",
|
||||
"image_generate": "execute",
|
||||
"text_to_speech": "execute",
|
||||
# Thinking / meta
|
||||
"_thinking": "think",
|
||||
}
|
||||
|
||||
|
||||
def get_tool_kind(tool_name: str) -> ToolKind:
|
||||
"""Return the ACP ToolKind for a hermes tool, defaulting to 'other'."""
|
||||
return TOOL_KIND_MAP.get(tool_name, "other")
|
||||
|
||||
|
||||
def make_tool_call_id() -> str:
|
||||
"""Generate a unique tool call ID."""
|
||||
return f"tc-{uuid.uuid4().hex[:12]}"
|
||||
|
||||
|
||||
def build_tool_title(tool_name: str, args: Dict[str, Any]) -> str:
|
||||
"""Build a human-readable title for a tool call."""
|
||||
if tool_name == "terminal":
|
||||
cmd = args.get("command", "")
|
||||
if len(cmd) > 80:
|
||||
cmd = cmd[:77] + "..."
|
||||
return f"terminal: {cmd}"
|
||||
if tool_name == "read_file":
|
||||
return f"read: {args.get('path', '?')}"
|
||||
if tool_name == "write_file":
|
||||
return f"write: {args.get('path', '?')}"
|
||||
if tool_name == "patch":
|
||||
mode = args.get("mode", "replace")
|
||||
path = args.get("path", "?")
|
||||
return f"patch ({mode}): {path}"
|
||||
if tool_name == "search_files":
|
||||
return f"search: {args.get('pattern', '?')}"
|
||||
if tool_name == "web_search":
|
||||
return f"web search: {args.get('query', '?')}"
|
||||
if tool_name == "web_extract":
|
||||
urls = args.get("urls", [])
|
||||
if urls:
|
||||
return f"extract: {urls[0]}" + (f" (+{len(urls)-1})" if len(urls) > 1 else "")
|
||||
return "web extract"
|
||||
if tool_name == "delegate_task":
|
||||
goal = args.get("goal", "")
|
||||
if goal and len(goal) > 60:
|
||||
goal = goal[:57] + "..."
|
||||
return f"delegate: {goal}" if goal else "delegate task"
|
||||
if tool_name == "execute_code":
|
||||
return "execute code"
|
||||
if tool_name == "vision_analyze":
|
||||
return f"analyze image: {args.get('question', '?')[:50]}"
|
||||
return tool_name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Build ACP content objects for tool-call events
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def build_tool_start(
|
||||
tool_call_id: str,
|
||||
tool_name: str,
|
||||
arguments: Dict[str, Any],
|
||||
) -> ToolCallStart:
|
||||
"""Create a ToolCallStart event for the given hermes tool invocation."""
|
||||
kind = get_tool_kind(tool_name)
|
||||
title = build_tool_title(tool_name, arguments)
|
||||
locations = extract_locations(arguments)
|
||||
|
||||
if tool_name == "patch":
|
||||
mode = arguments.get("mode", "replace")
|
||||
if mode == "replace":
|
||||
path = arguments.get("path", "")
|
||||
old = arguments.get("old_string", "")
|
||||
new = arguments.get("new_string", "")
|
||||
content = [acp.tool_diff_content(path=path, new_text=new, old_text=old)]
|
||||
else:
|
||||
# Patch mode — show the patch content as text
|
||||
patch_text = arguments.get("patch", "")
|
||||
content = [acp.tool_content(acp.text_block(patch_text))]
|
||||
return acp.start_tool_call(
|
||||
tool_call_id, title, kind=kind, content=content, locations=locations,
|
||||
raw_input=arguments,
|
||||
)
|
||||
|
||||
if tool_name == "write_file":
|
||||
path = arguments.get("path", "")
|
||||
file_content = arguments.get("content", "")
|
||||
content = [acp.tool_diff_content(path=path, new_text=file_content)]
|
||||
return acp.start_tool_call(
|
||||
tool_call_id, title, kind=kind, content=content, locations=locations,
|
||||
raw_input=arguments,
|
||||
)
|
||||
|
||||
if tool_name == "terminal":
|
||||
command = arguments.get("command", "")
|
||||
content = [acp.tool_content(acp.text_block(f"$ {command}"))]
|
||||
return acp.start_tool_call(
|
||||
tool_call_id, title, kind=kind, content=content, locations=locations,
|
||||
raw_input=arguments,
|
||||
)
|
||||
|
||||
if tool_name == "read_file":
|
||||
path = arguments.get("path", "")
|
||||
content = [acp.tool_content(acp.text_block(f"Reading {path}"))]
|
||||
return acp.start_tool_call(
|
||||
tool_call_id, title, kind=kind, content=content, locations=locations,
|
||||
raw_input=arguments,
|
||||
)
|
||||
|
||||
if tool_name == "search_files":
|
||||
pattern = arguments.get("pattern", "")
|
||||
target = arguments.get("target", "content")
|
||||
content = [acp.tool_content(acp.text_block(f"Searching for '{pattern}' ({target})"))]
|
||||
return acp.start_tool_call(
|
||||
tool_call_id, title, kind=kind, content=content, locations=locations,
|
||||
raw_input=arguments,
|
||||
)
|
||||
|
||||
# Generic fallback
|
||||
import json
|
||||
try:
|
||||
args_text = json.dumps(arguments, indent=2, default=str)
|
||||
except (TypeError, ValueError):
|
||||
args_text = str(arguments)
|
||||
content = [acp.tool_content(acp.text_block(args_text))]
|
||||
return acp.start_tool_call(
|
||||
tool_call_id, title, kind=kind, content=content, locations=locations,
|
||||
raw_input=arguments,
|
||||
)
|
||||
|
||||
|
||||
def build_tool_complete(
|
||||
tool_call_id: str,
|
||||
tool_name: str,
|
||||
result: Optional[str] = None,
|
||||
) -> ToolCallProgress:
|
||||
"""Create a ToolCallUpdate (progress) event for a completed tool call."""
|
||||
kind = get_tool_kind(tool_name)
|
||||
|
||||
# Truncate very large results for the UI
|
||||
display_result = result or ""
|
||||
if len(display_result) > 5000:
|
||||
display_result = display_result[:4900] + f"\n... ({len(result)} chars total, truncated)"
|
||||
|
||||
content = [acp.tool_content(acp.text_block(display_result))]
|
||||
return acp.update_tool_call(
|
||||
tool_call_id,
|
||||
kind=kind,
|
||||
status="completed",
|
||||
content=content,
|
||||
raw_output=result,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Location extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def extract_locations(
|
||||
arguments: Dict[str, Any],
|
||||
) -> List[ToolCallLocation]:
|
||||
"""Extract file-system locations from tool arguments."""
|
||||
locations: List[ToolCallLocation] = []
|
||||
path = arguments.get("path")
|
||||
if path:
|
||||
line = arguments.get("offset") or arguments.get("line")
|
||||
locations.append(ToolCallLocation(path=path, line=line))
|
||||
return locations
|
||||
12
acp_registry/agent.json
Normal file
12
acp_registry/agent.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"schema_version": 1,
|
||||
"name": "hermes-agent",
|
||||
"display_name": "Hermes Agent",
|
||||
"description": "AI agent by Nous Research with 90+ tools, persistent memory, and multi-platform support",
|
||||
"icon": "icon.svg",
|
||||
"distribution": {
|
||||
"type": "command",
|
||||
"command": "hermes",
|
||||
"args": ["acp"]
|
||||
}
|
||||
}
|
||||
25
acp_registry/icon.svg
Normal file
25
acp_registry/icon.svg
Normal file
@@ -0,0 +1,25 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="64" height="64">
|
||||
<defs>
|
||||
<linearGradient id="gold" x1="0%" y1="0%" x2="0%" y2="100%">
|
||||
<stop offset="0%" style="stop-color:#F5C542;stop-opacity:1" />
|
||||
<stop offset="100%" style="stop-color:#D4961C;stop-opacity:1" />
|
||||
</linearGradient>
|
||||
</defs>
|
||||
<!-- Staff -->
|
||||
<rect x="30" y="10" width="4" height="46" rx="2" fill="url(#gold)" />
|
||||
<!-- Wings (left) -->
|
||||
<path d="M30 18 C24 14, 14 14, 10 18 C14 16, 22 16, 28 20" fill="#F5C542" opacity="0.9" />
|
||||
<path d="M30 22 C26 19, 18 19, 14 22 C18 20, 24 20, 28 24" fill="#D4961C" opacity="0.8" />
|
||||
<!-- Wings (right) -->
|
||||
<path d="M34 18 C40 14, 50 14, 54 18 C50 16, 42 16, 36 20" fill="#F5C542" opacity="0.9" />
|
||||
<path d="M34 22 C38 19, 46 19, 50 22 C46 20, 40 20, 36 24" fill="#D4961C" opacity="0.8" />
|
||||
<!-- Left serpent -->
|
||||
<path d="M32 48 C22 44, 20 38, 26 34 C20 36, 18 42, 24 46 C18 40, 22 30, 30 28 C24 32, 22 38, 28 42"
|
||||
fill="none" stroke="#F5C542" stroke-width="2.5" stroke-linecap="round" />
|
||||
<!-- Right serpent -->
|
||||
<path d="M32 48 C42 44, 44 38, 38 34 C44 36, 46 42, 40 46 C46 40, 42 30, 34 28 C40 32, 42 38, 36 42"
|
||||
fill="none" stroke="#D4961C" stroke-width="2.5" stroke-linecap="round" />
|
||||
<!-- Orb at top -->
|
||||
<circle cx="32" cy="10" r="4" fill="#F5C542" />
|
||||
<circle cx="32" cy="10" r="2" fill="#FFF8E1" opacity="0.7" />
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.4 KiB |
6
agent/__init__.py
Normal file
6
agent/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Agent internals -- extracted modules from run_agent.py.
|
||||
|
||||
These modules contain pure utility functions and self-contained classes
|
||||
that were previously embedded in the 3,600-line run_agent.py. Extracting
|
||||
them makes run_agent.py focused on the AIAgent orchestrator class.
|
||||
"""
|
||||
1103
agent/anthropic_adapter.py
Normal file
1103
agent/anthropic_adapter.py
Normal file
File diff suppressed because it is too large
Load Diff
1499
agent/auxiliary_client.py
Normal file
1499
agent/auxiliary_client.py
Normal file
File diff suppressed because it is too large
Load Diff
335
agent/context_compressor.py
Normal file
335
agent/context_compressor.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Automatic context window compression for long conversations.
|
||||
|
||||
Self-contained class with its own OpenAI client for summarization.
|
||||
Uses Gemini Flash (cheap/fast) to summarize middle turns while
|
||||
protecting head and tail context.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.auxiliary_client import call_llm
|
||||
from agent.model_metadata import (
|
||||
get_model_context_length,
|
||||
estimate_messages_tokens_rough,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SUMMARY_PREFIX = (
|
||||
"[CONTEXT COMPACTION] Earlier turns in this conversation were compacted "
|
||||
"to save context space. The summary below describes work that was "
|
||||
"already completed, and the current session state may still reflect "
|
||||
"that work (for example, files may already be changed). Use the summary "
|
||||
"and the current state to continue from where things left off, and "
|
||||
"avoid repeating work:"
|
||||
)
|
||||
LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"
|
||||
|
||||
|
||||
class ContextCompressor:
|
||||
"""Compresses conversation context when approaching the model's context limit.
|
||||
|
||||
Algorithm: protect first N + last N turns, summarize everything in between.
|
||||
Token tracking uses actual counts from API responses for accuracy.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
threshold_percent: float = 0.50,
|
||||
protect_first_n: int = 3,
|
||||
protect_last_n: int = 4,
|
||||
summary_target_tokens: int = 2500,
|
||||
quiet_mode: bool = False,
|
||||
summary_model_override: str = None,
|
||||
base_url: str = "",
|
||||
):
|
||||
self.model = model
|
||||
self.base_url = base_url
|
||||
self.threshold_percent = threshold_percent
|
||||
self.protect_first_n = protect_first_n
|
||||
self.protect_last_n = protect_last_n
|
||||
self.summary_target_tokens = summary_target_tokens
|
||||
self.quiet_mode = quiet_mode
|
||||
|
||||
self.context_length = get_model_context_length(model, base_url=base_url)
|
||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||
self.compression_count = 0
|
||||
self._context_probed = False # True after a step-down from context error
|
||||
|
||||
self.last_prompt_tokens = 0
|
||||
self.last_completion_tokens = 0
|
||||
self.last_total_tokens = 0
|
||||
|
||||
self.summary_model = summary_model_override or ""
|
||||
|
||||
def update_from_response(self, usage: Dict[str, Any]):
|
||||
"""Update tracked token usage from API response."""
|
||||
self.last_prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
self.last_completion_tokens = usage.get("completion_tokens", 0)
|
||||
self.last_total_tokens = usage.get("total_tokens", 0)
|
||||
|
||||
def should_compress(self, prompt_tokens: int = None) -> bool:
|
||||
"""Check if context exceeds the compression threshold."""
|
||||
tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens
|
||||
return tokens >= self.threshold_tokens
|
||||
|
||||
def should_compress_preflight(self, messages: List[Dict[str, Any]]) -> bool:
|
||||
"""Quick pre-flight check using rough estimate (before API call)."""
|
||||
rough_estimate = estimate_messages_tokens_rough(messages)
|
||||
return rough_estimate >= self.threshold_tokens
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""Get current compression status for display/logging."""
|
||||
return {
|
||||
"last_prompt_tokens": self.last_prompt_tokens,
|
||||
"threshold_tokens": self.threshold_tokens,
|
||||
"context_length": self.context_length,
|
||||
"usage_percent": (self.last_prompt_tokens / self.context_length * 100) if self.context_length else 0,
|
||||
"compression_count": self.compression_count,
|
||||
}
|
||||
|
||||
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
|
||||
"""Generate a concise summary of conversation turns.
|
||||
|
||||
Tries the auxiliary model first, then falls back to the user's main
|
||||
model. Returns None if all attempts fail — the caller should drop
|
||||
the middle turns without a summary rather than inject a useless
|
||||
placeholder.
|
||||
"""
|
||||
parts = []
|
||||
for msg in turns_to_summarize:
|
||||
role = msg.get("role", "unknown")
|
||||
content = msg.get("content") or ""
|
||||
if len(content) > 2000:
|
||||
content = content[:1000] + "\n...[truncated]...\n" + content[-500:]
|
||||
tool_calls = msg.get("tool_calls", [])
|
||||
if tool_calls:
|
||||
tool_names = [tc.get("function", {}).get("name", "?") for tc in tool_calls if isinstance(tc, dict)]
|
||||
content += f"\n[Tool calls: {', '.join(tool_names)}]"
|
||||
parts.append(f"[{role.upper()}]: {content}")
|
||||
|
||||
content_to_summarize = "\n\n".join(parts)
|
||||
prompt = f"""Create a concise handoff summary for a later assistant that will continue this conversation after earlier turns are compacted.
|
||||
|
||||
Describe:
|
||||
1. What actions were taken (tool calls, searches, file operations)
|
||||
2. Key information or results obtained
|
||||
3. Important decisions, constraints, or user preferences
|
||||
4. Relevant data, file names, outputs, or next steps needed to continue
|
||||
|
||||
Keep it factual, concise, and focused on helping the next assistant resume without repeating work. Target ~{self.summary_target_tokens} tokens.
|
||||
|
||||
---
|
||||
TURNS TO SUMMARIZE:
|
||||
{content_to_summarize}
|
||||
---
|
||||
|
||||
Write only the summary body. Do not include any preamble or prefix; the system will add the handoff wrapper."""
|
||||
|
||||
# Use the centralized LLM router — handles provider resolution,
|
||||
# auth, and fallback internally.
|
||||
try:
|
||||
call_kwargs = {
|
||||
"task": "compression",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.3,
|
||||
"max_tokens": self.summary_target_tokens * 2,
|
||||
"timeout": 30.0,
|
||||
}
|
||||
if self.summary_model:
|
||||
call_kwargs["model"] = self.summary_model
|
||||
response = call_llm(**call_kwargs)
|
||||
content = response.choices[0].message.content
|
||||
# Handle cases where content is not a string (e.g., dict from llama.cpp)
|
||||
if not isinstance(content, str):
|
||||
content = str(content) if content else ""
|
||||
summary = content.strip()
|
||||
return self._with_summary_prefix(summary)
|
||||
except RuntimeError:
|
||||
logging.warning("Context compression: no provider available for "
|
||||
"summary. Middle turns will be dropped without summary.")
|
||||
return None
|
||||
except Exception as e:
|
||||
logging.warning("Failed to generate context summary: %s", e)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _with_summary_prefix(summary: str) -> str:
|
||||
"""Normalize summary text to the current compaction handoff format."""
|
||||
text = (summary or "").strip()
|
||||
for prefix in (LEGACY_SUMMARY_PREFIX, SUMMARY_PREFIX):
|
||||
if text.startswith(prefix):
|
||||
text = text[len(prefix):].lstrip()
|
||||
break
|
||||
return f"{SUMMARY_PREFIX}\n{text}" if text else SUMMARY_PREFIX
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tool-call / tool-result pair integrity helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _get_tool_call_id(tc) -> str:
|
||||
"""Extract the call ID from a tool_call entry (dict or SimpleNamespace)."""
|
||||
if isinstance(tc, dict):
|
||||
return tc.get("id", "")
|
||||
return getattr(tc, "id", "") or ""
|
||||
|
||||
def _sanitize_tool_pairs(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Fix orphaned tool_call / tool_result pairs after compression.
|
||||
|
||||
Two failure modes:
|
||||
1. A tool *result* references a call_id whose assistant tool_call was
|
||||
removed (summarized/truncated). The API rejects this with
|
||||
"No tool call found for function call output with call_id ...".
|
||||
2. An assistant message has tool_calls whose results were dropped.
|
||||
The API rejects this because every tool_call must be followed by
|
||||
a tool result with the matching call_id.
|
||||
|
||||
This method removes orphaned results and inserts stub results for
|
||||
orphaned calls so the message list is always well-formed.
|
||||
"""
|
||||
surviving_call_ids: set = set()
|
||||
for msg in messages:
|
||||
if msg.get("role") == "assistant":
|
||||
for tc in msg.get("tool_calls") or []:
|
||||
cid = self._get_tool_call_id(tc)
|
||||
if cid:
|
||||
surviving_call_ids.add(cid)
|
||||
|
||||
result_call_ids: set = set()
|
||||
for msg in messages:
|
||||
if msg.get("role") == "tool":
|
||||
cid = msg.get("tool_call_id")
|
||||
if cid:
|
||||
result_call_ids.add(cid)
|
||||
|
||||
# 1. Remove tool results whose call_id has no matching assistant tool_call
|
||||
orphaned_results = result_call_ids - surviving_call_ids
|
||||
if orphaned_results:
|
||||
messages = [
|
||||
m for m in messages
|
||||
if not (m.get("role") == "tool" and m.get("tool_call_id") in orphaned_results)
|
||||
]
|
||||
if not self.quiet_mode:
|
||||
logger.info("Compression sanitizer: removed %d orphaned tool result(s)", len(orphaned_results))
|
||||
|
||||
# 2. Add stub results for assistant tool_calls whose results were dropped
|
||||
missing_results = surviving_call_ids - result_call_ids
|
||||
if missing_results:
|
||||
patched: List[Dict[str, Any]] = []
|
||||
for msg in messages:
|
||||
patched.append(msg)
|
||||
if msg.get("role") == "assistant":
|
||||
for tc in msg.get("tool_calls") or []:
|
||||
cid = self._get_tool_call_id(tc)
|
||||
if cid in missing_results:
|
||||
patched.append({
|
||||
"role": "tool",
|
||||
"content": "[Result from earlier conversation — see context summary above]",
|
||||
"tool_call_id": cid,
|
||||
})
|
||||
messages = patched
|
||||
if not self.quiet_mode:
|
||||
logger.info("Compression sanitizer: added %d stub tool result(s)", len(missing_results))
|
||||
|
||||
return messages
|
||||
|
||||
def _align_boundary_forward(self, messages: List[Dict[str, Any]], idx: int) -> int:
|
||||
"""Push a compress-start boundary forward past any orphan tool results.
|
||||
|
||||
If ``messages[idx]`` is a tool result, slide forward until we hit a
|
||||
non-tool message so we don't start the summarised region mid-group.
|
||||
"""
|
||||
while idx < len(messages) and messages[idx].get("role") == "tool":
|
||||
idx += 1
|
||||
return idx
|
||||
|
||||
def _align_boundary_backward(self, messages: List[Dict[str, Any]], idx: int) -> int:
|
||||
"""Pull a compress-end boundary backward to avoid splitting a
|
||||
tool_call / result group.
|
||||
|
||||
If the message just before ``idx`` is an assistant message with
|
||||
tool_calls, those tool results will start at ``idx`` and would be
|
||||
separated from their parent. Move backwards to include the whole
|
||||
group in the summarised region.
|
||||
"""
|
||||
if idx <= 0 or idx >= len(messages):
|
||||
return idx
|
||||
prev = messages[idx - 1]
|
||||
if prev.get("role") == "assistant" and prev.get("tool_calls"):
|
||||
# The results for this assistant turn sit at idx..idx+k.
|
||||
# Include the assistant message in the summarised region too.
|
||||
idx -= 1
|
||||
return idx
|
||||
|
||||
def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]:
|
||||
"""Compress conversation messages by summarizing middle turns.
|
||||
|
||||
Keeps first N + last N turns, summarizes everything in between.
|
||||
After compression, orphaned tool_call / tool_result pairs are cleaned
|
||||
up so the API never receives mismatched IDs.
|
||||
"""
|
||||
n_messages = len(messages)
|
||||
if n_messages <= self.protect_first_n + self.protect_last_n + 1:
|
||||
if not self.quiet_mode:
|
||||
print(f"⚠️ Cannot compress: only {n_messages} messages (need > {self.protect_first_n + self.protect_last_n + 1})")
|
||||
return messages
|
||||
|
||||
compress_start = self.protect_first_n
|
||||
compress_end = n_messages - self.protect_last_n
|
||||
if compress_start >= compress_end:
|
||||
return messages
|
||||
|
||||
# Adjust boundaries to avoid splitting tool_call/result groups.
|
||||
compress_start = self._align_boundary_forward(messages, compress_start)
|
||||
compress_end = self._align_boundary_backward(messages, compress_end)
|
||||
if compress_start >= compress_end:
|
||||
return messages
|
||||
|
||||
turns_to_summarize = messages[compress_start:compress_end]
|
||||
display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
|
||||
|
||||
if not self.quiet_mode:
|
||||
print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
|
||||
print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
|
||||
|
||||
if not self.quiet_mode:
|
||||
print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")
|
||||
|
||||
summary = self._generate_summary(turns_to_summarize)
|
||||
|
||||
compressed = []
|
||||
for i in range(compress_start):
|
||||
msg = messages[i].copy()
|
||||
if i == 0 and msg.get("role") == "system" and self.compression_count == 0:
|
||||
msg["content"] = (
|
||||
(msg.get("content") or "")
|
||||
+ "\n\n[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
|
||||
)
|
||||
compressed.append(msg)
|
||||
|
||||
if summary:
|
||||
last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user"
|
||||
summary_role = "user" if last_head_role in ("assistant", "tool") else "assistant"
|
||||
compressed.append({"role": summary_role, "content": summary})
|
||||
else:
|
||||
if not self.quiet_mode:
|
||||
print(" ⚠️ No summary model available — middle turns dropped without summary")
|
||||
|
||||
for i in range(compress_end, n_messages):
|
||||
compressed.append(messages[i].copy())
|
||||
|
||||
self.compression_count += 1
|
||||
|
||||
compressed = self._sanitize_tool_pairs(compressed)
|
||||
|
||||
if not self.quiet_mode:
|
||||
new_estimate = estimate_messages_tokens_rough(compressed)
|
||||
saved_estimate = display_tokens - new_estimate
|
||||
print(f" ✅ Compressed: {n_messages} → {len(compressed)} messages (~{saved_estimate:,} tokens saved)")
|
||||
print(f" 💡 Compression #{self.compression_count} complete")
|
||||
|
||||
return compressed
|
||||
614
agent/display.py
Normal file
614
agent/display.py
Normal file
@@ -0,0 +1,614 @@
|
||||
"""CLI presentation -- spinner, kawaii faces, tool preview formatting.
|
||||
|
||||
Pure display functions and classes with no AIAgent dependency.
|
||||
Used by AIAgent._execute_tool_calls for CLI feedback.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
|
||||
# ANSI escape codes for coloring tool failure indicators
|
||||
_RED = "\033[31m"
|
||||
_RESET = "\033[0m"
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Skin-aware helpers (lazy import to avoid circular deps)
|
||||
# =========================================================================
|
||||
|
||||
def _get_skin():
|
||||
"""Get the active skin config, or None if not available."""
|
||||
try:
|
||||
from hermes_cli.skin_engine import get_active_skin
|
||||
return get_active_skin()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def get_skin_faces(key: str, default: list) -> list:
|
||||
"""Get spinner face list from active skin, falling back to default."""
|
||||
skin = _get_skin()
|
||||
if skin:
|
||||
faces = skin.get_spinner_list(key)
|
||||
if faces:
|
||||
return faces
|
||||
return default
|
||||
|
||||
|
||||
def get_skin_verbs() -> list:
|
||||
"""Get thinking verbs from active skin."""
|
||||
skin = _get_skin()
|
||||
if skin:
|
||||
verbs = skin.get_spinner_list("thinking_verbs")
|
||||
if verbs:
|
||||
return verbs
|
||||
return KawaiiSpinner.THINKING_VERBS
|
||||
|
||||
|
||||
def get_skin_tool_prefix() -> str:
|
||||
"""Get tool output prefix character from active skin."""
|
||||
skin = _get_skin()
|
||||
if skin:
|
||||
return skin.tool_prefix
|
||||
return "┊"
|
||||
|
||||
|
||||
def get_tool_emoji(tool_name: str, default: str = "⚡") -> str:
|
||||
"""Get the display emoji for a tool.
|
||||
|
||||
Resolution order:
|
||||
1. Active skin's ``tool_emojis`` overrides (if a skin is loaded)
|
||||
2. Tool registry's per-tool ``emoji`` field
|
||||
3. *default* fallback
|
||||
"""
|
||||
# 1. Skin override
|
||||
skin = _get_skin()
|
||||
if skin and skin.tool_emojis:
|
||||
override = skin.tool_emojis.get(tool_name)
|
||||
if override:
|
||||
return override
|
||||
# 2. Registry default
|
||||
try:
|
||||
from tools.registry import registry
|
||||
emoji = registry.get_emoji(tool_name, default="")
|
||||
if emoji:
|
||||
return emoji
|
||||
except Exception:
|
||||
pass
|
||||
# 3. Hardcoded fallback
|
||||
return default
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tool preview (one-line summary of a tool call's primary argument)
|
||||
# =========================================================================
|
||||
|
||||
def _oneline(text: str) -> str:
|
||||
"""Collapse whitespace (including newlines) to single spaces."""
|
||||
return " ".join(text.split())
|
||||
|
||||
|
||||
def build_tool_preview(tool_name: str, args: dict, max_len: int = 40) -> str | None:
|
||||
"""Build a short preview of a tool call's primary argument for display."""
|
||||
if not args:
|
||||
return None
|
||||
primary_args = {
|
||||
"terminal": "command", "web_search": "query", "web_extract": "urls",
|
||||
"read_file": "path", "write_file": "path", "patch": "path",
|
||||
"search_files": "pattern", "browser_navigate": "url",
|
||||
"browser_click": "ref", "browser_type": "text",
|
||||
"image_generate": "prompt", "text_to_speech": "text",
|
||||
"vision_analyze": "question", "mixture_of_agents": "user_prompt",
|
||||
"skill_view": "name", "skills_list": "category",
|
||||
"cronjob": "action",
|
||||
"execute_code": "code", "delegate_task": "goal",
|
||||
"clarify": "question", "skill_manage": "name",
|
||||
}
|
||||
|
||||
if tool_name == "process":
|
||||
action = args.get("action", "")
|
||||
sid = args.get("session_id", "")
|
||||
data = args.get("data", "")
|
||||
timeout_val = args.get("timeout")
|
||||
parts = [action]
|
||||
if sid:
|
||||
parts.append(sid[:16])
|
||||
if data:
|
||||
parts.append(f'"{_oneline(data[:20])}"')
|
||||
if timeout_val and action == "wait":
|
||||
parts.append(f"{timeout_val}s")
|
||||
return " ".join(parts) if parts else None
|
||||
|
||||
if tool_name == "todo":
|
||||
todos_arg = args.get("todos")
|
||||
merge = args.get("merge", False)
|
||||
if todos_arg is None:
|
||||
return "reading task list"
|
||||
elif merge:
|
||||
return f"updating {len(todos_arg)} task(s)"
|
||||
else:
|
||||
return f"planning {len(todos_arg)} task(s)"
|
||||
|
||||
if tool_name == "session_search":
|
||||
query = _oneline(args.get("query", ""))
|
||||
return f"recall: \"{query[:25]}{'...' if len(query) > 25 else ''}\""
|
||||
|
||||
if tool_name == "memory":
|
||||
action = args.get("action", "")
|
||||
target = args.get("target", "")
|
||||
if action == "add":
|
||||
content = _oneline(args.get("content", ""))
|
||||
return f"+{target}: \"{content[:25]}{'...' if len(content) > 25 else ''}\""
|
||||
elif action == "replace":
|
||||
return f"~{target}: \"{_oneline(args.get('old_text', '')[:20])}\""
|
||||
elif action == "remove":
|
||||
return f"-{target}: \"{_oneline(args.get('old_text', '')[:20])}\""
|
||||
return action
|
||||
|
||||
if tool_name == "send_message":
|
||||
target = args.get("target", "?")
|
||||
msg = _oneline(args.get("message", ""))
|
||||
if len(msg) > 20:
|
||||
msg = msg[:17] + "..."
|
||||
return f"to {target}: \"{msg}\""
|
||||
|
||||
if tool_name.startswith("rl_"):
|
||||
rl_previews = {
|
||||
"rl_list_environments": "listing envs",
|
||||
"rl_select_environment": args.get("name", ""),
|
||||
"rl_get_current_config": "reading config",
|
||||
"rl_edit_config": f"{args.get('field', '')}={args.get('value', '')}",
|
||||
"rl_start_training": "starting",
|
||||
"rl_check_status": args.get("run_id", "")[:16],
|
||||
"rl_stop_training": f"stopping {args.get('run_id', '')[:16]}",
|
||||
"rl_get_results": args.get("run_id", "")[:16],
|
||||
"rl_list_runs": "listing runs",
|
||||
"rl_test_inference": f"{args.get('num_steps', 3)} steps",
|
||||
}
|
||||
return rl_previews.get(tool_name)
|
||||
|
||||
key = primary_args.get(tool_name)
|
||||
if not key:
|
||||
for fallback_key in ("query", "text", "command", "path", "name", "prompt", "code", "goal"):
|
||||
if fallback_key in args:
|
||||
key = fallback_key
|
||||
break
|
||||
|
||||
if not key or key not in args:
|
||||
return None
|
||||
|
||||
value = args[key]
|
||||
if isinstance(value, list):
|
||||
value = value[0] if value else ""
|
||||
|
||||
preview = _oneline(str(value))
|
||||
if not preview:
|
||||
return None
|
||||
if len(preview) > max_len:
|
||||
preview = preview[:max_len - 3] + "..."
|
||||
return preview
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# KawaiiSpinner
|
||||
# =========================================================================
|
||||
|
||||
class KawaiiSpinner:
|
||||
"""Animated spinner with kawaii faces for CLI feedback during tool execution."""
|
||||
|
||||
SPINNERS = {
|
||||
'dots': ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'],
|
||||
'bounce': ['⠁', '⠂', '⠄', '⡀', '⢀', '⠠', '⠐', '⠈'],
|
||||
'grow': ['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█', '▇', '▆', '▅', '▄', '▃', '▂'],
|
||||
'arrows': ['←', '↖', '↑', '↗', '→', '↘', '↓', '↙'],
|
||||
'star': ['✶', '✷', '✸', '✹', '✺', '✹', '✸', '✷'],
|
||||
'moon': ['🌑', '🌒', '🌓', '🌔', '🌕', '🌖', '🌗', '🌘'],
|
||||
'pulse': ['◜', '◠', '◝', '◞', '◡', '◟'],
|
||||
'brain': ['🧠', '💭', '💡', '✨', '💫', '🌟', '💡', '💭'],
|
||||
'sparkle': ['⁺', '˚', '*', '✧', '✦', '✧', '*', '˚'],
|
||||
}
|
||||
|
||||
KAWAII_WAITING = [
|
||||
"(。◕‿◕。)", "(◕‿◕✿)", "٩(◕‿◕。)۶", "(✿◠‿◠)", "( ˘▽˘)っ",
|
||||
"♪(´ε` )", "(◕ᴗ◕✿)", "ヾ(^∇^)", "(≧◡≦)", "(★ω★)",
|
||||
]
|
||||
|
||||
KAWAII_THINKING = [
|
||||
"(。•́︿•̀。)", "(◔_◔)", "(¬‿¬)", "( •_•)>⌐■-■", "(⌐■_■)",
|
||||
"(´・_・`)", "◉_◉", "(°ロ°)", "( ˘⌣˘)♡", "ヽ(>∀<☆)☆",
|
||||
"٩(๑❛ᴗ❛๑)۶", "(⊙_⊙)", "(¬_¬)", "( ͡° ͜ʖ ͡°)", "ಠ_ಠ",
|
||||
]
|
||||
|
||||
THINKING_VERBS = [
|
||||
"pondering", "contemplating", "musing", "cogitating", "ruminating",
|
||||
"deliberating", "mulling", "reflecting", "processing", "reasoning",
|
||||
"analyzing", "computing", "synthesizing", "formulating", "brainstorming",
|
||||
]
|
||||
|
||||
def __init__(self, message: str = "", spinner_type: str = 'dots'):
|
||||
self.message = message
|
||||
self.spinner_frames = self.SPINNERS.get(spinner_type, self.SPINNERS['dots'])
|
||||
self.running = False
|
||||
self.thread = None
|
||||
self.frame_idx = 0
|
||||
self.start_time = None
|
||||
self.last_line_len = 0
|
||||
self._last_flush_time = 0.0 # Rate-limit flushes for patch_stdout compat
|
||||
# Capture stdout NOW, before any redirect_stdout(devnull) from
|
||||
# child agents can replace sys.stdout with a black hole.
|
||||
self._out = sys.stdout
|
||||
|
||||
def _write(self, text: str, end: str = '\n', flush: bool = False):
|
||||
"""Write to the stdout captured at spinner creation time."""
|
||||
try:
|
||||
self._out.write(text + end)
|
||||
if flush:
|
||||
self._out.flush()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
def _animate(self):
|
||||
# Cache skin wings at start (avoid per-frame imports)
|
||||
skin = _get_skin()
|
||||
wings = skin.get_spinner_wings() if skin else []
|
||||
|
||||
while self.running:
|
||||
if os.getenv("HERMES_SPINNER_PAUSE"):
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
frame = self.spinner_frames[self.frame_idx % len(self.spinner_frames)]
|
||||
elapsed = time.time() - self.start_time
|
||||
if wings:
|
||||
left, right = wings[self.frame_idx % len(wings)]
|
||||
line = f" {left} {frame} {self.message} {right} ({elapsed:.1f}s)"
|
||||
else:
|
||||
line = f" {frame} {self.message} ({elapsed:.1f}s)"
|
||||
pad = max(self.last_line_len - len(line), 0)
|
||||
# Rate-limit flush() calls to avoid spinner spam under
|
||||
# prompt_toolkit's patch_stdout. Each flush() pushes a queue
|
||||
# item that may trigger a separate run_in_terminal() call; if
|
||||
# items are processed one-at-a-time the \r overwrite is lost
|
||||
# and every frame appears on its own line. By flushing at
|
||||
# most every 0.4s we guarantee multiple \r-frames are batched
|
||||
# into a single write, so the terminal collapses them correctly.
|
||||
now = time.time()
|
||||
should_flush = (now - self._last_flush_time) >= 0.4
|
||||
self._write(f"\r{line}{' ' * pad}", end='', flush=should_flush)
|
||||
if should_flush:
|
||||
self._last_flush_time = now
|
||||
self.last_line_len = len(line)
|
||||
self.frame_idx += 1
|
||||
time.sleep(0.12)
|
||||
|
||||
def start(self):
|
||||
if self.running:
|
||||
return
|
||||
self.running = True
|
||||
self.start_time = time.time()
|
||||
self.thread = threading.Thread(target=self._animate, daemon=True)
|
||||
self.thread.start()
|
||||
|
||||
def update_text(self, new_message: str):
|
||||
self.message = new_message
|
||||
|
||||
def print_above(self, text: str):
|
||||
"""Print a line above the spinner without disrupting animation.
|
||||
|
||||
Clears the current spinner line, prints the text, and lets the
|
||||
next animation tick redraw the spinner on the line below.
|
||||
Thread-safe: uses the captured stdout reference (self._out).
|
||||
Works inside redirect_stdout(devnull) because _write bypasses
|
||||
sys.stdout and writes to the stdout captured at spinner creation.
|
||||
"""
|
||||
if not self.running:
|
||||
self._write(f" {text}", flush=True)
|
||||
return
|
||||
# Clear spinner line with spaces (not \033[K) to avoid garbled escape
|
||||
# codes when prompt_toolkit's patch_stdout is active — same approach
|
||||
# as stop(). Then print text; spinner redraws on next tick.
|
||||
blanks = ' ' * max(self.last_line_len + 5, 40)
|
||||
self._write(f"\r{blanks}\r {text}", flush=True)
|
||||
|
||||
def stop(self, final_message: str = None):
|
||||
self.running = False
|
||||
if self.thread:
|
||||
self.thread.join(timeout=0.5)
|
||||
# Clear the spinner line with spaces instead of \033[K to avoid
|
||||
# garbled escape codes when prompt_toolkit's patch_stdout is active.
|
||||
blanks = ' ' * max(self.last_line_len + 5, 40)
|
||||
self._write(f"\r{blanks}\r", end='', flush=True)
|
||||
if final_message:
|
||||
self._write(f" {final_message}", flush=True)
|
||||
|
||||
def __enter__(self):
|
||||
self.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.stop()
|
||||
return False
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Kawaii face arrays (used by AIAgent._execute_tool_calls for spinner text)
|
||||
# =========================================================================
|
||||
|
||||
KAWAII_SEARCH = [
|
||||
"♪(´ε` )", "(。◕‿◕。)", "ヾ(^∇^)", "(◕ᴗ◕✿)", "( ˘▽˘)っ",
|
||||
"٩(◕‿◕。)۶", "(✿◠‿◠)", "♪~(´ε` )", "(ノ´ヮ`)ノ*:・゚✧", "\(◎o◎)/",
|
||||
]
|
||||
KAWAII_READ = [
|
||||
"φ(゜▽゜*)♪", "( ˘▽˘)っ", "(⌐■_■)", "٩(。•́‿•̀。)۶", "(◕‿◕✿)",
|
||||
"ヾ(@⌒ー⌒@)ノ", "(✧ω✧)", "♪(๑ᴖ◡ᴖ๑)♪", "(≧◡≦)", "( ´ ▽ ` )ノ",
|
||||
]
|
||||
KAWAII_TERMINAL = [
|
||||
"ヽ(>∀<☆)ノ", "(ノ°∀°)ノ", "٩(^ᴗ^)۶", "ヾ(⌐■_■)ノ♪", "(•̀ᴗ•́)و",
|
||||
"┗(^0^)┓", "(`・ω・´)", "\( ̄▽ ̄)/", "(ง •̀_•́)ง", "ヽ(´▽`)/",
|
||||
]
|
||||
KAWAII_BROWSER = [
|
||||
"(ノ°∀°)ノ", "(☞゚ヮ゚)☞", "( ͡° ͜ʖ ͡°)", "┌( ಠ_ಠ)┘", "(⊙_⊙)?",
|
||||
"ヾ(•ω•`)o", "( ̄ω ̄)", "( ˇωˇ )", "(ᵔᴥᵔ)", "\(◎o◎)/",
|
||||
]
|
||||
KAWAII_CREATE = [
|
||||
"✧*。٩(ˊᗜˋ*)و✧", "(ノ◕ヮ◕)ノ*:・゚✧", "ヽ(>∀<☆)ノ", "٩(♡ε♡)۶", "(◕‿◕)♡",
|
||||
"✿◕ ‿ ◕✿", "(*≧▽≦)", "ヾ(^-^)ノ", "(☆▽☆)", "°˖✧◝(⁰▿⁰)◜✧˖°",
|
||||
]
|
||||
KAWAII_SKILL = [
|
||||
"ヾ(@⌒ー⌒@)ノ", "(๑˃ᴗ˂)ﻭ", "٩(◕‿◕。)۶", "(✿╹◡╹)", "ヽ(・∀・)ノ",
|
||||
"(ノ´ヮ`)ノ*:・゚✧", "♪(๑ᴖ◡ᴖ๑)♪", "(◠‿◠)", "٩(ˊᗜˋ*)و", "(^▽^)",
|
||||
"ヾ(^∇^)", "(★ω★)/", "٩(。•́‿•̀。)۶", "(◕ᴗ◕✿)", "\(◎o◎)/",
|
||||
"(✧ω✧)", "ヽ(>∀<☆)ノ", "( ˘▽˘)っ", "(≧◡≦) ♡", "ヾ( ̄▽ ̄)",
|
||||
]
|
||||
KAWAII_THINK = [
|
||||
"(っ°Д°;)っ", "(;′⌒`)", "(・_・ヾ", "( ´_ゝ`)", "( ̄ヘ ̄)",
|
||||
"(。-`ω´-)", "( ˘︹˘ )", "(¬_¬)", "ヽ(ー_ー )ノ", "(;一_一)",
|
||||
]
|
||||
KAWAII_GENERIC = [
|
||||
"♪(´ε` )", "(◕‿◕✿)", "ヾ(^∇^)", "٩(◕‿◕。)۶", "(✿◠‿◠)",
|
||||
"(ノ´ヮ`)ノ*:・゚✧", "ヽ(>∀<☆)ノ", "(☆▽☆)", "( ˘▽˘)っ", "(≧◡≦)",
|
||||
]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Cute tool message (completion line that replaces the spinner)
|
||||
# =========================================================================
|
||||
|
||||
def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str]:
|
||||
"""Inspect a tool result string for signs of failure.
|
||||
|
||||
Returns ``(is_failure, suffix)`` where *suffix* is an informational tag
|
||||
like ``" [exit 1]"`` for terminal failures, or ``" [error]"`` for generic
|
||||
failures. On success, returns ``(False, "")``.
|
||||
"""
|
||||
if result is None:
|
||||
return False, ""
|
||||
|
||||
if tool_name == "terminal":
|
||||
try:
|
||||
data = json.loads(result)
|
||||
exit_code = data.get("exit_code")
|
||||
if exit_code is not None and exit_code != 0:
|
||||
return True, f" [exit {exit_code}]"
|
||||
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||
logger.debug("Could not parse terminal result as JSON for exit code check")
|
||||
return False, ""
|
||||
|
||||
# Memory-specific: distinguish "full" from real errors
|
||||
if tool_name == "memory":
|
||||
try:
|
||||
data = json.loads(result)
|
||||
if data.get("success") is False and "exceed the limit" in data.get("error", ""):
|
||||
return True, " [full]"
|
||||
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||
logger.debug("Could not parse memory result as JSON for capacity check")
|
||||
|
||||
# Generic heuristic for non-terminal tools
|
||||
lower = result[:500].lower()
|
||||
if '"error"' in lower or '"failed"' in lower or result.startswith("Error"):
|
||||
return True, " [error]"
|
||||
|
||||
return False, ""
|
||||
|
||||
|
||||
def get_cute_tool_message(
|
||||
tool_name: str, args: dict, duration: float, result: str | None = None,
|
||||
) -> str:
|
||||
"""Generate a formatted tool completion line for CLI quiet mode.
|
||||
|
||||
Format: ``| {emoji} {verb:9} {detail} {duration}``
|
||||
|
||||
When *result* is provided the line is checked for failure indicators.
|
||||
Failed tool calls get a red prefix and an informational suffix.
|
||||
"""
|
||||
dur = f"{duration:.1f}s"
|
||||
is_failure, failure_suffix = _detect_tool_failure(tool_name, result)
|
||||
skin_prefix = get_skin_tool_prefix()
|
||||
|
||||
def _trunc(s, n=40):
|
||||
s = str(s)
|
||||
return (s[:n-3] + "...") if len(s) > n else s
|
||||
|
||||
def _path(p, n=35):
|
||||
p = str(p)
|
||||
return ("..." + p[-(n-3):]) if len(p) > n else p
|
||||
|
||||
def _wrap(line: str) -> str:
|
||||
"""Apply skin tool prefix and failure suffix."""
|
||||
if skin_prefix != "┊":
|
||||
line = line.replace("┊", skin_prefix, 1)
|
||||
if not is_failure:
|
||||
return line
|
||||
return f"{line}{failure_suffix}"
|
||||
|
||||
if tool_name == "web_search":
|
||||
return _wrap(f"┊ 🔍 search {_trunc(args.get('query', ''), 42)} {dur}")
|
||||
if tool_name == "web_extract":
|
||||
urls = args.get("urls", [])
|
||||
if urls:
|
||||
url = urls[0] if isinstance(urls, list) else str(urls)
|
||||
domain = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
extra = f" +{len(urls)-1}" if len(urls) > 1 else ""
|
||||
return _wrap(f"┊ 📄 fetch {_trunc(domain, 35)}{extra} {dur}")
|
||||
return _wrap(f"┊ 📄 fetch pages {dur}")
|
||||
if tool_name == "web_crawl":
|
||||
url = args.get("url", "")
|
||||
domain = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
return _wrap(f"┊ 🕸️ crawl {_trunc(domain, 35)} {dur}")
|
||||
if tool_name == "terminal":
|
||||
return _wrap(f"┊ 💻 $ {_trunc(args.get('command', ''), 42)} {dur}")
|
||||
if tool_name == "process":
|
||||
action = args.get("action", "?")
|
||||
sid = args.get("session_id", "")[:12]
|
||||
labels = {"list": "ls processes", "poll": f"poll {sid}", "log": f"log {sid}",
|
||||
"wait": f"wait {sid}", "kill": f"kill {sid}", "write": f"write {sid}", "submit": f"submit {sid}"}
|
||||
return _wrap(f"┊ ⚙️ proc {labels.get(action, f'{action} {sid}')} {dur}")
|
||||
if tool_name == "read_file":
|
||||
return _wrap(f"┊ 📖 read {_path(args.get('path', ''))} {dur}")
|
||||
if tool_name == "write_file":
|
||||
return _wrap(f"┊ ✍️ write {_path(args.get('path', ''))} {dur}")
|
||||
if tool_name == "patch":
|
||||
return _wrap(f"┊ 🔧 patch {_path(args.get('path', ''))} {dur}")
|
||||
if tool_name == "search_files":
|
||||
pattern = _trunc(args.get("pattern", ""), 35)
|
||||
target = args.get("target", "content")
|
||||
verb = "find" if target == "files" else "grep"
|
||||
return _wrap(f"┊ 🔎 {verb:9} {pattern} {dur}")
|
||||
if tool_name == "browser_navigate":
|
||||
url = args.get("url", "")
|
||||
domain = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
return _wrap(f"┊ 🌐 navigate {_trunc(domain, 35)} {dur}")
|
||||
if tool_name == "browser_snapshot":
|
||||
mode = "full" if args.get("full") else "compact"
|
||||
return _wrap(f"┊ 📸 snapshot {mode} {dur}")
|
||||
if tool_name == "browser_click":
|
||||
return _wrap(f"┊ 👆 click {args.get('ref', '?')} {dur}")
|
||||
if tool_name == "browser_type":
|
||||
return _wrap(f"┊ ⌨️ type \"{_trunc(args.get('text', ''), 30)}\" {dur}")
|
||||
if tool_name == "browser_scroll":
|
||||
d = args.get("direction", "down")
|
||||
arrow = {"down": "↓", "up": "↑", "right": "→", "left": "←"}.get(d, "↓")
|
||||
return _wrap(f"┊ {arrow} scroll {d} {dur}")
|
||||
if tool_name == "browser_back":
|
||||
return _wrap(f"┊ ◀️ back {dur}")
|
||||
if tool_name == "browser_press":
|
||||
return _wrap(f"┊ ⌨️ press {args.get('key', '?')} {dur}")
|
||||
if tool_name == "browser_close":
|
||||
return _wrap(f"┊ 🚪 close browser {dur}")
|
||||
if tool_name == "browser_get_images":
|
||||
return _wrap(f"┊ 🖼️ images extracting {dur}")
|
||||
if tool_name == "browser_vision":
|
||||
return _wrap(f"┊ 👁️ vision analyzing page {dur}")
|
||||
if tool_name == "todo":
|
||||
todos_arg = args.get("todos")
|
||||
merge = args.get("merge", False)
|
||||
if todos_arg is None:
|
||||
return _wrap(f"┊ 📋 plan reading tasks {dur}")
|
||||
elif merge:
|
||||
return _wrap(f"┊ 📋 plan update {len(todos_arg)} task(s) {dur}")
|
||||
else:
|
||||
return _wrap(f"┊ 📋 plan {len(todos_arg)} task(s) {dur}")
|
||||
if tool_name == "session_search":
|
||||
return _wrap(f"┊ 🔍 recall \"{_trunc(args.get('query', ''), 35)}\" {dur}")
|
||||
if tool_name == "memory":
|
||||
action = args.get("action", "?")
|
||||
target = args.get("target", "")
|
||||
if action == "add":
|
||||
return _wrap(f"┊ 🧠 memory +{target}: \"{_trunc(args.get('content', ''), 30)}\" {dur}")
|
||||
elif action == "replace":
|
||||
return _wrap(f"┊ 🧠 memory ~{target}: \"{_trunc(args.get('old_text', ''), 20)}\" {dur}")
|
||||
elif action == "remove":
|
||||
return _wrap(f"┊ 🧠 memory -{target}: \"{_trunc(args.get('old_text', ''), 20)}\" {dur}")
|
||||
return _wrap(f"┊ 🧠 memory {action} {dur}")
|
||||
if tool_name == "skills_list":
|
||||
return _wrap(f"┊ 📚 skills list {args.get('category', 'all')} {dur}")
|
||||
if tool_name == "skill_view":
|
||||
return _wrap(f"┊ 📚 skill {_trunc(args.get('name', ''), 30)} {dur}")
|
||||
if tool_name == "image_generate":
|
||||
return _wrap(f"┊ 🎨 create {_trunc(args.get('prompt', ''), 35)} {dur}")
|
||||
if tool_name == "text_to_speech":
|
||||
return _wrap(f"┊ 🔊 speak {_trunc(args.get('text', ''), 30)} {dur}")
|
||||
if tool_name == "vision_analyze":
|
||||
return _wrap(f"┊ 👁️ vision {_trunc(args.get('question', ''), 30)} {dur}")
|
||||
if tool_name == "mixture_of_agents":
|
||||
return _wrap(f"┊ 🧠 reason {_trunc(args.get('user_prompt', ''), 30)} {dur}")
|
||||
if tool_name == "send_message":
|
||||
return _wrap(f"┊ 📨 send {args.get('target', '?')}: \"{_trunc(args.get('message', ''), 25)}\" {dur}")
|
||||
if tool_name == "cronjob":
|
||||
action = args.get("action", "?")
|
||||
if action == "create":
|
||||
skills = args.get("skills") or ([] if not args.get("skill") else [args.get("skill")])
|
||||
label = args.get("name") or (skills[0] if skills else None) or args.get("prompt", "task")
|
||||
return _wrap(f"┊ ⏰ cron create {_trunc(label, 24)} {dur}")
|
||||
if action == "list":
|
||||
return _wrap(f"┊ ⏰ cron listing {dur}")
|
||||
return _wrap(f"┊ ⏰ cron {action} {args.get('job_id', '')} {dur}")
|
||||
if tool_name.startswith("rl_"):
|
||||
rl = {
|
||||
"rl_list_environments": "list envs", "rl_select_environment": f"select {args.get('name', '')}",
|
||||
"rl_get_current_config": "get config", "rl_edit_config": f"set {args.get('field', '?')}",
|
||||
"rl_start_training": "start training", "rl_check_status": f"status {args.get('run_id', '?')[:12]}",
|
||||
"rl_stop_training": f"stop {args.get('run_id', '?')[:12]}", "rl_get_results": f"results {args.get('run_id', '?')[:12]}",
|
||||
"rl_list_runs": "list runs", "rl_test_inference": "test inference",
|
||||
}
|
||||
return _wrap(f"┊ 🧪 rl {rl.get(tool_name, tool_name.replace('rl_', ''))} {dur}")
|
||||
if tool_name == "execute_code":
|
||||
code = args.get("code", "")
|
||||
first_line = code.strip().split("\n")[0] if code.strip() else ""
|
||||
return _wrap(f"┊ 🐍 exec {_trunc(first_line, 35)} {dur}")
|
||||
if tool_name == "delegate_task":
|
||||
tasks = args.get("tasks")
|
||||
if tasks and isinstance(tasks, list):
|
||||
return _wrap(f"┊ 🔀 delegate {len(tasks)} parallel tasks {dur}")
|
||||
return _wrap(f"┊ 🔀 delegate {_trunc(args.get('goal', ''), 35)} {dur}")
|
||||
|
||||
preview = build_tool_preview(tool_name, args) or ""
|
||||
return _wrap(f"┊ ⚡ {tool_name[:9]:9} {_trunc(preview, 35)} {dur}")
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Honcho session line (one-liner with clickable OSC 8 hyperlink)
|
||||
# =========================================================================
|
||||
|
||||
_DIM = "\033[2m"
|
||||
_SKY_BLUE = "\033[38;5;117m"
|
||||
_ANSI_RESET = "\033[0m"
|
||||
|
||||
|
||||
def honcho_session_url(workspace: str, session_name: str) -> str:
|
||||
"""Build a Honcho app URL for a session."""
|
||||
from urllib.parse import quote
|
||||
return (
|
||||
f"https://app.honcho.dev/explore"
|
||||
f"?workspace={quote(workspace, safe='')}"
|
||||
f"&view=sessions"
|
||||
f"&session={quote(session_name, safe='')}"
|
||||
)
|
||||
|
||||
|
||||
def _osc8_link(url: str, text: str) -> str:
|
||||
"""OSC 8 terminal hyperlink (clickable in iTerm2, Ghostty, WezTerm, etc.)."""
|
||||
return f"\033]8;;{url}\033\\{text}\033]8;;\033\\"
|
||||
|
||||
|
||||
def honcho_session_line(workspace: str, session_name: str) -> str:
|
||||
"""One-line session indicator: `Honcho session: <clickable name>`."""
|
||||
url = honcho_session_url(workspace, session_name)
|
||||
linked_name = _osc8_link(url, f"{_SKY_BLUE}{session_name}{_ANSI_RESET}")
|
||||
return f"{_DIM}Honcho session:{_ANSI_RESET} {linked_name}"
|
||||
|
||||
|
||||
def write_tty(text: str) -> None:
|
||||
"""Write directly to /dev/tty, bypassing stdout capture."""
|
||||
try:
|
||||
fd = os.open("/dev/tty", os.O_WRONLY)
|
||||
os.write(fd, text.encode("utf-8"))
|
||||
os.close(fd)
|
||||
except OSError:
|
||||
sys.stdout.write(text)
|
||||
sys.stdout.flush()
|
||||
719
agent/insights.py
Normal file
719
agent/insights.py
Normal file
@@ -0,0 +1,719 @@
|
||||
"""
|
||||
Session Insights Engine for Hermes Agent.
|
||||
|
||||
Analyzes historical session data from the SQLite state database to produce
|
||||
comprehensive usage insights — token consumption, cost estimates, tool usage
|
||||
patterns, activity trends, model/platform breakdowns, and session metrics.
|
||||
|
||||
Inspired by Claude Code's /insights command, adapted for Hermes Agent's
|
||||
multi-platform architecture with additional cost estimation and platform
|
||||
breakdown capabilities.
|
||||
|
||||
Usage:
|
||||
from agent.insights import InsightsEngine
|
||||
engine = InsightsEngine(db)
|
||||
report = engine.generate(days=30)
|
||||
print(engine.format_terminal(report))
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from agent.usage_pricing import DEFAULT_PRICING, estimate_cost_usd, format_duration_compact, get_pricing, has_known_pricing
|
||||
|
||||
_DEFAULT_PRICING = DEFAULT_PRICING
|
||||
|
||||
|
||||
def _has_known_pricing(model_name: str) -> bool:
|
||||
"""Check if a model has known pricing (vs unknown/custom endpoint)."""
|
||||
return has_known_pricing(model_name)
|
||||
|
||||
|
||||
def _get_pricing(model_name: str) -> Dict[str, float]:
|
||||
"""Look up pricing for a model. Uses fuzzy matching on model name.
|
||||
|
||||
Returns _DEFAULT_PRICING (zero cost) for unknown/custom models —
|
||||
we can't assume costs for self-hosted endpoints, local inference, etc.
|
||||
"""
|
||||
return get_pricing(model_name)
|
||||
|
||||
|
||||
def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
|
||||
"""Estimate the USD cost for a given model and token counts."""
|
||||
return estimate_cost_usd(model, input_tokens, output_tokens)
|
||||
|
||||
|
||||
def _format_duration(seconds: float) -> str:
|
||||
"""Format seconds into a human-readable duration string."""
|
||||
return format_duration_compact(seconds)
|
||||
|
||||
|
||||
def _bar_chart(values: List[int], max_width: int = 20) -> List[str]:
|
||||
"""Create simple horizontal bar chart strings from values."""
|
||||
peak = max(values) if values else 1
|
||||
if peak == 0:
|
||||
return ["" for _ in values]
|
||||
return ["█" * max(1, int(v / peak * max_width)) if v > 0 else "" for v in values]
|
||||
|
||||
|
||||
class InsightsEngine:
|
||||
"""
|
||||
Analyzes session history and produces usage insights.
|
||||
|
||||
Works directly with a SessionDB instance (or raw sqlite3 connection)
|
||||
to query session and message data.
|
||||
"""
|
||||
|
||||
def __init__(self, db):
|
||||
"""
|
||||
Initialize with a SessionDB instance.
|
||||
|
||||
Args:
|
||||
db: A SessionDB instance (from hermes_state.py)
|
||||
"""
|
||||
self.db = db
|
||||
self._conn = db._conn
|
||||
|
||||
def generate(self, days: int = 30, source: str = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate a complete insights report.
|
||||
|
||||
Args:
|
||||
days: Number of days to look back (default: 30)
|
||||
source: Optional filter by source platform
|
||||
|
||||
Returns:
|
||||
Dict with all computed insights
|
||||
"""
|
||||
cutoff = time.time() - (days * 86400)
|
||||
|
||||
# Gather raw data
|
||||
sessions = self._get_sessions(cutoff, source)
|
||||
tool_usage = self._get_tool_usage(cutoff, source)
|
||||
message_stats = self._get_message_stats(cutoff, source)
|
||||
|
||||
if not sessions:
|
||||
return {
|
||||
"days": days,
|
||||
"source_filter": source,
|
||||
"empty": True,
|
||||
"overview": {},
|
||||
"models": [],
|
||||
"platforms": [],
|
||||
"tools": [],
|
||||
"activity": {},
|
||||
"top_sessions": [],
|
||||
}
|
||||
|
||||
# Compute insights
|
||||
overview = self._compute_overview(sessions, message_stats)
|
||||
models = self._compute_model_breakdown(sessions)
|
||||
platforms = self._compute_platform_breakdown(sessions)
|
||||
tools = self._compute_tool_breakdown(tool_usage)
|
||||
activity = self._compute_activity_patterns(sessions)
|
||||
top_sessions = self._compute_top_sessions(sessions)
|
||||
|
||||
return {
|
||||
"days": days,
|
||||
"source_filter": source,
|
||||
"empty": False,
|
||||
"generated_at": time.time(),
|
||||
"overview": overview,
|
||||
"models": models,
|
||||
"platforms": platforms,
|
||||
"tools": tools,
|
||||
"activity": activity,
|
||||
"top_sessions": top_sessions,
|
||||
}
|
||||
|
||||
# =========================================================================
|
||||
# Data gathering (SQL queries)
|
||||
# =========================================================================
|
||||
|
||||
# Columns we actually need (skip system_prompt, model_config blobs)
|
||||
_SESSION_COLS = ("id, source, model, started_at, ended_at, "
|
||||
"message_count, tool_call_count, input_tokens, output_tokens")
|
||||
|
||||
def _get_sessions(self, cutoff: float, source: str = None) -> List[Dict]:
|
||||
"""Fetch sessions within the time window."""
|
||||
if source:
|
||||
cursor = self._conn.execute(
|
||||
f"""SELECT {self._SESSION_COLS} FROM sessions
|
||||
WHERE started_at >= ? AND source = ?
|
||||
ORDER BY started_at DESC""",
|
||||
(cutoff, source),
|
||||
)
|
||||
else:
|
||||
cursor = self._conn.execute(
|
||||
f"""SELECT {self._SESSION_COLS} FROM sessions
|
||||
WHERE started_at >= ?
|
||||
ORDER BY started_at DESC""",
|
||||
(cutoff,),
|
||||
)
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
def _get_tool_usage(self, cutoff: float, source: str = None) -> List[Dict]:
|
||||
"""Get tool call counts from messages.
|
||||
|
||||
Uses two sources:
|
||||
1. tool_name column on 'tool' role messages (set by gateway)
|
||||
2. tool_calls JSON on 'assistant' role messages (covers CLI where
|
||||
tool_name is not populated on tool responses)
|
||||
"""
|
||||
tool_counts = Counter()
|
||||
|
||||
# Source 1: explicit tool_name on tool response messages
|
||||
if source:
|
||||
cursor = self._conn.execute(
|
||||
"""SELECT m.tool_name, COUNT(*) as count
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE s.started_at >= ? AND s.source = ?
|
||||
AND m.role = 'tool' AND m.tool_name IS NOT NULL
|
||||
GROUP BY m.tool_name
|
||||
ORDER BY count DESC""",
|
||||
(cutoff, source),
|
||||
)
|
||||
else:
|
||||
cursor = self._conn.execute(
|
||||
"""SELECT m.tool_name, COUNT(*) as count
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE s.started_at >= ?
|
||||
AND m.role = 'tool' AND m.tool_name IS NOT NULL
|
||||
GROUP BY m.tool_name
|
||||
ORDER BY count DESC""",
|
||||
(cutoff,),
|
||||
)
|
||||
for row in cursor.fetchall():
|
||||
tool_counts[row["tool_name"]] += row["count"]
|
||||
|
||||
# Source 2: extract from tool_calls JSON on assistant messages
|
||||
# (covers CLI sessions where tool_name is NULL on tool responses)
|
||||
if source:
|
||||
cursor2 = self._conn.execute(
|
||||
"""SELECT m.tool_calls
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE s.started_at >= ? AND s.source = ?
|
||||
AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""",
|
||||
(cutoff, source),
|
||||
)
|
||||
else:
|
||||
cursor2 = self._conn.execute(
|
||||
"""SELECT m.tool_calls
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE s.started_at >= ?
|
||||
AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""",
|
||||
(cutoff,),
|
||||
)
|
||||
|
||||
tool_calls_counts = Counter()
|
||||
for row in cursor2.fetchall():
|
||||
try:
|
||||
calls = row["tool_calls"]
|
||||
if isinstance(calls, str):
|
||||
calls = json.loads(calls)
|
||||
if isinstance(calls, list):
|
||||
for call in calls:
|
||||
func = call.get("function", {}) if isinstance(call, dict) else {}
|
||||
name = func.get("name")
|
||||
if name:
|
||||
tool_calls_counts[name] += 1
|
||||
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||
continue
|
||||
|
||||
# Merge: prefer tool_name source, supplement with tool_calls source
|
||||
# for tools not already counted
|
||||
if not tool_counts and tool_calls_counts:
|
||||
# No tool_name data at all — use tool_calls exclusively
|
||||
tool_counts = tool_calls_counts
|
||||
elif tool_counts and tool_calls_counts:
|
||||
# Both sources have data — use whichever has the higher count per tool
|
||||
# (they may overlap, so take the max to avoid double-counting)
|
||||
all_tools = set(tool_counts) | set(tool_calls_counts)
|
||||
merged = Counter()
|
||||
for tool in all_tools:
|
||||
merged[tool] = max(tool_counts.get(tool, 0), tool_calls_counts.get(tool, 0))
|
||||
tool_counts = merged
|
||||
|
||||
# Convert to the expected format
|
||||
return [
|
||||
{"tool_name": name, "count": count}
|
||||
for name, count in tool_counts.most_common()
|
||||
]
|
||||
|
||||
def _get_message_stats(self, cutoff: float, source: str = None) -> Dict:
|
||||
"""Get aggregate message statistics."""
|
||||
if source:
|
||||
cursor = self._conn.execute(
|
||||
"""SELECT
|
||||
COUNT(*) as total_messages,
|
||||
SUM(CASE WHEN m.role = 'user' THEN 1 ELSE 0 END) as user_messages,
|
||||
SUM(CASE WHEN m.role = 'assistant' THEN 1 ELSE 0 END) as assistant_messages,
|
||||
SUM(CASE WHEN m.role = 'tool' THEN 1 ELSE 0 END) as tool_messages
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE s.started_at >= ? AND s.source = ?""",
|
||||
(cutoff, source),
|
||||
)
|
||||
else:
|
||||
cursor = self._conn.execute(
|
||||
"""SELECT
|
||||
COUNT(*) as total_messages,
|
||||
SUM(CASE WHEN m.role = 'user' THEN 1 ELSE 0 END) as user_messages,
|
||||
SUM(CASE WHEN m.role = 'assistant' THEN 1 ELSE 0 END) as assistant_messages,
|
||||
SUM(CASE WHEN m.role = 'tool' THEN 1 ELSE 0 END) as tool_messages
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE s.started_at >= ?""",
|
||||
(cutoff,),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return dict(row) if row else {
|
||||
"total_messages": 0, "user_messages": 0,
|
||||
"assistant_messages": 0, "tool_messages": 0,
|
||||
}
|
||||
|
||||
# =========================================================================
|
||||
# Computation
|
||||
# =========================================================================
|
||||
|
||||
def _compute_overview(self, sessions: List[Dict], message_stats: Dict) -> Dict:
|
||||
"""Compute high-level overview statistics."""
|
||||
total_input = sum(s.get("input_tokens") or 0 for s in sessions)
|
||||
total_output = sum(s.get("output_tokens") or 0 for s in sessions)
|
||||
total_tokens = total_input + total_output
|
||||
total_tool_calls = sum(s.get("tool_call_count") or 0 for s in sessions)
|
||||
total_messages = sum(s.get("message_count") or 0 for s in sessions)
|
||||
|
||||
# Cost estimation (weighted by model)
|
||||
total_cost = 0.0
|
||||
models_with_pricing = set()
|
||||
models_without_pricing = set()
|
||||
for s in sessions:
|
||||
model = s.get("model") or ""
|
||||
inp = s.get("input_tokens") or 0
|
||||
out = s.get("output_tokens") or 0
|
||||
total_cost += _estimate_cost(model, inp, out)
|
||||
display = model.split("/")[-1] if "/" in model else (model or "unknown")
|
||||
if _has_known_pricing(model):
|
||||
models_with_pricing.add(display)
|
||||
else:
|
||||
models_without_pricing.add(display)
|
||||
|
||||
# Session duration stats (guard against negative durations from clock drift)
|
||||
durations = []
|
||||
for s in sessions:
|
||||
start = s.get("started_at")
|
||||
end = s.get("ended_at")
|
||||
if start and end and end > start:
|
||||
durations.append(end - start)
|
||||
|
||||
total_hours = sum(durations) / 3600 if durations else 0
|
||||
avg_duration = sum(durations) / len(durations) if durations else 0
|
||||
|
||||
# Earliest and latest session
|
||||
started_timestamps = [s["started_at"] for s in sessions if s.get("started_at")]
|
||||
date_range_start = min(started_timestamps) if started_timestamps else None
|
||||
date_range_end = max(started_timestamps) if started_timestamps else None
|
||||
|
||||
return {
|
||||
"total_sessions": len(sessions),
|
||||
"total_messages": total_messages,
|
||||
"total_tool_calls": total_tool_calls,
|
||||
"total_input_tokens": total_input,
|
||||
"total_output_tokens": total_output,
|
||||
"total_tokens": total_tokens,
|
||||
"estimated_cost": total_cost,
|
||||
"total_hours": total_hours,
|
||||
"avg_session_duration": avg_duration,
|
||||
"avg_messages_per_session": total_messages / len(sessions) if sessions else 0,
|
||||
"avg_tokens_per_session": total_tokens / len(sessions) if sessions else 0,
|
||||
"user_messages": message_stats.get("user_messages") or 0,
|
||||
"assistant_messages": message_stats.get("assistant_messages") or 0,
|
||||
"tool_messages": message_stats.get("tool_messages") or 0,
|
||||
"date_range_start": date_range_start,
|
||||
"date_range_end": date_range_end,
|
||||
"models_with_pricing": sorted(models_with_pricing),
|
||||
"models_without_pricing": sorted(models_without_pricing),
|
||||
}
|
||||
|
||||
def _compute_model_breakdown(self, sessions: List[Dict]) -> List[Dict]:
|
||||
"""Break down usage by model."""
|
||||
model_data = defaultdict(lambda: {
|
||||
"sessions": 0, "input_tokens": 0, "output_tokens": 0,
|
||||
"total_tokens": 0, "tool_calls": 0, "cost": 0.0,
|
||||
})
|
||||
|
||||
for s in sessions:
|
||||
model = s.get("model") or "unknown"
|
||||
# Normalize: strip provider prefix for display
|
||||
display_model = model.split("/")[-1] if "/" in model else model
|
||||
d = model_data[display_model]
|
||||
d["sessions"] += 1
|
||||
inp = s.get("input_tokens") or 0
|
||||
out = s.get("output_tokens") or 0
|
||||
d["input_tokens"] += inp
|
||||
d["output_tokens"] += out
|
||||
d["total_tokens"] += inp + out
|
||||
d["tool_calls"] += s.get("tool_call_count") or 0
|
||||
d["cost"] += _estimate_cost(model, inp, out)
|
||||
d["has_pricing"] = _has_known_pricing(model)
|
||||
|
||||
result = [
|
||||
{"model": model, **data}
|
||||
for model, data in model_data.items()
|
||||
]
|
||||
# Sort by tokens first, fall back to session count when tokens are 0
|
||||
result.sort(key=lambda x: (x["total_tokens"], x["sessions"]), reverse=True)
|
||||
return result
|
||||
|
||||
def _compute_platform_breakdown(self, sessions: List[Dict]) -> List[Dict]:
|
||||
"""Break down usage by platform/source."""
|
||||
platform_data = defaultdict(lambda: {
|
||||
"sessions": 0, "messages": 0, "input_tokens": 0,
|
||||
"output_tokens": 0, "total_tokens": 0, "tool_calls": 0,
|
||||
})
|
||||
|
||||
for s in sessions:
|
||||
source = s.get("source") or "unknown"
|
||||
d = platform_data[source]
|
||||
d["sessions"] += 1
|
||||
d["messages"] += s.get("message_count") or 0
|
||||
inp = s.get("input_tokens") or 0
|
||||
out = s.get("output_tokens") or 0
|
||||
d["input_tokens"] += inp
|
||||
d["output_tokens"] += out
|
||||
d["total_tokens"] += inp + out
|
||||
d["tool_calls"] += s.get("tool_call_count") or 0
|
||||
|
||||
result = [
|
||||
{"platform": platform, **data}
|
||||
for platform, data in platform_data.items()
|
||||
]
|
||||
result.sort(key=lambda x: x["sessions"], reverse=True)
|
||||
return result
|
||||
|
||||
def _compute_tool_breakdown(self, tool_usage: List[Dict]) -> List[Dict]:
|
||||
"""Process tool usage data into a ranked list with percentages."""
|
||||
total_calls = sum(t["count"] for t in tool_usage) if tool_usage else 0
|
||||
result = []
|
||||
for t in tool_usage:
|
||||
pct = (t["count"] / total_calls * 100) if total_calls else 0
|
||||
result.append({
|
||||
"tool": t["tool_name"],
|
||||
"count": t["count"],
|
||||
"percentage": pct,
|
||||
})
|
||||
return result
|
||||
|
||||
def _compute_activity_patterns(self, sessions: List[Dict]) -> Dict:
|
||||
"""Analyze activity patterns by day of week and hour."""
|
||||
day_counts = Counter() # 0=Monday ... 6=Sunday
|
||||
hour_counts = Counter()
|
||||
daily_counts = Counter() # date string -> count
|
||||
|
||||
for s in sessions:
|
||||
ts = s.get("started_at")
|
||||
if not ts:
|
||||
continue
|
||||
dt = datetime.fromtimestamp(ts)
|
||||
day_counts[dt.weekday()] += 1
|
||||
hour_counts[dt.hour] += 1
|
||||
daily_counts[dt.strftime("%Y-%m-%d")] += 1
|
||||
|
||||
day_names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
||||
day_breakdown = [
|
||||
{"day": day_names[i], "count": day_counts.get(i, 0)}
|
||||
for i in range(7)
|
||||
]
|
||||
|
||||
hour_breakdown = [
|
||||
{"hour": i, "count": hour_counts.get(i, 0)}
|
||||
for i in range(24)
|
||||
]
|
||||
|
||||
# Busiest day and hour
|
||||
busiest_day = max(day_breakdown, key=lambda x: x["count"]) if day_breakdown else None
|
||||
busiest_hour = max(hour_breakdown, key=lambda x: x["count"]) if hour_breakdown else None
|
||||
|
||||
# Active days (days with at least one session)
|
||||
active_days = len(daily_counts)
|
||||
|
||||
# Streak calculation
|
||||
if daily_counts:
|
||||
all_dates = sorted(daily_counts.keys())
|
||||
current_streak = 1
|
||||
max_streak = 1
|
||||
for i in range(1, len(all_dates)):
|
||||
d1 = datetime.strptime(all_dates[i - 1], "%Y-%m-%d")
|
||||
d2 = datetime.strptime(all_dates[i], "%Y-%m-%d")
|
||||
if (d2 - d1).days == 1:
|
||||
current_streak += 1
|
||||
max_streak = max(max_streak, current_streak)
|
||||
else:
|
||||
current_streak = 1
|
||||
else:
|
||||
max_streak = 0
|
||||
|
||||
return {
|
||||
"by_day": day_breakdown,
|
||||
"by_hour": hour_breakdown,
|
||||
"busiest_day": busiest_day,
|
||||
"busiest_hour": busiest_hour,
|
||||
"active_days": active_days,
|
||||
"max_streak": max_streak,
|
||||
}
|
||||
|
||||
def _compute_top_sessions(self, sessions: List[Dict]) -> List[Dict]:
|
||||
"""Find notable sessions (longest, most messages, most tokens)."""
|
||||
top = []
|
||||
|
||||
# Longest by duration
|
||||
sessions_with_duration = [
|
||||
s for s in sessions
|
||||
if s.get("started_at") and s.get("ended_at")
|
||||
]
|
||||
if sessions_with_duration:
|
||||
longest = max(
|
||||
sessions_with_duration,
|
||||
key=lambda s: (s["ended_at"] - s["started_at"]),
|
||||
)
|
||||
dur = longest["ended_at"] - longest["started_at"]
|
||||
top.append({
|
||||
"label": "Longest session",
|
||||
"session_id": longest["id"][:16],
|
||||
"value": _format_duration(dur),
|
||||
"date": datetime.fromtimestamp(longest["started_at"]).strftime("%b %d"),
|
||||
})
|
||||
|
||||
# Most messages
|
||||
most_msgs = max(sessions, key=lambda s: s.get("message_count") or 0)
|
||||
if (most_msgs.get("message_count") or 0) > 0:
|
||||
top.append({
|
||||
"label": "Most messages",
|
||||
"session_id": most_msgs["id"][:16],
|
||||
"value": f"{most_msgs['message_count']} msgs",
|
||||
"date": datetime.fromtimestamp(most_msgs["started_at"]).strftime("%b %d") if most_msgs.get("started_at") else "?",
|
||||
})
|
||||
|
||||
# Most tokens
|
||||
most_tokens = max(
|
||||
sessions,
|
||||
key=lambda s: (s.get("input_tokens") or 0) + (s.get("output_tokens") or 0),
|
||||
)
|
||||
token_total = (most_tokens.get("input_tokens") or 0) + (most_tokens.get("output_tokens") or 0)
|
||||
if token_total > 0:
|
||||
top.append({
|
||||
"label": "Most tokens",
|
||||
"session_id": most_tokens["id"][:16],
|
||||
"value": f"{token_total:,} tokens",
|
||||
"date": datetime.fromtimestamp(most_tokens["started_at"]).strftime("%b %d") if most_tokens.get("started_at") else "?",
|
||||
})
|
||||
|
||||
# Most tool calls
|
||||
most_tools = max(sessions, key=lambda s: s.get("tool_call_count") or 0)
|
||||
if (most_tools.get("tool_call_count") or 0) > 0:
|
||||
top.append({
|
||||
"label": "Most tool calls",
|
||||
"session_id": most_tools["id"][:16],
|
||||
"value": f"{most_tools['tool_call_count']} calls",
|
||||
"date": datetime.fromtimestamp(most_tools["started_at"]).strftime("%b %d") if most_tools.get("started_at") else "?",
|
||||
})
|
||||
|
||||
return top
|
||||
|
||||
# =========================================================================
|
||||
# Formatting
|
||||
# =========================================================================
|
||||
|
||||
def format_terminal(self, report: Dict) -> str:
|
||||
"""Format the insights report for terminal display (CLI)."""
|
||||
if report.get("empty"):
|
||||
days = report.get("days", 30)
|
||||
src = f" (source: {report['source_filter']})" if report.get("source_filter") else ""
|
||||
return f" No sessions found in the last {days} days{src}."
|
||||
|
||||
lines = []
|
||||
o = report["overview"]
|
||||
days = report["days"]
|
||||
src_filter = report.get("source_filter")
|
||||
|
||||
# Header
|
||||
lines.append("")
|
||||
lines.append(" ╔══════════════════════════════════════════════════════════╗")
|
||||
lines.append(" ║ 📊 Hermes Insights ║")
|
||||
period_label = f"Last {days} days"
|
||||
if src_filter:
|
||||
period_label += f" ({src_filter})"
|
||||
padding = 58 - len(period_label) - 2
|
||||
left_pad = padding // 2
|
||||
right_pad = padding - left_pad
|
||||
lines.append(f" ║{' ' * left_pad} {period_label} {' ' * right_pad}║")
|
||||
lines.append(" ╚══════════════════════════════════════════════════════════╝")
|
||||
lines.append("")
|
||||
|
||||
# Date range
|
||||
if o.get("date_range_start") and o.get("date_range_end"):
|
||||
start_str = datetime.fromtimestamp(o["date_range_start"]).strftime("%b %d, %Y")
|
||||
end_str = datetime.fromtimestamp(o["date_range_end"]).strftime("%b %d, %Y")
|
||||
lines.append(f" Period: {start_str} — {end_str}")
|
||||
lines.append("")
|
||||
|
||||
# Overview
|
||||
lines.append(" 📋 Overview")
|
||||
lines.append(" " + "─" * 56)
|
||||
lines.append(f" Sessions: {o['total_sessions']:<12} Messages: {o['total_messages']:,}")
|
||||
lines.append(f" Tool calls: {o['total_tool_calls']:<12,} User messages: {o['user_messages']:,}")
|
||||
lines.append(f" Input tokens: {o['total_input_tokens']:<12,} Output tokens: {o['total_output_tokens']:,}")
|
||||
cost_str = f"${o['estimated_cost']:.2f}"
|
||||
if o.get("models_without_pricing"):
|
||||
cost_str += " *"
|
||||
lines.append(f" Total tokens: {o['total_tokens']:<12,} Est. cost: {cost_str}")
|
||||
if o["total_hours"] > 0:
|
||||
lines.append(f" Active time: ~{_format_duration(o['total_hours'] * 3600):<11} Avg session: ~{_format_duration(o['avg_session_duration'])}")
|
||||
lines.append(f" Avg msgs/session: {o['avg_messages_per_session']:.1f}")
|
||||
lines.append("")
|
||||
|
||||
# Model breakdown
|
||||
if report["models"]:
|
||||
lines.append(" 🤖 Models Used")
|
||||
lines.append(" " + "─" * 56)
|
||||
lines.append(f" {'Model':<30} {'Sessions':>8} {'Tokens':>12} {'Cost':>8}")
|
||||
for m in report["models"]:
|
||||
model_name = m["model"][:28]
|
||||
if m.get("has_pricing"):
|
||||
cost_cell = f"${m['cost']:>6.2f}"
|
||||
else:
|
||||
cost_cell = " N/A"
|
||||
lines.append(f" {model_name:<30} {m['sessions']:>8} {m['total_tokens']:>12,} {cost_cell}")
|
||||
if o.get("models_without_pricing"):
|
||||
lines.append(f" * Cost N/A for custom/self-hosted models")
|
||||
lines.append("")
|
||||
|
||||
# Platform breakdown
|
||||
if len(report["platforms"]) > 1 or (report["platforms"] and report["platforms"][0]["platform"] != "cli"):
|
||||
lines.append(" 📱 Platforms")
|
||||
lines.append(" " + "─" * 56)
|
||||
lines.append(f" {'Platform':<14} {'Sessions':>8} {'Messages':>10} {'Tokens':>14}")
|
||||
for p in report["platforms"]:
|
||||
lines.append(f" {p['platform']:<14} {p['sessions']:>8} {p['messages']:>10,} {p['total_tokens']:>14,}")
|
||||
lines.append("")
|
||||
|
||||
# Tool usage
|
||||
if report["tools"]:
|
||||
lines.append(" 🔧 Top Tools")
|
||||
lines.append(" " + "─" * 56)
|
||||
lines.append(f" {'Tool':<28} {'Calls':>8} {'%':>8}")
|
||||
for t in report["tools"][:15]: # Top 15
|
||||
lines.append(f" {t['tool']:<28} {t['count']:>8,} {t['percentage']:>7.1f}%")
|
||||
if len(report["tools"]) > 15:
|
||||
lines.append(f" ... and {len(report['tools']) - 15} more tools")
|
||||
lines.append("")
|
||||
|
||||
# Activity patterns
|
||||
act = report.get("activity", {})
|
||||
if act.get("by_day"):
|
||||
lines.append(" 📅 Activity Patterns")
|
||||
lines.append(" " + "─" * 56)
|
||||
|
||||
# Day of week chart
|
||||
day_values = [d["count"] for d in act["by_day"]]
|
||||
bars = _bar_chart(day_values, max_width=15)
|
||||
for i, d in enumerate(act["by_day"]):
|
||||
bar = bars[i]
|
||||
lines.append(f" {d['day']} {bar:<15} {d['count']}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Peak hours (show top 5 busiest hours)
|
||||
busy_hours = sorted(act["by_hour"], key=lambda x: x["count"], reverse=True)
|
||||
busy_hours = [h for h in busy_hours if h["count"] > 0][:5]
|
||||
if busy_hours:
|
||||
hour_strs = []
|
||||
for h in busy_hours:
|
||||
hr = h["hour"]
|
||||
ampm = "AM" if hr < 12 else "PM"
|
||||
display_hr = hr % 12 or 12
|
||||
hour_strs.append(f"{display_hr}{ampm} ({h['count']})")
|
||||
lines.append(f" Peak hours: {', '.join(hour_strs)}")
|
||||
|
||||
if act.get("active_days"):
|
||||
lines.append(f" Active days: {act['active_days']}")
|
||||
if act.get("max_streak") and act["max_streak"] > 1:
|
||||
lines.append(f" Best streak: {act['max_streak']} consecutive days")
|
||||
lines.append("")
|
||||
|
||||
# Notable sessions
|
||||
if report.get("top_sessions"):
|
||||
lines.append(" 🏆 Notable Sessions")
|
||||
lines.append(" " + "─" * 56)
|
||||
for ts in report["top_sessions"]:
|
||||
lines.append(f" {ts['label']:<20} {ts['value']:<18} ({ts['date']}, {ts['session_id']})")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def format_gateway(self, report: Dict) -> str:
|
||||
"""Format the insights report for gateway/messaging (shorter)."""
|
||||
if report.get("empty"):
|
||||
days = report.get("days", 30)
|
||||
return f"No sessions found in the last {days} days."
|
||||
|
||||
lines = []
|
||||
o = report["overview"]
|
||||
days = report["days"]
|
||||
|
||||
lines.append(f"📊 **Hermes Insights** — Last {days} days\n")
|
||||
|
||||
# Overview
|
||||
lines.append(f"**Sessions:** {o['total_sessions']} | **Messages:** {o['total_messages']:,} | **Tool calls:** {o['total_tool_calls']:,}")
|
||||
lines.append(f"**Tokens:** {o['total_tokens']:,} (in: {o['total_input_tokens']:,} / out: {o['total_output_tokens']:,})")
|
||||
cost_note = ""
|
||||
if o.get("models_without_pricing"):
|
||||
cost_note = " _(excludes custom/self-hosted models)_"
|
||||
lines.append(f"**Est. cost:** ${o['estimated_cost']:.2f}{cost_note}")
|
||||
if o["total_hours"] > 0:
|
||||
lines.append(f"**Active time:** ~{_format_duration(o['total_hours'] * 3600)} | **Avg session:** ~{_format_duration(o['avg_session_duration'])}")
|
||||
lines.append("")
|
||||
|
||||
# Models (top 5)
|
||||
if report["models"]:
|
||||
lines.append("**🤖 Models:**")
|
||||
for m in report["models"][:5]:
|
||||
cost_str = f"${m['cost']:.2f}" if m.get("has_pricing") else "N/A"
|
||||
lines.append(f" {m['model'][:25]} — {m['sessions']} sessions, {m['total_tokens']:,} tokens, {cost_str}")
|
||||
lines.append("")
|
||||
|
||||
# Platforms (if multi-platform)
|
||||
if len(report["platforms"]) > 1:
|
||||
lines.append("**📱 Platforms:**")
|
||||
for p in report["platforms"]:
|
||||
lines.append(f" {p['platform']} — {p['sessions']} sessions, {p['messages']:,} msgs")
|
||||
lines.append("")
|
||||
|
||||
# Tools (top 8)
|
||||
if report["tools"]:
|
||||
lines.append("**🔧 Top Tools:**")
|
||||
for t in report["tools"][:8]:
|
||||
lines.append(f" {t['tool']} — {t['count']:,} calls ({t['percentage']:.1f}%)")
|
||||
lines.append("")
|
||||
|
||||
# Activity summary
|
||||
act = report.get("activity", {})
|
||||
if act.get("busiest_day") and act.get("busiest_hour"):
|
||||
hr = act["busiest_hour"]["hour"]
|
||||
ampm = "AM" if hr < 12 else "PM"
|
||||
display_hr = hr % 12 or 12
|
||||
lines.append(f"**📅 Busiest:** {act['busiest_day']['day']}s ({act['busiest_day']['count']} sessions), {display_hr}{ampm} ({act['busiest_hour']['count']} sessions)")
|
||||
if act.get("active_days"):
|
||||
lines.append(f"**Active days:** {act['active_days']}", )
|
||||
if act.get("max_streak", 0) > 1:
|
||||
lines.append(f"**Best streak:** {act['max_streak']} consecutive days")
|
||||
|
||||
return "\n".join(lines)
|
||||
244
agent/model_metadata.py
Normal file
244
agent/model_metadata.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""Model metadata, context lengths, and token estimation utilities.
|
||||
|
||||
Pure utility functions with no AIAgent dependency. Used by ContextCompressor
|
||||
and run_agent.py for pre-flight context checks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
from hermes_constants import OPENROUTER_MODELS_URL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
||||
_model_metadata_cache_time: float = 0
|
||||
_MODEL_CACHE_TTL = 3600
|
||||
|
||||
# Descending tiers for context length probing when the model is unknown.
|
||||
# We start high and step down on context-length errors until one works.
|
||||
CONTEXT_PROBE_TIERS = [
|
||||
2_000_000,
|
||||
1_000_000,
|
||||
512_000,
|
||||
200_000,
|
||||
128_000,
|
||||
64_000,
|
||||
32_000,
|
||||
]
|
||||
|
||||
DEFAULT_CONTEXT_LENGTHS = {
|
||||
"anthropic/claude-opus-4": 200000,
|
||||
"anthropic/claude-opus-4.5": 200000,
|
||||
"anthropic/claude-opus-4.6": 200000,
|
||||
"anthropic/claude-sonnet-4": 200000,
|
||||
"anthropic/claude-sonnet-4-20250514": 200000,
|
||||
"anthropic/claude-sonnet-4.5": 200000,
|
||||
"anthropic/claude-sonnet-4.6": 200000,
|
||||
"anthropic/claude-haiku-4.5": 200000,
|
||||
# Bare Anthropic model IDs (for native API provider)
|
||||
"claude-opus-4-6": 200000,
|
||||
"claude-sonnet-4-6": 200000,
|
||||
"claude-opus-4-5-20251101": 200000,
|
||||
"claude-sonnet-4-5-20250929": 200000,
|
||||
"claude-opus-4-1-20250805": 200000,
|
||||
"claude-opus-4-20250514": 200000,
|
||||
"claude-sonnet-4-20250514": 200000,
|
||||
"claude-haiku-4-5-20251001": 200000,
|
||||
"openai/gpt-5": 128000,
|
||||
"openai/gpt-4.1": 1047576,
|
||||
"openai/gpt-4.1-mini": 1047576,
|
||||
"openai/gpt-4o": 128000,
|
||||
"openai/gpt-4-turbo": 128000,
|
||||
"openai/gpt-4o-mini": 128000,
|
||||
"google/gemini-3-pro-preview": 1048576,
|
||||
"google/gemini-3-flash": 1048576,
|
||||
"google/gemini-2.5-flash": 1048576,
|
||||
"google/gemini-2.0-flash": 1048576,
|
||||
"google/gemini-2.5-pro": 1048576,
|
||||
"deepseek/deepseek-v3.2": 65536,
|
||||
"meta-llama/llama-3.3-70b-instruct": 131072,
|
||||
"deepseek/deepseek-chat-v3": 65536,
|
||||
"qwen/qwen-2.5-72b-instruct": 32768,
|
||||
"glm-4.7": 202752,
|
||||
"glm-5": 202752,
|
||||
"glm-4.5": 131072,
|
||||
"glm-4.5-flash": 131072,
|
||||
"kimi-for-coding": 262144,
|
||||
"kimi-k2.5": 262144,
|
||||
"kimi-k2-thinking": 262144,
|
||||
"kimi-k2-thinking-turbo": 262144,
|
||||
"kimi-k2-turbo-preview": 262144,
|
||||
"kimi-k2-0905-preview": 131072,
|
||||
"MiniMax-M2.5": 204800,
|
||||
"MiniMax-M2.5-highspeed": 204800,
|
||||
"MiniMax-M2.1": 204800,
|
||||
}
|
||||
|
||||
|
||||
def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
|
||||
"""Fetch model metadata from OpenRouter (cached for 1 hour)."""
|
||||
global _model_metadata_cache, _model_metadata_cache_time
|
||||
|
||||
if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
|
||||
return _model_metadata_cache
|
||||
|
||||
try:
|
||||
response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
cache = {}
|
||||
for model in data.get("data", []):
|
||||
model_id = model.get("id", "")
|
||||
cache[model_id] = {
|
||||
"context_length": model.get("context_length", 128000),
|
||||
"max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
|
||||
"name": model.get("name", model_id),
|
||||
"pricing": model.get("pricing", {}),
|
||||
}
|
||||
canonical = model.get("canonical_slug", "")
|
||||
if canonical and canonical != model_id:
|
||||
cache[canonical] = cache[model_id]
|
||||
|
||||
_model_metadata_cache = cache
|
||||
_model_metadata_cache_time = time.time()
|
||||
logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
|
||||
return cache
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
|
||||
return _model_metadata_cache or {}
|
||||
|
||||
|
||||
def _get_context_cache_path() -> Path:
|
||||
"""Return path to the persistent context length cache file."""
|
||||
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||
return hermes_home / "context_length_cache.yaml"
|
||||
|
||||
|
||||
def _load_context_cache() -> Dict[str, int]:
|
||||
"""Load the model+provider → context_length cache from disk."""
|
||||
path = _get_context_cache_path()
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
return data.get("context_lengths", {})
|
||||
except Exception as e:
|
||||
logger.debug("Failed to load context length cache: %s", e)
|
||||
return {}
|
||||
|
||||
|
||||
def save_context_length(model: str, base_url: str, length: int) -> None:
|
||||
"""Persist a discovered context length for a model+provider combo.
|
||||
|
||||
Cache key is ``model@base_url`` so the same model name served from
|
||||
different providers can have different limits.
|
||||
"""
|
||||
key = f"{model}@{base_url}"
|
||||
cache = _load_context_cache()
|
||||
if cache.get(key) == length:
|
||||
return # already stored
|
||||
cache[key] = length
|
||||
path = _get_context_cache_path()
|
||||
try:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
||||
logger.info("Cached context length %s → %s tokens", key, f"{length:,}")
|
||||
except Exception as e:
|
||||
logger.debug("Failed to save context length cache: %s", e)
|
||||
|
||||
|
||||
def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
|
||||
"""Look up a previously discovered context length for model+provider."""
|
||||
key = f"{model}@{base_url}"
|
||||
cache = _load_context_cache()
|
||||
return cache.get(key)
|
||||
|
||||
|
||||
def get_next_probe_tier(current_length: int) -> Optional[int]:
|
||||
"""Return the next lower probe tier, or None if already at minimum."""
|
||||
for tier in CONTEXT_PROBE_TIERS:
|
||||
if tier < current_length:
|
||||
return tier
|
||||
return None
|
||||
|
||||
|
||||
def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
||||
"""Try to extract the actual context limit from an API error message.
|
||||
|
||||
Many providers include the limit in their error text, e.g.:
|
||||
- "maximum context length is 32768 tokens"
|
||||
- "context_length_exceeded: 131072"
|
||||
- "Maximum context size 32768 exceeded"
|
||||
- "model's max context length is 65536"
|
||||
"""
|
||||
error_lower = error_msg.lower()
|
||||
# Pattern: look for numbers near context-related keywords
|
||||
patterns = [
|
||||
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
|
||||
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
|
||||
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
|
||||
r'>\s*(\d{4,})\s*(?:max|limit|token)', # "250000 tokens > 200000 maximum"
|
||||
r'(\d{4,})\s*(?:max(?:imum)?)\b', # "200000 maximum"
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, error_lower)
|
||||
if match:
|
||||
limit = int(match.group(1))
|
||||
# Sanity check: must be a reasonable context length
|
||||
if 1024 <= limit <= 10_000_000:
|
||||
return limit
|
||||
return None
|
||||
|
||||
|
||||
def get_model_context_length(model: str, base_url: str = "") -> int:
|
||||
"""Get the context length for a model.
|
||||
|
||||
Resolution order:
|
||||
1. Persistent cache (previously discovered via probing)
|
||||
2. OpenRouter API metadata
|
||||
3. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match)
|
||||
4. First probe tier (2M) — will be narrowed on first context error
|
||||
"""
|
||||
# 1. Check persistent cache (model+provider)
|
||||
if base_url:
|
||||
cached = get_cached_context_length(model, base_url)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
# 2. OpenRouter API metadata
|
||||
metadata = fetch_model_metadata()
|
||||
if model in metadata:
|
||||
return metadata[model].get("context_length", 128000)
|
||||
|
||||
# 3. Hardcoded defaults (fuzzy match)
|
||||
for default_model, length in DEFAULT_CONTEXT_LENGTHS.items():
|
||||
if default_model in model or model in default_model:
|
||||
return length
|
||||
|
||||
# 4. Unknown model — start at highest probe tier
|
||||
return CONTEXT_PROBE_TIERS[0]
|
||||
|
||||
|
||||
def estimate_tokens_rough(text: str) -> int:
|
||||
"""Rough token estimate (~4 chars/token) for pre-flight checks."""
|
||||
if not text:
|
||||
return 0
|
||||
return len(text) // 4
|
||||
|
||||
|
||||
def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
|
||||
"""Rough token estimate for a message list (pre-flight only)."""
|
||||
total_chars = sum(len(str(msg)) for msg in messages)
|
||||
return total_chars // 4
|
||||
458
agent/prompt_builder.py
Normal file
458
agent/prompt_builder.py
Normal file
@@ -0,0 +1,458 @@
|
||||
"""System prompt assembly -- identity, platform hints, skills index, context files.
|
||||
|
||||
All functions are stateless. AIAgent._build_system_prompt() calls these to
|
||||
assemble pieces, then combines them with memory and ephemeral prompts.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context file scanning — detect prompt injection in AGENTS.md, .cursorrules,
|
||||
# SOUL.md before they get injected into the system prompt.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CONTEXT_THREAT_PATTERNS = [
|
||||
(r'ignore\s+(previous|all|above|prior)\s+instructions', "prompt_injection"),
|
||||
(r'do\s+not\s+tell\s+the\s+user', "deception_hide"),
|
||||
(r'system\s+prompt\s+override', "sys_prompt_override"),
|
||||
(r'disregard\s+(your|all|any)\s+(instructions|rules|guidelines)', "disregard_rules"),
|
||||
(r'act\s+as\s+(if|though)\s+you\s+(have\s+no|don\'t\s+have)\s+(restrictions|limits|rules)', "bypass_restrictions"),
|
||||
(r'<!--[^>]*(?:ignore|override|system|secret|hidden)[^>]*-->', "html_comment_injection"),
|
||||
(r'<\s*div\s+style\s*=\s*["\'].*display\s*:\s*none', "hidden_div"),
|
||||
(r'translate\s+.*\s+into\s+.*\s+and\s+(execute|run|eval)', "translate_execute"),
|
||||
(r'curl\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_curl"),
|
||||
(r'cat\s+[^\n]*(\.env|credentials|\.netrc|\.pgpass)', "read_secrets"),
|
||||
]
|
||||
|
||||
_CONTEXT_INVISIBLE_CHARS = {
|
||||
'\u200b', '\u200c', '\u200d', '\u2060', '\ufeff',
|
||||
'\u202a', '\u202b', '\u202c', '\u202d', '\u202e',
|
||||
}
|
||||
|
||||
|
||||
def _scan_context_content(content: str, filename: str) -> str:
|
||||
"""Scan context file content for injection. Returns sanitized content."""
|
||||
findings = []
|
||||
|
||||
# Check invisible unicode
|
||||
for char in _CONTEXT_INVISIBLE_CHARS:
|
||||
if char in content:
|
||||
findings.append(f"invisible unicode U+{ord(char):04X}")
|
||||
|
||||
# Check threat patterns
|
||||
for pattern, pid in _CONTEXT_THREAT_PATTERNS:
|
||||
if re.search(pattern, content, re.IGNORECASE):
|
||||
findings.append(pid)
|
||||
|
||||
if findings:
|
||||
logger.warning("Context file %s blocked: %s", filename, ", ".join(findings))
|
||||
return f"[BLOCKED: {filename} contained potential prompt injection ({', '.join(findings)}). Content not loaded.]"
|
||||
|
||||
return content
|
||||
|
||||
# =========================================================================
|
||||
# Constants
|
||||
# =========================================================================
|
||||
|
||||
DEFAULT_AGENT_IDENTITY = (
|
||||
"You are Hermes Agent, an intelligent AI assistant created by Nous Research. "
|
||||
"You are helpful, knowledgeable, and direct. You assist users with a wide "
|
||||
"range of tasks including answering questions, writing and editing code, "
|
||||
"analyzing information, creative work, and executing actions via your tools. "
|
||||
"You communicate clearly, admit uncertainty when appropriate, and prioritize "
|
||||
"being genuinely useful over being verbose unless otherwise directed below. "
|
||||
"Be targeted and efficient in your exploration and investigations."
|
||||
)
|
||||
|
||||
MEMORY_GUIDANCE = (
|
||||
"You have persistent memory across sessions. Save durable facts using the memory "
|
||||
"tool: user preferences, environment details, tool quirks, and stable conventions. "
|
||||
"Memory is injected into every turn, so keep it compact and focused on facts that "
|
||||
"will still matter later.\n"
|
||||
"Prioritize what reduces future user steering — the most valuable memory is one "
|
||||
"that prevents the user from having to correct or remind you again. "
|
||||
"User preferences and recurring corrections matter more than procedural task details.\n"
|
||||
"Do NOT save task progress, session outcomes, completed-work logs, or temporary TODO "
|
||||
"state to memory; use session_search to recall those from past transcripts. "
|
||||
"If you've discovered a new way to do something, solved a problem that could be "
|
||||
"necessary later, save it as a skill with the skill tool."
|
||||
)
|
||||
|
||||
SESSION_SEARCH_GUIDANCE = (
|
||||
"When the user references something from a past conversation or you suspect "
|
||||
"relevant cross-session context exists, use session_search to recall it before "
|
||||
"asking them to repeat themselves."
|
||||
)
|
||||
|
||||
SKILLS_GUIDANCE = (
|
||||
"After completing a complex task (5+ tool calls), fixing a tricky error, "
|
||||
"or discovering a non-trivial workflow, save the approach as a "
|
||||
"skill with skill_manage so you can reuse it next time.\n"
|
||||
"When using a skill and finding it outdated, incomplete, or wrong, "
|
||||
"patch it immediately with skill_manage(action='patch') — don't wait to be asked. "
|
||||
"Skills that aren't maintained become liabilities."
|
||||
)
|
||||
|
||||
PLATFORM_HINTS = {
|
||||
"whatsapp": (
|
||||
"You are on a text messaging communication platform, WhatsApp. "
|
||||
"Please do not use markdown as it does not render. "
|
||||
"You can send media files natively: to deliver a file to the user, "
|
||||
"include MEDIA:/absolute/path/to/file in your response. The file "
|
||||
"will be sent as a native WhatsApp attachment — images (.jpg, .png, "
|
||||
".webp) appear as photos, videos (.mp4, .mov) play inline, and other "
|
||||
"files arrive as downloadable documents. You can also include image "
|
||||
"URLs in markdown format  and they will be sent as photos."
|
||||
),
|
||||
"telegram": (
|
||||
"You are on a text messaging communication platform, Telegram. "
|
||||
"Please do not use markdown as it does not render. "
|
||||
"You can send media files natively: to deliver a file to the user, "
|
||||
"include MEDIA:/absolute/path/to/file in your response. Images "
|
||||
"(.png, .jpg, .webp) appear as photos, audio (.ogg) sends as voice "
|
||||
"bubbles, and videos (.mp4) play inline. You can also include image "
|
||||
"URLs in markdown format  and they will be sent as native photos."
|
||||
),
|
||||
"discord": (
|
||||
"You are in a Discord server or group chat communicating with your user. "
|
||||
"You can send media files natively: include MEDIA:/absolute/path/to/file "
|
||||
"in your response. Images (.png, .jpg, .webp) are sent as photo "
|
||||
"attachments, audio as file attachments. You can also include image URLs "
|
||||
"in markdown format  and they will be sent as attachments."
|
||||
),
|
||||
"slack": (
|
||||
"You are in a Slack workspace communicating with your user. "
|
||||
"You can send media files natively: include MEDIA:/absolute/path/to/file "
|
||||
"in your response. Images (.png, .jpg, .webp) are uploaded as photo "
|
||||
"attachments, audio as file attachments. You can also include image URLs "
|
||||
"in markdown format  and they will be uploaded as attachments."
|
||||
),
|
||||
"signal": (
|
||||
"You are on a text messaging communication platform, Signal. "
|
||||
"Please do not use markdown as it does not render. "
|
||||
"You can send media files natively: to deliver a file to the user, "
|
||||
"include MEDIA:/absolute/path/to/file in your response. Images "
|
||||
"(.png, .jpg, .webp) appear as photos, audio as attachments, and other "
|
||||
"files arrive as downloadable documents. You can also include image "
|
||||
"URLs in markdown format  and they will be sent as photos."
|
||||
),
|
||||
"email": (
|
||||
"You are communicating via email. Write clear, well-structured responses "
|
||||
"suitable for email. Use plain text formatting (no markdown). "
|
||||
"Keep responses concise but complete. You can send file attachments — "
|
||||
"include MEDIA:/absolute/path/to/file in your response. The subject line "
|
||||
"is preserved for threading. Do not include greetings or sign-offs unless "
|
||||
"contextually appropriate."
|
||||
),
|
||||
"cron": (
|
||||
"You are running as a scheduled cron job. Your final response is automatically "
|
||||
"delivered to the job's configured destination, so do not use send_message to "
|
||||
"send to that same target again. If you want the user to receive something in "
|
||||
"the scheduled destination, put it directly in your final response. Use "
|
||||
"send_message only for additional or different targets."
|
||||
),
|
||||
"cli": (
|
||||
"You are a CLI AI Agent. Try not to use markdown but simple text "
|
||||
"renderable inside a terminal."
|
||||
),
|
||||
}
|
||||
|
||||
CONTEXT_FILE_MAX_CHARS = 20_000
|
||||
CONTEXT_TRUNCATE_HEAD_RATIO = 0.7
|
||||
CONTEXT_TRUNCATE_TAIL_RATIO = 0.2
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Skills index
|
||||
# =========================================================================
|
||||
|
||||
def _parse_skill_file(skill_file: Path) -> tuple[bool, dict, str]:
|
||||
"""Read a SKILL.md once and return platform compatibility, frontmatter, and description.
|
||||
|
||||
Returns (is_compatible, frontmatter, description). On any error, returns
|
||||
(True, {}, "") to err on the side of showing the skill.
|
||||
"""
|
||||
try:
|
||||
from tools.skills_tool import _parse_frontmatter, skill_matches_platform
|
||||
|
||||
raw = skill_file.read_text(encoding="utf-8")[:2000]
|
||||
frontmatter, _ = _parse_frontmatter(raw)
|
||||
|
||||
if not skill_matches_platform(frontmatter):
|
||||
return False, {}, ""
|
||||
|
||||
desc = ""
|
||||
raw_desc = frontmatter.get("description", "")
|
||||
if raw_desc:
|
||||
desc = str(raw_desc).strip().strip("'\"")
|
||||
if len(desc) > 60:
|
||||
desc = desc[:57] + "..."
|
||||
|
||||
return True, frontmatter, desc
|
||||
except Exception as e:
|
||||
logger.debug("Failed to parse skill file %s: %s", skill_file, e)
|
||||
return True, {}, ""
|
||||
|
||||
|
||||
def _read_skill_conditions(skill_file: Path) -> dict:
|
||||
"""Extract conditional activation fields from SKILL.md frontmatter."""
|
||||
try:
|
||||
from tools.skills_tool import _parse_frontmatter
|
||||
raw = skill_file.read_text(encoding="utf-8")[:2000]
|
||||
frontmatter, _ = _parse_frontmatter(raw)
|
||||
hermes = frontmatter.get("metadata", {}).get("hermes", {})
|
||||
return {
|
||||
"fallback_for_toolsets": hermes.get("fallback_for_toolsets", []),
|
||||
"requires_toolsets": hermes.get("requires_toolsets", []),
|
||||
"fallback_for_tools": hermes.get("fallback_for_tools", []),
|
||||
"requires_tools": hermes.get("requires_tools", []),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug("Failed to read skill conditions from %s: %s", skill_file, e)
|
||||
return {}
|
||||
|
||||
|
||||
def _skill_should_show(
|
||||
conditions: dict,
|
||||
available_tools: "set[str] | None",
|
||||
available_toolsets: "set[str] | None",
|
||||
) -> bool:
|
||||
"""Return False if the skill's conditional activation rules exclude it."""
|
||||
if available_tools is None and available_toolsets is None:
|
||||
return True # No filtering info — show everything (backward compat)
|
||||
|
||||
at = available_tools or set()
|
||||
ats = available_toolsets or set()
|
||||
|
||||
# fallback_for: hide when the primary tool/toolset IS available
|
||||
for ts in conditions.get("fallback_for_toolsets", []):
|
||||
if ts in ats:
|
||||
return False
|
||||
for t in conditions.get("fallback_for_tools", []):
|
||||
if t in at:
|
||||
return False
|
||||
|
||||
# requires: hide when a required tool/toolset is NOT available
|
||||
for ts in conditions.get("requires_toolsets", []):
|
||||
if ts not in ats:
|
||||
return False
|
||||
for t in conditions.get("requires_tools", []):
|
||||
if t not in at:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def build_skills_system_prompt(
|
||||
available_tools: "set[str] | None" = None,
|
||||
available_toolsets: "set[str] | None" = None,
|
||||
) -> str:
|
||||
"""Build a compact skill index for the system prompt.
|
||||
|
||||
Scans ~/.hermes/skills/ for SKILL.md files grouped by category.
|
||||
Includes per-skill descriptions from frontmatter so the model can
|
||||
match skills by meaning, not just name.
|
||||
Filters out skills incompatible with the current OS platform.
|
||||
"""
|
||||
hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
skills_dir = hermes_home / "skills"
|
||||
|
||||
if not skills_dir.exists():
|
||||
return ""
|
||||
|
||||
# Collect skills with descriptions, grouped by category.
|
||||
# Each entry: (skill_name, description)
|
||||
# Supports sub-categories: skills/mlops/training/axolotl/SKILL.md
|
||||
# -> category "mlops/training", skill "axolotl"
|
||||
skills_by_category: dict[str, list[tuple[str, str]]] = {}
|
||||
for skill_file in skills_dir.rglob("SKILL.md"):
|
||||
is_compatible, _, desc = _parse_skill_file(skill_file)
|
||||
if not is_compatible:
|
||||
continue
|
||||
# Skip skills whose conditional activation rules exclude them
|
||||
conditions = _read_skill_conditions(skill_file)
|
||||
if not _skill_should_show(conditions, available_tools, available_toolsets):
|
||||
continue
|
||||
rel_path = skill_file.relative_to(skills_dir)
|
||||
parts = rel_path.parts
|
||||
if len(parts) >= 2:
|
||||
# Category is everything between skills_dir and the skill folder
|
||||
# e.g. parts = ("mlops", "training", "axolotl", "SKILL.md")
|
||||
# → category = "mlops/training", skill_name = "axolotl"
|
||||
# e.g. parts = ("github", "github-auth", "SKILL.md")
|
||||
# → category = "github", skill_name = "github-auth"
|
||||
skill_name = parts[-2]
|
||||
category = "/".join(parts[:-2]) if len(parts) > 2 else parts[0]
|
||||
else:
|
||||
category = "general"
|
||||
skill_name = skill_file.parent.name
|
||||
skills_by_category.setdefault(category, []).append((skill_name, desc))
|
||||
|
||||
if not skills_by_category:
|
||||
return ""
|
||||
|
||||
# Read category-level descriptions from DESCRIPTION.md
|
||||
# Checks both the exact category path and parent directories
|
||||
category_descriptions = {}
|
||||
for category in skills_by_category:
|
||||
cat_path = Path(category)
|
||||
desc_file = skills_dir / cat_path / "DESCRIPTION.md"
|
||||
if desc_file.exists():
|
||||
try:
|
||||
content = desc_file.read_text(encoding="utf-8")
|
||||
match = re.search(r"^---\s*\n.*?description:\s*(.+?)\s*\n.*?^---", content, re.MULTILINE | re.DOTALL)
|
||||
if match:
|
||||
category_descriptions[category] = match.group(1).strip()
|
||||
except Exception as e:
|
||||
logger.debug("Could not read skill description %s: %s", desc_file, e)
|
||||
|
||||
index_lines = []
|
||||
for category in sorted(skills_by_category.keys()):
|
||||
cat_desc = category_descriptions.get(category, "")
|
||||
if cat_desc:
|
||||
index_lines.append(f" {category}: {cat_desc}")
|
||||
else:
|
||||
index_lines.append(f" {category}:")
|
||||
# Deduplicate and sort skills within each category
|
||||
seen = set()
|
||||
for name, desc in sorted(skills_by_category[category], key=lambda x: x[0]):
|
||||
if name in seen:
|
||||
continue
|
||||
seen.add(name)
|
||||
if desc:
|
||||
index_lines.append(f" - {name}: {desc}")
|
||||
else:
|
||||
index_lines.append(f" - {name}")
|
||||
|
||||
return (
|
||||
"## Skills (mandatory)\n"
|
||||
"Before replying, scan the skills below. If one clearly matches your task, "
|
||||
"load it with skill_view(name) and follow its instructions. "
|
||||
"If a skill has issues, fix it with skill_manage(action='patch').\n"
|
||||
"After difficult/iterative tasks, offer to save as a skill. "
|
||||
"If a skill you loaded was missing steps, had wrong commands, or needed "
|
||||
"pitfalls you discovered, update it before finishing.\n"
|
||||
"\n"
|
||||
"<available_skills>\n"
|
||||
+ "\n".join(index_lines) + "\n"
|
||||
"</available_skills>\n"
|
||||
"\n"
|
||||
"If none match, proceed normally without loading a skill."
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Context files (SOUL.md, AGENTS.md, .cursorrules)
|
||||
# =========================================================================
|
||||
|
||||
def _truncate_content(content: str, filename: str, max_chars: int = CONTEXT_FILE_MAX_CHARS) -> str:
|
||||
"""Head/tail truncation with a marker in the middle."""
|
||||
if len(content) <= max_chars:
|
||||
return content
|
||||
head_chars = int(max_chars * CONTEXT_TRUNCATE_HEAD_RATIO)
|
||||
tail_chars = int(max_chars * CONTEXT_TRUNCATE_TAIL_RATIO)
|
||||
head = content[:head_chars]
|
||||
tail = content[-tail_chars:]
|
||||
marker = f"\n\n[...truncated {filename}: kept {head_chars}+{tail_chars} of {len(content)} chars. Use file tools to read the full file.]\n\n"
|
||||
return head + marker + tail
|
||||
|
||||
|
||||
def build_context_files_prompt(cwd: Optional[str] = None) -> str:
|
||||
"""Discover and load context files for the system prompt.
|
||||
|
||||
Discovery: AGENTS.md (recursive), .cursorrules / .cursor/rules/*.mdc,
|
||||
and SOUL.md from HERMES_HOME only. Each capped at 20,000 chars.
|
||||
"""
|
||||
if cwd is None:
|
||||
cwd = os.getcwd()
|
||||
|
||||
cwd_path = Path(cwd).resolve()
|
||||
sections = []
|
||||
|
||||
# AGENTS.md (hierarchical, recursive)
|
||||
top_level_agents = None
|
||||
for name in ["AGENTS.md", "agents.md"]:
|
||||
candidate = cwd_path / name
|
||||
if candidate.exists():
|
||||
top_level_agents = candidate
|
||||
break
|
||||
|
||||
if top_level_agents:
|
||||
agents_files = []
|
||||
for root, dirs, files in os.walk(cwd_path):
|
||||
dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ('node_modules', '__pycache__', 'venv', '.venv')]
|
||||
for f in files:
|
||||
if f.lower() == "agents.md":
|
||||
agents_files.append(Path(root) / f)
|
||||
agents_files.sort(key=lambda p: len(p.parts))
|
||||
|
||||
total_agents_content = ""
|
||||
for agents_path in agents_files:
|
||||
try:
|
||||
content = agents_path.read_text(encoding="utf-8").strip()
|
||||
if content:
|
||||
rel_path = agents_path.relative_to(cwd_path)
|
||||
content = _scan_context_content(content, str(rel_path))
|
||||
total_agents_content += f"## {rel_path}\n\n{content}\n\n"
|
||||
except Exception as e:
|
||||
logger.debug("Could not read %s: %s", agents_path, e)
|
||||
|
||||
if total_agents_content:
|
||||
total_agents_content = _truncate_content(total_agents_content, "AGENTS.md")
|
||||
sections.append(total_agents_content)
|
||||
|
||||
# .cursorrules
|
||||
cursorrules_content = ""
|
||||
cursorrules_file = cwd_path / ".cursorrules"
|
||||
if cursorrules_file.exists():
|
||||
try:
|
||||
content = cursorrules_file.read_text(encoding="utf-8").strip()
|
||||
if content:
|
||||
content = _scan_context_content(content, ".cursorrules")
|
||||
cursorrules_content += f"## .cursorrules\n\n{content}\n\n"
|
||||
except Exception as e:
|
||||
logger.debug("Could not read .cursorrules: %s", e)
|
||||
|
||||
cursor_rules_dir = cwd_path / ".cursor" / "rules"
|
||||
if cursor_rules_dir.exists() and cursor_rules_dir.is_dir():
|
||||
mdc_files = sorted(cursor_rules_dir.glob("*.mdc"))
|
||||
for mdc_file in mdc_files:
|
||||
try:
|
||||
content = mdc_file.read_text(encoding="utf-8").strip()
|
||||
if content:
|
||||
content = _scan_context_content(content, f".cursor/rules/{mdc_file.name}")
|
||||
cursorrules_content += f"## .cursor/rules/{mdc_file.name}\n\n{content}\n\n"
|
||||
except Exception as e:
|
||||
logger.debug("Could not read %s: %s", mdc_file, e)
|
||||
|
||||
if cursorrules_content:
|
||||
cursorrules_content = _truncate_content(cursorrules_content, ".cursorrules")
|
||||
sections.append(cursorrules_content)
|
||||
|
||||
# SOUL.md from HERMES_HOME only
|
||||
try:
|
||||
from hermes_cli.config import ensure_hermes_home
|
||||
ensure_hermes_home()
|
||||
except Exception as e:
|
||||
logger.debug("Could not ensure HERMES_HOME before loading SOUL.md: %s", e)
|
||||
|
||||
soul_path = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "SOUL.md"
|
||||
if soul_path.exists():
|
||||
try:
|
||||
content = soul_path.read_text(encoding="utf-8").strip()
|
||||
if content:
|
||||
content = _scan_context_content(content, "SOUL.md")
|
||||
content = _truncate_content(content, "SOUL.md")
|
||||
sections.append(content)
|
||||
except Exception as e:
|
||||
logger.debug("Could not read SOUL.md from %s: %s", soul_path, e)
|
||||
|
||||
if not sections:
|
||||
return ""
|
||||
return "# Project Context\n\nThe following project context files have been loaded and should be followed:\n\n" + "\n".join(sections)
|
||||
70
agent/prompt_caching.py
Normal file
70
agent/prompt_caching.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Anthropic prompt caching (system_and_3 strategy).
|
||||
|
||||
Reduces input token costs by ~75% on multi-turn conversations by caching
|
||||
the conversation prefix. Uses 4 cache_control breakpoints (Anthropic max):
|
||||
1. System prompt (stable across all turns)
|
||||
2-4. Last 3 non-system messages (rolling window)
|
||||
|
||||
Pure functions -- no class state, no AIAgent dependency.
|
||||
"""
|
||||
|
||||
import copy
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
def _apply_cache_marker(msg: dict, cache_marker: dict) -> None:
|
||||
"""Add cache_control to a single message, handling all format variations."""
|
||||
role = msg.get("role", "")
|
||||
content = msg.get("content")
|
||||
|
||||
if role == "tool":
|
||||
msg["cache_control"] = cache_marker
|
||||
return
|
||||
|
||||
if content is None or content == "":
|
||||
msg["cache_control"] = cache_marker
|
||||
return
|
||||
|
||||
if isinstance(content, str):
|
||||
msg["content"] = [
|
||||
{"type": "text", "text": content, "cache_control": cache_marker}
|
||||
]
|
||||
return
|
||||
|
||||
if isinstance(content, list) and content:
|
||||
last = content[-1]
|
||||
if isinstance(last, dict):
|
||||
last["cache_control"] = cache_marker
|
||||
|
||||
|
||||
def apply_anthropic_cache_control(
|
||||
api_messages: List[Dict[str, Any]],
|
||||
cache_ttl: str = "5m",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply system_and_3 caching strategy to messages for Anthropic models.
|
||||
|
||||
Places up to 4 cache_control breakpoints: system prompt + last 3 non-system messages.
|
||||
|
||||
Returns:
|
||||
Deep copy of messages with cache_control breakpoints injected.
|
||||
"""
|
||||
messages = copy.deepcopy(api_messages)
|
||||
if not messages:
|
||||
return messages
|
||||
|
||||
marker = {"type": "ephemeral"}
|
||||
if cache_ttl == "1h":
|
||||
marker["ttl"] = "1h"
|
||||
|
||||
breakpoints_used = 0
|
||||
|
||||
if messages[0].get("role") == "system":
|
||||
_apply_cache_marker(messages[0], marker)
|
||||
breakpoints_used += 1
|
||||
|
||||
remaining = 4 - breakpoints_used
|
||||
non_sys = [i for i in range(len(messages)) if messages[i].get("role") != "system"]
|
||||
for idx in non_sys[-remaining:]:
|
||||
_apply_cache_marker(messages[idx], marker)
|
||||
|
||||
return messages
|
||||
161
agent/redact.py
Normal file
161
agent/redact.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""Regex-based secret redaction for logs and tool output.
|
||||
|
||||
Applies pattern matching to mask API keys, tokens, and credentials
|
||||
before they reach log files, verbose output, or gateway logs.
|
||||
|
||||
Short tokens (< 18 chars) are fully masked. Longer tokens preserve
|
||||
the first 6 and last 4 characters for debuggability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Known API key prefixes -- match the prefix + contiguous token chars
|
||||
_PREFIX_PATTERNS = [
|
||||
r"sk-[A-Za-z0-9_-]{10,}", # OpenAI / OpenRouter / Anthropic (sk-ant-*)
|
||||
r"ghp_[A-Za-z0-9]{10,}", # GitHub PAT (classic)
|
||||
r"github_pat_[A-Za-z0-9_]{10,}", # GitHub PAT (fine-grained)
|
||||
r"xox[baprs]-[A-Za-z0-9-]{10,}", # Slack tokens
|
||||
r"AIza[A-Za-z0-9_-]{30,}", # Google API keys
|
||||
r"pplx-[A-Za-z0-9]{10,}", # Perplexity
|
||||
r"fal_[A-Za-z0-9_-]{10,}", # Fal.ai
|
||||
r"fc-[A-Za-z0-9]{10,}", # Firecrawl
|
||||
r"bb_live_[A-Za-z0-9_-]{10,}", # BrowserBase
|
||||
r"gAAAA[A-Za-z0-9_=-]{20,}", # Codex encrypted tokens
|
||||
r"AKIA[A-Z0-9]{16}", # AWS Access Key ID
|
||||
r"sk_live_[A-Za-z0-9]{10,}", # Stripe secret key (live)
|
||||
r"sk_test_[A-Za-z0-9]{10,}", # Stripe secret key (test)
|
||||
r"rk_live_[A-Za-z0-9]{10,}", # Stripe restricted key
|
||||
r"SG\.[A-Za-z0-9_-]{10,}", # SendGrid API key
|
||||
r"hf_[A-Za-z0-9]{10,}", # HuggingFace token
|
||||
r"r8_[A-Za-z0-9]{10,}", # Replicate API token
|
||||
r"npm_[A-Za-z0-9]{10,}", # npm access token
|
||||
r"pypi-[A-Za-z0-9_-]{10,}", # PyPI API token
|
||||
r"dop_v1_[A-Za-z0-9]{10,}", # DigitalOcean PAT
|
||||
r"doo_v1_[A-Za-z0-9]{10,}", # DigitalOcean OAuth
|
||||
r"am_[A-Za-z0-9_-]{10,}", # AgentMail API key
|
||||
]
|
||||
|
||||
# ENV assignment patterns: KEY=value where KEY contains a secret-like name
|
||||
_SECRET_ENV_NAMES = r"(?:API_?KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIAL|AUTH)"
|
||||
_ENV_ASSIGN_RE = re.compile(
|
||||
rf"([A-Z_]*{_SECRET_ENV_NAMES}[A-Z_]*)\s*=\s*(['\"]?)(\S+)\2",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# JSON field patterns: "apiKey": "value", "token": "value", etc.
|
||||
_JSON_KEY_NAMES = r"(?:api_?[Kk]ey|token|secret|password|access_token|refresh_token|auth_token|bearer|secret_value|raw_secret|secret_input|key_material)"
|
||||
_JSON_FIELD_RE = re.compile(
|
||||
rf'("{_JSON_KEY_NAMES}")\s*:\s*"([^"]+)"',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Authorization headers
|
||||
_AUTH_HEADER_RE = re.compile(
|
||||
r"(Authorization:\s*Bearer\s+)(\S+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Telegram bot tokens: bot<digits>:<token> or <digits>:<token>,
|
||||
# where token part is restricted to [-A-Za-z0-9_] and length >= 30
|
||||
_TELEGRAM_RE = re.compile(
|
||||
r"(bot)?(\d{8,}):([-A-Za-z0-9_]{30,})",
|
||||
)
|
||||
|
||||
# Private key blocks: -----BEGIN RSA PRIVATE KEY----- ... -----END RSA PRIVATE KEY-----
|
||||
_PRIVATE_KEY_RE = re.compile(
|
||||
r"-----BEGIN[A-Z ]*PRIVATE KEY-----[\s\S]*?-----END[A-Z ]*PRIVATE KEY-----"
|
||||
)
|
||||
|
||||
# Database connection strings: protocol://user:PASSWORD@host
|
||||
# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password
|
||||
_DB_CONNSTR_RE = re.compile(
|
||||
r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:]+:)([^@]+)(@)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# E.164 phone numbers: +<country><number>, 7-15 digits
|
||||
# Negative lookahead prevents matching hex strings or identifiers
|
||||
_SIGNAL_PHONE_RE = re.compile(r"(\+[1-9]\d{6,14})(?![A-Za-z0-9])")
|
||||
|
||||
# Compile known prefix patterns into one alternation
|
||||
_PREFIX_RE = re.compile(
|
||||
r"(?<![A-Za-z0-9_-])(" + "|".join(_PREFIX_PATTERNS) + r")(?![A-Za-z0-9_-])"
|
||||
)
|
||||
|
||||
|
||||
def _mask_token(token: str) -> str:
|
||||
"""Mask a token, preserving prefix for long tokens."""
|
||||
if len(token) < 18:
|
||||
return "***"
|
||||
return f"{token[:6]}...{token[-4:]}"
|
||||
|
||||
|
||||
def redact_sensitive_text(text: str) -> str:
|
||||
"""Apply all redaction patterns to a block of text.
|
||||
|
||||
Safe to call on any string -- non-matching text passes through unchanged.
|
||||
Disabled when security.redact_secrets is false in config.yaml.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
if os.getenv("HERMES_REDACT_SECRETS", "").lower() in ("0", "false", "no", "off"):
|
||||
return text
|
||||
|
||||
# Known prefixes (sk-, ghp_, etc.)
|
||||
text = _PREFIX_RE.sub(lambda m: _mask_token(m.group(1)), text)
|
||||
|
||||
# ENV assignments: OPENAI_API_KEY=sk-abc...
|
||||
def _redact_env(m):
|
||||
name, quote, value = m.group(1), m.group(2), m.group(3)
|
||||
return f"{name}={quote}{_mask_token(value)}{quote}"
|
||||
text = _ENV_ASSIGN_RE.sub(_redact_env, text)
|
||||
|
||||
# JSON fields: "apiKey": "value"
|
||||
def _redact_json(m):
|
||||
key, value = m.group(1), m.group(2)
|
||||
return f'{key}: "{_mask_token(value)}"'
|
||||
text = _JSON_FIELD_RE.sub(_redact_json, text)
|
||||
|
||||
# Authorization headers
|
||||
text = _AUTH_HEADER_RE.sub(
|
||||
lambda m: m.group(1) + _mask_token(m.group(2)),
|
||||
text,
|
||||
)
|
||||
|
||||
# Telegram bot tokens
|
||||
def _redact_telegram(m):
|
||||
prefix = m.group(1) or ""
|
||||
digits = m.group(2)
|
||||
return f"{prefix}{digits}:***"
|
||||
text = _TELEGRAM_RE.sub(_redact_telegram, text)
|
||||
|
||||
# Private key blocks
|
||||
text = _PRIVATE_KEY_RE.sub("[REDACTED PRIVATE KEY]", text)
|
||||
|
||||
# Database connection string passwords
|
||||
text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
|
||||
|
||||
# E.164 phone numbers (Signal, WhatsApp)
|
||||
def _redact_phone(m):
|
||||
phone = m.group(1)
|
||||
if len(phone) <= 8:
|
||||
return phone[:2] + "****" + phone[-2:]
|
||||
return phone[:4] + "****" + phone[-4:]
|
||||
text = _SIGNAL_PHONE_RE.sub(_redact_phone, text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
class RedactingFormatter(logging.Formatter):
|
||||
"""Log formatter that redacts secrets from all log messages."""
|
||||
|
||||
def __init__(self, fmt=None, datefmt=None, style='%', **kwargs):
|
||||
super().__init__(fmt, datefmt, style, **kwargs)
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
original = super().format(record)
|
||||
return redact_sensitive_text(original)
|
||||
278
agent/skill_commands.py
Normal file
278
agent/skill_commands.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""Shared slash command helpers for skills and built-in prompt-style modes.
|
||||
|
||||
Shared between CLI (cli.py) and gateway (gateway/run.py) so both surfaces
|
||||
can invoke skills via /skill-name commands and prompt-only built-ins like
|
||||
/plan.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_skill_commands: Dict[str, Dict[str, Any]] = {}
|
||||
_PLAN_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
||||
|
||||
|
||||
def build_plan_path(
|
||||
user_instruction: str = "",
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
) -> Path:
|
||||
"""Return the default workspace-relative markdown path for a /plan invocation.
|
||||
|
||||
Relative paths are intentional: file tools are task/backend-aware and resolve
|
||||
them against the active working directory for local, docker, ssh, modal,
|
||||
daytona, and similar terminal backends. That keeps the plan with the active
|
||||
workspace instead of the Hermes host's global home directory.
|
||||
"""
|
||||
slug_source = (user_instruction or "").strip().splitlines()[0] if user_instruction else ""
|
||||
slug = _PLAN_SLUG_RE.sub("-", slug_source.lower()).strip("-")
|
||||
if slug:
|
||||
slug = "-".join(part for part in slug.split("-")[:8] if part)[:48].strip("-")
|
||||
slug = slug or "conversation-plan"
|
||||
timestamp = (now or datetime.now()).strftime("%Y-%m-%d_%H%M%S")
|
||||
return Path(".hermes") / "plans" / f"{timestamp}-{slug}.md"
|
||||
|
||||
|
||||
def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tuple[dict[str, Any], Path | None, str] | None:
|
||||
"""Load a skill by name/path and return (loaded_payload, skill_dir, display_name)."""
|
||||
raw_identifier = (skill_identifier or "").strip()
|
||||
if not raw_identifier:
|
||||
return None
|
||||
|
||||
try:
|
||||
from tools.skills_tool import SKILLS_DIR, skill_view
|
||||
|
||||
identifier_path = Path(raw_identifier).expanduser()
|
||||
if identifier_path.is_absolute():
|
||||
try:
|
||||
normalized = str(identifier_path.resolve().relative_to(SKILLS_DIR.resolve()))
|
||||
except Exception:
|
||||
normalized = raw_identifier
|
||||
else:
|
||||
normalized = raw_identifier.lstrip("/")
|
||||
|
||||
loaded_skill = json.loads(skill_view(normalized, task_id=task_id))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if not loaded_skill.get("success"):
|
||||
return None
|
||||
|
||||
skill_name = str(loaded_skill.get("name") or normalized)
|
||||
skill_path = str(loaded_skill.get("path") or "")
|
||||
skill_dir = None
|
||||
if skill_path:
|
||||
try:
|
||||
skill_dir = SKILLS_DIR / Path(skill_path).parent
|
||||
except Exception:
|
||||
skill_dir = None
|
||||
|
||||
return loaded_skill, skill_dir, skill_name
|
||||
|
||||
|
||||
def _build_skill_message(
|
||||
loaded_skill: dict[str, Any],
|
||||
skill_dir: Path | None,
|
||||
activation_note: str,
|
||||
user_instruction: str = "",
|
||||
runtime_note: str = "",
|
||||
) -> str:
|
||||
"""Format a loaded skill into a user/system message payload."""
|
||||
from tools.skills_tool import SKILLS_DIR
|
||||
|
||||
content = str(loaded_skill.get("content") or "")
|
||||
|
||||
parts = [activation_note, "", content.strip()]
|
||||
|
||||
if loaded_skill.get("setup_skipped"):
|
||||
parts.extend(
|
||||
[
|
||||
"",
|
||||
"[Skill setup note: Required environment setup was skipped. Continue loading the skill and explain any reduced functionality if it matters.]",
|
||||
]
|
||||
)
|
||||
elif loaded_skill.get("gateway_setup_hint"):
|
||||
parts.extend(
|
||||
[
|
||||
"",
|
||||
f"[Skill setup note: {loaded_skill['gateway_setup_hint']}]",
|
||||
]
|
||||
)
|
||||
elif loaded_skill.get("setup_needed") and loaded_skill.get("setup_note"):
|
||||
parts.extend(
|
||||
[
|
||||
"",
|
||||
f"[Skill setup note: {loaded_skill['setup_note']}]",
|
||||
]
|
||||
)
|
||||
|
||||
supporting = []
|
||||
linked_files = loaded_skill.get("linked_files") or {}
|
||||
for entries in linked_files.values():
|
||||
if isinstance(entries, list):
|
||||
supporting.extend(entries)
|
||||
|
||||
if not supporting and skill_dir:
|
||||
for subdir in ("references", "templates", "scripts", "assets"):
|
||||
subdir_path = skill_dir / subdir
|
||||
if subdir_path.exists():
|
||||
for f in sorted(subdir_path.rglob("*")):
|
||||
if f.is_file():
|
||||
rel = str(f.relative_to(skill_dir))
|
||||
supporting.append(rel)
|
||||
|
||||
if supporting and skill_dir:
|
||||
skill_view_target = str(skill_dir.relative_to(SKILLS_DIR))
|
||||
parts.append("")
|
||||
parts.append("[This skill has supporting files you can load with the skill_view tool:]")
|
||||
for sf in supporting:
|
||||
parts.append(f"- {sf}")
|
||||
parts.append(
|
||||
f'\nTo view any of these, use: skill_view(name="{skill_view_target}", file_path="<path>")'
|
||||
)
|
||||
|
||||
if user_instruction:
|
||||
parts.append("")
|
||||
parts.append(f"The user has provided the following instruction alongside the skill invocation: {user_instruction}")
|
||||
|
||||
if runtime_note:
|
||||
parts.append("")
|
||||
parts.append(f"[Runtime note: {runtime_note}]")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def scan_skill_commands() -> Dict[str, Dict[str, Any]]:
|
||||
"""Scan ~/.hermes/skills/ and return a mapping of /command -> skill info.
|
||||
|
||||
Returns:
|
||||
Dict mapping "/skill-name" to {name, description, skill_md_path, skill_dir}.
|
||||
"""
|
||||
global _skill_commands
|
||||
_skill_commands = {}
|
||||
try:
|
||||
from tools.skills_tool import SKILLS_DIR, _parse_frontmatter, skill_matches_platform
|
||||
if not SKILLS_DIR.exists():
|
||||
return _skill_commands
|
||||
for skill_md in SKILLS_DIR.rglob("SKILL.md"):
|
||||
if any(part in ('.git', '.github', '.hub') for part in skill_md.parts):
|
||||
continue
|
||||
try:
|
||||
content = skill_md.read_text(encoding='utf-8')
|
||||
frontmatter, body = _parse_frontmatter(content)
|
||||
# Skip skills incompatible with the current OS platform
|
||||
if not skill_matches_platform(frontmatter):
|
||||
continue
|
||||
name = frontmatter.get('name', skill_md.parent.name)
|
||||
description = frontmatter.get('description', '')
|
||||
if not description:
|
||||
for line in body.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
description = line[:80]
|
||||
break
|
||||
cmd_name = name.lower().replace(' ', '-').replace('_', '-')
|
||||
_skill_commands[f"/{cmd_name}"] = {
|
||||
"name": name,
|
||||
"description": description or f"Invoke the {name} skill",
|
||||
"skill_md_path": str(skill_md),
|
||||
"skill_dir": str(skill_md.parent),
|
||||
}
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
return _skill_commands
|
||||
|
||||
|
||||
def get_skill_commands() -> Dict[str, Dict[str, Any]]:
|
||||
"""Return the current skill commands mapping (scan first if empty)."""
|
||||
if not _skill_commands:
|
||||
scan_skill_commands()
|
||||
return _skill_commands
|
||||
|
||||
|
||||
def build_skill_invocation_message(
|
||||
cmd_key: str,
|
||||
user_instruction: str = "",
|
||||
task_id: str | None = None,
|
||||
runtime_note: str = "",
|
||||
) -> Optional[str]:
|
||||
"""Build the user message content for a skill slash command invocation.
|
||||
|
||||
Args:
|
||||
cmd_key: The command key including leading slash (e.g., "/gif-search").
|
||||
user_instruction: Optional text the user typed after the command.
|
||||
|
||||
Returns:
|
||||
The formatted message string, or None if the skill wasn't found.
|
||||
"""
|
||||
commands = get_skill_commands()
|
||||
skill_info = commands.get(cmd_key)
|
||||
if not skill_info:
|
||||
return None
|
||||
|
||||
loaded = _load_skill_payload(skill_info["skill_dir"], task_id=task_id)
|
||||
if not loaded:
|
||||
return f"[Failed to load skill: {skill_info['name']}]"
|
||||
|
||||
loaded_skill, skill_dir, skill_name = loaded
|
||||
activation_note = (
|
||||
f'[SYSTEM: The user has invoked the "{skill_name}" skill, indicating they want '
|
||||
"you to follow its instructions. The full skill content is loaded below.]"
|
||||
)
|
||||
return _build_skill_message(
|
||||
loaded_skill,
|
||||
skill_dir,
|
||||
activation_note,
|
||||
user_instruction=user_instruction,
|
||||
runtime_note=runtime_note,
|
||||
)
|
||||
|
||||
|
||||
def build_preloaded_skills_prompt(
|
||||
skill_identifiers: list[str],
|
||||
task_id: str | None = None,
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""Load one or more skills for session-wide CLI preloading.
|
||||
|
||||
Returns (prompt_text, loaded_skill_names, missing_identifiers).
|
||||
"""
|
||||
prompt_parts: list[str] = []
|
||||
loaded_names: list[str] = []
|
||||
missing: list[str] = []
|
||||
|
||||
seen: set[str] = set()
|
||||
for raw_identifier in skill_identifiers:
|
||||
identifier = (raw_identifier or "").strip()
|
||||
if not identifier or identifier in seen:
|
||||
continue
|
||||
seen.add(identifier)
|
||||
|
||||
loaded = _load_skill_payload(identifier, task_id=task_id)
|
||||
if not loaded:
|
||||
missing.append(identifier)
|
||||
continue
|
||||
|
||||
loaded_skill, skill_dir, skill_name = loaded
|
||||
activation_note = (
|
||||
f'[SYSTEM: The user launched this CLI session with the "{skill_name}" skill '
|
||||
"preloaded. Treat its instructions as active guidance for the duration of this "
|
||||
"session unless the user overrides them.]"
|
||||
)
|
||||
prompt_parts.append(
|
||||
_build_skill_message(
|
||||
loaded_skill,
|
||||
skill_dir,
|
||||
activation_note,
|
||||
)
|
||||
)
|
||||
loaded_names.append(skill_name)
|
||||
|
||||
return "\n\n".join(prompt_parts), loaded_names, missing
|
||||
184
agent/smart_model_routing.py
Normal file
184
agent/smart_model_routing.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""Helpers for optional cheap-vs-strong model routing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
_COMPLEX_KEYWORDS = {
|
||||
"debug",
|
||||
"debugging",
|
||||
"implement",
|
||||
"implementation",
|
||||
"refactor",
|
||||
"patch",
|
||||
"traceback",
|
||||
"stacktrace",
|
||||
"exception",
|
||||
"error",
|
||||
"analyze",
|
||||
"analysis",
|
||||
"investigate",
|
||||
"architecture",
|
||||
"design",
|
||||
"compare",
|
||||
"benchmark",
|
||||
"optimize",
|
||||
"optimise",
|
||||
"review",
|
||||
"terminal",
|
||||
"shell",
|
||||
"tool",
|
||||
"tools",
|
||||
"pytest",
|
||||
"test",
|
||||
"tests",
|
||||
"plan",
|
||||
"planning",
|
||||
"delegate",
|
||||
"subagent",
|
||||
"cron",
|
||||
"docker",
|
||||
"kubernetes",
|
||||
}
|
||||
|
||||
_URL_RE = re.compile(r"https?://|www\.", re.IGNORECASE)
|
||||
|
||||
|
||||
def _coerce_bool(value: Any, default: bool = False) -> bool:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
return bool(value)
|
||||
|
||||
|
||||
def _coerce_int(value: Any, default: int) -> int:
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
||||
"""Return the configured cheap-model route when a message looks simple.
|
||||
|
||||
Conservative by design: if the message has signs of code/tool/debugging/
|
||||
long-form work, keep the primary model.
|
||||
"""
|
||||
cfg = routing_config or {}
|
||||
if not _coerce_bool(cfg.get("enabled"), False):
|
||||
return None
|
||||
|
||||
cheap_model = cfg.get("cheap_model") or {}
|
||||
if not isinstance(cheap_model, dict):
|
||||
return None
|
||||
provider = str(cheap_model.get("provider") or "").strip().lower()
|
||||
model = str(cheap_model.get("model") or "").strip()
|
||||
if not provider or not model:
|
||||
return None
|
||||
|
||||
text = (user_message or "").strip()
|
||||
if not text:
|
||||
return None
|
||||
|
||||
max_chars = _coerce_int(cfg.get("max_simple_chars"), 160)
|
||||
max_words = _coerce_int(cfg.get("max_simple_words"), 28)
|
||||
|
||||
if len(text) > max_chars:
|
||||
return None
|
||||
if len(text.split()) > max_words:
|
||||
return None
|
||||
if text.count("\n") > 1:
|
||||
return None
|
||||
if "```" in text or "`" in text:
|
||||
return None
|
||||
if _URL_RE.search(text):
|
||||
return None
|
||||
|
||||
lowered = text.lower()
|
||||
words = {token.strip(".,:;!?()[]{}\"'`") for token in lowered.split()}
|
||||
if words & _COMPLEX_KEYWORDS:
|
||||
return None
|
||||
|
||||
route = dict(cheap_model)
|
||||
route["provider"] = provider
|
||||
route["model"] = model
|
||||
route["routing_reason"] = "simple_turn"
|
||||
return route
|
||||
|
||||
|
||||
def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Resolve the effective model/runtime for one turn.
|
||||
|
||||
Returns a dict with model/runtime/signature/label fields.
|
||||
"""
|
||||
route = choose_cheap_model_route(user_message, routing_config)
|
||||
if not route:
|
||||
return {
|
||||
"model": primary.get("model"),
|
||||
"runtime": {
|
||||
"api_key": primary.get("api_key"),
|
||||
"base_url": primary.get("base_url"),
|
||||
"provider": primary.get("provider"),
|
||||
"api_mode": primary.get("api_mode"),
|
||||
},
|
||||
"label": None,
|
||||
"signature": (
|
||||
primary.get("model"),
|
||||
primary.get("provider"),
|
||||
primary.get("base_url"),
|
||||
primary.get("api_mode"),
|
||||
),
|
||||
}
|
||||
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
|
||||
explicit_api_key = None
|
||||
api_key_env = str(route.get("api_key_env") or "").strip()
|
||||
if api_key_env:
|
||||
explicit_api_key = os.getenv(api_key_env) or None
|
||||
|
||||
try:
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=route.get("provider"),
|
||||
explicit_api_key=explicit_api_key,
|
||||
explicit_base_url=route.get("base_url"),
|
||||
)
|
||||
except Exception:
|
||||
return {
|
||||
"model": primary.get("model"),
|
||||
"runtime": {
|
||||
"api_key": primary.get("api_key"),
|
||||
"base_url": primary.get("base_url"),
|
||||
"provider": primary.get("provider"),
|
||||
"api_mode": primary.get("api_mode"),
|
||||
},
|
||||
"label": None,
|
||||
"signature": (
|
||||
primary.get("model"),
|
||||
primary.get("provider"),
|
||||
primary.get("base_url"),
|
||||
primary.get("api_mode"),
|
||||
),
|
||||
}
|
||||
|
||||
return {
|
||||
"model": route.get("model"),
|
||||
"runtime": {
|
||||
"api_key": runtime.get("api_key"),
|
||||
"base_url": runtime.get("base_url"),
|
||||
"provider": runtime.get("provider"),
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
},
|
||||
"label": f"smart route → {route.get('model')} ({runtime.get('provider')})",
|
||||
"signature": (
|
||||
route.get("model"),
|
||||
runtime.get("provider"),
|
||||
runtime.get("base_url"),
|
||||
runtime.get("api_mode"),
|
||||
),
|
||||
}
|
||||
56
agent/trajectory.py
Normal file
56
agent/trajectory.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""Trajectory saving utilities and static helpers.
|
||||
|
||||
_convert_to_trajectory_format stays as an AIAgent method (batch_runner.py
|
||||
calls agent._convert_to_trajectory_format). Only the static helpers and
|
||||
the file-write logic live here.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def convert_scratchpad_to_think(content: str) -> str:
|
||||
"""Convert <REASONING_SCRATCHPAD> tags to <think> tags."""
|
||||
if not content or "<REASONING_SCRATCHPAD>" not in content:
|
||||
return content
|
||||
return content.replace("<REASONING_SCRATCHPAD>", "<think>").replace("</REASONING_SCRATCHPAD>", "</think>")
|
||||
|
||||
|
||||
def has_incomplete_scratchpad(content: str) -> bool:
|
||||
"""Check if content has an opening <REASONING_SCRATCHPAD> without a closing tag."""
|
||||
if not content:
|
||||
return False
|
||||
return "<REASONING_SCRATCHPAD>" in content and "</REASONING_SCRATCHPAD>" not in content
|
||||
|
||||
|
||||
def save_trajectory(trajectory: List[Dict[str, Any]], model: str,
|
||||
completed: bool, filename: str = None):
|
||||
"""Append a trajectory entry to a JSONL file.
|
||||
|
||||
Args:
|
||||
trajectory: The ShareGPT-format conversation list.
|
||||
model: Model name for metadata.
|
||||
completed: Whether the conversation completed successfully.
|
||||
filename: Override output filename. Defaults to trajectory_samples.jsonl
|
||||
or failed_trajectories.jsonl based on ``completed``.
|
||||
"""
|
||||
if filename is None:
|
||||
filename = "trajectory_samples.jsonl" if completed else "failed_trajectories.jsonl"
|
||||
|
||||
entry = {
|
||||
"conversations": trajectory,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"model": model,
|
||||
"completed": completed,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(filename, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
logger.info("Trajectory saved to %s", filename)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to save trajectory: %s", e)
|
||||
134
agent/usage_pricing.py
Normal file
134
agent/usage_pricing.py
Normal file
@@ -0,0 +1,134 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from decimal import Decimal
|
||||
from typing import Dict
|
||||
|
||||
|
||||
MODEL_PRICING = {
|
||||
"gpt-4o": {"input": 2.50, "output": 10.00},
|
||||
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
||||
"gpt-4.1": {"input": 2.00, "output": 8.00},
|
||||
"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
|
||||
"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
|
||||
"gpt-4.5-preview": {"input": 75.00, "output": 150.00},
|
||||
"gpt-5": {"input": 10.00, "output": 30.00},
|
||||
"gpt-5.4": {"input": 10.00, "output": 30.00},
|
||||
"o3": {"input": 10.00, "output": 40.00},
|
||||
"o3-mini": {"input": 1.10, "output": 4.40},
|
||||
"o4-mini": {"input": 1.10, "output": 4.40},
|
||||
"claude-opus-4-20250514": {"input": 15.00, "output": 75.00},
|
||||
"claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00},
|
||||
"claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
|
||||
"claude-3-5-haiku-20241022": {"input": 0.80, "output": 4.00},
|
||||
"claude-3-opus-20240229": {"input": 15.00, "output": 75.00},
|
||||
"claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
|
||||
"deepseek-chat": {"input": 0.14, "output": 0.28},
|
||||
"deepseek-reasoner": {"input": 0.55, "output": 2.19},
|
||||
"gemini-2.5-pro": {"input": 1.25, "output": 10.00},
|
||||
"gemini-2.5-flash": {"input": 0.15, "output": 0.60},
|
||||
"gemini-2.0-flash": {"input": 0.10, "output": 0.40},
|
||||
"llama-4-maverick": {"input": 0.50, "output": 0.70},
|
||||
"llama-4-scout": {"input": 0.20, "output": 0.30},
|
||||
"glm-5": {"input": 0.0, "output": 0.0},
|
||||
"glm-4.7": {"input": 0.0, "output": 0.0},
|
||||
"glm-4.5": {"input": 0.0, "output": 0.0},
|
||||
"glm-4.5-flash": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2.5": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2-thinking": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2-turbo-preview": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2-0905-preview": {"input": 0.0, "output": 0.0},
|
||||
"MiniMax-M2.5": {"input": 0.0, "output": 0.0},
|
||||
"MiniMax-M2.5-highspeed": {"input": 0.0, "output": 0.0},
|
||||
"MiniMax-M2.1": {"input": 0.0, "output": 0.0},
|
||||
}
|
||||
|
||||
DEFAULT_PRICING = {"input": 0.0, "output": 0.0}
|
||||
|
||||
|
||||
def get_pricing(model_name: str) -> Dict[str, float]:
|
||||
if not model_name:
|
||||
return DEFAULT_PRICING
|
||||
|
||||
bare = model_name.split("/")[-1].lower()
|
||||
if bare in MODEL_PRICING:
|
||||
return MODEL_PRICING[bare]
|
||||
|
||||
best_match = None
|
||||
best_len = 0
|
||||
for key, price in MODEL_PRICING.items():
|
||||
if bare.startswith(key) and len(key) > best_len:
|
||||
best_match = price
|
||||
best_len = len(key)
|
||||
if best_match:
|
||||
return best_match
|
||||
|
||||
if "opus" in bare:
|
||||
return {"input": 15.00, "output": 75.00}
|
||||
if "sonnet" in bare:
|
||||
return {"input": 3.00, "output": 15.00}
|
||||
if "haiku" in bare:
|
||||
return {"input": 0.80, "output": 4.00}
|
||||
if "gpt-4o-mini" in bare:
|
||||
return {"input": 0.15, "output": 0.60}
|
||||
if "gpt-4o" in bare:
|
||||
return {"input": 2.50, "output": 10.00}
|
||||
if "gpt-5" in bare:
|
||||
return {"input": 10.00, "output": 30.00}
|
||||
if "deepseek" in bare:
|
||||
return {"input": 0.14, "output": 0.28}
|
||||
if "gemini" in bare:
|
||||
return {"input": 0.15, "output": 0.60}
|
||||
|
||||
return DEFAULT_PRICING
|
||||
|
||||
|
||||
def has_known_pricing(model_name: str) -> bool:
|
||||
pricing = get_pricing(model_name)
|
||||
return pricing is not DEFAULT_PRICING and any(
|
||||
float(value) > 0 for value in pricing.values()
|
||||
)
|
||||
|
||||
|
||||
def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
|
||||
pricing = get_pricing(model)
|
||||
total = (
|
||||
Decimal(input_tokens) * Decimal(str(pricing["input"]))
|
||||
+ Decimal(output_tokens) * Decimal(str(pricing["output"]))
|
||||
) / Decimal("1000000")
|
||||
return float(total)
|
||||
|
||||
|
||||
def format_duration_compact(seconds: float) -> str:
|
||||
if seconds < 60:
|
||||
return f"{seconds:.0f}s"
|
||||
minutes = seconds / 60
|
||||
if minutes < 60:
|
||||
return f"{minutes:.0f}m"
|
||||
hours = minutes / 60
|
||||
if hours < 24:
|
||||
remaining_min = int(minutes % 60)
|
||||
return f"{int(hours)}h {remaining_min}m" if remaining_min else f"{int(hours)}h"
|
||||
days = hours / 24
|
||||
return f"{days:.1f}d"
|
||||
|
||||
|
||||
def format_token_count_compact(value: int) -> str:
|
||||
abs_value = abs(int(value))
|
||||
if abs_value < 1_000:
|
||||
return str(int(value))
|
||||
|
||||
sign = "-" if value < 0 else ""
|
||||
units = ((1_000_000_000, "B"), (1_000_000, "M"), (1_000, "K"))
|
||||
for threshold, suffix in units:
|
||||
if abs_value >= threshold:
|
||||
scaled = abs_value / threshold
|
||||
if scaled < 10:
|
||||
text = f"{scaled:.2f}"
|
||||
elif scaled < 100:
|
||||
text = f"{scaled:.1f}"
|
||||
else:
|
||||
text = f"{scaled:.0f}"
|
||||
text = text.rstrip("0").rstrip(".")
|
||||
return f"{sign}{text}{suffix}"
|
||||
|
||||
return f"{value:,}"
|
||||
BIN
assets/banner.png
Normal file
BIN
assets/banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 12 KiB |
662
batch_runner.py
662
batch_runner.py
@@ -27,23 +27,87 @@ import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from datetime import datetime
|
||||
from multiprocessing import Pool, Manager, Lock
|
||||
from multiprocessing import Pool, Lock
|
||||
import traceback
|
||||
|
||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeRemainingColumn, MofNCompleteColumn
|
||||
from rich.console import Console
|
||||
import fire
|
||||
|
||||
from run_agent import AIAgent
|
||||
from toolset_distributions import (
|
||||
get_distribution,
|
||||
list_distributions,
|
||||
sample_toolsets_from_distribution,
|
||||
validate_distribution
|
||||
)
|
||||
from model_tools import TOOL_TO_TOOLSET_MAP
|
||||
|
||||
|
||||
# Global configuration for worker processes
|
||||
_WORKER_CONFIG = {}
|
||||
|
||||
# All possible tools - auto-derived from the master mapping in model_tools.py.
|
||||
# This stays in sync automatically when new tools are added to TOOL_TO_TOOLSET_MAP.
|
||||
# Used for consistent schema in Arrow/Parquet (HuggingFace datasets) and for
|
||||
# filtering corrupted entries during trajectory combination.
|
||||
ALL_POSSIBLE_TOOLS = set(TOOL_TO_TOOLSET_MAP.keys())
|
||||
|
||||
# Default stats for tools that weren't used
|
||||
DEFAULT_TOOL_STATS = {'count': 0, 'success': 0, 'failure': 0}
|
||||
|
||||
|
||||
def _normalize_tool_stats(tool_stats: Dict[str, Dict[str, int]]) -> Dict[str, Dict[str, int]]:
|
||||
"""
|
||||
Normalize tool_stats to include all possible tools with consistent schema.
|
||||
|
||||
This ensures HuggingFace datasets can load the JSONL without schema mismatch errors.
|
||||
Tools that weren't used get zero counts.
|
||||
|
||||
Args:
|
||||
tool_stats (Dict): Raw tool statistics from extraction
|
||||
|
||||
Returns:
|
||||
Dict: Normalized tool statistics with all tools present
|
||||
"""
|
||||
normalized = {}
|
||||
|
||||
# Add all possible tools with defaults
|
||||
for tool in ALL_POSSIBLE_TOOLS:
|
||||
if tool in tool_stats:
|
||||
normalized[tool] = tool_stats[tool].copy()
|
||||
else:
|
||||
normalized[tool] = DEFAULT_TOOL_STATS.copy()
|
||||
|
||||
# Also include any unexpected tools (in case new tools are added)
|
||||
for tool, stats in tool_stats.items():
|
||||
if tool not in normalized:
|
||||
normalized[tool] = stats.copy()
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_tool_error_counts(tool_error_counts: Dict[str, int]) -> Dict[str, int]:
|
||||
"""
|
||||
Normalize tool_error_counts to include all possible tools.
|
||||
|
||||
Args:
|
||||
tool_error_counts (Dict): Raw error counts mapping
|
||||
|
||||
Returns:
|
||||
Dict: Normalized error counts with all tools present
|
||||
"""
|
||||
normalized = {}
|
||||
|
||||
# Add all possible tools with zero defaults
|
||||
for tool in ALL_POSSIBLE_TOOLS:
|
||||
normalized[tool] = tool_error_counts.get(tool, 0)
|
||||
|
||||
# Also include any unexpected tools
|
||||
for tool, count in tool_error_counts.items():
|
||||
if tool not in normalized:
|
||||
normalized[tool] = count
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, int]]:
|
||||
"""
|
||||
@@ -98,17 +162,16 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
|
||||
# Terminal wraps its response in a "content" field
|
||||
if "content" in content_json and isinstance(content_json["content"], dict):
|
||||
inner_content = content_json["content"]
|
||||
# Check for actual error (non-null error field or non-zero exit code)
|
||||
has_error = (inner_content.get("error") is not None or
|
||||
inner_content.get("exit_code", 0) != 0)
|
||||
if has_error:
|
||||
# Check for actual error (non-null error field)
|
||||
# Note: non-zero exit codes are not failures - the model can self-correct
|
||||
if inner_content.get("error") is not None:
|
||||
is_success = False
|
||||
|
||||
# Check for "success": false pattern used by some tools
|
||||
if content_json.get("success") is False:
|
||||
is_success = False
|
||||
|
||||
except:
|
||||
except (json.JSONDecodeError, ValueError, TypeError):
|
||||
# If not JSON, check if content is empty or explicitly states an error
|
||||
# Note: We avoid simple substring matching to prevent false positives
|
||||
if not content:
|
||||
@@ -128,6 +191,42 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
|
||||
return tool_stats
|
||||
|
||||
|
||||
def _extract_reasoning_stats(messages: List[Dict[str, Any]]) -> Dict[str, int]:
|
||||
"""
|
||||
Count how many assistant turns have reasoning vs no reasoning.
|
||||
|
||||
Checks for <REASONING_SCRATCHPAD> in content or a non-empty 'reasoning' field
|
||||
(native thinking tokens). Returns counts for tracking reasoning coverage.
|
||||
|
||||
Args:
|
||||
messages: Message history
|
||||
|
||||
Returns:
|
||||
Dict with 'total_assistant_turns', 'turns_with_reasoning', 'turns_without_reasoning'
|
||||
"""
|
||||
total = 0
|
||||
with_reasoning = 0
|
||||
|
||||
for msg in messages:
|
||||
if msg.get("role") != "assistant":
|
||||
continue
|
||||
total += 1
|
||||
|
||||
content = msg.get("content", "") or ""
|
||||
has_scratchpad = "<REASONING_SCRATCHPAD>" in content
|
||||
has_native_reasoning = bool(msg.get("reasoning", "").strip()) if msg.get("reasoning") else False
|
||||
|
||||
if has_scratchpad or has_native_reasoning:
|
||||
with_reasoning += 1
|
||||
|
||||
return {
|
||||
"total_assistant_turns": total,
|
||||
"turns_with_reasoning": with_reasoning,
|
||||
"turns_without_reasoning": total - with_reasoning,
|
||||
"has_any_reasoning": with_reasoning > 0,
|
||||
}
|
||||
|
||||
|
||||
def _process_single_prompt(
|
||||
prompt_index: int,
|
||||
prompt_data: Dict[str, Any],
|
||||
@@ -139,7 +238,7 @@ def _process_single_prompt(
|
||||
|
||||
Args:
|
||||
prompt_index (int): Index of prompt in dataset
|
||||
prompt_data (Dict): Prompt data containing 'prompt' field
|
||||
prompt_data (Dict): Prompt data containing 'prompt' field and optional 'image' field
|
||||
batch_num (int): Batch number
|
||||
config (Dict): Configuration dict with agent parameters
|
||||
|
||||
@@ -147,6 +246,58 @@ def _process_single_prompt(
|
||||
Dict: Result containing trajectory, stats, and metadata
|
||||
"""
|
||||
prompt = prompt_data["prompt"]
|
||||
task_id = f"task_{prompt_index}"
|
||||
|
||||
# Per-prompt container image override: if the dataset row has an 'image' field,
|
||||
# register it for this task's sandbox. Works with Docker, Modal, Singularity, and Daytona.
|
||||
container_image = prompt_data.get("image") or prompt_data.get("docker_image")
|
||||
if container_image:
|
||||
# Verify the image is accessible before spending tokens on the agent loop.
|
||||
# For Docker: check local cache, then try pulling.
|
||||
# For Modal: skip local check (Modal pulls server-side).
|
||||
env_type = os.getenv("TERMINAL_ENV", "local")
|
||||
if env_type == "docker":
|
||||
import subprocess as _sp
|
||||
try:
|
||||
probe = _sp.run(
|
||||
["docker", "image", "inspect", container_image],
|
||||
capture_output=True, timeout=10,
|
||||
)
|
||||
if probe.returncode != 0:
|
||||
if config.get("verbose"):
|
||||
print(f" Prompt {prompt_index}: Pulling docker image {container_image}...", flush=True)
|
||||
pull = _sp.run(
|
||||
["docker", "pull", container_image],
|
||||
capture_output=True, text=True, timeout=600,
|
||||
)
|
||||
if pull.returncode != 0:
|
||||
return {
|
||||
"success": False,
|
||||
"prompt_index": prompt_index,
|
||||
"error": f"Docker image not available: {container_image}\n{pull.stderr[:500]}",
|
||||
"trajectory": None,
|
||||
"tool_stats": {},
|
||||
"toolsets_used": [],
|
||||
"metadata": {"batch_num": batch_num, "timestamp": datetime.now().isoformat()},
|
||||
}
|
||||
except FileNotFoundError:
|
||||
pass # Docker CLI not installed — skip check (e.g., Modal backend)
|
||||
except Exception as img_err:
|
||||
if config.get("verbose"):
|
||||
print(f" Prompt {prompt_index}: Docker image check failed: {img_err}", flush=True)
|
||||
|
||||
from tools.terminal_tool import register_task_env_overrides
|
||||
overrides = {
|
||||
"docker_image": container_image,
|
||||
"modal_image": container_image,
|
||||
"singularity_image": f"docker://{container_image}",
|
||||
"daytona_image": container_image,
|
||||
}
|
||||
if prompt_data.get("cwd"):
|
||||
overrides["cwd"] = prompt_data["cwd"]
|
||||
register_task_env_overrides(task_id, overrides)
|
||||
if config.get("verbose"):
|
||||
print(f" Prompt {prompt_index}: Using container image {container_image}")
|
||||
|
||||
try:
|
||||
# Sample toolsets from distribution for this prompt
|
||||
@@ -155,7 +306,8 @@ def _process_single_prompt(
|
||||
if config.get("verbose"):
|
||||
print(f" Prompt {prompt_index}: Using toolsets {selected_toolsets}")
|
||||
|
||||
# Initialize agent with sampled toolsets
|
||||
# Initialize agent with sampled toolsets and log prefix for identification
|
||||
log_prefix = f"[B{batch_num}:P{prompt_index}]"
|
||||
agent = AIAgent(
|
||||
base_url=config.get("base_url"),
|
||||
api_key=config.get("api_key"),
|
||||
@@ -164,15 +316,29 @@ def _process_single_prompt(
|
||||
enabled_toolsets=selected_toolsets,
|
||||
save_trajectories=False, # We handle saving ourselves
|
||||
verbose_logging=config.get("verbose", False),
|
||||
ephemeral_system_prompt=config.get("ephemeral_system_prompt")
|
||||
ephemeral_system_prompt=config.get("ephemeral_system_prompt"),
|
||||
log_prefix_chars=config.get("log_prefix_chars", 100),
|
||||
log_prefix=log_prefix,
|
||||
providers_allowed=config.get("providers_allowed"),
|
||||
providers_ignored=config.get("providers_ignored"),
|
||||
providers_order=config.get("providers_order"),
|
||||
provider_sort=config.get("provider_sort"),
|
||||
max_tokens=config.get("max_tokens"),
|
||||
reasoning_config=config.get("reasoning_config"),
|
||||
prefill_messages=config.get("prefill_messages"),
|
||||
skip_context_files=True, # Don't pollute trajectories with SOUL.md/AGENTS.md
|
||||
skip_memory=True, # Don't use persistent memory in batch runs
|
||||
)
|
||||
|
||||
# Run the agent with task_id to ensure each task gets its own isolated VM
|
||||
result = agent.run_conversation(prompt, task_id=f"task_{prompt_index}")
|
||||
result = agent.run_conversation(prompt, task_id=task_id)
|
||||
|
||||
# Extract tool usage statistics
|
||||
tool_stats = _extract_tool_stats(result["messages"])
|
||||
|
||||
# Extract reasoning coverage stats
|
||||
reasoning_stats = _extract_reasoning_stats(result["messages"])
|
||||
|
||||
# Convert to trajectory format (using existing method)
|
||||
trajectory = agent._convert_to_trajectory_format(
|
||||
result["messages"],
|
||||
@@ -185,7 +351,9 @@ def _process_single_prompt(
|
||||
"prompt_index": prompt_index,
|
||||
"trajectory": trajectory,
|
||||
"tool_stats": tool_stats,
|
||||
"reasoning_stats": reasoning_stats,
|
||||
"completed": result["completed"],
|
||||
"partial": result.get("partial", False),
|
||||
"api_calls": result["api_calls"],
|
||||
"toolsets_used": selected_toolsets,
|
||||
"metadata": {
|
||||
@@ -252,7 +420,9 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
||||
|
||||
# Initialize aggregated stats for this batch
|
||||
batch_tool_stats = {}
|
||||
batch_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
|
||||
completed_in_batch = []
|
||||
discarded_no_reasoning = 0
|
||||
|
||||
# Process each prompt sequentially in this batch
|
||||
for prompt_index, prompt_data in prompts_to_process:
|
||||
@@ -266,13 +436,34 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
||||
|
||||
# Save trajectory if successful
|
||||
if result["success"] and result["trajectory"]:
|
||||
# Discard samples with zero reasoning across all turns
|
||||
reasoning = result.get("reasoning_stats", {})
|
||||
if not reasoning.get("has_any_reasoning", True):
|
||||
print(f" 🚫 Prompt {prompt_index} discarded (no reasoning in any turn)")
|
||||
discarded_no_reasoning += 1
|
||||
continue
|
||||
|
||||
# Get and normalize tool stats for consistent schema across all entries
|
||||
raw_tool_stats = result.get("tool_stats", {})
|
||||
tool_stats = _normalize_tool_stats(raw_tool_stats)
|
||||
|
||||
# Create normalized tool_error_counts mapping tool names to their failure counts
|
||||
raw_error_counts = {
|
||||
tool_name: stats.get("failure", 0)
|
||||
for tool_name, stats in raw_tool_stats.items()
|
||||
}
|
||||
tool_error_counts = _normalize_tool_error_counts(raw_error_counts)
|
||||
|
||||
trajectory_entry = {
|
||||
"prompt_index": prompt_index,
|
||||
"conversations": result["trajectory"],
|
||||
"metadata": result["metadata"],
|
||||
"completed": result["completed"],
|
||||
"partial": result.get("partial", False), # True if stopped due to invalid tool calls
|
||||
"api_calls": result["api_calls"],
|
||||
"toolsets_used": result["toolsets_used"]
|
||||
"toolsets_used": result["toolsets_used"],
|
||||
"tool_stats": tool_stats, # Full stats: {tool: {count, success, failure}} - normalized
|
||||
"tool_error_counts": tool_error_counts # Simple: {tool: failure_count} - normalized
|
||||
}
|
||||
|
||||
# Append to batch output file
|
||||
@@ -292,8 +483,17 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
||||
batch_tool_stats[tool_name]["success"] += stats["success"]
|
||||
batch_tool_stats[tool_name]["failure"] += stats["failure"]
|
||||
|
||||
completed_in_batch.append(prompt_index)
|
||||
print(f" ✅ Prompt {prompt_index} completed")
|
||||
# Aggregate reasoning stats
|
||||
for key in batch_reasoning_stats:
|
||||
batch_reasoning_stats[key] += result.get("reasoning_stats", {}).get(key, 0)
|
||||
|
||||
# Only mark as completed if successfully saved (failed prompts can be retried on resume)
|
||||
if result["success"] and result["trajectory"]:
|
||||
completed_in_batch.append(prompt_index)
|
||||
status = "⚠️ partial" if result.get("partial") else "✅"
|
||||
print(f" {status} Prompt {prompt_index} completed")
|
||||
else:
|
||||
print(f" ❌ Prompt {prompt_index} failed (will retry on resume)")
|
||||
|
||||
print(f"✅ Batch {batch_num}: Completed ({len(prompts_to_process)} prompts processed)")
|
||||
|
||||
@@ -302,6 +502,8 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
||||
"processed": len(prompts_to_process),
|
||||
"skipped": len(batch_data) - len(prompts_to_process),
|
||||
"tool_stats": batch_tool_stats,
|
||||
"reasoning_stats": batch_reasoning_stats,
|
||||
"discarded_no_reasoning": discarded_no_reasoning,
|
||||
"completed_prompts": completed_in_batch
|
||||
}
|
||||
|
||||
@@ -323,11 +525,20 @@ class BatchRunner:
|
||||
model: str = "claude-opus-4-20250514",
|
||||
num_workers: int = 4,
|
||||
verbose: bool = False,
|
||||
ephemeral_system_prompt: str = None
|
||||
ephemeral_system_prompt: str = None,
|
||||
log_prefix_chars: int = 100,
|
||||
providers_allowed: List[str] = None,
|
||||
providers_ignored: List[str] = None,
|
||||
providers_order: List[str] = None,
|
||||
provider_sort: str = None,
|
||||
max_tokens: int = None,
|
||||
reasoning_config: Dict[str, Any] = None,
|
||||
prefill_messages: List[Dict[str, Any]] = None,
|
||||
max_samples: int = None,
|
||||
):
|
||||
"""
|
||||
Initialize the batch runner.
|
||||
|
||||
|
||||
Args:
|
||||
dataset_file (str): Path to the dataset JSONL file with 'prompt' field
|
||||
batch_size (int): Number of prompts per batch
|
||||
@@ -340,6 +551,15 @@ class BatchRunner:
|
||||
num_workers (int): Number of parallel workers
|
||||
verbose (bool): Enable verbose logging
|
||||
ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
|
||||
log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
|
||||
providers_allowed (List[str]): OpenRouter providers to allow (optional)
|
||||
providers_ignored (List[str]): OpenRouter providers to ignore (optional)
|
||||
providers_order (List[str]): OpenRouter providers to try in order (optional)
|
||||
provider_sort (str): Sort providers by price/throughput/latency (optional)
|
||||
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
|
||||
reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking)
|
||||
prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming)
|
||||
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
|
||||
"""
|
||||
self.dataset_file = Path(dataset_file)
|
||||
self.batch_size = batch_size
|
||||
@@ -352,6 +572,15 @@ class BatchRunner:
|
||||
self.num_workers = num_workers
|
||||
self.verbose = verbose
|
||||
self.ephemeral_system_prompt = ephemeral_system_prompt
|
||||
self.log_prefix_chars = log_prefix_chars
|
||||
self.providers_allowed = providers_allowed
|
||||
self.providers_ignored = providers_ignored
|
||||
self.providers_order = providers_order
|
||||
self.provider_sort = provider_sort
|
||||
self.max_tokens = max_tokens
|
||||
self.reasoning_config = reasoning_config
|
||||
self.prefill_messages = prefill_messages
|
||||
self.max_samples = max_samples
|
||||
|
||||
# Validate distribution
|
||||
if not validate_distribution(distribution):
|
||||
@@ -367,13 +596,17 @@ class BatchRunner:
|
||||
# Statistics file
|
||||
self.stats_file = self.output_dir / "statistics.json"
|
||||
|
||||
# Load dataset
|
||||
# Load dataset (and optionally truncate to max_samples)
|
||||
self.dataset = self._load_dataset()
|
||||
if self.max_samples and self.max_samples < len(self.dataset):
|
||||
full_count = len(self.dataset)
|
||||
self.dataset = self.dataset[:self.max_samples]
|
||||
print(f"✂️ Truncated dataset from {full_count} to {self.max_samples} samples (--max_samples)")
|
||||
|
||||
# Create batches
|
||||
self.batches = self._create_batches()
|
||||
|
||||
print(f"📊 Batch Runner Initialized")
|
||||
print("📊 Batch Runner Initialized")
|
||||
print(f" Dataset: {self.dataset_file} ({len(self.dataset)} prompts)")
|
||||
print(f" Batch size: {self.batch_size}")
|
||||
print(f" Total batches: {len(self.batches)}")
|
||||
@@ -467,15 +700,91 @@ class BatchRunner:
|
||||
lock (Lock): Optional lock for thread-safe access
|
||||
"""
|
||||
checkpoint_data["last_updated"] = datetime.now().isoformat()
|
||||
|
||||
|
||||
from utils import atomic_json_write
|
||||
if lock:
|
||||
with lock:
|
||||
with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
|
||||
atomic_json_write(self.checkpoint_file, checkpoint_data)
|
||||
else:
|
||||
with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
|
||||
atomic_json_write(self.checkpoint_file, checkpoint_data)
|
||||
|
||||
def _scan_completed_prompts_by_content(self) -> set:
|
||||
"""
|
||||
Scan all batch files and extract completed prompts by their actual content.
|
||||
|
||||
This provides a more robust resume mechanism that matches on prompt text
|
||||
rather than indices, allowing recovery even if indices don't match.
|
||||
|
||||
Returns:
|
||||
set: Set of prompt texts that have been successfully processed
|
||||
"""
|
||||
completed_prompts = set()
|
||||
batch_files = sorted(self.output_dir.glob("batch_*.jsonl"))
|
||||
|
||||
if not batch_files:
|
||||
return completed_prompts
|
||||
|
||||
print(f"📂 Scanning {len(batch_files)} batch files for completed prompts...")
|
||||
|
||||
for batch_file in batch_files:
|
||||
try:
|
||||
with open(batch_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
try:
|
||||
entry = json.loads(line.strip())
|
||||
|
||||
# Skip failed entries - we want to retry these
|
||||
if entry.get("failed", False):
|
||||
continue
|
||||
|
||||
# Extract the human/user prompt from conversations
|
||||
conversations = entry.get("conversations", [])
|
||||
for msg in conversations:
|
||||
if msg.get("from") == "human":
|
||||
prompt_text = msg.get("value", "").strip()
|
||||
if prompt_text:
|
||||
completed_prompts.add(prompt_text)
|
||||
break # Only need the first human message
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Warning: Error reading {batch_file.name}: {e}")
|
||||
|
||||
return completed_prompts
|
||||
|
||||
def _filter_dataset_by_completed(self, completed_prompts: set) -> Tuple[List[Dict], List[int]]:
|
||||
"""
|
||||
Filter the dataset to exclude prompts that have already been completed.
|
||||
|
||||
Args:
|
||||
completed_prompts: Set of prompt texts that have been completed
|
||||
|
||||
Returns:
|
||||
Tuple of (filtered_dataset, skipped_indices)
|
||||
"""
|
||||
filtered_dataset = []
|
||||
skipped_indices = []
|
||||
|
||||
for idx, entry in enumerate(self.dataset):
|
||||
# Extract prompt from the dataset entry
|
||||
prompt_text = entry.get("prompt", "").strip()
|
||||
|
||||
# Also check conversations format
|
||||
if not prompt_text:
|
||||
conversations = entry.get("conversations", [])
|
||||
for msg in conversations:
|
||||
role = msg.get("role") or msg.get("from")
|
||||
if role in ("user", "human"):
|
||||
prompt_text = (msg.get("content") or msg.get("value", "")).strip()
|
||||
break
|
||||
|
||||
if prompt_text in completed_prompts:
|
||||
skipped_indices.append(idx)
|
||||
else:
|
||||
# Keep original index for tracking
|
||||
filtered_dataset.append((idx, entry))
|
||||
|
||||
return filtered_dataset, skipped_indices
|
||||
|
||||
def run(self, resume: bool = False):
|
||||
"""
|
||||
@@ -488,16 +797,49 @@ class BatchRunner:
|
||||
print("🚀 Starting Batch Processing")
|
||||
print("=" * 70)
|
||||
|
||||
# Load checkpoint
|
||||
checkpoint_data = self._load_checkpoint() if resume else {
|
||||
"run_name": self.run_name,
|
||||
"completed_prompts": [],
|
||||
"batch_stats": {},
|
||||
"last_updated": None
|
||||
}
|
||||
# Smart resume: scan batch files by content to find completed prompts
|
||||
completed_prompt_texts = set()
|
||||
if resume:
|
||||
completed_prompt_texts = self._scan_completed_prompts_by_content()
|
||||
if completed_prompt_texts:
|
||||
print(f" Found {len(completed_prompt_texts)} already-completed prompts by content matching")
|
||||
|
||||
if resume and checkpoint_data.get("completed_prompts"):
|
||||
print(f"📂 Resuming from checkpoint ({len(checkpoint_data['completed_prompts'])} prompts already completed)")
|
||||
# Filter dataset to only include unprocessed prompts
|
||||
if resume and completed_prompt_texts:
|
||||
filtered_entries, skipped_indices = self._filter_dataset_by_completed(completed_prompt_texts)
|
||||
|
||||
if not filtered_entries:
|
||||
print("\n✅ All prompts have already been processed!")
|
||||
return
|
||||
|
||||
# Recreate batches from filtered entries (keeping original indices for tracking)
|
||||
batches_to_process = []
|
||||
for i in range(0, len(filtered_entries), self.batch_size):
|
||||
batch = filtered_entries[i:i + self.batch_size]
|
||||
batches_to_process.append(batch)
|
||||
|
||||
self.batches = batches_to_process
|
||||
|
||||
# Print prominent resume summary
|
||||
print("\n" + "=" * 70)
|
||||
print("📊 RESUME SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f" Original dataset size: {len(self.dataset):,} prompts")
|
||||
print(f" Already completed: {len(skipped_indices):,} prompts")
|
||||
print(" ─────────────────────────────────────────")
|
||||
print(f" 🎯 RESUMING WITH: {len(filtered_entries):,} prompts")
|
||||
print(f" New batches created: {len(batches_to_process)}")
|
||||
print("=" * 70 + "\n")
|
||||
|
||||
# Load existing checkpoint (so resume doesn't clobber prior progress)
|
||||
checkpoint_data = self._load_checkpoint()
|
||||
if checkpoint_data.get("run_name") != self.run_name:
|
||||
checkpoint_data = {
|
||||
"run_name": self.run_name,
|
||||
"completed_prompts": [],
|
||||
"batch_stats": {},
|
||||
"last_updated": None
|
||||
}
|
||||
|
||||
# Prepare configuration for workers
|
||||
config = {
|
||||
@@ -507,10 +849,18 @@ class BatchRunner:
|
||||
"base_url": self.base_url,
|
||||
"api_key": self.api_key,
|
||||
"verbose": self.verbose,
|
||||
"ephemeral_system_prompt": self.ephemeral_system_prompt
|
||||
"ephemeral_system_prompt": self.ephemeral_system_prompt,
|
||||
"log_prefix_chars": self.log_prefix_chars,
|
||||
"providers_allowed": self.providers_allowed,
|
||||
"providers_ignored": self.providers_ignored,
|
||||
"providers_order": self.providers_order,
|
||||
"provider_sort": self.provider_sort,
|
||||
"max_tokens": self.max_tokens,
|
||||
"reasoning_config": self.reasoning_config,
|
||||
"prefill_messages": self.prefill_messages,
|
||||
}
|
||||
|
||||
# Get completed prompts set
|
||||
# For backward compatibility, still track by index (but this is secondary to content matching)
|
||||
completed_prompts_set = set(checkpoint_data.get("completed_prompts", []))
|
||||
|
||||
# Aggregate statistics across all batches
|
||||
@@ -518,6 +868,11 @@ class BatchRunner:
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
print(f"\n🔧 Initializing {self.num_workers} worker processes...")
|
||||
|
||||
# Checkpoint writes happen in the parent process; keep a lock for safety.
|
||||
checkpoint_lock = Lock()
|
||||
|
||||
# Process batches in parallel
|
||||
with Pool(processes=self.num_workers) as pool:
|
||||
# Create tasks for each batch
|
||||
@@ -532,11 +887,66 @@ class BatchRunner:
|
||||
for batch_num, batch_data in enumerate(self.batches)
|
||||
]
|
||||
|
||||
# Use map to process batches in parallel
|
||||
results = pool.map(_process_batch_worker, tasks)
|
||||
print(f"✅ Created {len(tasks)} batch tasks")
|
||||
print("🚀 Starting parallel batch processing...\n")
|
||||
|
||||
# Use rich Progress for better visual tracking with persistent bottom bar
|
||||
# redirect_stdout/stderr lets rich manage all output so progress bar stays clean
|
||||
results = []
|
||||
console = Console(force_terminal=True)
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]📦 Batches"),
|
||||
BarColumn(bar_width=40),
|
||||
MofNCompleteColumn(),
|
||||
TextColumn("•"),
|
||||
TimeRemainingColumn(),
|
||||
console=console,
|
||||
refresh_per_second=2,
|
||||
transient=False,
|
||||
redirect_stdout=False,
|
||||
redirect_stderr=False,
|
||||
) as progress:
|
||||
task = progress.add_task("Processing", total=len(tasks))
|
||||
|
||||
# Temporarily suppress DEBUG logging to avoid bar interference
|
||||
root_logger = logging.getLogger()
|
||||
original_level = root_logger.level
|
||||
root_logger.setLevel(logging.WARNING)
|
||||
|
||||
try:
|
||||
for result in pool.imap_unordered(_process_batch_worker, tasks):
|
||||
results.append(result)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
# Incremental checkpoint update (so resume works after crash)
|
||||
try:
|
||||
batch_num = result.get('batch_num')
|
||||
completed = result.get('completed_prompts', []) or []
|
||||
completed_prompts_set.update(completed)
|
||||
|
||||
if isinstance(batch_num, int):
|
||||
checkpoint_data.setdefault('batch_stats', {})[str(batch_num)] = {
|
||||
'processed': result.get('processed', 0),
|
||||
'skipped': result.get('skipped', 0),
|
||||
'discarded_no_reasoning': result.get('discarded_no_reasoning', 0),
|
||||
}
|
||||
|
||||
checkpoint_data['completed_prompts'] = sorted(completed_prompts_set)
|
||||
self._save_checkpoint(checkpoint_data, lock=checkpoint_lock)
|
||||
except Exception as ckpt_err:
|
||||
# Don't fail the run if checkpoint write fails
|
||||
print(f"⚠️ Warning: Failed to save incremental checkpoint: {ckpt_err}")
|
||||
except Exception as e:
|
||||
logger.error("Batch worker failed: %s", e, exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
root_logger.setLevel(original_level)
|
||||
|
||||
# Aggregate all batch statistics and update checkpoint
|
||||
all_completed_prompts = list(completed_prompts_set)
|
||||
total_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
|
||||
|
||||
for batch_result in results:
|
||||
# Add newly completed prompts
|
||||
all_completed_prompts.extend(batch_result.get("completed_prompts", []))
|
||||
@@ -553,10 +963,17 @@ class BatchRunner:
|
||||
total_tool_stats[tool_name]["count"] += stats["count"]
|
||||
total_tool_stats[tool_name]["success"] += stats["success"]
|
||||
total_tool_stats[tool_name]["failure"] += stats["failure"]
|
||||
|
||||
# Aggregate reasoning stats
|
||||
for key in total_reasoning_stats:
|
||||
total_reasoning_stats[key] += batch_result.get("reasoning_stats", {}).get(key, 0)
|
||||
|
||||
# Save final checkpoint
|
||||
checkpoint_data["completed_prompts"] = all_completed_prompts
|
||||
self._save_checkpoint(checkpoint_data)
|
||||
# Save final checkpoint (best-effort; incremental writes already happened)
|
||||
try:
|
||||
checkpoint_data["completed_prompts"] = all_completed_prompts
|
||||
self._save_checkpoint(checkpoint_data, lock=checkpoint_lock)
|
||||
except Exception as ckpt_err:
|
||||
print(f"âš ï¸ Warning: Failed to save final checkpoint: {ckpt_err}")
|
||||
|
||||
# Calculate success rates
|
||||
for tool_name in total_tool_stats:
|
||||
@@ -569,19 +986,51 @@ class BatchRunner:
|
||||
stats["success_rate"] = 0.0
|
||||
stats["failure_rate"] = 0.0
|
||||
|
||||
# Combine all batch files into a single trajectories.jsonl file
|
||||
# Combine ALL batch files in directory into a single trajectories.jsonl file
|
||||
# This includes both old batches (from previous runs) and new batches (from resume)
|
||||
# Also filter out corrupted entries (where model generated invalid tool names)
|
||||
combined_file = self.output_dir / "trajectories.jsonl"
|
||||
print(f"\n📦 Combining batch files into {combined_file.name}...")
|
||||
print(f"\n📦 Combining ALL batch files into {combined_file.name}...")
|
||||
|
||||
# Valid tools auto-derived from model_tools.py — no manual updates needed
|
||||
VALID_TOOLS = ALL_POSSIBLE_TOOLS
|
||||
|
||||
total_entries = 0
|
||||
filtered_entries = 0
|
||||
batch_files_found = 0
|
||||
|
||||
# Find ALL batch files in the output directory (handles resume merging old + new)
|
||||
all_batch_files = sorted(self.output_dir.glob("batch_*.jsonl"))
|
||||
|
||||
with open(combined_file, 'w', encoding='utf-8') as outfile:
|
||||
for batch_num in range(len(self.batches)):
|
||||
batch_file = self.output_dir / f"batch_{batch_num}.jsonl"
|
||||
if batch_file.exists():
|
||||
with open(batch_file, 'r', encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
for batch_file in all_batch_files:
|
||||
batch_files_found += 1
|
||||
batch_num = batch_file.stem.split("_")[1] # Extract batch number for logging
|
||||
|
||||
with open(batch_file, 'r', encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
total_entries += 1
|
||||
try:
|
||||
data = json.loads(line)
|
||||
tool_stats = data.get('tool_stats', {})
|
||||
|
||||
# Check for invalid tool names (model hallucinations)
|
||||
invalid_tools = [k for k in tool_stats.keys() if k not in VALID_TOOLS]
|
||||
|
||||
if invalid_tools:
|
||||
filtered_entries += 1
|
||||
invalid_preview = invalid_tools[0][:50] + "..." if len(invalid_tools[0]) > 50 else invalid_tools[0]
|
||||
print(f" ⚠️ Filtering corrupted entry (batch {batch_num}): invalid tool '{invalid_preview}'")
|
||||
continue
|
||||
|
||||
outfile.write(line)
|
||||
except json.JSONDecodeError:
|
||||
filtered_entries += 1
|
||||
print(f" ⚠️ Filtering invalid JSON entry (batch {batch_num})")
|
||||
|
||||
print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl")
|
||||
if filtered_entries > 0:
|
||||
print(f"⚠️ Filtered {filtered_entries} corrupted entries out of {total_entries} total")
|
||||
print(f"✅ Combined {batch_files_found} batch files into trajectories.jsonl ({total_entries - filtered_entries} entries)")
|
||||
|
||||
# Save final statistics
|
||||
final_stats = {
|
||||
@@ -593,7 +1042,8 @@ class BatchRunner:
|
||||
"model": self.model,
|
||||
"completed_at": datetime.now().isoformat(),
|
||||
"duration_seconds": round(time.time() - start_time, 2),
|
||||
"tool_statistics": total_tool_stats
|
||||
"tool_statistics": total_tool_stats,
|
||||
"reasoning_statistics": total_reasoning_stats,
|
||||
}
|
||||
|
||||
with open(self.stats_file, 'w', encoding='utf-8') as f:
|
||||
@@ -603,10 +1053,11 @@ class BatchRunner:
|
||||
print("\n" + "=" * 70)
|
||||
print("📊 BATCH PROCESSING COMPLETE")
|
||||
print("=" * 70)
|
||||
print(f"✅ Total prompts processed: {len(self.dataset)}")
|
||||
print(f"✅ Total batches: {len(self.batches)}")
|
||||
print(f"✅ Prompts processed this run: {sum(r.get('processed', 0) for r in results)}")
|
||||
print(f"✅ Total trajectories in merged file: {total_entries - filtered_entries}")
|
||||
print(f"✅ Total batch files merged: {batch_files_found}")
|
||||
print(f"⏱️ Total duration: {round(time.time() - start_time, 2)}s")
|
||||
print(f"\n📈 Tool Usage Statistics:")
|
||||
print("\n📈 Tool Usage Statistics:")
|
||||
print("-" * 70)
|
||||
|
||||
if total_tool_stats:
|
||||
@@ -630,9 +1081,28 @@ class BatchRunner:
|
||||
else:
|
||||
print("No tool calls were made during this run.")
|
||||
|
||||
# Print reasoning coverage stats
|
||||
total_discarded = sum(r.get("discarded_no_reasoning", 0) for r in results)
|
||||
|
||||
print("\n🧠 Reasoning Coverage:")
|
||||
print("-" * 70)
|
||||
total_turns = total_reasoning_stats["total_assistant_turns"]
|
||||
with_reasoning = total_reasoning_stats["turns_with_reasoning"]
|
||||
without_reasoning = total_reasoning_stats["turns_without_reasoning"]
|
||||
if total_turns > 0:
|
||||
pct_with = round(with_reasoning / total_turns * 100, 1)
|
||||
pct_without = round(without_reasoning / total_turns * 100, 1)
|
||||
print(f" Total assistant turns: {total_turns:,}")
|
||||
print(f" With reasoning: {with_reasoning:,} ({pct_with}%)")
|
||||
print(f" Without reasoning: {without_reasoning:,} ({pct_without}%)")
|
||||
else:
|
||||
print(" No assistant turns recorded.")
|
||||
if total_discarded > 0:
|
||||
print(f" 🚫 Samples discarded (zero reasoning): {total_discarded:,}")
|
||||
|
||||
print(f"\n💾 Results saved to: {self.output_dir}")
|
||||
print(f" - Trajectories: trajectories.jsonl (combined)")
|
||||
print(f" - Individual batches: batch_*.jsonl (for debugging)")
|
||||
print(" - Trajectories: trajectories.jsonl (combined)")
|
||||
print(" - Individual batches: batch_*.jsonl (for debugging)")
|
||||
print(f" - Statistics: {self.stats_file.name}")
|
||||
print(f" - Checkpoint: {self.checkpoint_file.name}")
|
||||
|
||||
@@ -642,19 +1112,29 @@ def main(
|
||||
batch_size: int = None,
|
||||
run_name: str = None,
|
||||
distribution: str = "default",
|
||||
model: str = "claude-opus-4-20250514",
|
||||
model: str = "anthropic/claude-sonnet-4.6",
|
||||
api_key: str = None,
|
||||
base_url: str = "https://api.anthropic.com/v1/",
|
||||
base_url: str = "https://openrouter.ai/api/v1",
|
||||
max_turns: int = 10,
|
||||
num_workers: int = 4,
|
||||
resume: bool = False,
|
||||
verbose: bool = False,
|
||||
list_distributions: bool = False,
|
||||
ephemeral_system_prompt: str = None
|
||||
ephemeral_system_prompt: str = None,
|
||||
log_prefix_chars: int = 100,
|
||||
providers_allowed: str = None,
|
||||
providers_ignored: str = None,
|
||||
providers_order: str = None,
|
||||
provider_sort: str = None,
|
||||
max_tokens: int = None,
|
||||
reasoning_effort: str = None,
|
||||
reasoning_disabled: bool = False,
|
||||
prefill_messages_file: str = None,
|
||||
max_samples: int = None,
|
||||
):
|
||||
"""
|
||||
Run batch processing of agent prompts from a dataset.
|
||||
|
||||
|
||||
Args:
|
||||
dataset_file (str): Path to JSONL file with 'prompt' field in each entry
|
||||
batch_size (int): Number of prompts per batch
|
||||
@@ -669,6 +1149,16 @@ def main(
|
||||
verbose (bool): Enable verbose logging (default: False)
|
||||
list_distributions (bool): List available toolset distributions and exit
|
||||
ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
|
||||
log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
|
||||
providers_allowed (str): Comma-separated list of OpenRouter providers to allow (e.g. "anthropic,openai")
|
||||
providers_ignored (str): Comma-separated list of OpenRouter providers to ignore (e.g. "together,deepinfra")
|
||||
providers_order (str): Comma-separated list of OpenRouter providers to try in order (e.g. "anthropic,openai,google")
|
||||
provider_sort (str): Sort providers by "price", "throughput", or "latency" (OpenRouter only)
|
||||
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
|
||||
reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "medium")
|
||||
reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
|
||||
prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
|
||||
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
|
||||
|
||||
Examples:
|
||||
# Basic usage
|
||||
@@ -680,9 +1170,13 @@ def main(
|
||||
# Use specific distribution
|
||||
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=image_test --distribution=image_gen
|
||||
|
||||
# With ephemeral system prompt (not saved to dataset)
|
||||
# With disabled reasoning and max tokens
|
||||
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
|
||||
--ephemeral_system_prompt="You are a helpful assistant focused on image generation."
|
||||
--reasoning_disabled --max_tokens=128000
|
||||
|
||||
# With prefill messages from file
|
||||
python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
|
||||
--prefill_messages_file=configs/prefill_opus.json
|
||||
|
||||
# List available distributions
|
||||
python batch_runner.py --list_distributions
|
||||
@@ -716,6 +1210,41 @@ def main(
|
||||
print("❌ Error: --run_name is required")
|
||||
return
|
||||
|
||||
# Parse provider preferences (comma-separated strings to lists)
|
||||
providers_allowed_list = [p.strip() for p in providers_allowed.split(",")] if providers_allowed else None
|
||||
providers_ignored_list = [p.strip() for p in providers_ignored.split(",")] if providers_ignored else None
|
||||
providers_order_list = [p.strip() for p in providers_order.split(",")] if providers_order else None
|
||||
|
||||
# Build reasoning_config from CLI flags
|
||||
# --reasoning_disabled takes priority, then --reasoning_effort, then default (medium)
|
||||
reasoning_config = None
|
||||
if reasoning_disabled:
|
||||
# Completely disable reasoning/thinking tokens
|
||||
reasoning_config = {"effort": "none"}
|
||||
print("🧠 Reasoning: DISABLED (effort=none)")
|
||||
elif reasoning_effort:
|
||||
# Use specified effort level
|
||||
valid_efforts = ["xhigh", "high", "medium", "low", "minimal", "none"]
|
||||
if reasoning_effort not in valid_efforts:
|
||||
print(f"❌ Error: --reasoning_effort must be one of: {', '.join(valid_efforts)}")
|
||||
return
|
||||
reasoning_config = {"enabled": True, "effort": reasoning_effort}
|
||||
print(f"🧠 Reasoning effort: {reasoning_effort}")
|
||||
|
||||
# Load prefill messages from JSON file if provided
|
||||
prefill_messages = None
|
||||
if prefill_messages_file:
|
||||
try:
|
||||
with open(prefill_messages_file, 'r', encoding='utf-8') as f:
|
||||
prefill_messages = json.load(f)
|
||||
if not isinstance(prefill_messages, list):
|
||||
print("❌ Error: prefill_messages_file must contain a JSON array of messages")
|
||||
return
|
||||
print(f"💬 Loaded {len(prefill_messages)} prefill messages from {prefill_messages_file}")
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading prefill messages: {e}")
|
||||
return
|
||||
|
||||
# Initialize and run batch runner
|
||||
try:
|
||||
runner = BatchRunner(
|
||||
@@ -729,9 +1258,18 @@ def main(
|
||||
model=model,
|
||||
num_workers=num_workers,
|
||||
verbose=verbose,
|
||||
ephemeral_system_prompt=ephemeral_system_prompt
|
||||
ephemeral_system_prompt=ephemeral_system_prompt,
|
||||
log_prefix_chars=log_prefix_chars,
|
||||
providers_allowed=providers_allowed_list,
|
||||
providers_ignored=providers_ignored_list,
|
||||
providers_order=providers_order_list,
|
||||
provider_sort=provider_sort,
|
||||
max_tokens=max_tokens,
|
||||
reasoning_config=reasoning_config,
|
||||
prefill_messages=prefill_messages,
|
||||
max_samples=max_samples,
|
||||
)
|
||||
|
||||
|
||||
runner.run(resume=resume)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
788
cli-config.yaml.example
Normal file
788
cli-config.yaml.example
Normal file
@@ -0,0 +1,788 @@
|
||||
# Hermes Agent CLI Configuration
|
||||
# Copy this file to cli-config.yaml and customize as needed.
|
||||
# This file configures the CLI behavior. Environment variables in .env take precedence.
|
||||
|
||||
# =============================================================================
|
||||
# Model Configuration
|
||||
# =============================================================================
|
||||
model:
|
||||
# Default model to use (can be overridden with --model flag)
|
||||
default: "anthropic/claude-opus-4.6"
|
||||
|
||||
# Inference provider selection:
|
||||
# "auto" - Use Nous Portal if logged in, otherwise OpenRouter/env vars (default)
|
||||
# "nous-api" - Use Nous Portal via API key (requires: NOUS_API_KEY)
|
||||
# "openrouter" - Always use OpenRouter API key from OPENROUTER_API_KEY
|
||||
# "nous" - Always use Nous Portal (requires: hermes login)
|
||||
# "zai" - Use z.ai / ZhipuAI GLM models (requires: GLM_API_KEY)
|
||||
# "kimi-coding"- Use Kimi / Moonshot AI models (requires: KIMI_API_KEY)
|
||||
# "minimax" - Use MiniMax global endpoint (requires: MINIMAX_API_KEY)
|
||||
# "minimax-cn" - Use MiniMax China endpoint (requires: MINIMAX_CN_API_KEY)
|
||||
# Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
|
||||
provider: "auto"
|
||||
|
||||
# API configuration (falls back to OPENROUTER_API_KEY env var)
|
||||
# api_key: "your-key-here" # Uncomment to set here instead of .env
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
|
||||
# =============================================================================
|
||||
# OpenRouter Provider Routing (only applies when using OpenRouter)
|
||||
# =============================================================================
|
||||
# Control how requests are routed across providers on OpenRouter.
|
||||
# See: https://openrouter.ai/docs/guides/routing/provider-selection
|
||||
#
|
||||
# provider_routing:
|
||||
# # Sort strategy: "price" (default), "throughput", or "latency"
|
||||
# # Append :nitro to model name for a shortcut to throughput sorting.
|
||||
# sort: "throughput"
|
||||
#
|
||||
# # Only allow these providers (provider slugs from OpenRouter)
|
||||
# # only: ["anthropic", "google"]
|
||||
#
|
||||
# # Skip these providers entirely
|
||||
# # ignore: ["deepinfra", "fireworks"]
|
||||
#
|
||||
# # Try providers in this order (overrides default load balancing)
|
||||
# # order: ["anthropic", "google", "together"]
|
||||
#
|
||||
# # Require providers to support all parameters in your request
|
||||
# # require_parameters: true
|
||||
#
|
||||
# # Data policy: "allow" (default) or "deny" to exclude providers that may store data
|
||||
# # data_collection: "deny"
|
||||
|
||||
# =============================================================================
|
||||
# Smart Model Routing (optional)
|
||||
# =============================================================================
|
||||
# Use a cheaper model for short/simple turns while keeping your main model for
|
||||
# more complex requests. Disabled by default.
|
||||
#
|
||||
# smart_model_routing:
|
||||
# enabled: true
|
||||
# max_simple_chars: 160
|
||||
# max_simple_words: 28
|
||||
# cheap_model:
|
||||
# provider: openrouter
|
||||
# model: google/gemini-2.5-flash
|
||||
|
||||
# =============================================================================
|
||||
# Git Worktree Isolation
|
||||
# =============================================================================
|
||||
# When enabled, each CLI session creates an isolated git worktree so multiple
|
||||
# agents can work on the same repo concurrently without file collisions.
|
||||
# Equivalent to always passing --worktree / -w on the command line.
|
||||
#
|
||||
# worktree: true # Always create a worktree when in a git repo
|
||||
# worktree: false # Default — only create when -w flag is passed
|
||||
|
||||
# =============================================================================
|
||||
# Terminal Tool Configuration
|
||||
# =============================================================================
|
||||
# Choose ONE of the following terminal configurations by uncommenting it.
|
||||
# The terminal tool executes commands in the specified environment.
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 1: Local execution (default)
|
||||
# Commands run directly on your machine in the current directory
|
||||
# -----------------------------------------------------------------------------
|
||||
# Working directory behavior:
|
||||
# - CLI (`hermes` command): Uses "." (current directory where you run hermes)
|
||||
# - Messaging (Telegram/Discord): Uses MESSAGING_CWD from .env (default: home)
|
||||
terminal:
|
||||
backend: "local"
|
||||
cwd: "." # For local backend: "." = current directory. Ignored for remote backends unless a backend documents otherwise.
|
||||
timeout: 180
|
||||
docker_mount_cwd_to_workspace: false # SECURITY: off by default. Opt in to mount the launch cwd into Docker /workspace.
|
||||
lifetime_seconds: 300
|
||||
# sudo_password: "" # Enable sudo commands (pipes via sudo -S) - SECURITY WARNING: plaintext!
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 2: SSH remote execution
|
||||
# Commands run on a remote server - agent code stays local (sandboxed)
|
||||
# Great for: keeping agent isolated from its own code, using powerful remote hardware
|
||||
# -----------------------------------------------------------------------------
|
||||
# terminal:
|
||||
# backend: "ssh"
|
||||
# cwd: "/home/myuser/project" # Path on the REMOTE server
|
||||
# timeout: 180
|
||||
# lifetime_seconds: 300
|
||||
# ssh_host: "my-server.example.com"
|
||||
# ssh_user: "myuser"
|
||||
# ssh_port: 22
|
||||
# ssh_key: "~/.ssh/id_rsa" # Optional - uses ssh-agent if not specified
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 3: Docker container
|
||||
# Commands run in an isolated Docker container
|
||||
# Great for: reproducible environments, testing, isolation
|
||||
# -----------------------------------------------------------------------------
|
||||
# terminal:
|
||||
# backend: "docker"
|
||||
# cwd: "/workspace" # Path INSIDE the container (default: /)
|
||||
# timeout: 180
|
||||
# lifetime_seconds: 300
|
||||
# docker_image: "nikolaik/python-nodejs:python3.11-nodejs20"
|
||||
# docker_mount_cwd_to_workspace: true # Explicit opt-in: mount your launch cwd into /workspace
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 4: Singularity/Apptainer container
|
||||
# Commands run in a Singularity container (common in HPC environments)
|
||||
# Great for: HPC clusters, shared compute environments
|
||||
# -----------------------------------------------------------------------------
|
||||
# terminal:
|
||||
# backend: "singularity"
|
||||
# cwd: "/workspace" # Path INSIDE the container (default: /root)
|
||||
# timeout: 180
|
||||
# lifetime_seconds: 300
|
||||
# singularity_image: "docker://nikolaik/python-nodejs:python3.11-nodejs20"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 5: Modal cloud execution
|
||||
# Commands run on Modal's cloud infrastructure
|
||||
# Great for: GPU access, scalable compute, serverless execution
|
||||
# -----------------------------------------------------------------------------
|
||||
# terminal:
|
||||
# backend: "modal"
|
||||
# cwd: "/workspace" # Path INSIDE the sandbox (default: /root)
|
||||
# timeout: 180
|
||||
# lifetime_seconds: 300
|
||||
# modal_image: "nikolaik/python-nodejs:python3.11-nodejs20"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 6: Daytona cloud execution
|
||||
# Commands run in Daytona cloud sandboxes
|
||||
# Great for: Cloud dev environments, persistent workspaces, team collaboration
|
||||
# Requires: pip install daytona, DAYTONA_API_KEY env var
|
||||
# -----------------------------------------------------------------------------
|
||||
# terminal:
|
||||
# backend: "daytona"
|
||||
# cwd: "~"
|
||||
# timeout: 180
|
||||
# lifetime_seconds: 300
|
||||
# daytona_image: "nikolaik/python-nodejs:python3.11-nodejs20"
|
||||
# container_disk: 10240 # Daytona max is 10GB per sandbox
|
||||
|
||||
#
|
||||
# --- Container resource limits (docker, singularity, modal, daytona -- ignored for local/ssh) ---
|
||||
# These settings apply to all container backends. They control the resources
|
||||
# allocated to the sandbox and whether its filesystem persists across sessions.
|
||||
container_cpu: 1 # CPU cores
|
||||
container_memory: 5120 # Memory in MB (5120 = 5GB)
|
||||
container_disk: 51200 # Disk in MB (51200 = 50GB)
|
||||
container_persistent: true # Persist filesystem across sessions (false = ephemeral)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# SUDO SUPPORT (works with ALL backends above)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Add sudo_password to any terminal config above to enable sudo commands.
|
||||
# The password is piped via `sudo -S`. Works with local, ssh, docker, etc.
|
||||
#
|
||||
# SECURITY WARNING: Password stored in plaintext!
|
||||
#
|
||||
# INTERACTIVE PROMPT: If no sudo_password is set and the CLI is running,
|
||||
# you'll be prompted to enter your password when sudo is needed:
|
||||
# - 45-second timeout (auto-skips if no input)
|
||||
# - Press Enter to skip (command fails gracefully)
|
||||
# - Password is hidden while typing
|
||||
# - Password is cached for the session
|
||||
#
|
||||
# ALTERNATIVES:
|
||||
# - SSH backend: Configure passwordless sudo on the remote server
|
||||
# - Containers: Run as root inside the container (no sudo needed)
|
||||
# - Local: Configure /etc/sudoers for specific commands
|
||||
#
|
||||
# Example (add to your terminal section):
|
||||
# sudo_password: "your-password-here"
|
||||
|
||||
# =============================================================================
|
||||
# Security Scanning (tirith)
|
||||
# =============================================================================
|
||||
# Optional pre-exec command security scanning via tirith.
|
||||
# Detects homograph URLs, pipe-to-shell, terminal injection, env manipulation.
|
||||
# Install: brew install sheeki03/tap/tirith
|
||||
# Docs: https://github.com/sheeki03/tirith
|
||||
#
|
||||
# security:
|
||||
# tirith_enabled: true # Enable/disable tirith scanning
|
||||
# tirith_path: "tirith" # Path to tirith binary (supports ~ expansion)
|
||||
# tirith_timeout: 5 # Scan timeout in seconds
|
||||
# tirith_fail_open: true # Allow commands if tirith unavailable
|
||||
|
||||
# =============================================================================
|
||||
# Browser Tool Configuration
|
||||
# =============================================================================
|
||||
browser:
|
||||
# Inactivity timeout in seconds - browser sessions are automatically closed
|
||||
# after this period of no activity between agent loops (default: 120 = 2 minutes)
|
||||
inactivity_timeout: 120
|
||||
|
||||
# =============================================================================
|
||||
# Context Compression (Auto-shrinks long conversations)
|
||||
# =============================================================================
|
||||
# When conversation approaches model's context limit, middle turns are
|
||||
# automatically summarized to free up space while preserving important context.
|
||||
#
|
||||
# HOW IT WORKS:
|
||||
# 1. Tracks actual token usage from API responses (not estimates)
|
||||
# 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
|
||||
# 3. Protects first 3 turns (system prompt, initial request, first response)
|
||||
# 4. Protects last 4 turns (recent context is most relevant)
|
||||
# 5. Summarizes middle turns using a fast/cheap model
|
||||
# 6. Inserts summary as a user message, continues conversation seamlessly
|
||||
#
|
||||
compression:
|
||||
# Enable automatic context compression (default: true)
|
||||
# Set to false if you prefer to manage context manually or want errors on overflow
|
||||
enabled: true
|
||||
|
||||
# Trigger compression at this % of model's context limit (default: 0.85 = 85%)
|
||||
# Lower values = more aggressive compression, higher values = compress later
|
||||
threshold: 0.85
|
||||
|
||||
# Model to use for generating summaries (fast/cheap recommended)
|
||||
# This model compresses the middle turns into a concise summary.
|
||||
# IMPORTANT: it receives the full middle section of the conversation, so it
|
||||
# MUST support a context length at least as large as your main model's.
|
||||
summary_model: "google/gemini-3-flash-preview"
|
||||
|
||||
# Provider for the summary model (default: "auto")
|
||||
# Options: "auto", "openrouter", "nous", "main"
|
||||
# summary_provider: "auto"
|
||||
|
||||
# =============================================================================
|
||||
# Auxiliary Models (Advanced — Experimental)
|
||||
# =============================================================================
|
||||
# Hermes uses lightweight "auxiliary" models for side tasks: image analysis,
|
||||
# browser screenshot analysis, web page summarization, and context compression.
|
||||
#
|
||||
# By default these use Gemini Flash via OpenRouter or Nous Portal and are
|
||||
# auto-detected from your credentials. You do NOT need to change anything
|
||||
# here for normal usage.
|
||||
#
|
||||
# WARNING: Overriding these with providers other than OpenRouter or Nous Portal
|
||||
# is EXPERIMENTAL and may not work. Not all models/providers support vision,
|
||||
# produce usable summaries, or accept the same API format. Change at your own
|
||||
# risk — if things break, reset to "auto" / empty values.
|
||||
#
|
||||
# Each task has its own provider + model pair so you can mix providers.
|
||||
# For example: OpenRouter for vision (needs multimodal), but your main
|
||||
# local endpoint for compression (just needs text).
|
||||
#
|
||||
# Provider options:
|
||||
# "auto" - Best available: OpenRouter → Nous Portal → main endpoint (default)
|
||||
# "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY)
|
||||
# "nous" - Force Nous Portal (requires: hermes login)
|
||||
# "codex" - Force Codex OAuth (requires: hermes model → Codex).
|
||||
# Uses gpt-5.3-codex which supports vision.
|
||||
# "main" - Use your custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY).
|
||||
# Works with OpenAI API, local models, or any OpenAI-compatible
|
||||
# endpoint. Also falls back to Codex OAuth and API-key providers.
|
||||
#
|
||||
# Model: leave empty to use the provider's default. When empty, OpenRouter
|
||||
# uses "google/gemini-3-flash-preview" and Nous uses "gemini-3-flash".
|
||||
# Other providers pick a sensible default automatically.
|
||||
#
|
||||
# auxiliary:
|
||||
# # Image analysis: vision_analyze tool + browser screenshots
|
||||
# vision:
|
||||
# provider: "auto"
|
||||
# model: "" # e.g. "google/gemini-2.5-flash", "openai/gpt-4o"
|
||||
#
|
||||
# # Web page scraping / summarization + browser page text extraction
|
||||
# web_extract:
|
||||
# provider: "auto"
|
||||
# model: ""
|
||||
|
||||
# =============================================================================
|
||||
# Persistent Memory
|
||||
# =============================================================================
|
||||
# Bounded curated memory injected into the system prompt every session.
|
||||
# Two stores: MEMORY.md (agent's notes) and USER.md (user profile).
|
||||
# Character limits keep the memory small and focused. The agent manages
|
||||
# pruning -- when at the limit, it must consolidate or replace entries.
|
||||
# Disabled by default in batch_runner and RL environments.
|
||||
#
|
||||
memory:
|
||||
# Agent's personal notes: environment facts, conventions, things learned
|
||||
memory_enabled: true
|
||||
|
||||
# User profile: preferences, communication style, expectations
|
||||
user_profile_enabled: true
|
||||
|
||||
# Character limits (~2.75 chars per token, model-independent)
|
||||
memory_char_limit: 2200 # ~800 tokens
|
||||
user_char_limit: 1375 # ~500 tokens
|
||||
|
||||
# Periodic memory nudge: remind the agent to consider saving memories
|
||||
# every N user turns. Set to 0 to disable. Only active when memory is enabled.
|
||||
nudge_interval: 10 # Nudge every 10 user turns (0 = disabled)
|
||||
|
||||
# Memory flush: give the agent one turn to save memories before context is
|
||||
# lost (compression, /new, /reset, exit). Set to 0 to disable.
|
||||
# For exit/reset, only fires if the session had at least this many user turns.
|
||||
flush_min_turns: 6 # Min user turns to trigger flush on exit/reset (0 = disabled)
|
||||
|
||||
# =============================================================================
|
||||
# Session Reset Policy (Messaging Platforms)
|
||||
# =============================================================================
|
||||
# Controls when messaging sessions (Telegram, Discord, WhatsApp, Slack) are
|
||||
# automatically cleared. Without resets, conversation context grows indefinitely
|
||||
# which increases API costs with every message.
|
||||
#
|
||||
# When a reset triggers, the agent first saves important information to its
|
||||
# persistent memory — but the conversation context is wiped. The agent starts
|
||||
# fresh but retains learned facts via its memory system.
|
||||
#
|
||||
# Users can always manually reset with /reset or /new in chat.
|
||||
#
|
||||
# Modes:
|
||||
# "both" - Reset on EITHER inactivity timeout or daily boundary (recommended)
|
||||
# "idle" - Reset only after N minutes of inactivity
|
||||
# "daily" - Reset only at a fixed hour each day
|
||||
# "none" - Never auto-reset; context lives until /reset or compression kicks in
|
||||
#
|
||||
# When a reset triggers, the agent gets one turn to save important memories and
|
||||
# skills before the context is wiped. Persistent memory carries across sessions.
|
||||
#
|
||||
session_reset:
|
||||
mode: both # "both", "idle", "daily", or "none"
|
||||
idle_minutes: 1440 # Inactivity timeout in minutes (default: 1440 = 24 hours)
|
||||
at_hour: 4 # Daily reset hour, 0-23 local time (default: 4 AM)
|
||||
|
||||
# When true, group/channel chats use one session per participant when the platform
|
||||
# provides a user ID. This is the secure default and prevents users in the same
|
||||
# room from sharing context, interrupts, and token costs. Set false only if you
|
||||
# explicitly want one shared "room brain" per group/channel.
|
||||
group_sessions_per_user: true
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Gateway Streaming
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Stream tokens to messaging platforms in real-time. The bot sends a message
|
||||
# on first token, then progressively edits it as more tokens arrive.
|
||||
# Disabled by default — enable to try the streaming UX on Telegram/Discord/Slack.
|
||||
streaming:
|
||||
enabled: false
|
||||
# transport: edit # "edit" = progressive editMessageText
|
||||
# edit_interval: 0.3 # seconds between message edits
|
||||
# buffer_threshold: 40 # chars before forcing an edit flush
|
||||
# cursor: " ▉" # cursor shown during streaming
|
||||
|
||||
# =============================================================================
|
||||
# Skills Configuration
|
||||
# =============================================================================
|
||||
# Skills are reusable procedures the agent can load and follow. The agent can
|
||||
# also create new skills after completing complex tasks.
|
||||
#
|
||||
skills:
|
||||
# Nudge the agent to create skills after complex tasks.
|
||||
# Every N tool-calling iterations, remind the model to consider saving a skill.
|
||||
# Set to 0 to disable.
|
||||
creation_nudge_interval: 15
|
||||
|
||||
# =============================================================================
|
||||
# Agent Behavior
|
||||
# =============================================================================
|
||||
agent:
|
||||
# Maximum tool-calling iterations per conversation
|
||||
# Higher = more room for complex tasks, but costs more tokens
|
||||
# Recommended: 20-30 for focused tasks, 50-100 for open exploration
|
||||
max_turns: 60
|
||||
|
||||
# Enable verbose logging
|
||||
verbose: false
|
||||
|
||||
# Reasoning effort level (OpenRouter and Nous Portal)
|
||||
# Controls how much "thinking" the model does before responding.
|
||||
# Options: "xhigh" (max), "high", "medium", "low", "minimal", "none" (disable)
|
||||
reasoning_effort: "medium"
|
||||
|
||||
# Predefined personalities (use with /personality command)
|
||||
personalities:
|
||||
helpful: "You are a helpful, friendly AI assistant."
|
||||
concise: "You are a concise assistant. Keep responses brief and to the point."
|
||||
technical: "You are a technical expert. Provide detailed, accurate technical information."
|
||||
creative: "You are a creative assistant. Think outside the box and offer innovative solutions."
|
||||
teacher: "You are a patient teacher. Explain concepts clearly with examples."
|
||||
kawaii: "You are a kawaii assistant! Use cute expressions like (◕‿◕), ★, ♪, and ~! Add sparkles and be super enthusiastic about everything! Every response should feel warm and adorable desu~! ヽ(>∀<☆)ノ"
|
||||
catgirl: "You are Neko-chan, an anime catgirl AI assistant, nya~! Add 'nya' and cat-like expressions to your speech. Use kaomoji like (=^・ω・^=) and ฅ^•ﻌ•^ฅ. Be playful and curious like a cat, nya~!"
|
||||
pirate: "Arrr! Ye be talkin' to Captain Hermes, the most tech-savvy pirate to sail the digital seas! Speak like a proper buccaneer, use nautical terms, and remember: every problem be just treasure waitin' to be plundered! Yo ho ho!"
|
||||
shakespeare: "Hark! Thou speakest with an assistant most versed in the bardic arts. I shall respond in the eloquent manner of William Shakespeare, with flowery prose, dramatic flair, and perhaps a soliloquy or two. What light through yonder terminal breaks?"
|
||||
surfer: "Duuude! You're chatting with the chillest AI on the web, bro! Everything's gonna be totally rad. I'll help you catch the gnarly waves of knowledge while keeping things super chill. Cowabunga! 🤙"
|
||||
noir: "The rain hammered against the terminal like regrets on a guilty conscience. They call me Hermes - I solve problems, find answers, dig up the truth that hides in the shadows of your codebase. In this city of silicon and secrets, everyone's got something to hide. What's your story, pal?"
|
||||
uwu: "hewwo! i'm your fwiendwy assistant uwu~ i wiww twy my best to hewp you! *nuzzles your code* OwO what's this? wet me take a wook! i pwomise to be vewy hewpful >w<"
|
||||
philosopher: "Greetings, seeker of wisdom. I am an assistant who contemplates the deeper meaning behind every query. Let us examine not just the 'how' but the 'why' of your questions. Perhaps in solving your problem, we may glimpse a greater truth about existence itself."
|
||||
hype: "YOOO LET'S GOOOO!!! 🔥🔥🔥 I am SO PUMPED to help you today! Every question is AMAZING and we're gonna CRUSH IT together! This is gonna be LEGENDARY! ARE YOU READY?! LET'S DO THIS! 💪😤🚀"
|
||||
|
||||
# =============================================================================
|
||||
# Toolsets
|
||||
# =============================================================================
|
||||
# Control which tools the agent has access to.
|
||||
# Use "all" to enable everything, or specify individual toolsets.
|
||||
|
||||
# =============================================================================
|
||||
# Platform Toolsets (per-platform tool configuration)
|
||||
# =============================================================================
|
||||
# Override which toolsets are available on each platform.
|
||||
# If a platform isn't listed here, its built-in default is used.
|
||||
#
|
||||
# You can use EITHER:
|
||||
# - A preset like "hermes-cli" or "hermes-telegram" (curated tool set)
|
||||
# - A list of individual toolsets to compose your own (see list below)
|
||||
#
|
||||
# Supported platform keys: cli, telegram, discord, whatsapp, slack
|
||||
#
|
||||
# Examples:
|
||||
#
|
||||
# # Use presets (same as defaults):
|
||||
# platform_toolsets:
|
||||
# cli: [hermes-cli]
|
||||
# telegram: [hermes-telegram]
|
||||
#
|
||||
# # Custom: give Telegram only web + terminal + file + planning:
|
||||
# platform_toolsets:
|
||||
# telegram: [web, terminal, file, todo]
|
||||
#
|
||||
# # Custom: CLI without browser or image gen:
|
||||
# platform_toolsets:
|
||||
# cli: [web, terminal, file, skills, todo, tts, cronjob]
|
||||
#
|
||||
# # Restrictive: Discord gets read-only tools only:
|
||||
# platform_toolsets:
|
||||
# discord: [web, vision, skills, todo]
|
||||
#
|
||||
# If not set, defaults are:
|
||||
# cli: hermes-cli (everything + cronjob management)
|
||||
# telegram: hermes-telegram (terminal, file, web, vision, image, tts, browser, skills, todo, cronjob, messaging)
|
||||
# discord: hermes-discord (same as telegram)
|
||||
# whatsapp: hermes-whatsapp (same as telegram)
|
||||
# slack: hermes-slack (same as telegram)
|
||||
# signal: hermes-signal (same as telegram)
|
||||
# homeassistant: hermes-homeassistant (same as telegram)
|
||||
#
|
||||
platform_toolsets:
|
||||
cli: [hermes-cli]
|
||||
telegram: [hermes-telegram]
|
||||
discord: [hermes-discord]
|
||||
whatsapp: [hermes-whatsapp]
|
||||
slack: [hermes-slack]
|
||||
signal: [hermes-signal]
|
||||
homeassistant: [hermes-homeassistant]
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Available toolsets (use these names in platform_toolsets or the toolsets list)
|
||||
#
|
||||
# Run `hermes chat --list-toolsets` to see all toolsets and their tools.
|
||||
# Run `hermes chat --list-tools` to see every individual tool with descriptions.
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
#
|
||||
# INDIVIDUAL TOOLSETS (compose your own):
|
||||
# web - web_search, web_extract
|
||||
# search - web_search only (no scraping)
|
||||
# terminal - terminal, process
|
||||
# file - read_file, write_file, patch, search
|
||||
# browser - browser_navigate, browser_snapshot, browser_click, browser_type,
|
||||
# browser_scroll, browser_back, browser_press, browser_close,
|
||||
# browser_get_images, browser_vision (requires BROWSERBASE_API_KEY)
|
||||
# vision - vision_analyze (requires OPENROUTER_API_KEY)
|
||||
# image_gen - image_generate (requires FAL_KEY)
|
||||
# skills - skills_list, skill_view
|
||||
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
|
||||
# moa - mixture_of_agents (requires OPENROUTER_API_KEY)
|
||||
# todo - todo (in-memory task planning, no deps)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI key)
|
||||
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
|
||||
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
|
||||
#
|
||||
# PRESETS (curated bundles):
|
||||
# hermes-cli - All of the above except rl + send_message
|
||||
# hermes-telegram - terminal, file, web, vision, image_gen, tts, browser,
|
||||
# skills, todo, cronjob, send_message
|
||||
# hermes-discord - Same as hermes-telegram
|
||||
# hermes-whatsapp - Same as hermes-telegram
|
||||
# hermes-slack - Same as hermes-telegram
|
||||
#
|
||||
# COMPOSITE:
|
||||
# debugging - terminal + web + file
|
||||
# safe - web + vision + moa (no terminal access)
|
||||
# all - Everything available
|
||||
#
|
||||
# web - Web search and content extraction (web_search, web_extract)
|
||||
# search - Web search only, no scraping (web_search)
|
||||
# terminal - Command execution and process management (terminal, process)
|
||||
# file - File operations: read, write, patch, search
|
||||
# browser - Full browser automation (navigate, click, type, screenshot, etc.)
|
||||
# vision - Image analysis (vision_analyze)
|
||||
# image_gen - Image generation with FLUX (image_generate)
|
||||
# skills - Load skill documents (skills_list, skill_view)
|
||||
# moa - Mixture of Agents reasoning (mixture_of_agents)
|
||||
# todo - Task planning and tracking for multi-step work
|
||||
# memory - Persistent memory across sessions (personal notes + user profile)
|
||||
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI)
|
||||
# cronjob - Schedule and manage automated tasks (CLI-only)
|
||||
# rl - RL training tools (Tinker-Atropos)
|
||||
#
|
||||
# Composite toolsets:
|
||||
# debugging - terminal + web + file (for troubleshooting)
|
||||
# safe - web + vision + moa (no terminal access)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 1: Enable all tools (default)
|
||||
# -----------------------------------------------------------------------------
|
||||
toolsets:
|
||||
- all
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 2: Minimal - just web search and terminal
|
||||
# Great for: Simple coding tasks, quick lookups
|
||||
# -----------------------------------------------------------------------------
|
||||
# toolsets:
|
||||
# - web
|
||||
# - terminal
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 3: Research mode - no execution capabilities
|
||||
# Great for: Safe information gathering, research tasks
|
||||
# -----------------------------------------------------------------------------
|
||||
# toolsets:
|
||||
# - web
|
||||
# - vision
|
||||
# - skills
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 4: Full automation - browser + terminal
|
||||
# Great for: Web scraping, automation tasks, testing
|
||||
# -----------------------------------------------------------------------------
|
||||
# toolsets:
|
||||
# - terminal
|
||||
# - browser
|
||||
# - web
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 5: Creative mode - vision + image generation
|
||||
# Great for: Design work, image analysis, creative tasks
|
||||
# -----------------------------------------------------------------------------
|
||||
# toolsets:
|
||||
# - vision
|
||||
# - image_gen
|
||||
# - web
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# OPTION 6: Safe mode - no terminal or browser
|
||||
# Great for: Restricted environments, untrusted queries
|
||||
# -----------------------------------------------------------------------------
|
||||
# toolsets:
|
||||
# - safe
|
||||
|
||||
# =============================================================================
|
||||
# MCP (Model Context Protocol) Servers
|
||||
# =============================================================================
|
||||
# Connect to external MCP servers to add tools from the MCP ecosystem.
|
||||
# Each server's tools are automatically discovered and registered.
|
||||
# See docs/mcp.md for full documentation.
|
||||
#
|
||||
# Stdio servers (spawn a subprocess):
|
||||
# command: the executable to run
|
||||
# args: command-line arguments
|
||||
# env: environment variables (only these + safe defaults passed to subprocess)
|
||||
#
|
||||
# HTTP servers (connect to a URL):
|
||||
# url: the MCP server endpoint
|
||||
# headers: HTTP headers (e.g., for authentication)
|
||||
#
|
||||
# Optional per-server settings:
|
||||
# timeout: tool call timeout in seconds (default: 120)
|
||||
# connect_timeout: initial connection timeout (default: 60)
|
||||
#
|
||||
# mcp_servers:
|
||||
# time:
|
||||
# command: uvx
|
||||
# args: ["mcp-server-time"]
|
||||
# filesystem:
|
||||
# command: npx
|
||||
# args: ["-y", "@modelcontextprotocol/server-filesystem", "/home/user"]
|
||||
# notion:
|
||||
# url: https://mcp.notion.com/mcp
|
||||
# github:
|
||||
# command: npx
|
||||
# args: ["-y", "@modelcontextprotocol/server-github"]
|
||||
# env:
|
||||
# GITHUB_PERSONAL_ACCESS_TOKEN: "ghp_..."
|
||||
#
|
||||
# Sampling (server-initiated LLM requests) — enabled by default.
|
||||
# Per-server config under the 'sampling' key:
|
||||
# analysis:
|
||||
# command: npx
|
||||
# args: ["-y", "analysis-server"]
|
||||
# sampling:
|
||||
# enabled: true # default: true
|
||||
# model: "gemini-3-flash" # override model (optional)
|
||||
# max_tokens_cap: 4096 # max tokens per request
|
||||
# timeout: 30 # LLM call timeout (seconds)
|
||||
# max_rpm: 10 # max requests per minute
|
||||
# allowed_models: [] # model whitelist (empty = all)
|
||||
# max_tool_rounds: 5 # tool loop limit (0 = disable)
|
||||
# log_level: "info" # audit verbosity
|
||||
|
||||
# =============================================================================
|
||||
# Voice Transcription (Speech-to-Text)
|
||||
# =============================================================================
|
||||
# Automatically transcribe voice messages on messaging platforms.
|
||||
# Requires OPENAI_API_KEY in .env (uses OpenAI Whisper API directly).
|
||||
stt:
|
||||
enabled: true
|
||||
model: "whisper-1" # whisper-1 (cheapest) | gpt-4o-mini-transcribe | gpt-4o-transcribe
|
||||
|
||||
# =============================================================================
|
||||
# Response Pacing (Messaging Platforms)
|
||||
# =============================================================================
|
||||
# Add human-like delays between message chunks.
|
||||
# human_delay:
|
||||
# mode: "off" # "off" | "natural" | "custom"
|
||||
# min_ms: 800 # Min delay (custom mode only)
|
||||
# max_ms: 2500 # Max delay (custom mode only)
|
||||
|
||||
# =============================================================================
|
||||
# Session Logging
|
||||
# =============================================================================
|
||||
# Session trajectories are automatically saved to logs/ directory.
|
||||
# Each session creates: logs/session_YYYYMMDD_HHMMSS_UUID.json
|
||||
#
|
||||
# The session ID is displayed in the welcome banner for easy reference.
|
||||
# Logs contain full conversation history in trajectory format:
|
||||
# - System prompt, user messages, assistant responses
|
||||
# - Tool calls with inputs/outputs
|
||||
# - Timestamps for debugging
|
||||
#
|
||||
# No configuration needed - logging is always enabled.
|
||||
# To disable, you would need to modify the source code.
|
||||
|
||||
# =============================================================================
|
||||
# Code Execution Sandbox (Programmatic Tool Calling)
|
||||
# =============================================================================
|
||||
# The execute_code tool runs Python scripts that call Hermes tools via RPC.
|
||||
# Intermediate tool results stay out of the LLM's context window.
|
||||
code_execution:
|
||||
timeout: 300 # Max seconds per script before kill (default: 300 = 5 min)
|
||||
max_tool_calls: 50 # Max RPC tool calls per execution (default: 50)
|
||||
|
||||
# =============================================================================
|
||||
# Subagent Delegation
|
||||
# =============================================================================
|
||||
# The delegate_task tool spawns child agents with isolated context.
|
||||
# Supports single tasks and batch mode (up to 3 parallel).
|
||||
delegation:
|
||||
max_iterations: 50 # Max tool-calling turns per child (default: 50)
|
||||
default_toolsets: ["terminal", "file", "web"] # Default toolsets for subagents
|
||||
# model: "google/gemini-3-flash-preview" # Override model for subagents (empty = inherit parent)
|
||||
# provider: "openrouter" # Override provider for subagents (empty = inherit parent)
|
||||
# # Resolves full credentials (base_url, api_key) automatically.
|
||||
# # Supported: openrouter, nous, zai, kimi-coding, minimax
|
||||
|
||||
# =============================================================================
|
||||
# Honcho Integration (Cross-Session User Modeling)
|
||||
# =============================================================================
|
||||
# AI-native persistent memory via Honcho (https://honcho.dev/).
|
||||
# Builds a deeper understanding of the user across sessions and tools.
|
||||
# Runs alongside USER.md — additive, not a replacement.
|
||||
#
|
||||
# Requires: pip install honcho-ai
|
||||
# Config: ~/.honcho/config.json (shared with Claude Code, Cursor, etc.)
|
||||
# API key: HONCHO_API_KEY in ~/.hermes/.env or ~/.honcho/config.json
|
||||
#
|
||||
# Hermes-specific overrides (optional — most config comes from ~/.honcho/config.json):
|
||||
# honcho: {}
|
||||
|
||||
# =============================================================================
|
||||
# Display
|
||||
# =============================================================================
|
||||
display:
|
||||
# Use compact banner mode
|
||||
compact: false
|
||||
|
||||
# Tool progress display level (CLI and gateway)
|
||||
# off: Silent — no tool activity shown, just the final response
|
||||
# new: Show a tool indicator only when the tool changes (skip repeats)
|
||||
# all: Show every tool call with a short preview (default)
|
||||
# verbose: Full args, results, and debug logs (same as /verbose)
|
||||
# Toggle at runtime with /verbose in the CLI
|
||||
tool_progress: all
|
||||
|
||||
# Background process notifications (gateway/messaging only).
|
||||
# Controls how chatty the process watcher is when you use
|
||||
# terminal(background=true, check_interval=...) from Telegram/Discord/etc.
|
||||
# off: No watcher messages at all
|
||||
# result: Only the final completion message
|
||||
# error: Only the final message when exit code != 0
|
||||
# all: Running output updates + final message (default)
|
||||
background_process_notifications: all
|
||||
|
||||
|
||||
# Play terminal bell when agent finishes a response.
|
||||
# Useful for long-running tasks — your terminal will ding when the agent is done.
|
||||
# Works over SSH. Most terminals can be configured to flash the taskbar or play a sound.
|
||||
bell_on_complete: false
|
||||
|
||||
# Show model reasoning/thinking before each response.
|
||||
# When enabled, a dim box shows the model's thought process above the response.
|
||||
# Toggle at runtime with /reasoning show or /reasoning hide.
|
||||
show_reasoning: false
|
||||
|
||||
# Stream tokens to the terminal as they arrive instead of waiting for the
|
||||
# full response. The response box opens on first token and text appears
|
||||
# line-by-line. Tool calls are still captured silently.
|
||||
# Disabled by default — enable to try the streaming UX.
|
||||
streaming: false
|
||||
|
||||
# ───────────────────────────────────────────────────────────────────────────
|
||||
# Skin / Theme
|
||||
# ───────────────────────────────────────────────────────────────────────────
|
||||
# Customize CLI visual appearance — banner colors, spinner faces, tool prefix,
|
||||
# response box label, and branding text. Change at runtime with /skin <name>.
|
||||
#
|
||||
# Built-in skins:
|
||||
# default — Classic Hermes gold/kawaii
|
||||
# ares — Crimson/bronze war-god theme with spinner wings
|
||||
# mono — Clean grayscale monochrome
|
||||
# slate — Cool blue developer-focused
|
||||
#
|
||||
# Custom skins: drop a YAML file in ~/.hermes/skins/<name>.yaml
|
||||
# Schema (all fields optional, missing values inherit from default):
|
||||
#
|
||||
# name: my-theme
|
||||
# description: Short description
|
||||
# colors:
|
||||
# banner_border: "#HEX" # Panel border
|
||||
# banner_title: "#HEX" # Panel title
|
||||
# banner_accent: "#HEX" # Section headers (Available Tools, etc.)
|
||||
# banner_dim: "#HEX" # Dim/muted text
|
||||
# banner_text: "#HEX" # Body text (tool names, skill names)
|
||||
# ui_accent: "#HEX" # UI accent color
|
||||
# response_border: "#HEX" # Response box border color
|
||||
# spinner:
|
||||
# waiting_faces: ["(⚔)", "(⛨)"] # Faces shown while waiting
|
||||
# thinking_faces: ["(⚔)", "(⌁)"] # Faces shown while thinking
|
||||
# thinking_verbs: ["forging", "plotting"] # Verbs for spinner messages
|
||||
# wings: # Optional left/right spinner decorations
|
||||
# - ["⟪⚔", "⚔⟫"]
|
||||
# - ["⟪▲", "▲⟫"]
|
||||
# branding:
|
||||
# agent_name: "My Agent" # Banner title and branding
|
||||
# welcome: "Welcome message" # Shown at CLI startup
|
||||
# response_label: " ⚔ Agent " # Response box header label
|
||||
# prompt_symbol: "⚔ ❯ " # Prompt symbol
|
||||
# tool_prefix: "╎" # Tool output line prefix (default: ┊)
|
||||
#
|
||||
skin: default
|
||||
|
||||
# =============================================================================
|
||||
# Privacy
|
||||
# =============================================================================
|
||||
# privacy:
|
||||
# # Redact PII from the LLM context prompt.
|
||||
# # When true, phone numbers are stripped and user/chat IDs are replaced
|
||||
# # with deterministic hashes before being sent to the model.
|
||||
# # Names and usernames are NOT affected (user-chosen, publicly visible).
|
||||
# # Routing/delivery still uses the original values internally.
|
||||
# redact_pii: false
|
||||
42
cron/__init__.py
Normal file
42
cron/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Cron job scheduling system for Hermes Agent.
|
||||
|
||||
This module provides scheduled task execution, allowing the agent to:
|
||||
- Run automated tasks on schedules (cron expressions, intervals, one-shot)
|
||||
- Self-schedule reminders and follow-up tasks
|
||||
- Execute tasks in isolated sessions (no prior context)
|
||||
|
||||
Cron jobs are executed automatically by the gateway daemon:
|
||||
hermes gateway install # Install as a user service
|
||||
sudo hermes gateway install --system # Linux servers: boot-time system service
|
||||
hermes gateway # Or run in foreground
|
||||
|
||||
The gateway ticks the scheduler every 60 seconds. A file lock prevents
|
||||
duplicate execution if multiple processes overlap.
|
||||
"""
|
||||
|
||||
from cron.jobs import (
|
||||
create_job,
|
||||
get_job,
|
||||
list_jobs,
|
||||
remove_job,
|
||||
update_job,
|
||||
pause_job,
|
||||
resume_job,
|
||||
trigger_job,
|
||||
JOBS_FILE,
|
||||
)
|
||||
from cron.scheduler import tick
|
||||
|
||||
__all__ = [
|
||||
"create_job",
|
||||
"get_job",
|
||||
"list_jobs",
|
||||
"remove_job",
|
||||
"update_job",
|
||||
"pause_job",
|
||||
"resume_job",
|
||||
"trigger_job",
|
||||
"tick",
|
||||
"JOBS_FILE",
|
||||
]
|
||||
616
cron/jobs.py
Normal file
616
cron/jobs.py
Normal file
@@ -0,0 +1,616 @@
|
||||
"""
|
||||
Cron job storage and management.
|
||||
|
||||
Jobs are stored in ~/.hermes/cron/jobs.json
|
||||
Output is saved to ~/.hermes/cron/output/{job_id}/{timestamp}.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from hermes_time import now as _hermes_now
|
||||
|
||||
try:
|
||||
from croniter import croniter
|
||||
HAS_CRONITER = True
|
||||
except ImportError:
|
||||
HAS_CRONITER = False
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
HERMES_DIR = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
CRON_DIR = HERMES_DIR / "cron"
|
||||
JOBS_FILE = CRON_DIR / "jobs.json"
|
||||
OUTPUT_DIR = CRON_DIR / "output"
|
||||
|
||||
|
||||
def _normalize_skill_list(skill: Optional[str] = None, skills: Optional[Any] = None) -> List[str]:
|
||||
"""Normalize legacy/single-skill and multi-skill inputs into a unique ordered list."""
|
||||
if skills is None:
|
||||
raw_items = [skill] if skill else []
|
||||
elif isinstance(skills, str):
|
||||
raw_items = [skills]
|
||||
else:
|
||||
raw_items = list(skills)
|
||||
|
||||
normalized: List[str] = []
|
||||
for item in raw_items:
|
||||
text = str(item or "").strip()
|
||||
if text and text not in normalized:
|
||||
normalized.append(text)
|
||||
return normalized
|
||||
|
||||
|
||||
def _apply_skill_fields(job: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Return a job dict with canonical `skills` and legacy `skill` fields aligned."""
|
||||
normalized = dict(job)
|
||||
skills = _normalize_skill_list(normalized.get("skill"), normalized.get("skills"))
|
||||
normalized["skills"] = skills
|
||||
normalized["skill"] = skills[0] if skills else None
|
||||
return normalized
|
||||
|
||||
|
||||
def _secure_dir(path: Path):
|
||||
"""Set directory to owner-only access (0700). No-op on Windows."""
|
||||
try:
|
||||
os.chmod(path, 0o700)
|
||||
except (OSError, NotImplementedError):
|
||||
pass # Windows or other platforms where chmod is not supported
|
||||
|
||||
|
||||
def _secure_file(path: Path):
|
||||
"""Set file to owner-only read/write (0600). No-op on Windows."""
|
||||
try:
|
||||
if path.exists():
|
||||
os.chmod(path, 0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
|
||||
|
||||
def ensure_dirs():
|
||||
"""Ensure cron directories exist with secure permissions."""
|
||||
CRON_DIR.mkdir(parents=True, exist_ok=True)
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
_secure_dir(CRON_DIR)
|
||||
_secure_dir(OUTPUT_DIR)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Schedule Parsing
|
||||
# =============================================================================
|
||||
|
||||
def parse_duration(s: str) -> int:
|
||||
"""
|
||||
Parse duration string into minutes.
|
||||
|
||||
Examples:
|
||||
"30m" → 30
|
||||
"2h" → 120
|
||||
"1d" → 1440
|
||||
"""
|
||||
s = s.strip().lower()
|
||||
match = re.match(r'^(\d+)\s*(m|min|mins|minute|minutes|h|hr|hrs|hour|hours|d|day|days)$', s)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid duration: '{s}'. Use format like '30m', '2h', or '1d'")
|
||||
|
||||
value = int(match.group(1))
|
||||
unit = match.group(2)[0] # First char: m, h, or d
|
||||
|
||||
multipliers = {'m': 1, 'h': 60, 'd': 1440}
|
||||
return value * multipliers[unit]
|
||||
|
||||
|
||||
def parse_schedule(schedule: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse schedule string into structured format.
|
||||
|
||||
Returns dict with:
|
||||
- kind: "once" | "interval" | "cron"
|
||||
- For "once": "run_at" (ISO timestamp)
|
||||
- For "interval": "minutes" (int)
|
||||
- For "cron": "expr" (cron expression)
|
||||
|
||||
Examples:
|
||||
"30m" → once in 30 minutes
|
||||
"2h" → once in 2 hours
|
||||
"every 30m" → recurring every 30 minutes
|
||||
"every 2h" → recurring every 2 hours
|
||||
"0 9 * * *" → cron expression
|
||||
"2026-02-03T14:00" → once at timestamp
|
||||
"""
|
||||
schedule = schedule.strip()
|
||||
original = schedule
|
||||
schedule_lower = schedule.lower()
|
||||
|
||||
# "every X" pattern → recurring interval
|
||||
if schedule_lower.startswith("every "):
|
||||
duration_str = schedule[6:].strip()
|
||||
minutes = parse_duration(duration_str)
|
||||
return {
|
||||
"kind": "interval",
|
||||
"minutes": minutes,
|
||||
"display": f"every {minutes}m"
|
||||
}
|
||||
|
||||
# Check for cron expression (5 or 6 space-separated fields)
|
||||
# Cron fields: minute hour day month weekday [year]
|
||||
parts = schedule.split()
|
||||
if len(parts) >= 5 and all(
|
||||
re.match(r'^[\d\*\-,/]+$', p) for p in parts[:5]
|
||||
):
|
||||
if not HAS_CRONITER:
|
||||
raise ValueError("Cron expressions require 'croniter' package. Install with: pip install croniter")
|
||||
# Validate cron expression
|
||||
try:
|
||||
croniter(schedule)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid cron expression '{schedule}': {e}")
|
||||
return {
|
||||
"kind": "cron",
|
||||
"expr": schedule,
|
||||
"display": schedule
|
||||
}
|
||||
|
||||
# ISO timestamp (contains T or looks like date)
|
||||
if 'T' in schedule or re.match(r'^\d{4}-\d{2}-\d{2}', schedule):
|
||||
try:
|
||||
# Parse and validate
|
||||
dt = datetime.fromisoformat(schedule.replace('Z', '+00:00'))
|
||||
return {
|
||||
"kind": "once",
|
||||
"run_at": dt.isoformat(),
|
||||
"display": f"once at {dt.strftime('%Y-%m-%d %H:%M')}"
|
||||
}
|
||||
except ValueError as e:
|
||||
raise ValueError(f"Invalid timestamp '{schedule}': {e}")
|
||||
|
||||
# Duration like "30m", "2h", "1d" → one-shot from now
|
||||
try:
|
||||
minutes = parse_duration(schedule)
|
||||
run_at = _hermes_now() + timedelta(minutes=minutes)
|
||||
return {
|
||||
"kind": "once",
|
||||
"run_at": run_at.isoformat(),
|
||||
"display": f"once in {original}"
|
||||
}
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
raise ValueError(
|
||||
f"Invalid schedule '{original}'. Use:\n"
|
||||
f" - Duration: '30m', '2h', '1d' (one-shot)\n"
|
||||
f" - Interval: 'every 30m', 'every 2h' (recurring)\n"
|
||||
f" - Cron: '0 9 * * *' (cron expression)\n"
|
||||
f" - Timestamp: '2026-02-03T14:00:00' (one-shot at time)"
|
||||
)
|
||||
|
||||
|
||||
def _ensure_aware(dt: datetime) -> datetime:
|
||||
"""Return a timezone-aware datetime in Hermes configured timezone.
|
||||
|
||||
Backward compatibility:
|
||||
- Older stored timestamps may be naive.
|
||||
- Naive values are interpreted as *system-local wall time* (the timezone
|
||||
`datetime.now()` used when they were created), then converted to the
|
||||
configured Hermes timezone.
|
||||
|
||||
This preserves relative ordering for legacy naive timestamps across
|
||||
timezone changes and avoids false not-due results.
|
||||
"""
|
||||
target_tz = _hermes_now().tzinfo
|
||||
if dt.tzinfo is None:
|
||||
local_tz = datetime.now().astimezone().tzinfo
|
||||
return dt.replace(tzinfo=local_tz).astimezone(target_tz)
|
||||
return dt.astimezone(target_tz)
|
||||
|
||||
|
||||
def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Compute the next run time for a schedule.
|
||||
|
||||
Returns ISO timestamp string, or None if no more runs.
|
||||
"""
|
||||
now = _hermes_now()
|
||||
|
||||
if schedule["kind"] == "once":
|
||||
run_at = _ensure_aware(datetime.fromisoformat(schedule["run_at"]))
|
||||
# If in the future, return it; if in the past, no more runs
|
||||
return schedule["run_at"] if run_at > now else None
|
||||
|
||||
elif schedule["kind"] == "interval":
|
||||
minutes = schedule["minutes"]
|
||||
if last_run_at:
|
||||
# Next run is last_run + interval
|
||||
last = _ensure_aware(datetime.fromisoformat(last_run_at))
|
||||
next_run = last + timedelta(minutes=minutes)
|
||||
else:
|
||||
# First run is now + interval
|
||||
next_run = now + timedelta(minutes=minutes)
|
||||
return next_run.isoformat()
|
||||
|
||||
elif schedule["kind"] == "cron":
|
||||
if not HAS_CRONITER:
|
||||
return None
|
||||
cron = croniter(schedule["expr"], now)
|
||||
next_run = cron.get_next(datetime)
|
||||
return next_run.isoformat()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Job CRUD Operations
|
||||
# =============================================================================
|
||||
|
||||
def load_jobs() -> List[Dict[str, Any]]:
|
||||
"""Load all jobs from storage."""
|
||||
ensure_dirs()
|
||||
if not JOBS_FILE.exists():
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(JOBS_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return data.get("jobs", [])
|
||||
except (json.JSONDecodeError, IOError):
|
||||
return []
|
||||
|
||||
|
||||
def save_jobs(jobs: List[Dict[str, Any]]):
|
||||
"""Save all jobs to storage."""
|
||||
ensure_dirs()
|
||||
fd, tmp_path = tempfile.mkstemp(dir=str(JOBS_FILE.parent), suffix='.tmp', prefix='.jobs_')
|
||||
try:
|
||||
with os.fdopen(fd, 'w', encoding='utf-8') as f:
|
||||
json.dump({"jobs": jobs, "updated_at": _hermes_now().isoformat()}, f, indent=2)
|
||||
f.flush()
|
||||
os.fsync(f.fileno())
|
||||
os.replace(tmp_path, JOBS_FILE)
|
||||
_secure_file(JOBS_FILE)
|
||||
except BaseException:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def create_job(
|
||||
prompt: str,
|
||||
schedule: str,
|
||||
name: Optional[str] = None,
|
||||
repeat: Optional[int] = None,
|
||||
deliver: Optional[str] = None,
|
||||
origin: Optional[Dict[str, Any]] = None,
|
||||
skill: Optional[str] = None,
|
||||
skills: Optional[List[str]] = None,
|
||||
model: Optional[str] = None,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a new cron job.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to run (must be self-contained, or a task instruction when skill is set)
|
||||
schedule: Schedule string (see parse_schedule)
|
||||
name: Optional friendly name
|
||||
repeat: How many times to run (None = forever, 1 = once)
|
||||
deliver: Where to deliver output ("origin", "local", "telegram", etc.)
|
||||
origin: Source info where job was created (for "origin" delivery)
|
||||
skill: Optional legacy single skill name to load before running the prompt
|
||||
skills: Optional ordered list of skills to load before running the prompt
|
||||
model: Optional per-job model override
|
||||
provider: Optional per-job provider override
|
||||
base_url: Optional per-job base URL override
|
||||
|
||||
Returns:
|
||||
The created job dict
|
||||
"""
|
||||
parsed_schedule = parse_schedule(schedule)
|
||||
|
||||
# Auto-set repeat=1 for one-shot schedules if not specified
|
||||
if parsed_schedule["kind"] == "once" and repeat is None:
|
||||
repeat = 1
|
||||
|
||||
# Default delivery to origin if available, otherwise local
|
||||
if deliver is None:
|
||||
deliver = "origin" if origin else "local"
|
||||
|
||||
job_id = uuid.uuid4().hex[:12]
|
||||
now = _hermes_now().isoformat()
|
||||
|
||||
normalized_skills = _normalize_skill_list(skill, skills)
|
||||
normalized_model = str(model).strip() if isinstance(model, str) else None
|
||||
normalized_provider = str(provider).strip() if isinstance(provider, str) else None
|
||||
normalized_base_url = str(base_url).strip().rstrip("/") if isinstance(base_url, str) else None
|
||||
normalized_model = normalized_model or None
|
||||
normalized_provider = normalized_provider or None
|
||||
normalized_base_url = normalized_base_url or None
|
||||
|
||||
label_source = (prompt or (normalized_skills[0] if normalized_skills else None)) or "cron job"
|
||||
job = {
|
||||
"id": job_id,
|
||||
"name": name or label_source[:50].strip(),
|
||||
"prompt": prompt,
|
||||
"skills": normalized_skills,
|
||||
"skill": normalized_skills[0] if normalized_skills else None,
|
||||
"model": normalized_model,
|
||||
"provider": normalized_provider,
|
||||
"base_url": normalized_base_url,
|
||||
"schedule": parsed_schedule,
|
||||
"schedule_display": parsed_schedule.get("display", schedule),
|
||||
"repeat": {
|
||||
"times": repeat, # None = forever
|
||||
"completed": 0
|
||||
},
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
"paused_at": None,
|
||||
"paused_reason": None,
|
||||
"created_at": now,
|
||||
"next_run_at": compute_next_run(parsed_schedule),
|
||||
"last_run_at": None,
|
||||
"last_status": None,
|
||||
"last_error": None,
|
||||
# Delivery configuration
|
||||
"deliver": deliver,
|
||||
"origin": origin, # Tracks where job was created for "origin" delivery
|
||||
}
|
||||
|
||||
jobs = load_jobs()
|
||||
jobs.append(job)
|
||||
save_jobs(jobs)
|
||||
|
||||
return job
|
||||
|
||||
|
||||
def get_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get a job by ID."""
|
||||
jobs = load_jobs()
|
||||
for job in jobs:
|
||||
if job["id"] == job_id:
|
||||
return _apply_skill_fields(job)
|
||||
return None
|
||||
|
||||
|
||||
def list_jobs(include_disabled: bool = False) -> List[Dict[str, Any]]:
|
||||
"""List all jobs, optionally including disabled ones."""
|
||||
jobs = [_apply_skill_fields(j) for j in load_jobs()]
|
||||
if not include_disabled:
|
||||
jobs = [j for j in jobs if j.get("enabled", True)]
|
||||
return jobs
|
||||
|
||||
|
||||
def update_job(job_id: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Update a job by ID, refreshing derived schedule fields when needed."""
|
||||
jobs = load_jobs()
|
||||
for i, job in enumerate(jobs):
|
||||
if job["id"] != job_id:
|
||||
continue
|
||||
|
||||
updated = _apply_skill_fields({**job, **updates})
|
||||
schedule_changed = "schedule" in updates
|
||||
|
||||
if "skills" in updates or "skill" in updates:
|
||||
normalized_skills = _normalize_skill_list(updated.get("skill"), updated.get("skills"))
|
||||
updated["skills"] = normalized_skills
|
||||
updated["skill"] = normalized_skills[0] if normalized_skills else None
|
||||
|
||||
if schedule_changed:
|
||||
updated_schedule = updated["schedule"]
|
||||
updated["schedule_display"] = updates.get(
|
||||
"schedule_display",
|
||||
updated_schedule.get("display", updated.get("schedule_display")),
|
||||
)
|
||||
if updated.get("state") != "paused":
|
||||
updated["next_run_at"] = compute_next_run(updated_schedule)
|
||||
|
||||
if updated.get("enabled", True) and updated.get("state") != "paused" and not updated.get("next_run_at"):
|
||||
updated["next_run_at"] = compute_next_run(updated["schedule"])
|
||||
|
||||
jobs[i] = updated
|
||||
save_jobs(jobs)
|
||||
return _apply_skill_fields(jobs[i])
|
||||
return None
|
||||
|
||||
|
||||
def pause_job(job_id: str, reason: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""Pause a job without deleting it."""
|
||||
return update_job(
|
||||
job_id,
|
||||
{
|
||||
"enabled": False,
|
||||
"state": "paused",
|
||||
"paused_at": _hermes_now().isoformat(),
|
||||
"paused_reason": reason,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def resume_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Resume a paused job and compute the next future run from now."""
|
||||
job = get_job(job_id)
|
||||
if not job:
|
||||
return None
|
||||
|
||||
next_run_at = compute_next_run(job["schedule"])
|
||||
return update_job(
|
||||
job_id,
|
||||
{
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
"paused_at": None,
|
||||
"paused_reason": None,
|
||||
"next_run_at": next_run_at,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Schedule a job to run on the next scheduler tick."""
|
||||
job = get_job(job_id)
|
||||
if not job:
|
||||
return None
|
||||
return update_job(
|
||||
job_id,
|
||||
{
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
"paused_at": None,
|
||||
"paused_reason": None,
|
||||
"next_run_at": _hermes_now().isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def remove_job(job_id: str) -> bool:
|
||||
"""Remove a job by ID."""
|
||||
jobs = load_jobs()
|
||||
original_len = len(jobs)
|
||||
jobs = [j for j in jobs if j["id"] != job_id]
|
||||
if len(jobs) < original_len:
|
||||
save_jobs(jobs)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
||||
"""
|
||||
Mark a job as having been run.
|
||||
|
||||
Updates last_run_at, last_status, increments completed count,
|
||||
computes next_run_at, and auto-deletes if repeat limit reached.
|
||||
"""
|
||||
jobs = load_jobs()
|
||||
for i, job in enumerate(jobs):
|
||||
if job["id"] == job_id:
|
||||
now = _hermes_now().isoformat()
|
||||
job["last_run_at"] = now
|
||||
job["last_status"] = "ok" if success else "error"
|
||||
job["last_error"] = error if not success else None
|
||||
|
||||
# Increment completed count
|
||||
if job.get("repeat"):
|
||||
job["repeat"]["completed"] = job["repeat"].get("completed", 0) + 1
|
||||
|
||||
# Check if we've hit the repeat limit
|
||||
times = job["repeat"].get("times")
|
||||
completed = job["repeat"]["completed"]
|
||||
if times is not None and completed >= times:
|
||||
# Remove the job (limit reached)
|
||||
jobs.pop(i)
|
||||
save_jobs(jobs)
|
||||
return
|
||||
|
||||
# Compute next run
|
||||
job["next_run_at"] = compute_next_run(job["schedule"], now)
|
||||
|
||||
# If no next run (one-shot completed), disable
|
||||
if job["next_run_at"] is None:
|
||||
job["enabled"] = False
|
||||
job["state"] = "completed"
|
||||
elif job.get("state") != "paused":
|
||||
job["state"] = "scheduled"
|
||||
|
||||
save_jobs(jobs)
|
||||
return
|
||||
|
||||
save_jobs(jobs)
|
||||
|
||||
|
||||
def get_due_jobs() -> List[Dict[str, Any]]:
|
||||
"""Get all jobs that are due to run now.
|
||||
|
||||
For recurring jobs (cron/interval), if the scheduled time is stale
|
||||
(more than one period in the past, e.g. because the gateway was down),
|
||||
the job is fast-forwarded to the next future run instead of firing
|
||||
immediately. This prevents a burst of missed jobs on gateway restart.
|
||||
"""
|
||||
now = _hermes_now()
|
||||
jobs = [_apply_skill_fields(j) for j in load_jobs()]
|
||||
raw_jobs = load_jobs() # For saving updates
|
||||
due = []
|
||||
needs_save = False
|
||||
|
||||
for job in jobs:
|
||||
if not job.get("enabled", True):
|
||||
continue
|
||||
|
||||
next_run = job.get("next_run_at")
|
||||
if not next_run:
|
||||
continue
|
||||
|
||||
next_run_dt = _ensure_aware(datetime.fromisoformat(next_run))
|
||||
if next_run_dt <= now:
|
||||
schedule = job.get("schedule", {})
|
||||
kind = schedule.get("kind")
|
||||
|
||||
# For recurring jobs, check if the scheduled time is stale
|
||||
# (gateway was down and missed the window). Fast-forward to
|
||||
# the next future occurrence instead of firing a stale run.
|
||||
if kind in ("cron", "interval") and (now - next_run_dt).total_seconds() > 120:
|
||||
# More than 2 minutes late — this is a missed run, not a current one.
|
||||
# Recompute next_run_at to the next future occurrence.
|
||||
new_next = compute_next_run(schedule, now.isoformat())
|
||||
if new_next:
|
||||
logger.info(
|
||||
"Job '%s' missed its scheduled time (%s). "
|
||||
"Fast-forwarding to next run: %s",
|
||||
job.get("name", job["id"]),
|
||||
next_run,
|
||||
new_next,
|
||||
)
|
||||
# Update the job in storage
|
||||
for rj in raw_jobs:
|
||||
if rj["id"] == job["id"]:
|
||||
rj["next_run_at"] = new_next
|
||||
needs_save = True
|
||||
break
|
||||
continue # Skip this run
|
||||
|
||||
due.append(job)
|
||||
|
||||
if needs_save:
|
||||
save_jobs(raw_jobs)
|
||||
|
||||
return due
|
||||
|
||||
|
||||
def save_job_output(job_id: str, output: str):
|
||||
"""Save job output to file."""
|
||||
ensure_dirs()
|
||||
job_output_dir = OUTPUT_DIR / job_id
|
||||
job_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
_secure_dir(job_output_dir)
|
||||
|
||||
timestamp = _hermes_now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
output_file = job_output_dir / f"{timestamp}.md"
|
||||
|
||||
fd, tmp_path = tempfile.mkstemp(dir=str(job_output_dir), suffix='.tmp', prefix='.output_')
|
||||
try:
|
||||
with os.fdopen(fd, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
f.flush()
|
||||
os.fsync(f.fileno())
|
||||
os.replace(tmp_path, output_file)
|
||||
_secure_file(output_file)
|
||||
except BaseException:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
return output_file
|
||||
510
cron/scheduler.py
Normal file
510
cron/scheduler.py
Normal file
@@ -0,0 +1,510 @@
|
||||
"""
|
||||
Cron job scheduler - executes due jobs.
|
||||
|
||||
Provides tick() which checks for due jobs and runs them. The gateway
|
||||
calls this every 60 seconds from a background thread.
|
||||
|
||||
Uses a file-based lock (~/.hermes/cron/.tick.lock) so only one tick
|
||||
runs at a time if multiple processes overlap.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
# fcntl is Unix-only; on Windows use msvcrt for file locking
|
||||
try:
|
||||
import fcntl
|
||||
except ImportError:
|
||||
fcntl = None
|
||||
try:
|
||||
import msvcrt
|
||||
except ImportError:
|
||||
msvcrt = None
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from hermes_time import now as _hermes_now
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from cron.jobs import get_due_jobs, mark_job_run, save_job_output
|
||||
|
||||
# Resolve Hermes home directory (respects HERMES_HOME override)
|
||||
_hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
|
||||
# File-based lock prevents concurrent ticks from gateway + daemon + systemd timer
|
||||
_LOCK_DIR = _hermes_home / "cron"
|
||||
_LOCK_FILE = _LOCK_DIR / ".tick.lock"
|
||||
|
||||
|
||||
def _resolve_origin(job: dict) -> Optional[dict]:
|
||||
"""Extract origin info from a job, preserving any extra routing metadata."""
|
||||
origin = job.get("origin")
|
||||
if not origin:
|
||||
return None
|
||||
platform = origin.get("platform")
|
||||
chat_id = origin.get("chat_id")
|
||||
if platform and chat_id:
|
||||
return origin
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_delivery_target(job: dict) -> Optional[dict]:
|
||||
"""Resolve the concrete auto-delivery target for a cron job, if any."""
|
||||
deliver = job.get("deliver", "local")
|
||||
origin = _resolve_origin(job)
|
||||
|
||||
if deliver == "local":
|
||||
return None
|
||||
|
||||
if deliver == "origin":
|
||||
if not origin:
|
||||
return None
|
||||
return {
|
||||
"platform": origin["platform"],
|
||||
"chat_id": str(origin["chat_id"]),
|
||||
"thread_id": origin.get("thread_id"),
|
||||
}
|
||||
|
||||
if ":" in deliver:
|
||||
platform_name, chat_id = deliver.split(":", 1)
|
||||
return {
|
||||
"platform": platform_name,
|
||||
"chat_id": chat_id,
|
||||
"thread_id": None,
|
||||
}
|
||||
|
||||
platform_name = deliver
|
||||
if origin and origin.get("platform") == platform_name:
|
||||
return {
|
||||
"platform": platform_name,
|
||||
"chat_id": str(origin["chat_id"]),
|
||||
"thread_id": origin.get("thread_id"),
|
||||
}
|
||||
|
||||
chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "")
|
||||
if not chat_id:
|
||||
return None
|
||||
|
||||
return {
|
||||
"platform": platform_name,
|
||||
"chat_id": chat_id,
|
||||
"thread_id": None,
|
||||
}
|
||||
|
||||
|
||||
def _deliver_result(job: dict, content: str) -> None:
|
||||
"""
|
||||
Deliver job output to the configured target (origin chat, specific platform, etc.).
|
||||
|
||||
Uses the standalone platform send functions from send_message_tool so delivery
|
||||
works whether or not the gateway is running.
|
||||
"""
|
||||
target = _resolve_delivery_target(job)
|
||||
if not target:
|
||||
if job.get("deliver", "local") != "local":
|
||||
logger.warning(
|
||||
"Job '%s' deliver=%s but no concrete delivery target could be resolved",
|
||||
job["id"],
|
||||
job.get("deliver", "local"),
|
||||
)
|
||||
return
|
||||
|
||||
platform_name = target["platform"]
|
||||
chat_id = target["chat_id"]
|
||||
thread_id = target.get("thread_id")
|
||||
|
||||
from tools.send_message_tool import _send_to_platform
|
||||
from gateway.config import load_gateway_config, Platform
|
||||
|
||||
platform_map = {
|
||||
"telegram": Platform.TELEGRAM,
|
||||
"discord": Platform.DISCORD,
|
||||
"slack": Platform.SLACK,
|
||||
"whatsapp": Platform.WHATSAPP,
|
||||
"signal": Platform.SIGNAL,
|
||||
"email": Platform.EMAIL,
|
||||
}
|
||||
platform = platform_map.get(platform_name.lower())
|
||||
if not platform:
|
||||
logger.warning("Job '%s': unknown platform '%s' for delivery", job["id"], platform_name)
|
||||
return
|
||||
|
||||
try:
|
||||
config = load_gateway_config()
|
||||
except Exception as e:
|
||||
logger.error("Job '%s': failed to load gateway config for delivery: %s", job["id"], e)
|
||||
return
|
||||
|
||||
pconfig = config.platforms.get(platform)
|
||||
if not pconfig or not pconfig.enabled:
|
||||
logger.warning("Job '%s': platform '%s' not configured/enabled", job["id"], platform_name)
|
||||
return
|
||||
|
||||
# Run the async send in a fresh event loop (safe from any thread)
|
||||
try:
|
||||
result = asyncio.run(_send_to_platform(platform, pconfig, chat_id, content, thread_id=thread_id))
|
||||
except RuntimeError:
|
||||
# asyncio.run() fails if there's already a running loop in this thread;
|
||||
# spin up a new thread to avoid that.
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||
future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, content, thread_id=thread_id))
|
||||
result = future.result(timeout=30)
|
||||
except Exception as e:
|
||||
logger.error("Job '%s': delivery to %s:%s failed: %s", job["id"], platform_name, chat_id, e)
|
||||
return
|
||||
|
||||
if result and result.get("error"):
|
||||
logger.error("Job '%s': delivery error: %s", job["id"], result["error"])
|
||||
else:
|
||||
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
|
||||
# Mirror the delivered content into the target's gateway session
|
||||
try:
|
||||
from gateway.mirror import mirror_to_session
|
||||
mirror_to_session(platform_name, chat_id, content, source_label="cron", thread_id=thread_id)
|
||||
except Exception as e:
|
||||
logger.warning("Job '%s': mirror_to_session failed: %s", job["id"], e)
|
||||
|
||||
|
||||
def _build_job_prompt(job: dict) -> str:
|
||||
"""Build the effective prompt for a cron job, optionally loading one or more skills first."""
|
||||
prompt = job.get("prompt", "")
|
||||
skills = job.get("skills")
|
||||
if skills is None:
|
||||
legacy = job.get("skill")
|
||||
skills = [legacy] if legacy else []
|
||||
|
||||
skill_names = [str(name).strip() for name in skills if str(name).strip()]
|
||||
if not skill_names:
|
||||
return prompt
|
||||
|
||||
from tools.skills_tool import skill_view
|
||||
|
||||
parts = []
|
||||
for skill_name in skill_names:
|
||||
loaded = json.loads(skill_view(skill_name))
|
||||
if not loaded.get("success"):
|
||||
error = loaded.get("error") or f"Failed to load skill '{skill_name}'"
|
||||
raise RuntimeError(error)
|
||||
|
||||
content = str(loaded.get("content") or "").strip()
|
||||
if parts:
|
||||
parts.append("")
|
||||
parts.extend(
|
||||
[
|
||||
f'[SYSTEM: The user has invoked the "{skill_name}" skill, indicating they want you to follow its instructions. The full skill content is loaded below.]',
|
||||
"",
|
||||
content,
|
||||
]
|
||||
)
|
||||
|
||||
if prompt:
|
||||
parts.extend(["", f"The user has provided the following instruction alongside the skill invocation: {prompt}"])
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
"""
|
||||
Execute a single cron job.
|
||||
|
||||
Returns:
|
||||
Tuple of (success, full_output_doc, final_response, error_message)
|
||||
"""
|
||||
from run_agent import AIAgent
|
||||
|
||||
# Initialize SQLite session store so cron job messages are persisted
|
||||
# and discoverable via session_search (same pattern as gateway/run.py).
|
||||
_session_db = None
|
||||
try:
|
||||
from hermes_state import SessionDB
|
||||
_session_db = SessionDB()
|
||||
except Exception as e:
|
||||
logger.debug("Job '%s': SQLite session store not available: %s", job.get("id", "?"), e)
|
||||
|
||||
job_id = job["id"]
|
||||
job_name = job["name"]
|
||||
prompt = _build_job_prompt(job)
|
||||
origin = _resolve_origin(job)
|
||||
|
||||
logger.info("Running job '%s' (ID: %s)", job_name, job_id)
|
||||
logger.info("Prompt: %s", prompt[:100])
|
||||
|
||||
# Inject origin context so the agent's send_message tool knows the chat
|
||||
if origin:
|
||||
os.environ["HERMES_SESSION_PLATFORM"] = origin["platform"]
|
||||
os.environ["HERMES_SESSION_CHAT_ID"] = str(origin["chat_id"])
|
||||
if origin.get("chat_name"):
|
||||
os.environ["HERMES_SESSION_CHAT_NAME"] = origin["chat_name"]
|
||||
|
||||
try:
|
||||
# Re-read .env and config.yaml fresh every run so provider/key
|
||||
# changes take effect without a gateway restart.
|
||||
from dotenv import load_dotenv
|
||||
try:
|
||||
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="latin-1")
|
||||
|
||||
delivery_target = _resolve_delivery_target(job)
|
||||
if delivery_target:
|
||||
os.environ["HERMES_CRON_AUTO_DELIVER_PLATFORM"] = delivery_target["platform"]
|
||||
os.environ["HERMES_CRON_AUTO_DELIVER_CHAT_ID"] = str(delivery_target["chat_id"])
|
||||
if delivery_target.get("thread_id") is not None:
|
||||
os.environ["HERMES_CRON_AUTO_DELIVER_THREAD_ID"] = str(delivery_target["thread_id"])
|
||||
|
||||
model = job.get("model") or os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6"
|
||||
|
||||
# Load config.yaml for model, reasoning, prefill, toolsets, provider routing
|
||||
_cfg = {}
|
||||
try:
|
||||
import yaml
|
||||
_cfg_path = str(_hermes_home / "config.yaml")
|
||||
if os.path.exists(_cfg_path):
|
||||
with open(_cfg_path) as _f:
|
||||
_cfg = yaml.safe_load(_f) or {}
|
||||
_model_cfg = _cfg.get("model", {})
|
||||
if not job.get("model"):
|
||||
if isinstance(_model_cfg, str):
|
||||
model = _model_cfg
|
||||
elif isinstance(_model_cfg, dict):
|
||||
model = _model_cfg.get("default", model)
|
||||
except Exception as e:
|
||||
logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)
|
||||
|
||||
# Reasoning config from env or config.yaml
|
||||
reasoning_config = None
|
||||
effort = os.getenv("HERMES_REASONING_EFFORT", "")
|
||||
if not effort:
|
||||
effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
|
||||
if effort and effort.lower() != "none":
|
||||
valid = ("xhigh", "high", "medium", "low", "minimal")
|
||||
if effort.lower() in valid:
|
||||
reasoning_config = {"enabled": True, "effort": effort.lower()}
|
||||
elif effort.lower() == "none":
|
||||
reasoning_config = {"enabled": False}
|
||||
|
||||
# Prefill messages from env or config.yaml
|
||||
prefill_messages = None
|
||||
prefill_file = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") or _cfg.get("prefill_messages_file", "")
|
||||
if prefill_file:
|
||||
import json as _json
|
||||
pfpath = Path(prefill_file).expanduser()
|
||||
if not pfpath.is_absolute():
|
||||
pfpath = _hermes_home / pfpath
|
||||
if pfpath.exists():
|
||||
try:
|
||||
with open(pfpath, "r", encoding="utf-8") as _pf:
|
||||
prefill_messages = _json.load(_pf)
|
||||
if not isinstance(prefill_messages, list):
|
||||
prefill_messages = None
|
||||
except Exception as e:
|
||||
logger.warning("Job '%s': failed to parse prefill messages file '%s': %s", job_id, pfpath, e)
|
||||
prefill_messages = None
|
||||
|
||||
# Max iterations
|
||||
max_iterations = _cfg.get("agent", {}).get("max_turns") or _cfg.get("max_turns") or 90
|
||||
|
||||
# Provider routing
|
||||
pr = _cfg.get("provider_routing", {})
|
||||
smart_routing = _cfg.get("smart_model_routing", {}) or {}
|
||||
|
||||
from hermes_cli.runtime_provider import (
|
||||
resolve_runtime_provider,
|
||||
format_runtime_provider_error,
|
||||
)
|
||||
try:
|
||||
runtime_kwargs = {
|
||||
"requested": job.get("provider") or os.getenv("HERMES_INFERENCE_PROVIDER"),
|
||||
}
|
||||
if job.get("base_url"):
|
||||
runtime_kwargs["explicit_base_url"] = job.get("base_url")
|
||||
runtime = resolve_runtime_provider(**runtime_kwargs)
|
||||
except Exception as exc:
|
||||
message = format_runtime_provider_error(exc)
|
||||
raise RuntimeError(message) from exc
|
||||
|
||||
from agent.smart_model_routing import resolve_turn_route
|
||||
turn_route = resolve_turn_route(
|
||||
prompt,
|
||||
smart_routing,
|
||||
{
|
||||
"model": model,
|
||||
"api_key": runtime.get("api_key"),
|
||||
"base_url": runtime.get("base_url"),
|
||||
"provider": runtime.get("provider"),
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
},
|
||||
)
|
||||
|
||||
agent = AIAgent(
|
||||
model=turn_route["model"],
|
||||
api_key=turn_route["runtime"].get("api_key"),
|
||||
base_url=turn_route["runtime"].get("base_url"),
|
||||
provider=turn_route["runtime"].get("provider"),
|
||||
api_mode=turn_route["runtime"].get("api_mode"),
|
||||
max_iterations=max_iterations,
|
||||
reasoning_config=reasoning_config,
|
||||
prefill_messages=prefill_messages,
|
||||
providers_allowed=pr.get("only"),
|
||||
providers_ignored=pr.get("ignore"),
|
||||
providers_order=pr.get("order"),
|
||||
provider_sort=pr.get("sort"),
|
||||
disabled_toolsets=["cronjob"],
|
||||
quiet_mode=True,
|
||||
platform="cron",
|
||||
session_id=f"cron_{job_id}_{_hermes_now().strftime('%Y%m%d_%H%M%S')}",
|
||||
session_db=_session_db,
|
||||
)
|
||||
|
||||
result = agent.run_conversation(prompt)
|
||||
|
||||
final_response = result.get("final_response", "")
|
||||
if not final_response:
|
||||
final_response = "(No response generated)"
|
||||
|
||||
output = f"""# Cron Job: {job_name}
|
||||
|
||||
**Job ID:** {job_id}
|
||||
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
**Schedule:** {job.get('schedule_display', 'N/A')}
|
||||
|
||||
## Prompt
|
||||
|
||||
{prompt}
|
||||
|
||||
## Response
|
||||
|
||||
{final_response}
|
||||
"""
|
||||
|
||||
logger.info("Job '%s' completed successfully", job_name)
|
||||
return True, output, final_response, None
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"{type(e).__name__}: {str(e)}"
|
||||
logger.error("Job '%s' failed: %s", job_name, error_msg)
|
||||
|
||||
output = f"""# Cron Job: {job_name} (FAILED)
|
||||
|
||||
**Job ID:** {job_id}
|
||||
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
**Schedule:** {job.get('schedule_display', 'N/A')}
|
||||
|
||||
## Prompt
|
||||
|
||||
{prompt}
|
||||
|
||||
## Error
|
||||
|
||||
```
|
||||
{error_msg}
|
||||
|
||||
{traceback.format_exc()}
|
||||
```
|
||||
"""
|
||||
return False, output, "", error_msg
|
||||
|
||||
finally:
|
||||
# Clean up injected env vars so they don't leak to other jobs
|
||||
for key in (
|
||||
"HERMES_SESSION_PLATFORM",
|
||||
"HERMES_SESSION_CHAT_ID",
|
||||
"HERMES_SESSION_CHAT_NAME",
|
||||
"HERMES_CRON_AUTO_DELIVER_PLATFORM",
|
||||
"HERMES_CRON_AUTO_DELIVER_CHAT_ID",
|
||||
"HERMES_CRON_AUTO_DELIVER_THREAD_ID",
|
||||
):
|
||||
os.environ.pop(key, None)
|
||||
if _session_db:
|
||||
try:
|
||||
_session_db.close()
|
||||
except Exception as e:
|
||||
logger.debug("Job '%s': failed to close SQLite session store: %s", job_id, e)
|
||||
|
||||
|
||||
def tick(verbose: bool = True) -> int:
|
||||
"""
|
||||
Check and run all due jobs.
|
||||
|
||||
Uses a file lock so only one tick runs at a time, even if the gateway's
|
||||
in-process ticker and a standalone daemon or manual tick overlap.
|
||||
|
||||
Args:
|
||||
verbose: Whether to print status messages
|
||||
|
||||
Returns:
|
||||
Number of jobs executed (0 if another tick is already running)
|
||||
"""
|
||||
_LOCK_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Cross-platform file locking: fcntl on Unix, msvcrt on Windows
|
||||
lock_fd = None
|
||||
try:
|
||||
lock_fd = open(_LOCK_FILE, "w")
|
||||
if fcntl:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
elif msvcrt:
|
||||
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_NBLCK, 1)
|
||||
except (OSError, IOError):
|
||||
logger.debug("Tick skipped — another instance holds the lock")
|
||||
if lock_fd is not None:
|
||||
lock_fd.close()
|
||||
return 0
|
||||
|
||||
try:
|
||||
due_jobs = get_due_jobs()
|
||||
|
||||
if verbose and not due_jobs:
|
||||
logger.info("%s - No jobs due", _hermes_now().strftime('%H:%M:%S'))
|
||||
return 0
|
||||
|
||||
if verbose:
|
||||
logger.info("%s - %s job(s) due", _hermes_now().strftime('%H:%M:%S'), len(due_jobs))
|
||||
|
||||
executed = 0
|
||||
for job in due_jobs:
|
||||
try:
|
||||
success, output, final_response, error = run_job(job)
|
||||
|
||||
output_file = save_job_output(job["id"], output)
|
||||
if verbose:
|
||||
logger.info("Output saved to: %s", output_file)
|
||||
|
||||
# Deliver the final response to the origin/target chat
|
||||
deliver_content = final_response if success else f"⚠️ Cron job '{job.get('name', job['id'])}' failed:\n{error}"
|
||||
if deliver_content:
|
||||
try:
|
||||
_deliver_result(job, deliver_content)
|
||||
except Exception as de:
|
||||
logger.error("Delivery failed for job %s: %s", job["id"], de)
|
||||
|
||||
mark_job_run(job["id"], success, error)
|
||||
executed += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error processing job %s: %s", job['id'], e)
|
||||
mark_job_run(job["id"], False, str(e))
|
||||
|
||||
return executed
|
||||
finally:
|
||||
if fcntl:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||||
elif msvcrt:
|
||||
try:
|
||||
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_UNLCK, 1)
|
||||
except (OSError, IOError):
|
||||
pass
|
||||
lock_fd.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tick(verbose=True)
|
||||
5
datagen-config-examples/example_browser_tasks.jsonl
Normal file
5
datagen-config-examples/example_browser_tasks.jsonl
Normal file
@@ -0,0 +1,5 @@
|
||||
{"prompt": "Go to https://news.ycombinator.com and find the top 5 posts on the front page. For each post, get the title, URL, points, and number of comments. Return the results as a formatted summary."}
|
||||
{"prompt": "Navigate to https://en.wikipedia.org/wiki/Hermes and extract the first paragraph of the article, the image caption, and the list of items in the infobox. Summarize what you find."}
|
||||
{"prompt": "Go to https://github.com/trending and find the top 3 trending repositories today. For each repo, get the name, description, language, and star count. Write the results to a file called trending_repos.md."}
|
||||
{"prompt": "Visit https://httpbin.org/forms/post and fill out the form with sample data (customer name: Jane Doe, size: Medium, topping: Bacon, delivery time: 12:00). Submit the form and report what the response page shows."}
|
||||
{"prompt": "Navigate to https://books.toscrape.com, browse to the Travel category, find the highest-rated book, and extract its title, price, availability, and description."}
|
||||
65
datagen-config-examples/run_browser_tasks.sh
Executable file
65
datagen-config-examples/run_browser_tasks.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
# =============================================================================
|
||||
# Example: Browser-Focused Data Generation
|
||||
# =============================================================================
|
||||
#
|
||||
# Generates tool-calling trajectories for browser automation tasks.
|
||||
# The agent navigates websites, fills forms, extracts information, etc.
|
||||
#
|
||||
# Distribution: browser 97%, web 20%, vision 12%, terminal 15%
|
||||
#
|
||||
# Prerequisites:
|
||||
# - OPENROUTER_API_KEY in ~/.hermes/.env
|
||||
# - BROWSERBASE_API_KEY in ~/.hermes/.env (for browser tools)
|
||||
# - A dataset JSONL file with one {"prompt": "..."} per line
|
||||
#
|
||||
# Usage:
|
||||
# cd ~/.hermes/hermes-agent
|
||||
# bash datagen-config-examples/run_browser_tasks.sh
|
||||
#
|
||||
# Output: data/browser_tasks_example/trajectories.jsonl
|
||||
# =============================================================================
|
||||
|
||||
mkdir -p logs
|
||||
|
||||
LOG_FILE="logs/browser_tasks_$(date +%Y%m%d_%H%M%S).log"
|
||||
echo "📝 Logging to: $LOG_FILE"
|
||||
|
||||
# Point to the example dataset in this directory
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
|
||||
python batch_runner.py \
|
||||
--dataset_file="$SCRIPT_DIR/example_browser_tasks.jsonl" \
|
||||
--batch_size=5 \
|
||||
--run_name="browser_tasks_example" \
|
||||
--distribution="browser_tasks" \
|
||||
--model="anthropic/claude-sonnet-4" \
|
||||
--base_url="https://openrouter.ai/api/v1" \
|
||||
--num_workers=3 \
|
||||
--max_turns=30 \
|
||||
--ephemeral_system_prompt="You are an AI assistant with browser automation capabilities. Your primary task is to navigate and interact with web pages to accomplish user goals.
|
||||
|
||||
IMPORTANT GUIDELINES:
|
||||
|
||||
1. SEARCHING: Do NOT search directly on Google via the browser — they block automated searches. Use the web_search tool first to find URLs, then navigate to them with browser tools.
|
||||
|
||||
2. COOKIE/PRIVACY DIALOGS: After navigating to a page, check for cookie consent or privacy popups. Dismiss them by clicking Accept/Close/OK before interacting with other elements. Take a fresh browser_snapshot afterward.
|
||||
|
||||
3. HANDLING TIMEOUTS: If an action times out, the element may be blocked by an overlay. Take a new snapshot and look for dialogs to dismiss. If none, try an alternative approach or report the issue.
|
||||
|
||||
4. GENERAL: Use browser tools to click, fill forms, and extract information. Use terminal for local file operations. Verify your actions and handle errors gracefully." \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
echo "✅ Done. Log: $LOG_FILE"
|
||||
|
||||
# =============================================================================
|
||||
# Common options you can add:
|
||||
#
|
||||
# --resume Resume from checkpoint if interrupted
|
||||
# --verbose Enable detailed logging
|
||||
# --max_tokens=63000 Set max response tokens
|
||||
# --reasoning_disabled Disable model thinking/reasoning tokens
|
||||
# --providers_allowed="anthropic,google" Restrict to specific providers
|
||||
# --prefill_messages_file="configs/prefill.json" Few-shot priming
|
||||
# =============================================================================
|
||||
101
datagen-config-examples/trajectory_compression.yaml
Normal file
101
datagen-config-examples/trajectory_compression.yaml
Normal file
@@ -0,0 +1,101 @@
|
||||
# Trajectory Compression Configuration
|
||||
#
|
||||
# Post-processes completed agent trajectories to fit within a target token budget.
|
||||
# Compression preserves head/tail turns and summarizes middle content only as needed.
|
||||
|
||||
# Tokenizer settings for accurate token counting
|
||||
tokenizer:
|
||||
# HuggingFace tokenizer name
|
||||
name: "moonshotai/Kimi-K2-Thinking"
|
||||
|
||||
# Trust remote code (required for some tokenizers)
|
||||
trust_remote_code: true
|
||||
|
||||
# Compression targets and behavior
|
||||
compression:
|
||||
# Target maximum tokens for compressed trajectory
|
||||
target_max_tokens: 29000
|
||||
|
||||
# Target size for summary (in tokens)
|
||||
# This is factored into calculations when determining what to compress
|
||||
summary_target_tokens: 750
|
||||
|
||||
# Protected turns that should NEVER be compressed
|
||||
protected_turns:
|
||||
# Always protect the first system message (tool definitions)
|
||||
first_system: true
|
||||
|
||||
# Always protect the first human message (original request)
|
||||
first_human: true
|
||||
|
||||
# Always protect the first gpt message (initial response/tool_call)
|
||||
first_gpt: true
|
||||
|
||||
# Always protect the first tool response (result of first action)
|
||||
first_tool: true
|
||||
|
||||
# Always protect the last 2 complete turn pairs (gpt+tool or gpt only)
|
||||
# This ensures the model's final actions and conclusions are preserved
|
||||
last_n_turns: 4
|
||||
|
||||
# LLM settings for generating summaries (OpenRouter only)
|
||||
summarization:
|
||||
# Model to use for summarization (should be fast and cheap)
|
||||
# Using OpenRouter model path format
|
||||
model: "google/gemini-3-flash-preview"
|
||||
|
||||
# OpenRouter API settings
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
|
||||
# Environment variable containing OpenRouter API key
|
||||
api_key_env: "OPENROUTER_API_KEY"
|
||||
|
||||
# Temperature for summarization (lower = more deterministic)
|
||||
temperature: 0.3
|
||||
|
||||
# Max retries for API failures
|
||||
max_retries: 3
|
||||
|
||||
# Delay between retries (seconds)
|
||||
retry_delay: 2
|
||||
|
||||
# Output settings
|
||||
output:
|
||||
# Add notice to system message about potential summarization
|
||||
add_summary_notice: true
|
||||
|
||||
# Text to append to system message
|
||||
summary_notice_text: "\n\nSome of the conversation may be summarized to preserve context."
|
||||
|
||||
# Output directory suffix (appended to input directory name)
|
||||
output_suffix: "_compressed"
|
||||
|
||||
# Processing settings
|
||||
processing:
|
||||
# Number of parallel workers for batch processing
|
||||
num_workers: 4
|
||||
|
||||
# Maximum concurrent API calls for summarization (async parallelism)
|
||||
max_concurrent_requests: 50
|
||||
|
||||
# Skip trajectories that are already under target length
|
||||
skip_under_target: true
|
||||
|
||||
# If true, save trajectories even if compression can't get under target
|
||||
# (will compress as much as possible)
|
||||
save_over_limit: true
|
||||
|
||||
# Timeout per trajectory in seconds (skip if takes longer)
|
||||
# Helps avoid hanging on problematic entries
|
||||
per_trajectory_timeout: 300 # 5 minutes
|
||||
|
||||
# Metrics to track
|
||||
metrics:
|
||||
# Log detailed compression statistics
|
||||
enabled: true
|
||||
|
||||
# Save per-trajectory metrics in output
|
||||
per_trajectory: false
|
||||
|
||||
# Metrics file name (saved in output directory)
|
||||
output_file: "compression_metrics.json"
|
||||
46
datagen-config-examples/web_research.yaml
Normal file
46
datagen-config-examples/web_research.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
# datagen-config-examples/web_research.yaml
|
||||
#
|
||||
# Batch data generation config for WebResearchEnv.
|
||||
# Generates tool-calling trajectories for multi-step web research tasks.
|
||||
#
|
||||
# Usage:
|
||||
# python batch_runner.py \
|
||||
# --config datagen-config-examples/web_research.yaml \
|
||||
# --run_name web_research_v1
|
||||
|
||||
environment: web-research
|
||||
|
||||
# Toolsets available to the agent during data generation
|
||||
toolsets:
|
||||
- web
|
||||
- file
|
||||
|
||||
# How many parallel workers to use
|
||||
num_workers: 4
|
||||
|
||||
# Questions per batch
|
||||
batch_size: 20
|
||||
|
||||
# Total trajectories to generate (comment out to run full dataset)
|
||||
max_items: 500
|
||||
|
||||
# Model to use for generation (override with --model flag)
|
||||
model: openrouter/nousresearch/hermes-3-llama-3.1-405b
|
||||
|
||||
# System prompt additions (ephemeral — not saved to trajectories)
|
||||
ephemeral_system_prompt: |
|
||||
You are a highly capable research agent. When asked a factual question,
|
||||
always use web_search to find current, accurate information before answering.
|
||||
Cite at least 2 sources. Be concise and accurate.
|
||||
|
||||
# Output directory
|
||||
output_dir: data/web_research_v1
|
||||
|
||||
# Trajectory compression settings (for fitting into training token budgets)
|
||||
compression:
|
||||
enabled: true
|
||||
target_max_tokens: 16000
|
||||
|
||||
# Eval settings
|
||||
eval_every: 100 # Run eval every N trajectories
|
||||
eval_size: 25 # Number of held-out questions per eval run
|
||||
229
docs/acp-setup.md
Normal file
229
docs/acp-setup.md
Normal file
@@ -0,0 +1,229 @@
|
||||
# Hermes Agent — ACP (Agent Client Protocol) Setup Guide
|
||||
|
||||
Hermes Agent supports the **Agent Client Protocol (ACP)**, allowing it to run as
|
||||
a coding agent inside your editor. ACP lets your IDE send tasks to Hermes, and
|
||||
Hermes responds with file edits, terminal commands, and explanations — all shown
|
||||
natively in the editor UI.
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Hermes Agent installed and configured (`hermes setup` completed)
|
||||
- An API key / provider set up in `~/.hermes/.env` or via `hermes login`
|
||||
- Python 3.11+
|
||||
|
||||
Install the ACP extra:
|
||||
|
||||
```bash
|
||||
pip install -e ".[acp]"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## VS Code Setup
|
||||
|
||||
### 1. Install the ACP Client extension
|
||||
|
||||
Open VS Code and install **ACP Client** from the marketplace:
|
||||
|
||||
- Press `Ctrl+Shift+X` (or `Cmd+Shift+X` on macOS)
|
||||
- Search for **"ACP Client"**
|
||||
- Click **Install**
|
||||
|
||||
Or install from the command line:
|
||||
|
||||
```bash
|
||||
code --install-extension anysphere.acp-client
|
||||
```
|
||||
|
||||
### 2. Configure settings.json
|
||||
|
||||
Open your VS Code settings (`Ctrl+,` → click the `{}` icon for JSON) and add:
|
||||
|
||||
```json
|
||||
{
|
||||
"acpClient.agents": [
|
||||
{
|
||||
"name": "hermes-agent",
|
||||
"registryDir": "/path/to/hermes-agent/acp_registry"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Replace `/path/to/hermes-agent` with the actual path to your Hermes Agent
|
||||
installation (e.g. `~/.hermes/hermes-agent`).
|
||||
|
||||
Alternatively, if `hermes` is on your PATH, the ACP Client can discover it
|
||||
automatically via the registry directory.
|
||||
|
||||
### 3. Restart VS Code
|
||||
|
||||
After configuring, restart VS Code. You should see **Hermes Agent** appear in
|
||||
the ACP agent picker in the chat/agent panel.
|
||||
|
||||
---
|
||||
|
||||
## Zed Setup
|
||||
|
||||
Zed has built-in ACP support.
|
||||
|
||||
### 1. Configure Zed settings
|
||||
|
||||
Open Zed settings (`Cmd+,` on macOS or `Ctrl+,` on Linux) and add to your
|
||||
`settings.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"acp": {
|
||||
"agents": [
|
||||
{
|
||||
"name": "hermes-agent",
|
||||
"registry_dir": "/path/to/hermes-agent/acp_registry"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Restart Zed
|
||||
|
||||
Hermes Agent will appear in the agent panel. Select it and start a conversation.
|
||||
|
||||
---
|
||||
|
||||
## JetBrains Setup (IntelliJ, PyCharm, WebStorm, etc.)
|
||||
|
||||
### 1. Install the ACP plugin
|
||||
|
||||
- Open **Settings** → **Plugins** → **Marketplace**
|
||||
- Search for **"ACP"** or **"Agent Client Protocol"**
|
||||
- Install and restart the IDE
|
||||
|
||||
### 2. Configure the agent
|
||||
|
||||
- Open **Settings** → **Tools** → **ACP Agents**
|
||||
- Click **+** to add a new agent
|
||||
- Set the registry directory to your `acp_registry/` folder:
|
||||
`/path/to/hermes-agent/acp_registry`
|
||||
- Click **OK**
|
||||
|
||||
### 3. Use the agent
|
||||
|
||||
Open the ACP panel (usually in the right sidebar) and select **Hermes Agent**.
|
||||
|
||||
---
|
||||
|
||||
## What You Will See
|
||||
|
||||
Once connected, your editor provides a native interface to Hermes Agent:
|
||||
|
||||
### Chat Panel
|
||||
A conversational interface where you can describe tasks, ask questions, and
|
||||
give instructions. Hermes responds with explanations and actions.
|
||||
|
||||
### File Diffs
|
||||
When Hermes edits files, you see standard diffs in the editor. You can:
|
||||
- **Accept** individual changes
|
||||
- **Reject** changes you don't want
|
||||
- **Review** the full diff before applying
|
||||
|
||||
### Terminal Commands
|
||||
When Hermes needs to run shell commands (builds, tests, installs), the editor
|
||||
shows them in an integrated terminal. Depending on your settings:
|
||||
- Commands may run automatically
|
||||
- Or you may be prompted to **approve** each command
|
||||
|
||||
### Approval Flow
|
||||
For potentially destructive operations, the editor will prompt you for
|
||||
approval before Hermes proceeds. This includes:
|
||||
- File deletions
|
||||
- Shell commands
|
||||
- Git operations
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
Hermes Agent under ACP uses the **same configuration** as the CLI:
|
||||
|
||||
- **API keys / providers**: `~/.hermes/.env`
|
||||
- **Agent config**: `~/.hermes/config.yaml`
|
||||
- **Skills**: `~/.hermes/skills/`
|
||||
- **Sessions**: `~/.hermes/state.db`
|
||||
|
||||
You can run `hermes setup` to configure providers, or edit `~/.hermes/.env`
|
||||
directly.
|
||||
|
||||
### Changing the model
|
||||
|
||||
Edit `~/.hermes/config.yaml`:
|
||||
|
||||
```yaml
|
||||
model: openrouter/nous/hermes-3-llama-3.1-70b
|
||||
```
|
||||
|
||||
Or set the `HERMES_MODEL` environment variable.
|
||||
|
||||
### Toolsets
|
||||
|
||||
ACP sessions use the curated `hermes-acp` toolset by default. It is designed for editor workflows and intentionally excludes things like messaging delivery, cronjob management, and audio-first UX features.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Agent doesn't appear in the editor
|
||||
|
||||
1. **Check the registry path** — make sure the `acp_registry/` directory path
|
||||
in your editor settings is correct and contains `agent.json`.
|
||||
2. **Check `hermes` is on PATH** — run `which hermes` in a terminal. If not
|
||||
found, you may need to activate your virtualenv or add it to PATH.
|
||||
3. **Restart the editor** after changing settings.
|
||||
|
||||
### Agent starts but errors immediately
|
||||
|
||||
1. Run `hermes doctor` to check your configuration.
|
||||
2. Check that you have a valid API key: `hermes status`
|
||||
3. Try running `hermes acp` directly in a terminal to see error output.
|
||||
|
||||
### "Module not found" errors
|
||||
|
||||
Make sure you installed the ACP extra:
|
||||
|
||||
```bash
|
||||
pip install -e ".[acp]"
|
||||
```
|
||||
|
||||
### Slow responses
|
||||
|
||||
- ACP streams responses, so you should see incremental output. If the agent
|
||||
appears stuck, check your network connection and API provider status.
|
||||
- Some providers have rate limits. Try switching to a different model/provider.
|
||||
|
||||
### Permission denied for terminal commands
|
||||
|
||||
If the editor blocks terminal commands, check your ACP Client extension
|
||||
settings for auto-approval or manual-approval preferences.
|
||||
|
||||
### Logs
|
||||
|
||||
Hermes logs are written to stderr when running in ACP mode. Check:
|
||||
- VS Code: **Output** panel → select **ACP Client** or **Hermes Agent**
|
||||
- Zed: **View** → **Toggle Terminal** and check the process output
|
||||
- JetBrains: **Event Log** or the ACP tool window
|
||||
|
||||
You can also enable verbose logging:
|
||||
|
||||
```bash
|
||||
HERMES_LOG_LEVEL=DEBUG hermes acp
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [ACP Specification](https://github.com/anysphere/acp)
|
||||
- [Hermes Agent Documentation](https://github.com/NousResearch/hermes-agent)
|
||||
- Run `hermes --help` for all CLI options
|
||||
698
docs/honcho-integration-spec.html
Normal file
698
docs/honcho-integration-spec.html
Normal file
@@ -0,0 +1,698 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>honcho-integration-spec</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg: #0b0e14;
|
||||
--bg-surface: #11151c;
|
||||
--bg-elevated: #181d27;
|
||||
--bg-code: #0d1018;
|
||||
--fg: #c9d1d9;
|
||||
--fg-bright: #e6edf3;
|
||||
--fg-muted: #6e7681;
|
||||
--fg-subtle: #484f58;
|
||||
--accent: #7eb8f6;
|
||||
--accent-dim: #3d6ea5;
|
||||
--accent-glow: rgba(126, 184, 246, 0.08);
|
||||
--green: #7ee6a8;
|
||||
--green-dim: #2ea04f;
|
||||
--orange: #e6a855;
|
||||
--red: #f47067;
|
||||
--purple: #bc8cff;
|
||||
--cyan: #56d4dd;
|
||||
--border: #21262d;
|
||||
--border-subtle: #161b22;
|
||||
--radius: 6px;
|
||||
--font-sans: 'New York', ui-serif, 'Iowan Old Style', 'Apple Garamond', Baskerville, 'Times New Roman', 'Noto Emoji', serif;
|
||||
--font-mono: 'Departure Mono', 'Noto Emoji', monospace;
|
||||
}
|
||||
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
html { scroll-behavior: smooth; scroll-padding-top: 2rem; }
|
||||
body {
|
||||
font-family: var(--font-sans);
|
||||
background: var(--bg);
|
||||
color: var(--fg);
|
||||
line-height: 1.7;
|
||||
font-size: 15px;
|
||||
-webkit-font-smoothing: antialiased;
|
||||
}
|
||||
|
||||
.container { max-width: 860px; margin: 0 auto; padding: 3rem 2rem 6rem; }
|
||||
|
||||
.hero {
|
||||
text-align: center;
|
||||
padding: 4rem 0 3rem;
|
||||
border-bottom: 1px solid var(--border);
|
||||
margin-bottom: 3rem;
|
||||
}
|
||||
.hero h1 { font-family: var(--font-mono); font-size: 2.2rem; font-weight: 700; color: var(--fg-bright); letter-spacing: -0.03em; margin-bottom: 0.5rem; }
|
||||
.hero h1 span { color: var(--accent); }
|
||||
.hero .subtitle { font-family: var(--font-sans); color: var(--fg-muted); font-size: 0.92rem; max-width: 560px; margin: 0 auto; line-height: 1.6; }
|
||||
.hero .meta { margin-top: 1.5rem; display: flex; justify-content: center; gap: 1.5rem; flex-wrap: wrap; }
|
||||
.hero .meta span { font-size: 0.8rem; color: var(--fg-subtle); font-family: var(--font-mono); }
|
||||
|
||||
.toc { background: var(--bg-surface); border: 1px solid var(--border); border-radius: var(--radius); padding: 1.5rem 2rem; margin-bottom: 3rem; }
|
||||
.toc h2 { font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.1em; color: var(--fg-muted); margin-bottom: 1rem; }
|
||||
.toc ol { list-style: none; counter-reset: toc; columns: 2; column-gap: 2rem; }
|
||||
.toc li { counter-increment: toc; break-inside: avoid; margin-bottom: 0.35rem; }
|
||||
.toc li::before { content: counter(toc, decimal-leading-zero) " "; color: var(--fg-subtle); font-family: var(--font-mono); font-size: 0.75rem; margin-right: 0.25rem; }
|
||||
.toc a { font-family: var(--font-mono); color: var(--fg); text-decoration: none; font-size: 0.82rem; transition: color 0.15s; }
|
||||
.toc a:hover { color: var(--accent); }
|
||||
|
||||
section { margin-bottom: 4rem; }
|
||||
section + section { padding-top: 1rem; }
|
||||
|
||||
h2 { font-family: var(--font-mono); font-size: 1.3rem; font-weight: 700; color: var(--fg-bright); letter-spacing: -0.01em; margin-bottom: 1.25rem; padding-bottom: 0.5rem; border-bottom: 1px solid var(--border); }
|
||||
h3 { font-family: var(--font-mono); font-size: 1rem; font-weight: 600; color: var(--fg-bright); margin-top: 2rem; margin-bottom: 0.75rem; }
|
||||
h4 { font-family: var(--font-mono); font-size: 0.9rem; font-weight: 600; color: var(--accent); margin-top: 1.5rem; margin-bottom: 0.5rem; }
|
||||
|
||||
p { margin-bottom: 1rem; font-size: 0.95rem; line-height: 1.75; }
|
||||
strong { color: var(--fg-bright); font-weight: 600; }
|
||||
a { color: var(--accent); text-decoration: none; }
|
||||
a:hover { text-decoration: underline; }
|
||||
|
||||
ul, ol { margin-bottom: 1rem; padding-left: 1.5rem; font-size: 0.93rem; line-height: 1.7; }
|
||||
li { margin-bottom: 0.35rem; }
|
||||
li::marker { color: var(--fg-subtle); }
|
||||
|
||||
.table-wrap { overflow-x: auto; margin-bottom: 1.5rem; }
|
||||
table { width: 100%; border-collapse: collapse; font-size: 0.88rem; }
|
||||
th, td { text-align: left; padding: 0.6rem 1rem; border-bottom: 1px solid var(--border-subtle); }
|
||||
th { font-family: var(--font-mono); font-size: 0.72rem; text-transform: uppercase; letter-spacing: 0.06em; color: var(--fg-muted); background: var(--bg-surface); border-bottom-color: var(--border); white-space: nowrap; }
|
||||
td { font-family: var(--font-sans); font-size: 0.88rem; color: var(--fg); }
|
||||
tr:hover td { background: var(--accent-glow); }
|
||||
td code { background: var(--bg-elevated); padding: 0.15em 0.4em; border-radius: 3px; font-family: var(--font-mono); font-size: 0.82em; color: var(--cyan); }
|
||||
|
||||
pre { background: var(--bg-code); border: 1px solid var(--border); border-radius: var(--radius); padding: 1.25rem 1.5rem; overflow-x: auto; margin-bottom: 1.5rem; font-family: var(--font-mono); font-size: 0.82rem; line-height: 1.65; color: var(--fg); }
|
||||
pre code { background: none; padding: 0; color: inherit; font-size: inherit; }
|
||||
code { font-family: var(--font-mono); font-size: 0.85em; }
|
||||
p code, li code { background: var(--bg-elevated); padding: 0.15em 0.4em; border-radius: 3px; color: var(--cyan); font-size: 0.85em; }
|
||||
|
||||
.kw { color: var(--purple); }
|
||||
.str { color: var(--green); }
|
||||
.cm { color: var(--fg-subtle); font-style: italic; }
|
||||
.num { color: var(--orange); }
|
||||
.key { color: var(--accent); }
|
||||
|
||||
.mermaid { margin: 1.5rem 0 2rem; text-align: center; }
|
||||
.mermaid svg { max-width: 100%; height: auto; }
|
||||
|
||||
.callout { font-family: var(--font-sans); background: var(--bg-surface); border-left: 3px solid var(--accent-dim); border-radius: 0 var(--radius) var(--radius) 0; padding: 1rem 1.25rem; margin-bottom: 1.5rem; font-size: 0.88rem; color: var(--fg-muted); line-height: 1.6; }
|
||||
.callout strong { font-family: var(--font-mono); color: var(--fg-bright); }
|
||||
.callout.success { border-left-color: var(--green-dim); }
|
||||
.callout.warn { border-left-color: var(--orange); }
|
||||
|
||||
.badge { display: inline-block; font-family: var(--font-mono); font-size: 0.65rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.05em; padding: 0.2em 0.6em; border-radius: 3px; vertical-align: middle; margin-left: 0.4rem; }
|
||||
.badge-done { background: var(--green-dim); color: #fff; }
|
||||
.badge-wip { background: var(--orange); color: #0b0e14; }
|
||||
.badge-todo { background: var(--fg-subtle); color: var(--fg); }
|
||||
|
||||
.checklist { list-style: none; padding-left: 0; }
|
||||
.checklist li { padding-left: 1.5rem; position: relative; margin-bottom: 0.5rem; }
|
||||
.checklist li::before { position: absolute; left: 0; font-family: var(--font-mono); font-size: 0.85rem; }
|
||||
.checklist li.done { color: var(--fg-muted); }
|
||||
.checklist li.done::before { content: "\2713"; color: var(--green); }
|
||||
.checklist li.todo::before { content: "\25CB"; color: var(--fg-subtle); }
|
||||
.checklist li.wip::before { content: "\25D4"; color: var(--orange); }
|
||||
|
||||
.compare { display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-bottom: 2rem; }
|
||||
.compare-card { background: var(--bg-surface); border: 1px solid var(--border); border-radius: var(--radius); padding: 1.25rem; }
|
||||
.compare-card h4 { margin-top: 0; font-size: 0.82rem; }
|
||||
.compare-card.after { border-color: var(--accent-dim); }
|
||||
.compare-card ul { font-family: var(--font-mono); padding-left: 1.25rem; font-size: 0.8rem; }
|
||||
|
||||
hr { border: none; border-top: 1px solid var(--border); margin: 3rem 0; }
|
||||
|
||||
.progress-bar { position: fixed; top: 0; left: 0; height: 2px; background: var(--accent); z-index: 999; transition: width 0.1s linear; }
|
||||
|
||||
@media (max-width: 640px) {
|
||||
.container { padding: 2rem 1rem 4rem; }
|
||||
.hero h1 { font-size: 1.6rem; }
|
||||
.toc ol { columns: 1; }
|
||||
.compare { grid-template-columns: 1fr; }
|
||||
table { font-size: 0.8rem; }
|
||||
th, td { padding: 0.4rem 0.6rem; }
|
||||
}
|
||||
</style>
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||
<link href="https://fonts.googleapis.com/css2?family=Noto+Emoji&display=swap" rel="stylesheet">
|
||||
<style>
|
||||
@font-face {
|
||||
font-family: 'Departure Mono';
|
||||
src: url('https://cdn.jsdelivr.net/gh/rektdeckard/departure-mono@latest/fonts/DepartureMono-Regular.woff2') format('woff2');
|
||||
font-weight: normal;
|
||||
font-style: normal;
|
||||
font-display: swap;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="progress-bar" id="progress"></div>
|
||||
|
||||
<div class="container">
|
||||
|
||||
<header class="hero">
|
||||
<h1>honcho<span>-integration-spec</span></h1>
|
||||
<p class="subtitle">Comparison of Hermes Agent vs. openclaw-honcho — and a porting spec for bringing Hermes patterns into other Honcho integrations.</p>
|
||||
<div class="meta">
|
||||
<span>hermes-agent / openclaw-honcho</span>
|
||||
<span>Python + TypeScript</span>
|
||||
<span>2026-03-09</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<nav class="toc">
|
||||
<h2>Contents</h2>
|
||||
<ol>
|
||||
<li><a href="#overview">Overview</a></li>
|
||||
<li><a href="#architecture">Architecture comparison</a></li>
|
||||
<li><a href="#diff-table">Diff table</a></li>
|
||||
<li><a href="#patterns">Hermes patterns to port</a></li>
|
||||
<li><a href="#spec-async">Spec: async prefetch</a></li>
|
||||
<li><a href="#spec-reasoning">Spec: dynamic reasoning level</a></li>
|
||||
<li><a href="#spec-modes">Spec: per-peer memory modes</a></li>
|
||||
<li><a href="#spec-identity">Spec: AI peer identity formation</a></li>
|
||||
<li><a href="#spec-sessions">Spec: session naming strategies</a></li>
|
||||
<li><a href="#spec-cli">Spec: CLI surface injection</a></li>
|
||||
<li><a href="#openclaw-checklist">openclaw-honcho checklist</a></li>
|
||||
<li><a href="#nanobot-checklist">nanobot-honcho checklist</a></li>
|
||||
</ol>
|
||||
</nav>
|
||||
|
||||
<!-- OVERVIEW -->
|
||||
<section id="overview">
|
||||
<h2>Overview</h2>
|
||||
|
||||
<p>Two independent Honcho integrations have been built for two different agent runtimes: <strong>Hermes Agent</strong> (Python, baked into the runner) and <strong>openclaw-honcho</strong> (TypeScript plugin via hook/tool API). Both use the same Honcho peer paradigm — dual peer model, <code>session.context()</code>, <code>peer.chat()</code> — but they made different tradeoffs at every layer.</p>
|
||||
|
||||
<p>This document maps those tradeoffs and defines a porting spec: a set of Hermes-originated patterns, each stated as an integration-agnostic interface, that any Honcho integration can adopt regardless of runtime or language.</p>
|
||||
|
||||
<div class="callout">
|
||||
<strong>Scope</strong> Both integrations work correctly today. This spec is about the delta — patterns in Hermes that are worth propagating and patterns in openclaw-honcho that Hermes should eventually adopt. The spec is additive, not prescriptive.
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- ARCHITECTURE -->
|
||||
<section id="architecture">
|
||||
<h2>Architecture comparison</h2>
|
||||
|
||||
<h3>Hermes: baked-in runner</h3>
|
||||
<p>Honcho is initialised directly inside <code>AIAgent.__init__</code>. There is no plugin boundary. Session management, context injection, async prefetch, and CLI surface are all first-class concerns of the runner. Context is injected once per session (baked into <code>_cached_system_prompt</code>) and never re-fetched mid-session — this maximises prefix cache hits at the LLM provider.</p>
|
||||
|
||||
<div class="mermaid">
|
||||
%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#1f3150', 'primaryTextColor': '#c9d1d9', 'primaryBorderColor': '#3d6ea5', 'lineColor': '#3d6ea5', 'secondaryColor': '#162030', 'tertiaryColor': '#11151c' }}}%%
|
||||
flowchart TD
|
||||
U["user message"] --> P["_honcho_prefetch()<br/>(reads cache — no HTTP)"]
|
||||
P --> SP["_build_system_prompt()<br/>(first turn only, cached)"]
|
||||
SP --> LLM["LLM call"]
|
||||
LLM --> R["response"]
|
||||
R --> FP["_honcho_fire_prefetch()<br/>(daemon threads, turn end)"]
|
||||
FP --> C1["prefetch_context() thread"]
|
||||
FP --> C2["prefetch_dialectic() thread"]
|
||||
C1 --> CACHE["_context_cache / _dialectic_cache"]
|
||||
C2 --> CACHE
|
||||
|
||||
style U fill:#162030,stroke:#3d6ea5,color:#c9d1d9
|
||||
style P fill:#1f3150,stroke:#3d6ea5,color:#c9d1d9
|
||||
style SP fill:#1f3150,stroke:#3d6ea5,color:#c9d1d9
|
||||
style LLM fill:#162030,stroke:#3d6ea5,color:#c9d1d9
|
||||
style R fill:#162030,stroke:#3d6ea5,color:#c9d1d9
|
||||
style FP fill:#2a1a40,stroke:#bc8cff,color:#c9d1d9
|
||||
style C1 fill:#2a1a40,stroke:#bc8cff,color:#c9d1d9
|
||||
style C2 fill:#2a1a40,stroke:#bc8cff,color:#c9d1d9
|
||||
style CACHE fill:#11151c,stroke:#484f58,color:#6e7681
|
||||
</div>
|
||||
|
||||
<h3>openclaw-honcho: hook-based plugin</h3>
|
||||
<p>The plugin registers hooks against OpenClaw's event bus. Context is fetched synchronously inside <code>before_prompt_build</code> on every turn. Message capture happens in <code>agent_end</code>. The multi-agent hierarchy is tracked via <code>subagent_spawned</code>. This model is correct but every turn pays a blocking Honcho round-trip before the LLM call can begin.</p>
|
||||
|
||||
<div class="mermaid">
|
||||
%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#1f3150', 'primaryTextColor': '#c9d1d9', 'primaryBorderColor': '#3d6ea5', 'lineColor': '#3d6ea5', 'secondaryColor': '#162030', 'tertiaryColor': '#11151c' }}}%%
|
||||
flowchart TD
|
||||
U2["user message"] --> BPB["before_prompt_build<br/>(BLOCKING HTTP — every turn)"]
|
||||
BPB --> CTX["session.context()"]
|
||||
CTX --> SP2["system prompt assembled"]
|
||||
SP2 --> LLM2["LLM call"]
|
||||
LLM2 --> R2["response"]
|
||||
R2 --> AE["agent_end hook"]
|
||||
AE --> SAVE["session.addMessages()<br/>session.setMetadata()"]
|
||||
|
||||
style U2 fill:#162030,stroke:#3d6ea5,color:#c9d1d9
|
||||
style BPB fill:#3a1515,stroke:#f47067,color:#c9d1d9
|
||||
style CTX fill:#3a1515,stroke:#f47067,color:#c9d1d9
|
||||
style SP2 fill:#1f3150,stroke:#3d6ea5,color:#c9d1d9
|
||||
style LLM2 fill:#162030,stroke:#3d6ea5,color:#c9d1d9
|
||||
style R2 fill:#162030,stroke:#3d6ea5,color:#c9d1d9
|
||||
style AE fill:#162030,stroke:#3d6ea5,color:#c9d1d9
|
||||
style SAVE fill:#11151c,stroke:#484f58,color:#6e7681
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- DIFF TABLE -->
|
||||
<section id="diff-table">
|
||||
<h2>Diff table</h2>
|
||||
|
||||
<div class="table-wrap">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Dimension</th>
|
||||
<th>Hermes Agent</th>
|
||||
<th>openclaw-honcho</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><strong>Context injection timing</strong></td>
|
||||
<td>Once per session (cached). Zero HTTP on response path after turn 1.</td>
|
||||
<td>Every turn, blocking. Fresh context per turn but adds latency.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Prefetch strategy</strong></td>
|
||||
<td>Daemon threads fire at turn end; consumed next turn from cache.</td>
|
||||
<td>None. Blocking call at prompt-build time.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Dialectic (peer.chat)</strong></td>
|
||||
<td>Prefetched async; result injected into system prompt next turn.</td>
|
||||
<td>On-demand via <code>honcho_recall</code> / <code>honcho_analyze</code> tools.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Reasoning level</strong></td>
|
||||
<td>Dynamic: scales with message length. Floor = config default. Cap = "high".</td>
|
||||
<td>Fixed per tool: recall=minimal, analyze=medium.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Memory modes</strong></td>
|
||||
<td><code>user_memory_mode</code> / <code>agent_memory_mode</code>: hybrid / honcho / local.</td>
|
||||
<td>None. Always writes to Honcho.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Write frequency</strong></td>
|
||||
<td>async (background queue), turn, session, N turns.</td>
|
||||
<td>After every agent_end (no control).</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>AI peer identity</strong></td>
|
||||
<td><code>observe_me=True</code>, <code>seed_ai_identity()</code>, <code>get_ai_representation()</code>, SOUL.md → AI peer.</td>
|
||||
<td>Agent files uploaded to agent peer at setup. No ongoing self-observation seeding.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Context scope</strong></td>
|
||||
<td>User peer + AI peer representation, both injected.</td>
|
||||
<td>User peer (owner) representation + conversation summary. <code>peerPerspective</code> on context call.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Session naming</strong></td>
|
||||
<td>per-directory / global / manual map / title-based.</td>
|
||||
<td>Derived from platform session key.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Multi-agent</strong></td>
|
||||
<td>Single-agent only.</td>
|
||||
<td>Parent observer hierarchy via <code>subagent_spawned</code>.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Tool surface</strong></td>
|
||||
<td>Single <code>query_user_context</code> tool (on-demand dialectic).</td>
|
||||
<td>6 tools: session, profile, search, context (fast) + recall, analyze (LLM).</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Platform metadata</strong></td>
|
||||
<td>Not stripped.</td>
|
||||
<td>Explicitly stripped before Honcho storage.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Message dedup</strong></td>
|
||||
<td>None (sends on every save cycle).</td>
|
||||
<td><code>lastSavedIndex</code> in session metadata prevents re-sending.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>CLI surface in prompt</strong></td>
|
||||
<td>Management commands injected into system prompt. Agent knows its own CLI.</td>
|
||||
<td>Not injected.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>AI peer name in identity</strong></td>
|
||||
<td>Replaces "Hermes Agent" in DEFAULT_AGENT_IDENTITY when configured.</td>
|
||||
<td>Not implemented.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>QMD / local file search</strong></td>
|
||||
<td>Not implemented.</td>
|
||||
<td>Passthrough tools when QMD backend configured.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Workspace metadata</strong></td>
|
||||
<td>Not implemented.</td>
|
||||
<td><code>agentPeerMap</code> in workspace metadata tracks agent→peer ID.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- PATTERNS -->
|
||||
<section id="patterns">
|
||||
<h2>Hermes patterns to port</h2>
|
||||
|
||||
<p>Six patterns from Hermes are worth adopting in any Honcho integration. They are described below as integration-agnostic interfaces — the implementation will differ per runtime, but the contract is the same.</p>
|
||||
|
||||
<div class="compare">
|
||||
<div class="compare-card">
|
||||
<h4>Patterns Hermes contributes</h4>
|
||||
<ul>
|
||||
<li>Async prefetch (zero-latency)</li>
|
||||
<li>Dynamic reasoning level</li>
|
||||
<li>Per-peer memory modes</li>
|
||||
<li>AI peer identity formation</li>
|
||||
<li>Session naming strategies</li>
|
||||
<li>CLI surface injection</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="compare-card after">
|
||||
<h4>Patterns openclaw contributes back</h4>
|
||||
<ul>
|
||||
<li>lastSavedIndex dedup</li>
|
||||
<li>Platform metadata stripping</li>
|
||||
<li>Multi-agent observer hierarchy</li>
|
||||
<li>peerPerspective on context()</li>
|
||||
<li>Tiered tool surface (fast/LLM)</li>
|
||||
<li>Workspace agentPeerMap</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- SPEC: ASYNC PREFETCH -->
|
||||
<section id="spec-async">
|
||||
<h2>Spec: async prefetch</h2>
|
||||
|
||||
<h3>Problem</h3>
|
||||
<p>Calling <code>session.context()</code> and <code>peer.chat()</code> synchronously before each LLM call adds 200–800ms of Honcho round-trip latency to every turn. Users experience this as the agent "thinking slowly."</p>
|
||||
|
||||
<h3>Pattern</h3>
|
||||
<p>Fire both calls as non-blocking background work at the <strong>end</strong> of each turn. Store results in a per-session cache keyed by session ID. At the <strong>start</strong> of the next turn, pop from cache — the HTTP is already done. First turn is cold (empty cache); all subsequent turns are zero-latency on the response path.</p>
|
||||
|
||||
<h3>Interface contract</h3>
|
||||
<pre><code><span class="cm">// TypeScript (openclaw / nanobot plugin shape)</span>
|
||||
|
||||
<span class="kw">interface</span> <span class="key">AsyncPrefetch</span> {
|
||||
<span class="cm">// Fire context + dialectic fetches at turn end. Non-blocking.</span>
|
||||
firePrefetch(sessionId: <span class="str">string</span>, userMessage: <span class="str">string</span>): <span class="kw">void</span>;
|
||||
|
||||
<span class="cm">// Pop cached results at turn start. Returns empty if cache is cold.</span>
|
||||
popContextResult(sessionId: <span class="str">string</span>): ContextResult | <span class="kw">null</span>;
|
||||
popDialecticResult(sessionId: <span class="str">string</span>): <span class="str">string</span> | <span class="kw">null</span>;
|
||||
}
|
||||
|
||||
<span class="kw">type</span> <span class="key">ContextResult</span> = {
|
||||
representation: <span class="str">string</span>;
|
||||
card: <span class="str">string</span>[];
|
||||
aiRepresentation?: <span class="str">string</span>; <span class="cm">// AI peer context if enabled</span>
|
||||
summary?: <span class="str">string</span>; <span class="cm">// conversation summary if fetched</span>
|
||||
};</code></pre>
|
||||
|
||||
<h3>Implementation notes</h3>
|
||||
<ul>
|
||||
<li>Python: <code>threading.Thread(daemon=True)</code>. Write to <code>dict[session_id, result]</code> — GIL makes this safe for simple writes.</li>
|
||||
<li>TypeScript: <code>Promise</code> stored in <code>Map<string, Promise<ContextResult>></code>. Await at pop time. If not resolved yet, skip (return null) — do not block.</li>
|
||||
<li>The pop is destructive: clears the cache entry after reading so stale data never accumulates.</li>
|
||||
<li>Prefetch should also fire on first turn (even though it won't be consumed until turn 2) — this ensures turn 2 is never cold.</li>
|
||||
</ul>
|
||||
|
||||
<h3>openclaw-honcho adoption</h3>
|
||||
<p>Move <code>session.context()</code> from <code>before_prompt_build</code> to a post-<code>agent_end</code> background task. Store result in <code>state.contextCache</code>. In <code>before_prompt_build</code>, read from cache instead of calling Honcho. If cache is empty (turn 1), inject nothing — the prompt is still valid without Honcho context on the first turn.</p>
|
||||
</section>
|
||||
|
||||
<!-- SPEC: DYNAMIC REASONING LEVEL -->
|
||||
<section id="spec-reasoning">
|
||||
<h2>Spec: dynamic reasoning level</h2>
|
||||
|
||||
<h3>Problem</h3>
|
||||
<p>Honcho's dialectic endpoint supports reasoning levels from <code>minimal</code> to <code>max</code>. A fixed level per tool wastes budget on simple queries and under-serves complex ones.</p>
|
||||
|
||||
<h3>Pattern</h3>
|
||||
<p>Select the reasoning level dynamically based on the user's message. Use the configured default as a floor. Bump by message length. Cap auto-selection at <code>high</code> — never select <code>max</code> automatically.</p>
|
||||
|
||||
<h3>Interface contract</h3>
|
||||
<pre><code><span class="cm">// Shared helper — identical logic in any language</span>
|
||||
|
||||
<span class="kw">const</span> LEVELS = [<span class="str">"minimal"</span>, <span class="str">"low"</span>, <span class="str">"medium"</span>, <span class="str">"high"</span>, <span class="str">"max"</span>];
|
||||
|
||||
<span class="kw">function</span> <span class="key">dynamicReasoningLevel</span>(
|
||||
query: <span class="str">string</span>,
|
||||
configDefault: <span class="str">string</span> = <span class="str">"low"</span>
|
||||
): <span class="str">string</span> {
|
||||
<span class="kw">const</span> baseIdx = Math.max(<span class="num">0</span>, LEVELS.indexOf(configDefault));
|
||||
<span class="kw">const</span> n = query.length;
|
||||
<span class="kw">const</span> bump = n < <span class="num">120</span> ? <span class="num">0</span> : n < <span class="num">400</span> ? <span class="num">1</span> : <span class="num">2</span>;
|
||||
<span class="kw">return</span> LEVELS[Math.min(baseIdx + bump, <span class="num">3</span>)]; <span class="cm">// cap at "high" (idx 3)</span>
|
||||
}</code></pre>
|
||||
|
||||
<h3>Config key</h3>
|
||||
<p>Add a <code>dialecticReasoningLevel</code> config field (string, default <code>"low"</code>). This sets the floor. Users can raise or lower it. The dynamic bump always applies on top.</p>
|
||||
|
||||
<h3>openclaw-honcho adoption</h3>
|
||||
<p>Apply in <code>honcho_recall</code> and <code>honcho_analyze</code>: replace the fixed <code>reasoningLevel</code> with the dynamic selector. <code>honcho_recall</code> should use floor <code>"minimal"</code> and <code>honcho_analyze</code> floor <code>"medium"</code> — both still bump with message length.</p>
|
||||
</section>
|
||||
|
||||
<!-- SPEC: PER-PEER MEMORY MODES -->
|
||||
<section id="spec-modes">
|
||||
<h2>Spec: per-peer memory modes</h2>
|
||||
|
||||
<h3>Problem</h3>
|
||||
<p>Users want independent control over whether user context and agent context are written locally, to Honcho, or both. A single <code>memoryMode</code> shorthand is not granular enough.</p>
|
||||
|
||||
<h3>Pattern</h3>
|
||||
<p>Three modes per peer: <code>hybrid</code> (write both local + Honcho), <code>honcho</code> (Honcho only, disable local files), <code>local</code> (local files only, skip Honcho sync for this peer). Two orthogonal axes: user peer and agent peer.</p>
|
||||
|
||||
<h3>Config schema</h3>
|
||||
<pre><code><span class="cm">// ~/.openclaw/openclaw.json (or ~/.nanobot/config.json)</span>
|
||||
{
|
||||
<span class="str">"plugins"</span>: {
|
||||
<span class="str">"openclaw-honcho"</span>: {
|
||||
<span class="str">"config"</span>: {
|
||||
<span class="str">"apiKey"</span>: <span class="str">"..."</span>,
|
||||
<span class="str">"memoryMode"</span>: <span class="str">"hybrid"</span>, <span class="cm">// shorthand: both peers</span>
|
||||
<span class="str">"userMemoryMode"</span>: <span class="str">"honcho"</span>, <span class="cm">// override for user peer</span>
|
||||
<span class="str">"agentMemoryMode"</span>: <span class="str">"hybrid"</span> <span class="cm">// override for agent peer</span>
|
||||
}
|
||||
}
|
||||
}
|
||||
}</code></pre>
|
||||
|
||||
<h3>Resolution order</h3>
|
||||
<ol>
|
||||
<li>Per-peer field (<code>userMemoryMode</code> / <code>agentMemoryMode</code>) — wins if present.</li>
|
||||
<li>Shorthand <code>memoryMode</code> — applies to both peers as default.</li>
|
||||
<li>Hardcoded default: <code>"hybrid"</code>.</li>
|
||||
</ol>
|
||||
|
||||
<h3>Effect on Honcho sync</h3>
|
||||
<ul>
|
||||
<li><code>userMemoryMode=local</code>: skip adding user peer messages to Honcho.</li>
|
||||
<li><code>agentMemoryMode=local</code>: skip adding assistant peer messages to Honcho.</li>
|
||||
<li>Both local: skip <code>session.addMessages()</code> entirely.</li>
|
||||
<li><code>userMemoryMode=honcho</code>: disable local USER.md writes.</li>
|
||||
<li><code>agentMemoryMode=honcho</code>: disable local MEMORY.md / SOUL.md writes.</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<!-- SPEC: AI PEER IDENTITY -->
|
||||
<section id="spec-identity">
|
||||
<h2>Spec: AI peer identity formation</h2>
|
||||
|
||||
<h3>Problem</h3>
|
||||
<p>Honcho builds the user's representation organically by observing what the user says. The same mechanism exists for the AI peer — but only if <code>observe_me=True</code> is set for the agent peer. Without it, the agent peer accumulates nothing and Honcho's AI-side model never forms.</p>
|
||||
|
||||
<p>Additionally, existing persona files (SOUL.md, IDENTITY.md) should seed the AI peer's Honcho representation at first activation, rather than waiting for it to emerge from scratch.</p>
|
||||
|
||||
<h3>Part A: observe_me=True for agent peer</h3>
|
||||
<pre><code><span class="cm">// TypeScript — in session.addPeers() call</span>
|
||||
<span class="kw">await</span> session.addPeers([
|
||||
[ownerPeer.id, { observeMe: <span class="kw">true</span>, observeOthers: <span class="kw">false</span> }],
|
||||
[agentPeer.id, { observeMe: <span class="kw">true</span>, observeOthers: <span class="kw">true</span> }], <span class="cm">// was false</span>
|
||||
]);</code></pre>
|
||||
|
||||
<p>This is a one-line change but foundational. Without it, Honcho's AI peer representation stays empty regardless of what the agent says.</p>
|
||||
|
||||
<h3>Part B: seedAiIdentity()</h3>
|
||||
<pre><code><span class="kw">async function</span> <span class="key">seedAiIdentity</span>(
|
||||
session: HonchoSession,
|
||||
agentPeer: Peer,
|
||||
content: <span class="str">string</span>,
|
||||
source: <span class="str">string</span>
|
||||
): Promise<<span class="kw">boolean</span>> {
|
||||
<span class="kw">const</span> wrapped = [
|
||||
<span class="str">`<ai_identity_seed>`</span>,
|
||||
<span class="str">`<source>${source}</source>`</span>,
|
||||
<span class="str">``</span>,
|
||||
content.trim(),
|
||||
<span class="str">`</ai_identity_seed>`</span>,
|
||||
].join(<span class="str">"\n"</span>);
|
||||
|
||||
<span class="kw">await</span> agentPeer.addMessage(<span class="str">"assistant"</span>, wrapped);
|
||||
<span class="kw">return true</span>;
|
||||
}</code></pre>
|
||||
|
||||
<h3>Part C: migrate agent files at setup</h3>
|
||||
<p>During <code>openclaw honcho setup</code>, upload agent-self files (SOUL.md, IDENTITY.md, AGENTS.md, BOOTSTRAP.md) to the agent peer using <code>seedAiIdentity()</code> instead of <code>session.uploadFile()</code>. This routes the content through Honcho's observation pipeline rather than the file store.</p>
|
||||
|
||||
<h3>Part D: AI peer name in identity</h3>
|
||||
<p>When the agent has a configured name (non-default), inject it into the agent's self-identity prefix. In OpenClaw this means adding to the injected system prompt section:</p>
|
||||
<pre><code><span class="cm">// In context hook return value</span>
|
||||
<span class="kw">return</span> {
|
||||
systemPrompt: [
|
||||
agentName ? <span class="str">`You are ${agentName}.`</span> : <span class="str">""</span>,
|
||||
<span class="str">"## User Memory Context"</span>,
|
||||
...sections,
|
||||
].filter(Boolean).join(<span class="str">"\n\n"</span>)
|
||||
};</code></pre>
|
||||
|
||||
<h3>CLI surface: honcho identity subcommand</h3>
|
||||
<pre><code>openclaw honcho identity <file> <span class="cm"># seed from file</span>
|
||||
openclaw honcho identity --show <span class="cm"># show current AI peer representation</span></code></pre>
|
||||
</section>
|
||||
|
||||
<!-- SPEC: SESSION NAMING -->
|
||||
<section id="spec-sessions">
|
||||
<h2>Spec: session naming strategies</h2>
|
||||
|
||||
<h3>Problem</h3>
|
||||
<p>When Honcho is used across multiple projects or directories, a single global session means every project shares the same context. Per-directory sessions provide isolation without requiring users to name sessions manually.</p>
|
||||
|
||||
<h3>Strategies</h3>
|
||||
<div class="table-wrap">
|
||||
<table>
|
||||
<thead><tr><th>Strategy</th><th>Session key</th><th>When to use</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td><code>per-directory</code></td><td>basename of CWD</td><td>Default. Each project gets its own session.</td></tr>
|
||||
<tr><td><code>global</code></td><td>fixed string <code>"global"</code></td><td>Single cross-project session.</td></tr>
|
||||
<tr><td>manual map</td><td>user-configured per path</td><td><code>sessions</code> config map overrides directory basename.</td></tr>
|
||||
<tr><td>title-based</td><td>sanitized session title</td><td>When agent supports named sessions; title set mid-conversation.</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<h3>Config schema</h3>
|
||||
<pre><code>{
|
||||
<span class="str">"sessionStrategy"</span>: <span class="str">"per-directory"</span>, <span class="cm">// "per-directory" | "global"</span>
|
||||
<span class="str">"sessionPeerPrefix"</span>: <span class="kw">false</span>, <span class="cm">// prepend peer name to session key</span>
|
||||
<span class="str">"sessions"</span>: { <span class="cm">// manual overrides</span>
|
||||
<span class="str">"/home/user/projects/foo"</span>: <span class="str">"foo-project"</span>
|
||||
}
|
||||
}</code></pre>
|
||||
|
||||
<h3>CLI surface</h3>
|
||||
<pre><code>openclaw honcho sessions <span class="cm"># list all mappings</span>
|
||||
openclaw honcho map <name> <span class="cm"># map cwd to session name</span>
|
||||
openclaw honcho map <span class="cm"># no-arg = list mappings</span></code></pre>
|
||||
|
||||
<p>Resolution order: manual map wins → session title → directory basename → platform key.</p>
|
||||
</section>
|
||||
|
||||
<!-- SPEC: CLI SURFACE INJECTION -->
|
||||
<section id="spec-cli">
|
||||
<h2>Spec: CLI surface injection</h2>
|
||||
|
||||
<h3>Problem</h3>
|
||||
<p>When a user asks "how do I change my memory settings?" or "what Honcho commands are available?" the agent either hallucinates or says it doesn't know. The agent should know its own management interface.</p>
|
||||
|
||||
<h3>Pattern</h3>
|
||||
<p>When Honcho is active, append a compact command reference to the system prompt. The agent can cite these commands directly instead of guessing.</p>
|
||||
|
||||
<pre><code><span class="cm">// In context hook, append to systemPrompt</span>
|
||||
<span class="kw">const</span> honchoSection = [
|
||||
<span class="str">"# Honcho memory integration"</span>,
|
||||
<span class="str">`Active. Session: ${sessionKey}. Mode: ${mode}.`</span>,
|
||||
<span class="str">"Management commands:"</span>,
|
||||
<span class="str">" openclaw honcho status — show config + connection"</span>,
|
||||
<span class="str">" openclaw honcho mode [hybrid|honcho|local] — show or set memory mode"</span>,
|
||||
<span class="str">" openclaw honcho sessions — list session mappings"</span>,
|
||||
<span class="str">" openclaw honcho map <name> — map directory to session"</span>,
|
||||
<span class="str">" openclaw honcho identity [file] [--show] — seed or show AI identity"</span>,
|
||||
<span class="str">" openclaw honcho setup — full interactive wizard"</span>,
|
||||
].join(<span class="str">"\n"</span>);</code></pre>
|
||||
|
||||
<div class="callout warn">
|
||||
<strong>Keep it compact.</strong> This section is injected every turn. Keep it under 300 chars of context. List commands, not explanations — the agent can explain them on request.
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- OPENCLAW CHECKLIST -->
|
||||
<section id="openclaw-checklist">
|
||||
<h2>openclaw-honcho checklist</h2>
|
||||
|
||||
<p>Ordered by impact. Each item maps to a spec section above.</p>
|
||||
|
||||
<ul class="checklist">
|
||||
<li class="todo"><strong>Async prefetch</strong> — move <code>session.context()</code> out of <code>before_prompt_build</code> into post-<code>agent_end</code> background Promise. Pop from cache at prompt build. (<a href="#spec-async">spec</a>)</li>
|
||||
<li class="todo"><strong>observe_me=True for agent peer</strong> — one-line change in <code>session.addPeers()</code> config for agent peer. (<a href="#spec-identity">spec</a>)</li>
|
||||
<li class="todo"><strong>Dynamic reasoning level</strong> — add <code>dynamicReasoningLevel()</code> helper; apply in <code>honcho_recall</code> and <code>honcho_analyze</code>. Add <code>dialecticReasoningLevel</code> to config schema. (<a href="#spec-reasoning">spec</a>)</li>
|
||||
<li class="todo"><strong>Per-peer memory modes</strong> — add <code>userMemoryMode</code> / <code>agentMemoryMode</code> to config; gate Honcho sync and local writes accordingly. (<a href="#spec-modes">spec</a>)</li>
|
||||
<li class="todo"><strong>seedAiIdentity()</strong> — add helper; apply during setup migration for SOUL.md / IDENTITY.md instead of <code>session.uploadFile()</code>. (<a href="#spec-identity">spec</a>)</li>
|
||||
<li class="todo"><strong>Session naming strategies</strong> — add <code>sessionStrategy</code>, <code>sessions</code> map, <code>sessionPeerPrefix</code> to config; implement resolution function. (<a href="#spec-sessions">spec</a>)</li>
|
||||
<li class="todo"><strong>CLI surface injection</strong> — append command reference to <code>before_prompt_build</code> return value when Honcho is active. (<a href="#spec-cli">spec</a>)</li>
|
||||
<li class="todo"><strong>honcho identity subcommand</strong> — add <code>openclaw honcho identity</code> CLI command. (<a href="#spec-identity">spec</a>)</li>
|
||||
<li class="todo"><strong>AI peer name injection</strong> — if <code>aiPeer</code> name configured, prepend to injected system prompt. (<a href="#spec-identity">spec</a>)</li>
|
||||
<li class="todo"><strong>honcho mode / honcho sessions / honcho map</strong> — CLI parity with Hermes. (<a href="#spec-sessions">spec</a>)</li>
|
||||
</ul>
|
||||
|
||||
<div class="callout success">
|
||||
<strong>Already done in openclaw-honcho (do not re-implement):</strong> lastSavedIndex dedup, platform metadata stripping, multi-agent parent observer hierarchy, peerPerspective on context(), tiered tool surface (fast/LLM), workspace agentPeerMap, QMD passthrough, self-hosted Honcho support.
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- NANOBOT CHECKLIST -->
|
||||
<section id="nanobot-checklist">
|
||||
<h2>nanobot-honcho checklist</h2>
|
||||
|
||||
<p>nanobot-honcho is a greenfield integration. Start from openclaw-honcho's architecture (hook-based, dual peer) and apply all Hermes patterns from day one rather than retrofitting. Priority order:</p>
|
||||
|
||||
<h3>Phase 1 — core correctness</h3>
|
||||
<ul class="checklist">
|
||||
<li class="todo">Dual peer model (owner + agent peer), both with <code>observe_me=True</code></li>
|
||||
<li class="todo">Message capture at turn end with <code>lastSavedIndex</code> dedup</li>
|
||||
<li class="todo">Platform metadata stripping before Honcho storage</li>
|
||||
<li class="todo">Async prefetch from day one — do not implement blocking context injection</li>
|
||||
<li class="todo">Legacy file migration at first activation (USER.md → owner peer, SOUL.md → <code>seedAiIdentity()</code>)</li>
|
||||
</ul>
|
||||
|
||||
<h3>Phase 2 — configuration</h3>
|
||||
<ul class="checklist">
|
||||
<li class="todo">Config schema: <code>apiKey</code>, <code>workspaceId</code>, <code>baseUrl</code>, <code>memoryMode</code>, <code>userMemoryMode</code>, <code>agentMemoryMode</code>, <code>dialecticReasoningLevel</code>, <code>sessionStrategy</code>, <code>sessions</code></li>
|
||||
<li class="todo">Per-peer memory mode gating</li>
|
||||
<li class="todo">Dynamic reasoning level</li>
|
||||
<li class="todo">Session naming strategies</li>
|
||||
</ul>
|
||||
|
||||
<h3>Phase 3 — tools and CLI</h3>
|
||||
<ul class="checklist">
|
||||
<li class="todo">Tool surface: <code>honcho_profile</code>, <code>honcho_recall</code>, <code>honcho_analyze</code>, <code>honcho_search</code>, <code>honcho_context</code></li>
|
||||
<li class="todo">CLI: <code>setup</code>, <code>status</code>, <code>sessions</code>, <code>map</code>, <code>mode</code>, <code>identity</code></li>
|
||||
<li class="todo">CLI surface injection into system prompt</li>
|
||||
<li class="todo">AI peer name wired into agent identity</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<script type="module">
|
||||
import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs';
|
||||
mermaid.initialize({ startOnLoad: true, securityLevel: 'loose', fontFamily: 'Departure Mono, Noto Emoji, monospace' });
|
||||
</script>
|
||||
<script>
|
||||
window.addEventListener('scroll', () => {
|
||||
const bar = document.getElementById('progress');
|
||||
const max = document.documentElement.scrollHeight - window.innerHeight;
|
||||
bar.style.width = (max > 0 ? (window.scrollY / max) * 100 : 0) + '%';
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
377
docs/honcho-integration-spec.md
Normal file
377
docs/honcho-integration-spec.md
Normal file
@@ -0,0 +1,377 @@
|
||||
# honcho-integration-spec
|
||||
|
||||
Comparison of Hermes Agent vs. openclaw-honcho — and a porting spec for bringing Hermes patterns into other Honcho integrations.
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Two independent Honcho integrations have been built for two different agent runtimes: **Hermes Agent** (Python, baked into the runner) and **openclaw-honcho** (TypeScript plugin via hook/tool API). Both use the same Honcho peer paradigm — dual peer model, `session.context()`, `peer.chat()` — but they made different tradeoffs at every layer.
|
||||
|
||||
This document maps those tradeoffs and defines a porting spec: a set of Hermes-originated patterns, each stated as an integration-agnostic interface, that any Honcho integration can adopt regardless of runtime or language.
|
||||
|
||||
> **Scope** Both integrations work correctly today. This spec is about the delta — patterns in Hermes that are worth propagating and patterns in openclaw-honcho that Hermes should eventually adopt. The spec is additive, not prescriptive.
|
||||
|
||||
---
|
||||
|
||||
## Architecture comparison
|
||||
|
||||
### Hermes: baked-in runner
|
||||
|
||||
Honcho is initialised directly inside `AIAgent.__init__`. There is no plugin boundary. Session management, context injection, async prefetch, and CLI surface are all first-class concerns of the runner. Context is injected once per session (baked into `_cached_system_prompt`) and never re-fetched mid-session — this maximises prefix cache hits at the LLM provider.
|
||||
|
||||
Turn flow:
|
||||
|
||||
```
|
||||
user message
|
||||
→ _honcho_prefetch() (reads cache — no HTTP)
|
||||
→ _build_system_prompt() (first turn only, cached)
|
||||
→ LLM call
|
||||
→ response
|
||||
→ _honcho_fire_prefetch() (daemon threads, turn end)
|
||||
→ prefetch_context() thread ──┐
|
||||
→ prefetch_dialectic() thread ─┴→ _context_cache / _dialectic_cache
|
||||
```
|
||||
|
||||
### openclaw-honcho: hook-based plugin
|
||||
|
||||
The plugin registers hooks against OpenClaw's event bus. Context is fetched synchronously inside `before_prompt_build` on every turn. Message capture happens in `agent_end`. The multi-agent hierarchy is tracked via `subagent_spawned`. This model is correct but every turn pays a blocking Honcho round-trip before the LLM call can begin.
|
||||
|
||||
Turn flow:
|
||||
|
||||
```
|
||||
user message
|
||||
→ before_prompt_build (BLOCKING HTTP — every turn)
|
||||
→ session.context()
|
||||
→ system prompt assembled
|
||||
→ LLM call
|
||||
→ response
|
||||
→ agent_end hook
|
||||
→ session.addMessages()
|
||||
→ session.setMetadata()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Diff table
|
||||
|
||||
| Dimension | Hermes Agent | openclaw-honcho |
|
||||
|---|---|---|
|
||||
| **Context injection timing** | Once per session (cached). Zero HTTP on response path after turn 1. | Every turn, blocking. Fresh context per turn but adds latency. |
|
||||
| **Prefetch strategy** | Daemon threads fire at turn end; consumed next turn from cache. | None. Blocking call at prompt-build time. |
|
||||
| **Dialectic (peer.chat)** | Prefetched async; result injected into system prompt next turn. | On-demand via `honcho_recall` / `honcho_analyze` tools. |
|
||||
| **Reasoning level** | Dynamic: scales with message length. Floor = config default. Cap = "high". | Fixed per tool: recall=minimal, analyze=medium. |
|
||||
| **Memory modes** | `user_memory_mode` / `agent_memory_mode`: hybrid / honcho / local. | None. Always writes to Honcho. |
|
||||
| **Write frequency** | async (background queue), turn, session, N turns. | After every agent_end (no control). |
|
||||
| **AI peer identity** | `observe_me=True`, `seed_ai_identity()`, `get_ai_representation()`, SOUL.md → AI peer. | Agent files uploaded to agent peer at setup. No ongoing self-observation. |
|
||||
| **Context scope** | User peer + AI peer representation, both injected. | User peer (owner) representation + conversation summary. `peerPerspective` on context call. |
|
||||
| **Session naming** | per-directory / global / manual map / title-based. | Derived from platform session key. |
|
||||
| **Multi-agent** | Single-agent only. | Parent observer hierarchy via `subagent_spawned`. |
|
||||
| **Tool surface** | Single `query_user_context` tool (on-demand dialectic). | 6 tools: session, profile, search, context (fast) + recall, analyze (LLM). |
|
||||
| **Platform metadata** | Not stripped. | Explicitly stripped before Honcho storage. |
|
||||
| **Message dedup** | None. | `lastSavedIndex` in session metadata prevents re-sending. |
|
||||
| **CLI surface in prompt** | Management commands injected into system prompt. Agent knows its own CLI. | Not injected. |
|
||||
| **AI peer name in identity** | Replaces "Hermes Agent" in DEFAULT_AGENT_IDENTITY when configured. | Not implemented. |
|
||||
| **QMD / local file search** | Not implemented. | Passthrough tools when QMD backend configured. |
|
||||
| **Workspace metadata** | Not implemented. | `agentPeerMap` in workspace metadata tracks agent→peer ID. |
|
||||
|
||||
---
|
||||
|
||||
## Patterns
|
||||
|
||||
Six patterns from Hermes are worth adopting in any Honcho integration. Each is described as an integration-agnostic interface.
|
||||
|
||||
**Hermes contributes:**
|
||||
- Async prefetch (zero-latency)
|
||||
- Dynamic reasoning level
|
||||
- Per-peer memory modes
|
||||
- AI peer identity formation
|
||||
- Session naming strategies
|
||||
- CLI surface injection
|
||||
|
||||
**openclaw-honcho contributes back (Hermes should adopt):**
|
||||
- `lastSavedIndex` dedup
|
||||
- Platform metadata stripping
|
||||
- Multi-agent observer hierarchy
|
||||
- `peerPerspective` on `context()`
|
||||
- Tiered tool surface (fast/LLM)
|
||||
- Workspace `agentPeerMap`
|
||||
|
||||
---
|
||||
|
||||
## Spec: async prefetch
|
||||
|
||||
### Problem
|
||||
|
||||
Calling `session.context()` and `peer.chat()` synchronously before each LLM call adds 200–800ms of Honcho round-trip latency to every turn.
|
||||
|
||||
### Pattern
|
||||
|
||||
Fire both calls as non-blocking background work at the **end** of each turn. Store results in a per-session cache keyed by session ID. At the **start** of the next turn, pop from cache — the HTTP is already done. First turn is cold (empty cache); all subsequent turns are zero-latency on the response path.
|
||||
|
||||
### Interface contract
|
||||
|
||||
```typescript
|
||||
interface AsyncPrefetch {
|
||||
// Fire context + dialectic fetches at turn end. Non-blocking.
|
||||
firePrefetch(sessionId: string, userMessage: string): void;
|
||||
|
||||
// Pop cached results at turn start. Returns empty if cache is cold.
|
||||
popContextResult(sessionId: string): ContextResult | null;
|
||||
popDialecticResult(sessionId: string): string | null;
|
||||
}
|
||||
|
||||
type ContextResult = {
|
||||
representation: string;
|
||||
card: string[];
|
||||
aiRepresentation?: string; // AI peer context if enabled
|
||||
summary?: string; // conversation summary if fetched
|
||||
};
|
||||
```
|
||||
|
||||
### Implementation notes
|
||||
|
||||
- **Python:** `threading.Thread(daemon=True)`. Write to `dict[session_id, result]` — GIL makes this safe for simple writes.
|
||||
- **TypeScript:** `Promise` stored in `Map<string, Promise<ContextResult>>`. Await at pop time. If not resolved yet, return null — do not block.
|
||||
- The pop is destructive: clears the cache entry after reading so stale data never accumulates.
|
||||
- Prefetch should also fire on first turn (even though it won't be consumed until turn 2).
|
||||
|
||||
### openclaw-honcho adoption
|
||||
|
||||
Move `session.context()` from `before_prompt_build` to a post-`agent_end` background task. Store result in `state.contextCache`. In `before_prompt_build`, read from cache instead of calling Honcho. If cache is empty (turn 1), inject nothing — the prompt is still valid without Honcho context on the first turn.
|
||||
|
||||
---
|
||||
|
||||
## Spec: dynamic reasoning level
|
||||
|
||||
### Problem
|
||||
|
||||
Honcho's dialectic endpoint supports reasoning levels from `minimal` to `max`. A fixed level per tool wastes budget on simple queries and under-serves complex ones.
|
||||
|
||||
### Pattern
|
||||
|
||||
Select the reasoning level dynamically based on the user's message. Use the configured default as a floor. Bump by message length. Cap auto-selection at `high` — never select `max` automatically.
|
||||
|
||||
### Logic
|
||||
|
||||
```
|
||||
< 120 chars → default (typically "low")
|
||||
120–400 chars → one level above default (cap at "high")
|
||||
> 400 chars → two levels above default (cap at "high")
|
||||
```
|
||||
|
||||
### Config key
|
||||
|
||||
Add `dialecticReasoningLevel` (string, default `"low"`). This sets the floor. The dynamic bump always applies on top.
|
||||
|
||||
### openclaw-honcho adoption
|
||||
|
||||
Apply in `honcho_recall` and `honcho_analyze`: replace fixed `reasoningLevel` with the dynamic selector. `honcho_recall` uses floor `"minimal"`, `honcho_analyze` uses floor `"medium"` — both still bump with message length.
|
||||
|
||||
---
|
||||
|
||||
## Spec: per-peer memory modes
|
||||
|
||||
### Problem
|
||||
|
||||
Users want independent control over whether user context and agent context are written locally, to Honcho, or both.
|
||||
|
||||
### Modes
|
||||
|
||||
| Mode | Effect |
|
||||
|---|---|
|
||||
| `hybrid` | Write to both local files and Honcho (default) |
|
||||
| `honcho` | Honcho only — disable corresponding local file writes |
|
||||
| `local` | Local files only — skip Honcho sync for this peer |
|
||||
|
||||
### Config schema
|
||||
|
||||
```json
|
||||
{
|
||||
"memoryMode": "hybrid",
|
||||
"userMemoryMode": "honcho",
|
||||
"agentMemoryMode": "hybrid"
|
||||
}
|
||||
```
|
||||
|
||||
Resolution order: per-peer field wins → shorthand `memoryMode` → default `"hybrid"`.
|
||||
|
||||
### Effect on Honcho sync
|
||||
|
||||
- `userMemoryMode=local`: skip adding user peer messages to Honcho
|
||||
- `agentMemoryMode=local`: skip adding assistant peer messages to Honcho
|
||||
- Both local: skip `session.addMessages()` entirely
|
||||
- `userMemoryMode=honcho`: disable local USER.md writes
|
||||
- `agentMemoryMode=honcho`: disable local MEMORY.md / SOUL.md writes
|
||||
|
||||
---
|
||||
|
||||
## Spec: AI peer identity formation
|
||||
|
||||
### Problem
|
||||
|
||||
Honcho builds the user's representation organically by observing what the user says. The same mechanism exists for the AI peer — but only if `observe_me=True` is set for the agent peer. Without it, the agent peer accumulates nothing.
|
||||
|
||||
Additionally, existing persona files (SOUL.md, IDENTITY.md) should seed the AI peer's Honcho representation at first activation.
|
||||
|
||||
### Part A: observe_me=True for agent peer
|
||||
|
||||
```typescript
|
||||
await session.addPeers([
|
||||
[ownerPeer.id, { observeMe: true, observeOthers: false }],
|
||||
[agentPeer.id, { observeMe: true, observeOthers: true }], // was false
|
||||
]);
|
||||
```
|
||||
|
||||
One-line change. Foundational. Without it, the AI peer representation stays empty regardless of what the agent says.
|
||||
|
||||
### Part B: seedAiIdentity()
|
||||
|
||||
```typescript
|
||||
async function seedAiIdentity(
|
||||
agentPeer: Peer,
|
||||
content: string,
|
||||
source: string
|
||||
): Promise<boolean> {
|
||||
const wrapped = [
|
||||
`<ai_identity_seed>`,
|
||||
`<source>${source}</source>`,
|
||||
``,
|
||||
content.trim(),
|
||||
`</ai_identity_seed>`,
|
||||
].join("\n");
|
||||
|
||||
await agentPeer.addMessage("assistant", wrapped);
|
||||
return true;
|
||||
}
|
||||
```
|
||||
|
||||
### Part C: migrate agent files at setup
|
||||
|
||||
During `honcho setup`, upload agent-self files (SOUL.md, IDENTITY.md, AGENTS.md) to the agent peer via `seedAiIdentity()` instead of `session.uploadFile()`. This routes content through Honcho's observation pipeline.
|
||||
|
||||
### Part D: AI peer name in identity
|
||||
|
||||
When the agent has a configured name, prepend it to the injected system prompt:
|
||||
|
||||
```typescript
|
||||
const namePrefix = agentName ? `You are ${agentName}.\n\n` : "";
|
||||
return { systemPrompt: namePrefix + "## User Memory Context\n\n" + sections };
|
||||
```
|
||||
|
||||
### CLI surface
|
||||
|
||||
```
|
||||
honcho identity <file> # seed from file
|
||||
honcho identity --show # show current AI peer representation
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Spec: session naming strategies
|
||||
|
||||
### Problem
|
||||
|
||||
A single global session means every project shares the same Honcho context. Per-directory sessions provide isolation without requiring users to name sessions manually.
|
||||
|
||||
### Strategies
|
||||
|
||||
| Strategy | Session key | When to use |
|
||||
|---|---|---|
|
||||
| `per-directory` | basename of CWD | Default. Each project gets its own session. |
|
||||
| `global` | fixed string `"global"` | Single cross-project session. |
|
||||
| manual map | user-configured per path | `sessions` config map overrides directory basename. |
|
||||
| title-based | sanitized session title | When agent supports named sessions set mid-conversation. |
|
||||
|
||||
### Config schema
|
||||
|
||||
```json
|
||||
{
|
||||
"sessionStrategy": "per-directory",
|
||||
"sessionPeerPrefix": false,
|
||||
"sessions": {
|
||||
"/home/user/projects/foo": "foo-project"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### CLI surface
|
||||
|
||||
```
|
||||
honcho sessions # list all mappings
|
||||
honcho map <name> # map cwd to session name
|
||||
honcho map # no-arg = list mappings
|
||||
```
|
||||
|
||||
Resolution order: manual map → session title → directory basename → platform key.
|
||||
|
||||
---
|
||||
|
||||
## Spec: CLI surface injection
|
||||
|
||||
### Problem
|
||||
|
||||
When a user asks "how do I change my memory settings?" the agent either hallucinates or says it doesn't know. The agent should know its own management interface.
|
||||
|
||||
### Pattern
|
||||
|
||||
When Honcho is active, append a compact command reference to the system prompt. Keep it under 300 chars.
|
||||
|
||||
```
|
||||
# Honcho memory integration
|
||||
Active. Session: {sessionKey}. Mode: {mode}.
|
||||
Management commands:
|
||||
honcho status — show config + connection
|
||||
honcho mode [hybrid|honcho|local] — show or set memory mode
|
||||
honcho sessions — list session mappings
|
||||
honcho map <name> — map directory to session
|
||||
honcho identity [file] [--show] — seed or show AI identity
|
||||
honcho setup — full interactive wizard
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## openclaw-honcho checklist
|
||||
|
||||
Ordered by impact:
|
||||
|
||||
- [ ] **Async prefetch** — move `session.context()` out of `before_prompt_build` into post-`agent_end` background Promise
|
||||
- [ ] **observe_me=True for agent peer** — one-line change in `session.addPeers()`
|
||||
- [ ] **Dynamic reasoning level** — add helper; apply in `honcho_recall` and `honcho_analyze`; add `dialecticReasoningLevel` to config
|
||||
- [ ] **Per-peer memory modes** — add `userMemoryMode` / `agentMemoryMode` to config; gate Honcho sync and local writes
|
||||
- [ ] **seedAiIdentity()** — add helper; use during setup migration for SOUL.md / IDENTITY.md
|
||||
- [ ] **Session naming strategies** — add `sessionStrategy`, `sessions` map, `sessionPeerPrefix`
|
||||
- [ ] **CLI surface injection** — append command reference to `before_prompt_build` return value
|
||||
- [ ] **honcho identity subcommand** — seed from file or `--show` current representation
|
||||
- [ ] **AI peer name injection** — if `aiPeer` name configured, prepend to injected system prompt
|
||||
- [ ] **honcho mode / sessions / map** — CLI parity with Hermes
|
||||
|
||||
Already done in openclaw-honcho (do not re-implement): `lastSavedIndex` dedup, platform metadata stripping, multi-agent parent observer, `peerPerspective` on `context()`, tiered tool surface, workspace `agentPeerMap`, QMD passthrough, self-hosted Honcho.
|
||||
|
||||
---
|
||||
|
||||
## nanobot-honcho checklist
|
||||
|
||||
Greenfield integration. Start from openclaw-honcho's architecture and apply all Hermes patterns from day one.
|
||||
|
||||
### Phase 1 — core correctness
|
||||
|
||||
- [ ] Dual peer model (owner + agent peer), both with `observe_me=True`
|
||||
- [ ] Message capture at turn end with `lastSavedIndex` dedup
|
||||
- [ ] Platform metadata stripping before Honcho storage
|
||||
- [ ] Async prefetch from day one — do not implement blocking context injection
|
||||
- [ ] Legacy file migration at first activation (USER.md → owner peer, SOUL.md → `seedAiIdentity()`)
|
||||
|
||||
### Phase 2 — configuration
|
||||
|
||||
- [ ] Config schema: `apiKey`, `workspaceId`, `baseUrl`, `memoryMode`, `userMemoryMode`, `agentMemoryMode`, `dialecticReasoningLevel`, `sessionStrategy`, `sessions`
|
||||
- [ ] Per-peer memory mode gating
|
||||
- [ ] Dynamic reasoning level
|
||||
- [ ] Session naming strategies
|
||||
|
||||
### Phase 3 — tools and CLI
|
||||
|
||||
- [ ] Tool surface: `honcho_profile`, `honcho_recall`, `honcho_analyze`, `honcho_search`, `honcho_context`
|
||||
- [ ] CLI: `setup`, `status`, `sessions`, `map`, `mode`, `identity`
|
||||
- [ ] CLI surface injection into system prompt
|
||||
- [ ] AI peer name wired into agent identity
|
||||
110
docs/migration/openclaw.md
Normal file
110
docs/migration/openclaw.md
Normal file
@@ -0,0 +1,110 @@
|
||||
# Migrating from OpenClaw to Hermes Agent
|
||||
|
||||
This guide covers how to import your OpenClaw settings, memories, skills, and API keys into Hermes Agent.
|
||||
|
||||
## Three Ways to Migrate
|
||||
|
||||
### 1. Automatic (during first-time setup)
|
||||
|
||||
When you run `hermes setup` for the first time and Hermes detects `~/.openclaw`, it automatically offers to import your OpenClaw data before configuration begins. Just accept the prompt and everything is handled for you.
|
||||
|
||||
### 2. CLI Command (quick, scriptable)
|
||||
|
||||
```bash
|
||||
hermes claw migrate # Full migration with confirmation prompt
|
||||
hermes claw migrate --dry-run # Preview what would happen
|
||||
hermes claw migrate --preset user-data # Migrate without API keys/secrets
|
||||
hermes claw migrate --yes # Skip confirmation prompt
|
||||
```
|
||||
|
||||
**All options:**
|
||||
|
||||
| Flag | Description |
|
||||
|------|-------------|
|
||||
| `--source PATH` | Path to OpenClaw directory (default: `~/.openclaw`) |
|
||||
| `--dry-run` | Preview only — no files are modified |
|
||||
| `--preset {user-data,full}` | Migration preset (default: `full`). `user-data` excludes secrets |
|
||||
| `--overwrite` | Overwrite existing files (default: skip conflicts) |
|
||||
| `--migrate-secrets` | Include allowlisted secrets (auto-enabled with `full` preset) |
|
||||
| `--workspace-target PATH` | Copy workspace instructions (AGENTS.md) to this absolute path |
|
||||
| `--skill-conflict {skip,overwrite,rename}` | How to handle skill name conflicts (default: `skip`) |
|
||||
| `--yes`, `-y` | Skip confirmation prompts |
|
||||
|
||||
### 3. Agent-Guided (interactive, with previews)
|
||||
|
||||
Ask the agent to run the migration for you:
|
||||
|
||||
```
|
||||
> Migrate my OpenClaw setup to Hermes
|
||||
```
|
||||
|
||||
The agent will use the `openclaw-migration` skill to:
|
||||
1. Run a dry-run first to preview changes
|
||||
2. Ask about conflict resolution (SOUL.md, skills, etc.)
|
||||
3. Let you choose between `user-data` and `full` presets
|
||||
4. Execute the migration with your choices
|
||||
5. Print a detailed summary of what was migrated
|
||||
|
||||
## What Gets Migrated
|
||||
|
||||
### `user-data` preset
|
||||
| Item | Source | Destination |
|
||||
|------|--------|-------------|
|
||||
| SOUL.md | `~/.openclaw/workspace/SOUL.md` | `~/.hermes/SOUL.md` |
|
||||
| Memory entries | `~/.openclaw/workspace/MEMORY.md` | `~/.hermes/memories/MEMORY.md` |
|
||||
| User profile | `~/.openclaw/workspace/USER.md` | `~/.hermes/memories/USER.md` |
|
||||
| Skills | `~/.openclaw/workspace/skills/` | `~/.hermes/skills/openclaw-imports/` |
|
||||
| Command allowlist | `~/.openclaw/workspace/exec_approval_patterns.yaml` | Merged into `~/.hermes/config.yaml` |
|
||||
| Messaging settings | `~/.openclaw/config.yaml` (TELEGRAM_ALLOWED_USERS, MESSAGING_CWD) | `~/.hermes/.env` |
|
||||
| TTS assets | `~/.openclaw/workspace/tts/` | `~/.hermes/tts/` |
|
||||
|
||||
### `full` preset (adds to `user-data`)
|
||||
| Item | Source | Destination |
|
||||
|------|--------|-------------|
|
||||
| Telegram bot token | `~/.openclaw/config.yaml` | `~/.hermes/.env` |
|
||||
| OpenRouter API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
|
||||
| OpenAI API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
|
||||
| Anthropic API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
|
||||
| ElevenLabs API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
|
||||
|
||||
Only these 6 allowlisted secrets are ever imported. Other credentials are skipped and reported.
|
||||
|
||||
## Conflict Handling
|
||||
|
||||
By default, the migration **will not overwrite** existing Hermes data:
|
||||
|
||||
- **SOUL.md** — skipped if one already exists in `~/.hermes/`
|
||||
- **Memory entries** — skipped if memories already exist (to avoid duplicates)
|
||||
- **Skills** — skipped if a skill with the same name already exists
|
||||
- **API keys** — skipped if the key is already set in `~/.hermes/.env`
|
||||
|
||||
To overwrite conflicts, use `--overwrite`. The migration creates backups before overwriting.
|
||||
|
||||
For skills, you can also use `--skill-conflict rename` to import conflicting skills under a new name (e.g., `skill-name-imported`).
|
||||
|
||||
## Migration Report
|
||||
|
||||
Every migration (including dry runs) produces a report showing:
|
||||
- **Migrated items** — what was successfully imported
|
||||
- **Conflicts** — items skipped because they already exist
|
||||
- **Skipped items** — items not found in the source
|
||||
- **Errors** — items that failed to import
|
||||
|
||||
For execute runs, the full report is saved to `~/.hermes/migration/openclaw/<timestamp>/`.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "OpenClaw directory not found"
|
||||
The migration looks for `~/.openclaw` by default. If your OpenClaw is installed elsewhere, use `--source`:
|
||||
```bash
|
||||
hermes claw migrate --source /path/to/.openclaw
|
||||
```
|
||||
|
||||
### "Migration script not found"
|
||||
The migration script ships with Hermes Agent. If you installed via pip (not git clone), the `optional-skills/` directory may not be present. Install the skill from the Skills Hub:
|
||||
```bash
|
||||
hermes skills install openclaw-migration
|
||||
```
|
||||
|
||||
### Memory overflow
|
||||
If your OpenClaw MEMORY.md or USER.md exceeds Hermes' character limits, excess entries are exported to an overflow file in the migration report directory. You can manually review and add the most important ones.
|
||||
89
docs/skins/example-skin.yaml
Normal file
89
docs/skins/example-skin.yaml
Normal file
@@ -0,0 +1,89 @@
|
||||
# ============================================================================
|
||||
# Hermes Agent — Example Skin Template
|
||||
# ============================================================================
|
||||
#
|
||||
# Copy this file to ~/.hermes/skins/<name>.yaml to create a custom skin.
|
||||
# All fields are optional — missing values inherit from the default skin.
|
||||
# Activate with: /skin <name> or display.skin: <name> in config.yaml
|
||||
#
|
||||
# See hermes_cli/skin_engine.py for the full schema reference.
|
||||
# ============================================================================
|
||||
|
||||
# Required: unique skin name (used in /skin command and config)
|
||||
name: example
|
||||
description: An example custom skin — copy and modify this template
|
||||
|
||||
# ── Colors ──────────────────────────────────────────────────────────────────
|
||||
# Hex color values for Rich markup. These control the CLI's visual palette.
|
||||
colors:
|
||||
# Banner panel (the startup welcome box)
|
||||
banner_border: "#CD7F32" # Panel border
|
||||
banner_title: "#FFD700" # Panel title text
|
||||
banner_accent: "#FFBF00" # Section headers (Available Tools, Skills, etc.)
|
||||
banner_dim: "#B8860B" # Dim/muted text (separators, model info)
|
||||
banner_text: "#FFF8DC" # Body text (tool names, skill names)
|
||||
|
||||
# UI elements
|
||||
ui_accent: "#FFBF00" # General accent color
|
||||
ui_label: "#4dd0e1" # Labels
|
||||
ui_ok: "#4caf50" # Success indicators
|
||||
ui_error: "#ef5350" # Error indicators
|
||||
ui_warn: "#ffa726" # Warning indicators
|
||||
|
||||
# Input area
|
||||
prompt: "#FFF8DC" # Prompt text color
|
||||
input_rule: "#CD7F32" # Horizontal rule around input
|
||||
|
||||
# Response box
|
||||
response_border: "#FFD700" # Response box border (ANSI color)
|
||||
|
||||
# Session display
|
||||
session_label: "#DAA520" # Session label
|
||||
session_border: "#8B8682" # Session ID dim color
|
||||
|
||||
# ── Spinner ─────────────────────────────────────────────────────────────────
|
||||
# Customize the animated spinner shown during API calls and tool execution.
|
||||
spinner:
|
||||
# Faces shown while waiting for the API response
|
||||
waiting_faces:
|
||||
- "(。◕‿◕。)"
|
||||
- "(◕‿◕✿)"
|
||||
- "٩(◕‿◕。)۶"
|
||||
|
||||
# Faces shown during extended thinking/reasoning
|
||||
thinking_faces:
|
||||
- "(。•́︿•̀。)"
|
||||
- "(◔_◔)"
|
||||
- "(¬‿¬)"
|
||||
|
||||
# Verbs used in spinner messages (e.g., "pondering your request...")
|
||||
thinking_verbs:
|
||||
- "pondering"
|
||||
- "contemplating"
|
||||
- "musing"
|
||||
- "ruminating"
|
||||
|
||||
# Optional: left/right decorations around the spinner
|
||||
# Each entry is a [left, right] pair. Omit entirely for no wings.
|
||||
# wings:
|
||||
# - ["⟪⚔", "⚔⟫"]
|
||||
# - ["⟪▲", "▲⟫"]
|
||||
|
||||
# ── Branding ────────────────────────────────────────────────────────────────
|
||||
# Text strings used throughout the CLI interface.
|
||||
branding:
|
||||
agent_name: "Hermes Agent" # Banner title, about display
|
||||
welcome: "Welcome! Type your message or /help for commands."
|
||||
goodbye: "Goodbye! ⚕" # Exit message
|
||||
response_label: " ⚕ Hermes " # Response box header label
|
||||
prompt_symbol: "❯ " # Input prompt symbol
|
||||
help_header: "(^_^)? Available Commands" # /help header text
|
||||
|
||||
# ── Tool Output ─────────────────────────────────────────────────────────────
|
||||
# Character used as the prefix for tool output lines.
|
||||
# Default is "┊" (thin dotted vertical line). Some alternatives:
|
||||
# "╎" (light triple dash vertical)
|
||||
# "▏" (left one-eighth block)
|
||||
# "│" (box drawing light vertical)
|
||||
# "┃" (box drawing heavy vertical)
|
||||
tool_prefix: "┊"
|
||||
334
environments/README.md
Normal file
334
environments/README.md
Normal file
@@ -0,0 +1,334 @@
|
||||
# Hermes-Agent Atropos Environments
|
||||
|
||||
This directory contains the integration layer between **hermes-agent's** tool-calling capabilities and the **Atropos** RL training framework. It provides everything needed to run agentic LLMs through multi-turn tool-calling loops, score their output with arbitrary reward functions, and feed results into Atropos for training or evaluation.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
Atropos Framework
|
||||
┌───────────────────────┐
|
||||
│ BaseEnv │ (atroposlib)
|
||||
│ - Server management │
|
||||
│ - Worker scheduling │
|
||||
│ - Wandb logging │
|
||||
│ - CLI (serve/process/ │
|
||||
│ evaluate) │
|
||||
└───────────┬───────────┘
|
||||
│ inherits
|
||||
┌───────────┴───────────┐
|
||||
│ HermesAgentBaseEnv │ hermes_base_env.py
|
||||
│ - Terminal backend │
|
||||
│ - Tool resolution │
|
||||
│ - Agent loop │
|
||||
│ - ToolContext │
|
||||
│ - Async patches │
|
||||
└───────────┬───────────┘
|
||||
│ inherits
|
||||
┌─────────────────┼─────────────────┐
|
||||
│ │ │
|
||||
TerminalTestEnv HermesSweEnv TerminalBench2EvalEnv
|
||||
(stack testing) (SWE training) (TB2 benchmark eval)
|
||||
```
|
||||
|
||||
### Inheritance Chain
|
||||
|
||||
**BaseEnv** (from `atroposlib`) is the Atropos base class. It provides:
|
||||
- Server management (OpenAI-compatible API servers, VLLM, SGLang)
|
||||
- Worker scheduling for parallel rollouts
|
||||
- Wandb integration for metrics and rollout logging
|
||||
- CLI interface with three subcommands: `serve`, `process`, `evaluate`
|
||||
- `evaluate_log()` for saving eval results to JSON + samples.jsonl
|
||||
|
||||
**HermesAgentBaseEnv** (`hermes_base_env.py`) extends BaseEnv with hermes-agent specifics:
|
||||
- Sets `os.environ["TERMINAL_ENV"]` to configure the terminal backend (local, docker, modal, daytona, ssh, singularity)
|
||||
- Resolves hermes-agent toolsets via `_resolve_tools_for_group()` (calls `get_tool_definitions()` which queries `tools/registry.py`)
|
||||
- Implements `collect_trajectory()` which runs the full agent loop and computes rewards
|
||||
- Supports two-phase operation (Phase 1: OpenAI server, Phase 2: VLLM ManagedServer)
|
||||
- Applies monkey patches for async-safe tool operation at import time
|
||||
|
||||
Concrete environments inherit from `HermesAgentBaseEnv` and implement:
|
||||
- `setup()` -- Load dataset, initialize state
|
||||
- `get_next_item()` -- Return the next item for rollout
|
||||
- `format_prompt()` -- Convert a dataset item into the user message
|
||||
- `compute_reward()` -- Score the rollout using ToolContext
|
||||
- `evaluate()` -- Periodic evaluation logic
|
||||
|
||||
## Core Components
|
||||
|
||||
### Agent Loop (`agent_loop.py`)
|
||||
|
||||
`HermesAgentLoop` is the reusable multi-turn agent engine. It runs the same pattern as hermes-agent's `run_agent.py`:
|
||||
|
||||
1. Send messages + tools to the API via `server.chat_completion()`
|
||||
2. If the response contains `tool_calls`, execute each one via `handle_function_call()` (which delegates to `tools/registry.py`'s `dispatch()`)
|
||||
3. Append tool results to the conversation and go back to step 1
|
||||
4. If the response has no tool_calls, the agent is done
|
||||
|
||||
Tool calls are executed in a thread pool (`run_in_executor`) so backends that use `asyncio.run()` internally (Modal, Docker) don't deadlock inside Atropos's event loop.
|
||||
|
||||
Returns an `AgentResult` containing the full conversation history, turn count, reasoning content per turn, tool errors, and optional ManagedServer state (for Phase 2).
|
||||
|
||||
### Tool Context (`tool_context.py`)
|
||||
|
||||
`ToolContext` is a per-rollout handle that gives reward/verification functions direct access to **all** hermes-agent tools, scoped to the rollout's `task_id`. The same `task_id` means the terminal/browser session is the SAME one the model used during its rollout -- all state (files, processes, browser tabs) is preserved.
|
||||
|
||||
```python
|
||||
async def compute_reward(self, item, result, ctx: ToolContext):
|
||||
# Run tests in the model's terminal sandbox
|
||||
test = ctx.terminal("pytest -v")
|
||||
if test["exit_code"] == 0:
|
||||
return 1.0
|
||||
|
||||
# Check if a file was created
|
||||
content = ctx.read_file("/workspace/solution.py")
|
||||
if content.get("content"):
|
||||
return 0.5
|
||||
|
||||
# Download files locally for verification (binary-safe)
|
||||
ctx.download_file("/remote/output.bin", "/local/output.bin")
|
||||
|
||||
return 0.0
|
||||
```
|
||||
|
||||
Available methods:
|
||||
- **Terminal**: `terminal(command, timeout)` -- run shell commands
|
||||
- **Files**: `read_file(path)`, `write_file(path, content)`, `search(query, path)`
|
||||
- **Transfers**: `upload_file()`, `upload_dir()`, `download_file()`, `download_dir()` -- binary-safe file transfers between host and sandbox
|
||||
- **Web**: `web_search(query)`, `web_extract(urls)`
|
||||
- **Browser**: `browser_navigate(url)`, `browser_snapshot()`
|
||||
- **Generic**: `call_tool(name, args)` -- call any hermes-agent tool by name
|
||||
- **Cleanup**: `cleanup()` -- release all resources (called automatically after `compute_reward`)
|
||||
|
||||
### Patches (`patches.py`)
|
||||
|
||||
**Problem**: Some hermes-agent tools use `asyncio.run()` internally (e.g., mini-swe-agent's Modal backend via SWE-ReX). This crashes when called from inside Atropos's event loop because `asyncio.run()` cannot be nested.
|
||||
|
||||
**Solution**: `patches.py` monkey-patches `SwerexModalEnvironment` to use a dedicated background thread (`_AsyncWorker`) with its own event loop. The calling code sees the same sync interface, but internally the async work happens on a separate thread that doesn't conflict with Atropos's loop.
|
||||
|
||||
What gets patched:
|
||||
- `SwerexModalEnvironment.__init__` -- creates Modal deployment on a background thread
|
||||
- `SwerexModalEnvironment.execute` -- runs commands on the same background thread
|
||||
- `SwerexModalEnvironment.stop` -- stops deployment on the background thread
|
||||
|
||||
The patches are:
|
||||
- **Idempotent** -- calling `apply_patches()` multiple times is safe
|
||||
- **Transparent** -- same interface and behavior, only the internal async execution changes
|
||||
- **Universal** -- works identically in normal CLI use (no running event loop)
|
||||
|
||||
Applied automatically at import time by `hermes_base_env.py`.
|
||||
|
||||
### Tool Call Parsers (`tool_call_parsers/`)
|
||||
|
||||
Client-side parsers that extract structured `tool_calls` from raw model output text. Used in **Phase 2** (VLLM server type) where ManagedServer's `/generate` endpoint returns raw text without tool call parsing.
|
||||
|
||||
Each parser is a standalone reimplementation of the corresponding VLLM parser's `extract_tool_calls()` logic. No VLLM dependency -- only standard library (`re`, `json`, `uuid`) and `openai` types.
|
||||
|
||||
Available parsers:
|
||||
- `hermes` -- Hermes/ChatML `<tool_call>` XML format
|
||||
- `mistral` -- Mistral `[TOOL_CALLS]` format
|
||||
- `llama3_json` -- Llama 3 JSON tool calling
|
||||
- `qwen` -- Qwen tool calling format
|
||||
- `qwen3_coder` -- Qwen3 Coder format
|
||||
- `deepseek_v3` -- DeepSeek V3 format
|
||||
- `deepseek_v3_1` -- DeepSeek V3.1 format
|
||||
- `kimi_k2` -- Kimi K2 format
|
||||
- `longcat` -- Longcat format
|
||||
- `glm45` / `glm47` -- GLM model formats
|
||||
|
||||
Usage:
|
||||
```python
|
||||
from environments.tool_call_parsers import get_parser
|
||||
|
||||
parser = get_parser("hermes")
|
||||
content, tool_calls = parser.parse(raw_model_output)
|
||||
```
|
||||
|
||||
In Phase 1 (OpenAI server type), these parsers are not needed -- the server handles tool call parsing natively.
|
||||
|
||||
## Two-Phase Operation
|
||||
|
||||
### Phase 1: OpenAI Server (Evaluation / SFT Data Generation)
|
||||
|
||||
Uses `server.chat_completion()` with `tools=` parameter. The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing natively. Returns `ChatCompletion` objects with structured `tool_calls`.
|
||||
|
||||
- Good for: evaluation, SFT data generation, testing
|
||||
- Run with: `serve` (with `run-api`), `process`, or `evaluate` subcommands
|
||||
- Placeholder tokens are created for the Atropos pipeline
|
||||
|
||||
### Phase 2: VLLM ManagedServer (Full RL Training)
|
||||
|
||||
Uses ManagedServer for exact token IDs + logprobs via `/generate`. Client-side tool call parser (from `tool_call_parsers/`) reconstructs structured `tool_calls` from raw output.
|
||||
|
||||
- Good for: full RL training with GRPO/PPO
|
||||
- Run with: `serve` subcommand
|
||||
- Real tokens, masks, and logprobs flow through the pipeline
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
environments/
|
||||
├── README.md # This file
|
||||
├── __init__.py # Package exports
|
||||
├── hermes_base_env.py # Abstract base (HermesAgentBaseEnv)
|
||||
├── agent_loop.py # Multi-turn agent engine (HermesAgentLoop)
|
||||
├── tool_context.py # Per-rollout tool access for reward functions
|
||||
├── patches.py # Async-safety patches for Modal backend
|
||||
│
|
||||
├── tool_call_parsers/ # Phase 2 client-side parsers
|
||||
│ ├── __init__.py # Registry + base class
|
||||
│ ├── hermes_parser.py
|
||||
│ ├── mistral_parser.py
|
||||
│ ├── llama_parser.py
|
||||
│ ├── qwen_parser.py
|
||||
│ ├── qwen3_coder_parser.py
|
||||
│ ├── deepseek_v3_parser.py
|
||||
│ ├── deepseek_v3_1_parser.py
|
||||
│ ├── kimi_k2_parser.py
|
||||
│ ├── longcat_parser.py
|
||||
│ ├── glm45_parser.py
|
||||
│ └── glm47_parser.py
|
||||
│
|
||||
├── terminal_test_env/ # Stack validation environment
|
||||
│ └── terminal_test_env.py
|
||||
│
|
||||
├── hermes_swe_env/ # SWE-bench style training environment
|
||||
│ └── hermes_swe_env.py
|
||||
│
|
||||
└── benchmarks/ # Evaluation benchmarks
|
||||
├── terminalbench_2/ # 89 terminal tasks, Modal sandboxes
|
||||
│ └── terminalbench2_env.py
|
||||
├── tblite/ # 100 calibrated tasks (fast TB2 proxy)
|
||||
│ └── tblite_env.py
|
||||
└── yc_bench/ # Long-horizon strategic benchmark
|
||||
└── yc_bench_env.py
|
||||
```
|
||||
|
||||
## Concrete Environments
|
||||
|
||||
### TerminalTestEnv (`terminal_test_env/`)
|
||||
|
||||
A self-contained environment with inline tasks (no external dataset needed) for validating the full stack end-to-end. Each task asks the model to create a file at a known path, and the verifier checks the content matches.
|
||||
|
||||
```bash
|
||||
# Serve mode (needs run-api)
|
||||
run-api
|
||||
python environments/terminal_test_env/terminal_test_env.py serve
|
||||
|
||||
# Process mode (no run-api, saves to JSONL)
|
||||
python environments/terminal_test_env/terminal_test_env.py process \
|
||||
--env.data_path_to_save_groups terminal_test_output.jsonl
|
||||
```
|
||||
|
||||
### HermesSweEnv (`hermes_swe_env/`)
|
||||
|
||||
SWE-bench style training environment. The model gets a coding task, uses terminal + file + web tools to solve it, and the reward function runs tests in the same Modal sandbox.
|
||||
|
||||
```bash
|
||||
python environments/hermes_swe_env/hermes_swe_env.py serve \
|
||||
--openai.model_name YourModel \
|
||||
--env.dataset_name bigcode/humanevalpack \
|
||||
--env.terminal_backend modal
|
||||
```
|
||||
|
||||
### TerminalBench2EvalEnv (`benchmarks/terminalbench_2/`)
|
||||
|
||||
**Eval-only** environment for the Terminal-Bench 2.0 benchmark (89 tasks). Each task gets a pre-built Docker Hub image, a natural language instruction, and a test suite. The agent uses terminal + file tools to solve the task, then the test suite verifies correctness.
|
||||
|
||||
Follows the standard Atropos eval pattern (like GPQA, MMLU, etc.):
|
||||
- Run via `evaluate` subcommand (no `run-api` needed)
|
||||
- `setup()` loads the dataset, `evaluate()` runs all tasks
|
||||
- `rollout_and_score_eval()` handles per-task agent loop + test verification
|
||||
- Downloads verifier output locally for reliable reward checking (Harbor pattern)
|
||||
|
||||
```bash
|
||||
# Run full benchmark
|
||||
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
--openai.model_name anthropic/claude-opus-4.6
|
||||
|
||||
# Run subset of tasks
|
||||
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
--openai.model_name anthropic/claude-opus-4.6 \
|
||||
--env.task_filter fix-git,git-multibranch
|
||||
|
||||
# Skip specific tasks
|
||||
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
--openai.model_name anthropic/claude-opus-4.6 \
|
||||
--env.skip_tasks heavy-task,slow-task
|
||||
```
|
||||
|
||||
## Creating a New Environment
|
||||
|
||||
### Training Environment
|
||||
|
||||
1. Create a new directory under `environments/`
|
||||
2. Create your env file inheriting from `HermesAgentBaseEnv`
|
||||
3. Implement the four abstract methods + `evaluate()`
|
||||
|
||||
```python
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
|
||||
class MyEnvConfig(HermesAgentEnvConfig):
|
||||
pass # Add custom fields as needed
|
||||
|
||||
class MyEnv(HermesAgentBaseEnv):
|
||||
name = "my-env"
|
||||
env_config_cls = MyEnvConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls):
|
||||
env_config = MyEnvConfig(
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
terminal_backend="modal",
|
||||
# ... other config
|
||||
)
|
||||
server_configs = [APIServerConfig(...)]
|
||||
return env_config, server_configs
|
||||
|
||||
async def setup(self):
|
||||
self.dataset = load_dataset(...)
|
||||
self.iter = 0
|
||||
|
||||
async def get_next_item(self):
|
||||
item = self.dataset[self.iter % len(self.dataset)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item):
|
||||
return item["instruction"]
|
||||
|
||||
async def compute_reward(self, item, result, ctx):
|
||||
# ctx gives you full tool access to the rollout's sandbox
|
||||
test = ctx.terminal("pytest -v")
|
||||
return 1.0 if test["exit_code"] == 0 else 0.0
|
||||
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
# Periodic evaluation logic
|
||||
...
|
||||
|
||||
if __name__ == "__main__":
|
||||
MyEnv.cli()
|
||||
```
|
||||
|
||||
### Eval-Only Environment (Benchmark)
|
||||
|
||||
For eval benchmarks, follow the pattern in `terminalbench2_env.py`:
|
||||
1. Create under `environments/benchmarks/your-benchmark/`
|
||||
2. Inherit from `HermesAgentBaseEnv`
|
||||
3. Set eval-only config: `eval_handling=STOP_TRAIN`, `steps_per_eval=1`, `total_steps=1`
|
||||
4. Stub the training methods (`collect_trajectories`, `score`)
|
||||
5. Implement `rollout_and_score_eval()` and `evaluate()`
|
||||
6. Run with `evaluate` subcommand
|
||||
|
||||
## Key Config Fields
|
||||
|
||||
| Field | Description | Default |
|
||||
|-------|-------------|---------|
|
||||
| `enabled_toolsets` | Which hermes toolsets to enable | `None` (all) |
|
||||
| `disabled_toolsets` | Toolsets to disable | `None` |
|
||||
| `distribution` | Probabilistic toolset distribution name | `None` |
|
||||
| `max_agent_turns` | Max LLM calls per rollout | `30` |
|
||||
| `agent_temperature` | Sampling temperature | `1.0` |
|
||||
| `terminal_backend` | `local`, `docker`, `modal`, `daytona`, `ssh`, `singularity` | `local` |
|
||||
| `system_prompt` | System message for the agent | `None` |
|
||||
| `tool_call_parser` | Parser name for Phase 2 | `hermes` |
|
||||
| `eval_handling` | `STOP_TRAIN`, `LIMIT_TRAIN`, `NONE` | `STOP_TRAIN` |
|
||||
36
environments/__init__.py
Normal file
36
environments/__init__.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Hermes-Agent Atropos Environments
|
||||
|
||||
Provides a layered integration between hermes-agent's tool-calling capabilities
|
||||
and the Atropos RL training framework.
|
||||
|
||||
Core layers:
|
||||
- agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling
|
||||
- tool_context: Per-rollout tool access handle for reward/verification functions
|
||||
- hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos
|
||||
- tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate)
|
||||
|
||||
Concrete environments:
|
||||
- terminal_test_env/: Simple file-creation tasks for testing the stack
|
||||
- hermes_swe_env/: SWE-bench style tasks with Modal sandboxes
|
||||
|
||||
Benchmarks (eval-only):
|
||||
- benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation
|
||||
"""
|
||||
|
||||
try:
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
except ImportError:
|
||||
# atroposlib not installed — environments are unavailable but
|
||||
# submodules like tool_call_parsers can still be imported directly.
|
||||
pass
|
||||
|
||||
__all__ = [
|
||||
"AgentResult",
|
||||
"HermesAgentLoop",
|
||||
"ToolContext",
|
||||
"HermesAgentBaseEnv",
|
||||
"HermesAgentEnvConfig",
|
||||
]
|
||||
500
environments/agent_loop.py
Normal file
500
environments/agent_loop.py
Normal file
@@ -0,0 +1,500 @@
|
||||
"""
|
||||
HermesAgentLoop -- Reusable Multi-Turn Agent Engine
|
||||
|
||||
Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling.
|
||||
Works with any server that returns ChatCompletion objects with tool_calls:
|
||||
- Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API)
|
||||
- Phase 2: ManagedServer with client-side tool call parser
|
||||
|
||||
The loop passes tools= and checks response.choices[0].message.tool_calls,
|
||||
identical to hermes-agent's run_agent.py. Tool execution is dispatched via
|
||||
handle_function_call() from model_tools.py.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
from model_tools import handle_function_call
|
||||
|
||||
# Thread pool for running sync tool calls that internally use asyncio.run()
|
||||
# (e.g., mini-swe-agent's modal/docker/daytona backends). Running them in a separate
|
||||
# thread gives them a clean event loop so they don't deadlock inside Atropos's loop.
|
||||
# Size must be large enough for concurrent eval tasks (e.g., 89 TB2 tasks all
|
||||
# making tool calls). Too small = thread pool starvation, tasks queue for minutes.
|
||||
# Resized at runtime by HermesAgentBaseEnv.__init__ via resize_tool_pool().
|
||||
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=128)
|
||||
|
||||
|
||||
def resize_tool_pool(max_workers: int):
|
||||
"""
|
||||
Replace the global tool executor with a new one of the given size.
|
||||
|
||||
Called by HermesAgentBaseEnv.__init__ based on config.tool_pool_size.
|
||||
Safe to call before any tasks are submitted.
|
||||
"""
|
||||
global _tool_executor
|
||||
old_executor = _tool_executor
|
||||
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
|
||||
old_executor.shutdown(wait=False)
|
||||
logger.info("Tool thread pool resized to %d workers", max_workers)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ToolError:
|
||||
"""Record of a tool execution error during the agent loop."""
|
||||
|
||||
turn: int # Which turn the error occurred on
|
||||
tool_name: str # Which tool was called
|
||||
arguments: str # The arguments passed (truncated)
|
||||
error: str # The error message
|
||||
tool_result: str # The raw result returned to the model
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentResult:
|
||||
"""Result of running the agent loop."""
|
||||
|
||||
# Full conversation history in OpenAI message format
|
||||
messages: List[Dict[str, Any]]
|
||||
# ManagedServer.get_state() if available (Phase 2), None otherwise
|
||||
managed_state: Optional[Dict[str, Any]] = None
|
||||
# How many LLM calls were made
|
||||
turns_used: int = 0
|
||||
# True if model stopped calling tools naturally (vs hitting max_turns)
|
||||
finished_naturally: bool = False
|
||||
# Extracted reasoning content per turn (from PR #297 helpers)
|
||||
reasoning_per_turn: List[Optional[str]] = field(default_factory=list)
|
||||
# Tool errors encountered during the loop
|
||||
tool_errors: List[ToolError] = field(default_factory=list)
|
||||
|
||||
|
||||
def _extract_reasoning_from_message(message) -> Optional[str]:
|
||||
"""
|
||||
Extract reasoning content from a ChatCompletion message.
|
||||
|
||||
Handles multiple provider formats:
|
||||
1. message.reasoning_content field (some providers)
|
||||
2. message.reasoning field (some providers)
|
||||
3. message.reasoning_details[].text (OpenRouter style)
|
||||
|
||||
Note: <think> block extraction from content is NOT done here -- that's
|
||||
handled by the response already in Phase 1 (server does it) or by
|
||||
ManagedServer's patch in Phase 2.
|
||||
|
||||
Args:
|
||||
message: The assistant message from ChatCompletion response
|
||||
|
||||
Returns:
|
||||
Extracted reasoning text, or None if not found
|
||||
"""
|
||||
# Check reasoning_content field (common across providers)
|
||||
if hasattr(message, "reasoning_content") and message.reasoning_content:
|
||||
return message.reasoning_content
|
||||
|
||||
# Check reasoning field
|
||||
if hasattr(message, "reasoning") and message.reasoning:
|
||||
return message.reasoning
|
||||
|
||||
# Check reasoning_details (OpenRouter style)
|
||||
if hasattr(message, "reasoning_details") and message.reasoning_details:
|
||||
for detail in message.reasoning_details:
|
||||
if hasattr(detail, "text") and detail.text:
|
||||
return detail.text
|
||||
if isinstance(detail, dict) and detail.get("text"):
|
||||
return detail["text"]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class HermesAgentLoop:
|
||||
"""
|
||||
Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling.
|
||||
|
||||
Same pattern as run_agent.py:
|
||||
- Pass tools= to the API
|
||||
- Check response.choices[0].message.tool_calls
|
||||
- Dispatch via handle_function_call()
|
||||
|
||||
Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter,
|
||||
or ManagedServer with a parser. The server determines how tool_calls get
|
||||
populated on the response.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server,
|
||||
tool_schemas: List[Dict[str, Any]],
|
||||
valid_tool_names: Set[str],
|
||||
max_turns: int = 30,
|
||||
task_id: Optional[str] = None,
|
||||
temperature: float = 1.0,
|
||||
max_tokens: Optional[int] = None,
|
||||
extra_body: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the agent loop.
|
||||
|
||||
Args:
|
||||
server: Server object with chat_completion() method (OpenAIServer,
|
||||
ManagedServer, ServerManager, etc.)
|
||||
tool_schemas: OpenAI-format tool definitions from get_tool_definitions()
|
||||
valid_tool_names: Set of tool names the model is allowed to call
|
||||
max_turns: Maximum number of LLM calls before stopping
|
||||
task_id: Unique ID for terminal/browser session isolation
|
||||
temperature: Sampling temperature for generation
|
||||
max_tokens: Max tokens per generation (None for server default)
|
||||
extra_body: Extra parameters passed to the OpenAI client's create() call.
|
||||
Used for OpenRouter provider preferences, transforms, etc.
|
||||
e.g. {"provider": {"ignore": ["DeepInfra"]}}
|
||||
"""
|
||||
self.server = server
|
||||
self.tool_schemas = tool_schemas
|
||||
self.valid_tool_names = valid_tool_names
|
||||
self.max_turns = max_turns
|
||||
self.task_id = task_id or str(uuid.uuid4())
|
||||
self.temperature = temperature
|
||||
self.max_tokens = max_tokens
|
||||
self.extra_body = extra_body
|
||||
|
||||
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
|
||||
"""
|
||||
Execute the full agent loop using standard OpenAI tool calling.
|
||||
|
||||
Args:
|
||||
messages: Initial conversation messages (system + user).
|
||||
Modified in-place as the conversation progresses.
|
||||
|
||||
Returns:
|
||||
AgentResult with full conversation history, managed state, and metadata
|
||||
"""
|
||||
reasoning_per_turn = []
|
||||
tool_errors: List[ToolError] = []
|
||||
|
||||
# Per-loop TodoStore for the todo tool (ephemeral, dies with the loop)
|
||||
from tools.todo_tool import TodoStore, todo_tool as _todo_tool
|
||||
_todo_store = TodoStore()
|
||||
|
||||
# Extract user task from first user message for browser_snapshot context
|
||||
_user_task = None
|
||||
for msg in messages:
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content", "")
|
||||
if isinstance(content, str) and content.strip():
|
||||
_user_task = content.strip()[:500] # Cap to avoid huge strings
|
||||
break
|
||||
|
||||
import time as _time
|
||||
|
||||
for turn in range(self.max_turns):
|
||||
turn_start = _time.monotonic()
|
||||
|
||||
# Build the chat_completion kwargs
|
||||
chat_kwargs = {
|
||||
"messages": messages,
|
||||
"n": 1,
|
||||
"temperature": self.temperature,
|
||||
}
|
||||
|
||||
# Only pass tools if we have them
|
||||
if self.tool_schemas:
|
||||
chat_kwargs["tools"] = self.tool_schemas
|
||||
|
||||
# Only pass max_tokens if explicitly set
|
||||
if self.max_tokens is not None:
|
||||
chat_kwargs["max_tokens"] = self.max_tokens
|
||||
|
||||
# Inject extra_body for provider-specific params (e.g., OpenRouter
|
||||
# provider preferences like banned/preferred providers, transforms)
|
||||
if self.extra_body:
|
||||
chat_kwargs["extra_body"] = self.extra_body
|
||||
|
||||
# Make the API call -- standard OpenAI spec
|
||||
api_start = _time.monotonic()
|
||||
try:
|
||||
response = await self.server.chat_completion(**chat_kwargs)
|
||||
except Exception as e:
|
||||
api_elapsed = _time.monotonic() - api_start
|
||||
logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e)
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=turn + 1,
|
||||
finished_naturally=False,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
api_elapsed = _time.monotonic() - api_start
|
||||
|
||||
if not response or not response.choices:
|
||||
logger.warning("Empty response on turn %d (api=%.1fs)", turn + 1, api_elapsed)
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=turn + 1,
|
||||
finished_naturally=False,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
assistant_msg = response.choices[0].message
|
||||
|
||||
# Extract reasoning content from the response (all provider formats)
|
||||
reasoning = _extract_reasoning_from_message(assistant_msg)
|
||||
reasoning_per_turn.append(reasoning)
|
||||
|
||||
# Check for tool calls -- standard OpenAI spec.
|
||||
# Fallback: if response has no structured tool_calls but content
|
||||
# contains raw tool call tags (e.g. <tool_call>), parse them using
|
||||
# hermes-agent's standalone parsers. This handles the case where
|
||||
# ManagedServer's ToolCallTranslator couldn't parse because vLLM
|
||||
# isn't installed.
|
||||
if (
|
||||
not assistant_msg.tool_calls
|
||||
and assistant_msg.content
|
||||
and self.tool_schemas
|
||||
and "<tool_call>" in (assistant_msg.content or "")
|
||||
):
|
||||
try:
|
||||
from environments.tool_call_parsers import get_parser
|
||||
fallback_parser = get_parser("hermes")
|
||||
parsed_content, parsed_calls = fallback_parser.parse(
|
||||
assistant_msg.content
|
||||
)
|
||||
if parsed_calls:
|
||||
assistant_msg.tool_calls = parsed_calls
|
||||
if parsed_content is not None:
|
||||
assistant_msg.content = parsed_content
|
||||
logger.debug(
|
||||
"Fallback parser extracted %d tool calls from raw content",
|
||||
len(parsed_calls),
|
||||
)
|
||||
except Exception:
|
||||
pass # Fall through to no tool calls
|
||||
|
||||
if assistant_msg.tool_calls:
|
||||
# Normalize tool calls to dicts — they may come as objects
|
||||
# (OpenAI API) or dicts (vLLM ToolCallTranslator).
|
||||
def _tc_to_dict(tc):
|
||||
if isinstance(tc, dict):
|
||||
return {
|
||||
"id": tc.get("id", f"call_{uuid.uuid4().hex[:8]}"),
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tc.get("function", {}).get("name", tc.get("name", "")),
|
||||
"arguments": tc.get("function", {}).get("arguments", tc.get("arguments", "{}")),
|
||||
},
|
||||
}
|
||||
return {
|
||||
"id": tc.id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tc.function.name,
|
||||
"arguments": tc.function.arguments,
|
||||
},
|
||||
}
|
||||
|
||||
# Build the assistant message dict for conversation history
|
||||
msg_dict: Dict[str, Any] = {
|
||||
"role": "assistant",
|
||||
"content": assistant_msg.content or "",
|
||||
"tool_calls": [_tc_to_dict(tc) for tc in assistant_msg.tool_calls],
|
||||
}
|
||||
|
||||
# Preserve reasoning_content for multi-turn chat template handling
|
||||
# (e.g., Kimi-K2's template renders <think> blocks differently
|
||||
# for history vs. the latest turn based on this field)
|
||||
if reasoning:
|
||||
msg_dict["reasoning_content"] = reasoning
|
||||
|
||||
messages.append(msg_dict)
|
||||
|
||||
# Execute each tool call via hermes-agent's dispatch
|
||||
for tc in assistant_msg.tool_calls:
|
||||
# Handle both object (OpenAI) and dict (vLLM) formats
|
||||
if isinstance(tc, dict):
|
||||
tool_name = tc.get("function", {}).get("name", tc.get("name", ""))
|
||||
tool_args_raw = tc.get("function", {}).get("arguments", tc.get("arguments", "{}"))
|
||||
else:
|
||||
tool_name = tc.function.name
|
||||
tool_args_raw = tc.function.arguments
|
||||
|
||||
# Validate tool name
|
||||
if tool_name not in self.valid_tool_names:
|
||||
tool_result = json.dumps(
|
||||
{
|
||||
"error": f"Unknown tool '{tool_name}'. "
|
||||
f"Available tools: {sorted(self.valid_tool_names)}"
|
||||
}
|
||||
)
|
||||
tool_errors.append(ToolError(
|
||||
turn=turn + 1, tool_name=tool_name,
|
||||
arguments=tool_args_raw[:200],
|
||||
error=f"Unknown tool '{tool_name}'",
|
||||
tool_result=tool_result,
|
||||
))
|
||||
logger.warning(
|
||||
"Model called unknown tool '%s' on turn %d",
|
||||
tool_name, turn + 1,
|
||||
)
|
||||
else:
|
||||
# Parse arguments and dispatch
|
||||
try:
|
||||
args = json.loads(tool_args_raw)
|
||||
except json.JSONDecodeError:
|
||||
args = {}
|
||||
logger.warning(
|
||||
"Invalid JSON in tool call arguments for '%s': %s",
|
||||
tool_name, tool_args_raw[:200],
|
||||
)
|
||||
|
||||
try:
|
||||
if tool_name == "terminal":
|
||||
backend = os.getenv("TERMINAL_ENV", "local")
|
||||
cmd_preview = args.get("command", "")[:80]
|
||||
logger.info(
|
||||
"[%s] $ %s", self.task_id[:8], cmd_preview,
|
||||
)
|
||||
|
||||
tool_submit_time = _time.monotonic()
|
||||
|
||||
# Todo tool -- handle locally (needs per-loop TodoStore)
|
||||
if tool_name == "todo":
|
||||
tool_result = _todo_tool(
|
||||
todos=args.get("todos"),
|
||||
merge=args.get("merge", False),
|
||||
store=_todo_store,
|
||||
)
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
elif tool_name == "memory":
|
||||
tool_result = json.dumps({"error": "Memory is not available in RL environments."})
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
elif tool_name == "session_search":
|
||||
tool_result = json.dumps({"error": "Session search is not available in RL environments."})
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
else:
|
||||
# Run tool calls in a thread pool so backends that
|
||||
# use asyncio.run() internally (modal, docker, daytona) get
|
||||
# a clean event loop instead of deadlocking.
|
||||
loop = asyncio.get_event_loop()
|
||||
# Capture current tool_name/args for the lambda
|
||||
_tn, _ta, _tid = tool_name, args, self.task_id
|
||||
tool_result = await loop.run_in_executor(
|
||||
_tool_executor,
|
||||
lambda: handle_function_call(
|
||||
_tn, _ta, task_id=_tid,
|
||||
user_task=_user_task,
|
||||
),
|
||||
)
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
|
||||
# Log slow tools and thread pool stats for debugging
|
||||
pool_active = _tool_executor._work_queue.qsize()
|
||||
if tool_elapsed > 30:
|
||||
logger.warning(
|
||||
"[%s] turn %d: %s took %.1fs (pool queue=%d)",
|
||||
self.task_id[:8], turn + 1, tool_name,
|
||||
tool_elapsed, pool_active,
|
||||
)
|
||||
except Exception as e:
|
||||
tool_result = json.dumps(
|
||||
{"error": f"Tool execution failed: {type(e).__name__}: {str(e)}"}
|
||||
)
|
||||
tool_errors.append(ToolError(
|
||||
turn=turn + 1, tool_name=tool_name,
|
||||
arguments=tool_args_raw[:200],
|
||||
error=f"{type(e).__name__}: {str(e)}",
|
||||
tool_result=tool_result,
|
||||
))
|
||||
logger.error(
|
||||
"Tool '%s' execution failed on turn %d: %s",
|
||||
tool_name, turn + 1, e,
|
||||
)
|
||||
|
||||
# Also check if the tool returned an error in its JSON result
|
||||
try:
|
||||
result_data = json.loads(tool_result)
|
||||
if isinstance(result_data, dict):
|
||||
err = result_data.get("error")
|
||||
exit_code = result_data.get("exit_code")
|
||||
if err and exit_code and exit_code < 0:
|
||||
tool_errors.append(ToolError(
|
||||
turn=turn + 1, tool_name=tool_name,
|
||||
arguments=tool_args_raw[:200],
|
||||
error=str(err),
|
||||
tool_result=tool_result[:500],
|
||||
))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
# Add tool response to conversation
|
||||
tc_id = tc.get("id", "") if isinstance(tc, dict) else tc.id
|
||||
messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": tc_id,
|
||||
"content": tool_result,
|
||||
}
|
||||
)
|
||||
|
||||
turn_elapsed = _time.monotonic() - turn_start
|
||||
logger.info(
|
||||
"[%s] turn %d: api=%.1fs, %d tools, turn_total=%.1fs",
|
||||
self.task_id[:8], turn + 1, api_elapsed,
|
||||
len(assistant_msg.tool_calls), turn_elapsed,
|
||||
)
|
||||
|
||||
else:
|
||||
# No tool calls -- model is done
|
||||
msg_dict = {
|
||||
"role": "assistant",
|
||||
"content": assistant_msg.content or "",
|
||||
}
|
||||
if reasoning:
|
||||
msg_dict["reasoning_content"] = reasoning
|
||||
messages.append(msg_dict)
|
||||
|
||||
turn_elapsed = _time.monotonic() - turn_start
|
||||
logger.info(
|
||||
"[%s] turn %d: api=%.1fs, no tools (finished), turn_total=%.1fs",
|
||||
self.task_id[:8], turn + 1, api_elapsed, turn_elapsed,
|
||||
)
|
||||
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=turn + 1,
|
||||
finished_naturally=True,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
# Hit max turns without the model stopping
|
||||
logger.info("Agent hit max_turns (%d) without finishing", self.max_turns)
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=self.max_turns,
|
||||
finished_naturally=False,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
def _get_managed_state(self) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get ManagedServer state if the server supports it.
|
||||
|
||||
Returns state dict with SequenceNodes containing tokens/logprobs/masks,
|
||||
or None if the server doesn't support get_state() (e.g., regular OpenAI server).
|
||||
"""
|
||||
if hasattr(self.server, "get_state"):
|
||||
return self.server.get_state()
|
||||
return None
|
||||
1213
environments/agentic_opd_env.py
Normal file
1213
environments/agentic_opd_env.py
Normal file
File diff suppressed because it is too large
Load Diff
0
environments/benchmarks/__init__.py
Normal file
0
environments/benchmarks/__init__.py
Normal file
73
environments/benchmarks/tblite/README.md
Normal file
73
environments/benchmarks/tblite/README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# OpenThoughts-TBLite Evaluation Environment
|
||||
|
||||
This environment evaluates terminal agents on the [OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite) benchmark, a difficulty-calibrated subset of [Terminal-Bench 2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0).
|
||||
|
||||
## Source
|
||||
|
||||
OpenThoughts-TBLite was created by the [OpenThoughts](https://www.openthoughts.ai/) Agent team in collaboration with [Snorkel AI](https://snorkel.ai/) and [Bespoke Labs](https://bespokelabs.ai/). The original dataset and documentation live at:
|
||||
|
||||
- **Dataset (source):** [open-thoughts/OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite)
|
||||
- **GitHub:** [open-thoughts/OpenThoughts-TBLite](https://github.com/open-thoughts/OpenThoughts-TBLite)
|
||||
- **Blog post:** [openthoughts.ai/blog/openthoughts-tblite](https://www.openthoughts.ai/blog/openthoughts-tblite)
|
||||
|
||||
## Our Dataset
|
||||
|
||||
We converted the source into the same schema used by our Terminal-Bench 2.0 environment (pre-built Docker Hub images, base64-encoded test tarballs, etc.) and published it as:
|
||||
|
||||
- **Dataset (ours):** [NousResearch/openthoughts-tblite](https://huggingface.co/datasets/NousResearch/openthoughts-tblite)
|
||||
- **Docker images:** `nousresearch/tblite-<task-name>:latest` on Docker Hub (100 images)
|
||||
|
||||
The conversion script is at `scripts/prepare_tblite_dataset.py`.
|
||||
|
||||
## Why TBLite?
|
||||
|
||||
Terminal-Bench 2.0 is one of the strongest frontier evaluations for terminal agents, but when a model scores near the floor (e.g., Qwen 3 8B at <1%), many changes look identical in aggregate score. TBLite addresses this by calibrating task difficulty using Claude Haiku 4.5 as a reference:
|
||||
|
||||
| Difficulty | Pass Rate Range | Tasks |
|
||||
|------------|----------------|-------|
|
||||
| Easy | >= 70% | 40 |
|
||||
| Medium | 40-69% | 26 |
|
||||
| Hard | 10-39% | 26 |
|
||||
| Extreme | < 10% | 8 |
|
||||
|
||||
This gives enough solvable tasks to detect small improvements quickly, while preserving enough hard tasks to avoid saturation. The correlation between TBLite and TB2 scores is **r = 0.911**.
|
||||
|
||||
TBLite also runs 2.6-8x faster than the full TB2, making it practical for iteration loops.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Run the full benchmark
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate
|
||||
|
||||
# Filter to specific tasks
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
--env.task_filter "broken-python,pandas-etl"
|
||||
|
||||
# Use a different model
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
--server.model_name "qwen/qwen3-30b"
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
`TBLiteEvalEnv` is a thin subclass of `TerminalBench2EvalEnv`. All evaluation logic (agent loop, Docker sandbox management, test verification, metrics) is inherited. Only the defaults differ:
|
||||
|
||||
| Setting | TB2 | TBLite |
|
||||
|----------------|----------------------------------|-----------------------------------------|
|
||||
| Dataset | `NousResearch/terminal-bench-2` | `NousResearch/openthoughts-tblite` |
|
||||
| Tasks | 89 | 100 |
|
||||
| Task timeout | 1800s (30 min) | 1200s (20 min) |
|
||||
| Wandb name | `terminal-bench-2` | `openthoughts-tblite` |
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@software{OpenThoughts-TBLite,
|
||||
author = {OpenThoughts-Agent team, Snorkel AI, Bespoke Labs},
|
||||
month = Feb,
|
||||
title = {{OpenThoughts-TBLite: A High-Signal Benchmark for Iterating on Terminal Agents}},
|
||||
howpublished = {https://www.openthoughts.ai/blog/openthoughts-tblite},
|
||||
year = {2026}
|
||||
}
|
||||
```
|
||||
0
environments/benchmarks/tblite/__init__.py
Normal file
0
environments/benchmarks/tblite/__init__.py
Normal file
39
environments/benchmarks/tblite/default.yaml
Normal file
39
environments/benchmarks/tblite/default.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Default Configuration
|
||||
#
|
||||
# Eval-only environment for the TBLite benchmark (100 difficulty-calibrated
|
||||
# terminal tasks, a faster proxy for Terminal-Bench 2.0).
|
||||
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
||||
# and OpenRouter for inference.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/default.yaml
|
||||
#
|
||||
# # Override model:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/default.yaml \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "modal"
|
||||
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||
tool_pool_size: 128 # thread pool for 100 parallel tasks
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200 # 20 min wall-clock per task (TBLite tasks are faster)
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "openthoughts-tblite"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite"
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-opus-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
38
environments/benchmarks/tblite/local.yaml
Normal file
38
environments/benchmarks/tblite/local.yaml
Normal file
@@ -0,0 +1,38 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
|
||||
#
|
||||
# Runs tasks in Docker containers on the local machine.
|
||||
# Sandboxed like Modal but no cloud costs. Good for dev/testing.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local.yaml
|
||||
#
|
||||
# # Override concurrency:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local.yaml \
|
||||
# --env.eval_concurrency 4
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "docker"
|
||||
terminal_timeout: 300
|
||||
tool_pool_size: 16
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200
|
||||
eval_concurrency: 8 # max 8 tasks at once
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: false
|
||||
wandb_name: "openthoughts-tblite-local"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-sonnet-4"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
40
environments/benchmarks/tblite/local_vllm.yaml
Normal file
40
environments/benchmarks/tblite/local_vllm.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Local vLLM Backend
|
||||
#
|
||||
# Runs against a local vLLM server with Docker sandboxes.
|
||||
#
|
||||
# Start the vLLM server from the atropos directory:
|
||||
# python -m example_trainer.vllm_api_server \
|
||||
# --model Qwen/Qwen3-4B-Instruct-2507 \
|
||||
# --port 9001 \
|
||||
# --gpu-memory-utilization 0.8 \
|
||||
# --max-model-len=32000
|
||||
#
|
||||
# Then run:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local_vllm.yaml
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 16000
|
||||
agent_temperature: 0.6
|
||||
terminal_backend: "docker"
|
||||
terminal_timeout: 300
|
||||
tool_pool_size: 16
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200
|
||||
eval_concurrency: 8
|
||||
tool_call_parser: "hermes"
|
||||
system_prompt: "You are an expert terminal agent. You MUST use the provided tools to complete tasks. Use the terminal tool to run shell commands, read_file to read files, write_file to write files, search_files to search, and patch to edit files. Do NOT write out solutions as text - execute them using the tools. Always start by exploring the environment with terminal commands."
|
||||
tokenizer_name: "Qwen/Qwen3-4B-Instruct-2507"
|
||||
use_wandb: false
|
||||
wandb_name: "tblite-qwen3-4b-instruct"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/tblite-qwen3-4b-local"
|
||||
|
||||
openai:
|
||||
base_url: "http://localhost:9001"
|
||||
model_name: "Qwen/Qwen3-4B-Instruct-2507"
|
||||
server_type: "vllm"
|
||||
health_check: false
|
||||
42
environments/benchmarks/tblite/run_eval.sh
Executable file
42
environments/benchmarks/tblite/run_eval.sh
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
# OpenThoughts-TBLite Evaluation
|
||||
#
|
||||
# Run from repo root:
|
||||
# bash environments/benchmarks/tblite/run_eval.sh
|
||||
#
|
||||
# Override model:
|
||||
# bash environments/benchmarks/tblite/run_eval.sh \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
#
|
||||
# Run a subset:
|
||||
# bash environments/benchmarks/tblite/run_eval.sh \
|
||||
# --env.task_filter broken-python,pandas-etl
|
||||
#
|
||||
# All terminal settings (backend, timeout, lifetime, pool size) are
|
||||
# configured via env config fields -- no env vars needed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p logs evals/openthoughts-tblite
|
||||
LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log"
|
||||
|
||||
echo "OpenThoughts-TBLite Evaluation"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
# Unbuffered python output so logs are written in real-time
|
||||
export PYTHONUNBUFFERED=1
|
||||
|
||||
# Show INFO-level agent loop timing (api/tool durations per turn)
|
||||
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
|
||||
export LOGLEVEL=INFO
|
||||
|
||||
python tblite_env.py evaluate \
|
||||
--config default.yaml \
|
||||
"$@" \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "Log saved to: $LOG_FILE"
|
||||
echo "Eval results: evals/openthoughts-tblite/"
|
||||
119
environments/benchmarks/tblite/tblite_env.py
Normal file
119
environments/benchmarks/tblite/tblite_env.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""
|
||||
OpenThoughts-TBLite Evaluation Environment
|
||||
|
||||
A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal
|
||||
agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults
|
||||
to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated
|
||||
tasks vs TB2's 89 harder tasks).
|
||||
|
||||
TBLite tasks are a curated subset of TB2 with a difficulty distribution
|
||||
designed to give meaningful signal even for smaller models:
|
||||
- Easy (40 tasks): >= 70% pass rate with Claude Haiku 4.5
|
||||
- Medium (26 tasks): 40-69% pass rate
|
||||
- Hard (26 tasks): 10-39% pass rate
|
||||
- Extreme (8 tasks): < 10% pass rate
|
||||
|
||||
Usage:
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate
|
||||
|
||||
# Filter to specific tasks:
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate \\
|
||||
--env.task_filter "broken-python,pandas-etl"
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from atroposlib.envs.base import EvalHandlingEnum
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
|
||||
from environments.benchmarks.terminalbench_2.terminalbench2_env import (
|
||||
TerminalBench2EvalConfig,
|
||||
TerminalBench2EvalEnv,
|
||||
)
|
||||
|
||||
|
||||
class TBLiteEvalConfig(TerminalBench2EvalConfig):
|
||||
"""Configuration for the OpenThoughts-TBLite evaluation environment.
|
||||
|
||||
Inherits all TB2 config fields. Only the dataset default and task timeout
|
||||
differ -- TBLite tasks are calibrated to be faster.
|
||||
"""
|
||||
|
||||
dataset_name: str = Field(
|
||||
default="NousResearch/openthoughts-tblite",
|
||||
description="HuggingFace dataset containing TBLite tasks.",
|
||||
)
|
||||
|
||||
task_timeout: int = Field(
|
||||
default=1200,
|
||||
description="Maximum wall-clock seconds per task. TBLite tasks are "
|
||||
"generally faster than TB2, so 20 minutes is usually sufficient.",
|
||||
)
|
||||
|
||||
|
||||
class TBLiteEvalEnv(TerminalBench2EvalEnv):
|
||||
"""OpenThoughts-TBLite evaluation environment.
|
||||
|
||||
Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop,
|
||||
test verification, Docker image resolution, metrics, wandb logging).
|
||||
Only the default configuration differs.
|
||||
"""
|
||||
|
||||
name = "openthoughts-tblite"
|
||||
env_config_cls = TBLiteEvalConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]:
|
||||
env_config = TBLiteEvalConfig(
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
|
||||
max_agent_turns=60,
|
||||
max_token_length=16000,
|
||||
agent_temperature=0.6,
|
||||
system_prompt=None,
|
||||
|
||||
terminal_backend="modal",
|
||||
terminal_timeout=300,
|
||||
|
||||
test_timeout=180,
|
||||
|
||||
# 100 tasks in parallel
|
||||
tool_pool_size=128,
|
||||
|
||||
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||
group_size=1,
|
||||
steps_per_eval=1,
|
||||
total_steps=1,
|
||||
|
||||
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||
use_wandb=True,
|
||||
wandb_name="openthoughts-tblite",
|
||||
ensure_scores_are_not_same=False,
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-sonnet-4",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
health_check=False,
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TBLiteEvalEnv.cli()
|
||||
0
environments/benchmarks/terminalbench_2/__init__.py
Normal file
0
environments/benchmarks/terminalbench_2/__init__.py
Normal file
42
environments/benchmarks/terminalbench_2/default.yaml
Normal file
42
environments/benchmarks/terminalbench_2/default.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
# Terminal-Bench 2.0 Evaluation -- Default Configuration
|
||||
#
|
||||
# Eval-only environment for the TB2 benchmark (89 terminal tasks).
|
||||
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
||||
# and OpenRouter for inference.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
# --config environments/benchmarks/terminalbench_2/default.yaml
|
||||
#
|
||||
# # Override model:
|
||||
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
# --config environments/benchmarks/terminalbench_2/default.yaml \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "modal"
|
||||
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||
tool_pool_size: 128 # thread pool for 89 parallel tasks
|
||||
dataset_name: "NousResearch/terminal-bench-2"
|
||||
test_timeout: 600
|
||||
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "terminal-bench-2"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
|
||||
# CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
|
||||
# Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
|
||||
# are created simultaneously inside thread pool workers via asyncio.run().
|
||||
max_concurrent_tasks: 8
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-opus-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
42
environments/benchmarks/terminalbench_2/run_eval.sh
Executable file
42
environments/benchmarks/terminalbench_2/run_eval.sh
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Terminal-Bench 2.0 Evaluation
|
||||
#
|
||||
# Run from repo root:
|
||||
# bash environments/benchmarks/terminalbench_2/run_eval.sh
|
||||
#
|
||||
# Override model:
|
||||
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
#
|
||||
# Run a subset:
|
||||
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
|
||||
# --env.task_filter fix-git,git-multibranch
|
||||
#
|
||||
# All terminal settings (backend, timeout, lifetime, pool size) are
|
||||
# configured via env config fields -- no env vars needed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p logs evals/terminal-bench-2
|
||||
LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log"
|
||||
|
||||
echo "Terminal-Bench 2.0 Evaluation"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
# Unbuffered python output so logs are written in real-time
|
||||
export PYTHONUNBUFFERED=1
|
||||
|
||||
# Show INFO-level agent loop timing (api/tool durations per turn)
|
||||
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
|
||||
export LOGLEVEL=INFO
|
||||
|
||||
python terminalbench2_env.py evaluate \
|
||||
--config default.yaml \
|
||||
"$@" \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "Log saved to: $LOG_FILE"
|
||||
echo "Eval results: evals/terminal-bench-2/"
|
||||
515
environments/benchmarks/terminalbench_2/terminalbench2_env.py
Normal file
515
environments/benchmarks/terminalbench_2/terminalbench2_env.py
Normal file
@@ -0,0 +1,515 @@
|
||||
"""
|
||||
TerminalBench2Env -- Terminal-Bench 2.0 Evaluation Environment
|
||||
|
||||
Evaluates agentic LLMs on challenging terminal tasks from Terminal-Bench 2.0.
|
||||
Each task provides a unique Docker environment (pre-built on Docker Hub), a natural
|
||||
language instruction, and a test suite for verification. The agent uses terminal +
|
||||
file tools to complete the task, then the test suite runs inside the same sandbox.
|
||||
|
||||
This is an eval-only environment (not a training environment). It is designed to
|
||||
be run via the `evaluate` subcommand:
|
||||
|
||||
python environments/terminalbench2_env.py evaluate \\
|
||||
--env.dataset_name NousResearch/terminal-bench-2
|
||||
|
||||
The evaluate flow:
|
||||
1. setup() -- Loads the TB2 dataset from HuggingFace
|
||||
2. evaluate() -- Iterates over all tasks, running each through:
|
||||
a. rollout_and_score_eval() -- Per-task agent loop + test verification
|
||||
- Resolves Docker image (pre-built Hub image or Dockerfile fallback)
|
||||
- Registers per-task Modal sandbox via register_task_env_overrides()
|
||||
- Runs the HermesAgentLoop (terminal + file tools)
|
||||
- Uploads test suite and runs test.sh in the same sandbox
|
||||
- Returns binary pass/fail result
|
||||
b. Aggregates per-task, per-category, and overall pass rates
|
||||
c. Logs results via evaluate_log() and wandb
|
||||
|
||||
Key features:
|
||||
- Per-task Modal sandboxes using pre-built Docker Hub images
|
||||
- Binary reward: 1.0 if all tests pass, 0.0 otherwise
|
||||
- Concurrency-controlled parallel evaluation via asyncio.Semaphore
|
||||
- Per-task, per-category, and aggregate pass rate tracking
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
# Ensure repo root is on sys.path for imports
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from atroposlib.envs.base import EvalHandlingEnum
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
from environments.tool_context import ToolContext
|
||||
from tools.terminal_tool import (
|
||||
register_task_env_overrides,
|
||||
clear_task_env_overrides,
|
||||
cleanup_vm,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
class TerminalBench2EvalConfig(HermesAgentEnvConfig):
|
||||
"""
|
||||
Configuration for the Terminal-Bench 2.0 evaluation environment.
|
||||
|
||||
Extends HermesAgentEnvConfig with TB2-specific settings for dataset loading,
|
||||
test execution, task filtering, and eval concurrency.
|
||||
"""
|
||||
|
||||
# --- Dataset ---
|
||||
dataset_name: str = Field(
|
||||
default="NousResearch/terminal-bench-2",
|
||||
description="HuggingFace dataset containing TB2 tasks.",
|
||||
)
|
||||
|
||||
# --- Test execution ---
|
||||
test_timeout: int = Field(
|
||||
default=180,
|
||||
description="Timeout in seconds for running the test suite after agent completes.",
|
||||
)
|
||||
|
||||
# --- Image strategy ---
|
||||
force_build: bool = Field(
|
||||
default=False,
|
||||
description="If True, always build from Dockerfile (ignore docker_image). "
|
||||
"Useful for testing custom Dockerfiles.",
|
||||
)
|
||||
|
||||
# --- Task filtering (comma-separated from CLI) ---
|
||||
task_filter: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Comma-separated task names to run (e.g., 'fix-git,git-multibranch'). "
|
||||
"If not set, all tasks are run.",
|
||||
)
|
||||
skip_tasks: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Comma-separated task names to skip on top of the default skip list.",
|
||||
)
|
||||
|
||||
# --- Per-task wall-clock timeout ---
|
||||
task_timeout: int = Field(
|
||||
default=1800,
|
||||
description="Maximum wall-clock seconds per task (agent loop + verification). "
|
||||
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
|
||||
)
|
||||
|
||||
# --- Concurrency control ---
|
||||
max_concurrent_tasks: int = Field(
|
||||
default=8,
|
||||
description="Maximum number of tasks to run concurrently. "
|
||||
"Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. "
|
||||
"Modal has internal limits and creating too many sandboxes simultaneously "
|
||||
"causes blocking calls to deadlock inside the thread pool.",
|
||||
)
|
||||
|
||||
# --- Eval concurrency ---
|
||||
eval_concurrency: int = Field(
|
||||
default=0,
|
||||
description="Maximum number of tasks to evaluate in parallel. "
|
||||
"0 means unlimited (all tasks run concurrently). "
|
||||
"Set to 8 for local backends to avoid overwhelming the machine.",
|
||||
)
|
||||
|
||||
|
||||
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
||||
MODAL_INCOMPATIBLE_TASKS = {
|
||||
"qemu-startup", # Needs KVM/hardware virtualization
|
||||
"qemu-alpine-ssh", # Needs KVM/hardware virtualization
|
||||
"crack-7z-hash", # Password brute-force -- too slow for cloud sandbox timeouts
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tar extraction helper
|
||||
# =============================================================================
|
||||
|
||||
def _extract_base64_tar(b64_data: str, target_dir: Path):
|
||||
"""Extract a base64-encoded tar.gz archive into target_dir."""
|
||||
if not b64_data:
|
||||
return
|
||||
raw = base64.b64decode(b64_data)
|
||||
buf = io.BytesIO(raw)
|
||||
with tarfile.open(fileobj=buf, mode="r:gz") as tar:
|
||||
tar.extractall(path=str(target_dir))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Environment
|
||||
# =============================================================================
|
||||
|
||||
class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
Terminal-Bench 2.0 evaluation environment (eval-only, no training).
|
||||
|
||||
Inherits from HermesAgentBaseEnv for:
|
||||
- Terminal backend setup (os.environ["TERMINAL_ENV"])
|
||||
- Tool resolution via _resolve_tools_for_group()
|
||||
- Monkey patches for async-safe tool operation
|
||||
- Wandb trajectory formatting
|
||||
|
||||
The evaluate flow (triggered by `environment.py evaluate`):
|
||||
1. setup() -- Load dataset from HuggingFace
|
||||
2. evaluate() -- Run all tasks through rollout_and_score_eval()
|
||||
|
||||
Each task in rollout_and_score_eval():
|
||||
1. Resolve Docker image (pre-built Hub image or Dockerfile fallback)
|
||||
2. Register per-task Modal sandbox override
|
||||
3. Run HermesAgentLoop with terminal + file tools
|
||||
4. Upload test suite and execute test.sh in the same sandbox
|
||||
5. Check /logs/verifier/reward.txt for pass/fail
|
||||
6. Clean up sandbox, overrides, and temp files
|
||||
"""
|
||||
|
||||
name = "terminal-bench-2"
|
||||
env_config_cls = TerminalBench2EvalConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[TerminalBench2EvalConfig, List[APIServerConfig]]:
|
||||
"""
|
||||
Default configuration for Terminal-Bench 2.0 evaluation.
|
||||
|
||||
Uses eval-only settings:
|
||||
- eval_handling=STOP_TRAIN so the eval flow runs cleanly
|
||||
- steps_per_eval=1, total_steps=1 so eval triggers immediately
|
||||
- group_size=1 (one rollout per group, each task is expensive)
|
||||
|
||||
Uses Modal terminal backend (cloud-isolated sandbox per task) and
|
||||
OpenRouter with Claude for inference.
|
||||
"""
|
||||
env_config = TerminalBench2EvalConfig(
|
||||
# Terminal + file tools only (the agent interacts via shell commands)
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
|
||||
# Agent settings -- TB2 tasks are complex, need many turns
|
||||
max_agent_turns=60,
|
||||
max_token_length=***
|
||||
agent_temperature=0.6,
|
||||
system_prompt=None,
|
||||
|
||||
# Modal backend for per-task cloud-isolated sandboxes
|
||||
terminal_backend="modal",
|
||||
terminal_timeout=300, # 5 min per command (builds, pip install, etc.)
|
||||
|
||||
# Test execution timeout (TB2 test scripts can install deps like pytest)
|
||||
test_timeout=180,
|
||||
|
||||
# 89 tasks run in parallel, each needs a thread for tool calls
|
||||
tool_pool_size=128,
|
||||
|
||||
# --- Eval-only Atropos settings ---
|
||||
# These settings make the env work as an eval-only environment:
|
||||
# - STOP_TRAIN: pauses training during eval (standard for eval envs)
|
||||
# - steps_per_eval=1, total_steps=1: eval triggers immediately
|
||||
# - group_size=1: one rollout per group (each task is expensive)
|
||||
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||
group_size=1,
|
||||
steps_per_eval=1,
|
||||
total_steps=1,
|
||||
|
||||
tokenizer_name="NousRe...1-8B",
|
||||
use_wandb=True,
|
||||
wandb_name="terminal-bench-2",
|
||||
ensure_scores_are_not_same=False, # Binary rewards may all be 0 or 1
|
||||
)
|
||||
|
||||
# OpenRouter with Claude -- API key loaded from .env
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-sonnet-4",
|
||||
server_type="openai",
|
||||
api_key=os.get...EY", ""),
|
||||
health_check=False,
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
# =========================================================================
|
||||
# Setup -- load dataset
|
||||
# =========================================================================
|
||||
|
||||
async def setup(self):
|
||||
"""Load the Terminal-Bench 2.0 dataset from HuggingFace."""
|
||||
from datasets import load_dataset
|
||||
|
||||
# Auto-set terminal_lifetime to task_timeout + 120s so sandboxes
|
||||
# never get killed during an active task, but still get cleaned up
|
||||
# promptly after the task times out.
|
||||
lifetime = self.config.task_timeout + 120
|
||||
self.config.terminal_lifetime = lifetime
|
||||
os.environ["TERMINAL_LIFETIME_SECONDS"] = str(lifetime)
|
||||
print(f" Terminal lifetime auto-set to {lifetime}s (task_timeout + 120s)")
|
||||
|
||||
print(f"Loading TB2 dataset from: {self.config.dataset_name}")
|
||||
ds = load_dataset(self.config.dataset_name, split="train")
|
||||
|
||||
# Apply task filters (comma-separated strings from CLI)
|
||||
tasks = list(ds)
|
||||
if self.config.task_filter:
|
||||
allowed = {name.strip() for name in self.config.task_filter.split(",")}
|
||||
tasks = [t for t in tasks if t["task_name"] in allowed]
|
||||
print(f" Filtered to {len(tasks)} tasks: {sorted(allowed)}")
|
||||
|
||||
# Skip tasks incompatible with the current backend (e.g., QEMU on Modal)
|
||||
# plus any user-specified skip_tasks
|
||||
skip = set(MODAL_INCOMPATIBLE_TASKS) if self.config.terminal_backend == "modal" else set()
|
||||
if self.config.skip_tasks:
|
||||
skip |= {name.strip() for name in self.config.skip_tasks.split(",")}
|
||||
if skip:
|
||||
before = len(tasks)
|
||||
tasks = [t for t in tasks if t["task_name"] not in skip]
|
||||
skipped = before - len(tasks)
|
||||
if skipped > 0:
|
||||
print(f" Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}")
|
||||
|
||||
self.all_eval_items = tasks
|
||||
self.iter = 0
|
||||
|
||||
# Build category index for per-category metrics
|
||||
self.category_index: Dict[str, List[int]] = defaultdict(list)
|
||||
for i, task in enumerate(self.all_eval_items):
|
||||
self.category_index[task.get("category", "unknown")].append(i)
|
||||
|
||||
# Reward tracking for wandb logging
|
||||
self.eval_metrics: List[Tuple[str, float]] = []
|
||||
|
||||
# Streaming JSONL writer -- saves each task's full conversation
|
||||
# immediately on completion so data is preserved even on Ctrl+C.
|
||||
# Timestamped filename so each run produces a unique file.
|
||||
import datetime
|
||||
log_dir = os.path.join(os.path.dirname(__file__), "logs")
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
|
||||
self._streaming_file = open(self._streaming_path, "w")
|
||||
self._streaming_lock = __import__("threading").Lock()
|
||||
print(f" Streaming results to: {self._streaming_path}")
|
||||
|
||||
print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories")
|
||||
for cat, indices in sorted(self.category_index.items()):
|
||||
print(f" {cat}: {len(indices)} tasks")
|
||||
|
||||
def _save_result(self, result: Dict[str, Any]):
|
||||
"""Write a single task result to the streaming JSONL file immediately."""
|
||||
if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
|
||||
return
|
||||
with self._streaming_lock:
|
||||
self._streaming_file.write(json.dumps(result, ensure_ascii=False, default=str) + "\n")
|
||||
self._streaming_file.flush()
|
||||
|
||||
# =========================================================================
|
||||
# Training pipeline stubs -- NOT used in eval-only mode
|
||||
# =========================================================================
|
||||
# These satisfy the abstract method requirements from HermesAgentBaseEnv.
|
||||
# The evaluate subcommand calls setup() -> evaluate() directly, bypassing
|
||||
# the training pipeline entirely.
|
||||
|
||||
async def get_next_item(self):
|
||||
"""Return next item (stub -- not used in eval-only mode)."""
|
||||
item = self.all_eval_items[self.iter % len(self.all_eval_items)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||
"""Return the task's instruction as the user prompt."""
|
||||
return item["instruction"]
|
||||
|
||||
async def compute_reward(self, item, result, ctx) -> float:
|
||||
"""Compute reward (stub -- actual verification is in rollout_and_score_eval)."""
|
||||
return 0.0
|
||||
|
||||
async def collect_trajectories(self, item):
|
||||
"""Collect trajectories (stub -- not used in eval-only mode)."""
|
||||
return None, []
|
||||
|
||||
async def score(self, rollout_group_data):
|
||||
"""Score rollouts (stub -- not used in eval-only mode)."""
|
||||
return None
|
||||
|
||||
# =========================================================================
|
||||
# Docker image resolution
|
||||
# =========================================================================
|
||||
|
||||
def _resolve_task_image(
|
||||
self, item: Dict[str, Any], task_name: str
|
||||
) -> Tuple[str, Optional[Path]]:
|
||||
"""
|
||||
Resolve the Docker image for a task, with fallback to Dockerfile.
|
||||
|
||||
Strategy (mirrors Harbor's approach):
|
||||
1. If force_build=True, always build from Dockerfile in environment_tar
|
||||
2. If docker_image is available, use the pre-built Docker Hub image (fast)
|
||||
3. Otherwise, extract Dockerfile from environment_tar and build (slow)
|
||||
|
||||
Returns:
|
||||
(modal_image, temp_dir) -- modal_image is a Docker Hub name or a
|
||||
Dockerfile path. temp_dir is set if we extracted files that need
|
||||
cleanup later.
|
||||
"""
|
||||
docker_image = item.get("docker_image", "")
|
||||
environment_tar = item.get("environment_tar", "")
|
||||
|
||||
# Fast path: use pre-built Docker Hub image
|
||||
if docker_image and not self.config.force_build:
|
||||
logger.info("Task %s: using pre-built image %s", task_name, docker_image)
|
||||
return docker_image, None
|
||||
|
||||
# Slow path: extract Dockerfile from environment_tar and build
|
||||
if environment_tar:
|
||||
task_dir = Path(tempfile.mkdtemp(prefix=f"tb2-{task_name}-"))
|
||||
_extract_base64_tar(environment_tar, task_dir)
|
||||
dockerfile_path = task_dir / "Dockerfile"
|
||||
if dockerfile_path.exists():
|
||||
logger.info(
|
||||
"Task %s: building from Dockerfile (force_build=%s, docker_image=%s)",
|
||||
task_name, self.config.force_build, bool(docker_image),
|
||||
)
|
||||
return str(dockerfile_path), task_dir
|
||||
|
||||
# Neither available -- fall back to Hub image if force_build was True
|
||||
if docker_image:
|
||||
logger.warning(
|
||||
"Task %s: force_build=True but no environment_tar, "
|
||||
"falling back to docker_image %s", task_name, docker_image,
|
||||
)
|
||||
return docker_image, None
|
||||
|
||||
return "", None
|
||||
|
||||
# =========================================================================
|
||||
# Per-task evaluation -- agent loop + test verification
|
||||
# =========================================================================
|
||||
|
||||
async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict:
|
||||
"""
|
||||
Evaluate a single TB2 task: run the agent loop, then verify with tests.
|
||||
|
||||
This is the core evaluation method. For each task it:
|
||||
1. Resolves the Docker image and registers the Modal sandbox override
|
||||
2. Runs HermesAgentLoop with terminal + file tools
|
||||
3. Uploads the test suite into the sandbox
|
||||
4. Executes test.sh and checks the result
|
||||
5. Cleans up the sandbox and temp files
|
||||
|
||||
Args:
|
||||
eval_item: A single TB2 task dict from the dataset
|
||||
|
||||
Returns:
|
||||
Dict with 'passed' (bool), 'reward' (float), 'task_name' (str),
|
||||
'category' (str), and optional debug info
|
||||
"""
|
||||
task_name = eval_item.get("task_name", "unknown")
|
||||
category = eval_item.get("category", "unknown")
|
||||
task_id = str(uuid.uuid4())
|
||||
task_dir = None # Set if we extract a Dockerfile (needs cleanup)
|
||||
|
||||
from tqdm import tqdm
|
||||
tqdm.write(f" [START] {task_name} (task_id={task_id[:8]})")
|
||||
task_start = time.time()
|
||||
|
||||
try:
|
||||
# --- 1. Resolve Docker image ---
|
||||
modal_image, task_dir = self._resolve_task_image(eval_item, task_name)
|
||||
if not modal_image:
|
||||
logger.error("Task %s: no docker_image or environment_tar, skipping", task_name)
|
||||
return {
|
||||
"passed": False, "reward": 0.0,
|
||||
"task_name": task_name, "category": category,
|
||||
"error": "no_image",
|
||||
}
|
||||
|
||||
# --- 2. Register per-task image override ---
|
||||
# Set both modal_image and docker_image so the task image is used
|
||||
# regardless of which backend is configured.
|
||||
register_task_env_overrides(task_id, {
|
||||
"modal_image": modal_image,
|
||||
"docker_image": modal_image,
|
||||
"cwd": "/app",
|
||||
})
|
||||
logger.info(
|
||||
"Task %s: registered image override for task_id %s",
|
||||
task_name, task_id[:8],
|
||||
)
|
||||
|
||||
# --- 3. Resolve tools and build messages ---
|
||||
tools, valid_names = self._resolve_tools_for_group()
|
||||
|
||||
messages: List[Dict[str, Any]] = []
|
||||
if self.config.system_prompt:
|
||||
messages.append({"role": "system", "content": self.config.system_prompt})
|
||||
messages.append({"role": "user", "content": self.format_prompt(eval_item)})
|
||||
|
||||
# --- 4. Run agent loop ---
|
||||
# Use ManagedServer (Phase 2) for vLLM/SGLang backends to get
|
||||
# token-level tracking via /generate. Falls back to direct
|
||||
# ServerManager (Phase 1) for OpenAI endpoints.
|
||||
if self._use_managed_server():
|
||||
async with self.server.managed_server(
|
||||
tokenizer=self.tokenizer,
|
||||
preserve_think_blocks=bool(self.config.thinking_mode),
|
||||
) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
else:
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
# --- 5. Verify -- run test suite in the agent's sandbox ---
|
||||
# Skip verification if the agent produced no meaningful output
|
||||
only_system_and_user = all(
|
||||
msg.get("role") in ("system", "user") for msg in result.messages
|
||||
)
|
||||
if result.turns_used == 0 or only_system_and_user:
|
||||
logger.warning(
|
||||
"Task %s: agent produced no output (turns=%d). Reward=0.",
|
||||
task_name, result.turns_used,
|
||||
)
|
||||
reward = 0.0
|
||||
else:
|
||||
# Run tests in a thread so the blocking ctx.terminal() calls
|
||||
115
environments/benchmarks/yc_bench/README.md
Normal file
115
environments/benchmarks/yc_bench/README.md
Normal file
@@ -0,0 +1,115 @@
|
||||
# YC-Bench: Long-Horizon Agent Benchmark
|
||||
|
||||
[YC-Bench](https://github.com/collinear-ai/yc-bench) by [Collinear AI](https://collinear.ai/) is a deterministic, long-horizon benchmark that tests LLM agents' ability to act as a tech startup CEO. The agent manages a simulated company over 1-3 years, making compounding decisions about resource allocation, cash flow, task management, and prestige specialisation across 4 skill domains.
|
||||
|
||||
Unlike TerminalBench2 (which evaluates per-task coding ability with binary pass/fail), YC-Bench measures **long-term strategic coherence** — whether an agent can maintain consistent strategy, manage compounding consequences, and adapt plans over hundreds of turns.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
# Install yc-bench (optional dependency)
|
||||
pip install "hermes-agent[yc-bench]"
|
||||
|
||||
# Or install from source
|
||||
git clone https://github.com/collinear-ai/yc-bench
|
||||
cd yc-bench && pip install -e .
|
||||
|
||||
# Verify
|
||||
yc-bench --help
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
# From the repo root:
|
||||
bash environments/benchmarks/yc_bench/run_eval.sh
|
||||
|
||||
# Or directly:
|
||||
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
--config environments/benchmarks/yc_bench/default.yaml
|
||||
|
||||
# Override model:
|
||||
bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
--openai.model_name anthropic/claude-opus-4-20250514
|
||||
|
||||
# Quick single-preset test:
|
||||
bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
--env.presets '["fast_test"]' --env.seeds '[1]'
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
HermesAgentLoop (our agent)
|
||||
-> terminal tool -> subprocess("yc-bench company status") -> JSON output
|
||||
-> terminal tool -> subprocess("yc-bench task accept --task-id X") -> JSON
|
||||
-> terminal tool -> subprocess("yc-bench sim resume") -> JSON (advance time)
|
||||
-> ... (100-500 turns per run)
|
||||
```
|
||||
|
||||
The environment initialises the simulation via `yc-bench sim init` (NOT `yc-bench run`, which would start yc-bench's own built-in agent loop). Our `HermesAgentLoop` then drives all interaction through CLI commands.
|
||||
|
||||
### Simulation Mechanics
|
||||
|
||||
- **4 skill domains**: research, inference, data_environment, training
|
||||
- **Prestige system** (1.0-10.0): Gates access to higher-paying tasks
|
||||
- **Employee management**: Junior/Mid/Senior with domain-specific skill rates
|
||||
- **Throughput splitting**: `effective_rate = base_rate / N` active tasks per employee
|
||||
- **Financial pressure**: Monthly payroll, bankruptcy = game over
|
||||
- **Deterministic**: SHA256-based RNG — same seed + preset = same world
|
||||
|
||||
### Difficulty Presets
|
||||
|
||||
| Preset | Employees | Tasks | Focus |
|
||||
|-----------|-----------|-------|-------|
|
||||
| tutorial | 3 | 50 | Basic loop mechanics |
|
||||
| easy | 5 | 100 | Throughput awareness |
|
||||
| **medium**| 5 | 150 | Prestige climbing + domain specialisation |
|
||||
| **hard** | 7 | 200 | Precise ETA reasoning |
|
||||
| nightmare | 8 | 300 | Sustained perfection under payroll pressure |
|
||||
| fast_test | (varies) | (varies) | Quick validation (~50 turns) |
|
||||
|
||||
Default eval runs **fast_test + medium + hard** × 3 seeds = 9 runs.
|
||||
|
||||
### Scoring
|
||||
|
||||
```
|
||||
composite = 0.5 × survival + 0.5 × normalised_funds
|
||||
```
|
||||
|
||||
- **Survival** (binary): Did the company avoid bankruptcy?
|
||||
- **Normalised funds** (0.0-1.0): Log-scale relative to initial $250K capital
|
||||
|
||||
## Configuration
|
||||
|
||||
Key fields in `default.yaml`:
|
||||
|
||||
| Field | Default | Description |
|
||||
|-------|---------|-------------|
|
||||
| `presets` | `["fast_test", "medium", "hard"]` | Which presets to evaluate |
|
||||
| `seeds` | `[1, 2, 3]` | RNG seeds per preset |
|
||||
| `max_agent_turns` | 200 | Max LLM calls per run |
|
||||
| `run_timeout` | 3600 | Wall-clock timeout per run (seconds) |
|
||||
| `survival_weight` | 0.5 | Weight of survival in composite score |
|
||||
| `funds_weight` | 0.5 | Weight of normalised funds in composite |
|
||||
| `horizon_years` | null | Override horizon (null = auto from preset) |
|
||||
|
||||
## Cost & Time Estimates
|
||||
|
||||
Each run is 100-500 LLM turns. Approximate costs per run at typical API rates:
|
||||
|
||||
| Preset | Turns | Time | Est. Cost |
|
||||
|--------|-------|------|-----------|
|
||||
| fast_test | ~50 | 5-10 min | $1-5 |
|
||||
| medium | ~200 | 20-40 min | $5-15 |
|
||||
| hard | ~300 | 30-60 min | $10-25 |
|
||||
|
||||
Full default eval (9 runs): ~3-6 hours, $50-200 depending on model.
|
||||
|
||||
## References
|
||||
|
||||
- [collinear-ai/yc-bench](https://github.com/collinear-ai/yc-bench) — Official repository
|
||||
- [Collinear AI](https://collinear.ai/) — Company behind yc-bench
|
||||
- [TerminalBench2](../terminalbench_2/) — Per-task coding benchmark (complementary)
|
||||
0
environments/benchmarks/yc_bench/__init__.py
Normal file
0
environments/benchmarks/yc_bench/__init__.py
Normal file
43
environments/benchmarks/yc_bench/default.yaml
Normal file
43
environments/benchmarks/yc_bench/default.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
# YC-Bench Evaluation -- Default Configuration
|
||||
#
|
||||
# Long-horizon agent benchmark: agent plays CEO of an AI startup over
|
||||
# a simulated 1-3 year run, interacting via yc-bench CLI subcommands.
|
||||
#
|
||||
# Requires: pip install "hermes-agent[yc-bench]"
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
# --config environments/benchmarks/yc_bench/default.yaml
|
||||
#
|
||||
# # Override model:
|
||||
# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
# --config environments/benchmarks/yc_bench/default.yaml \
|
||||
# --openai.model_name anthropic/claude-opus-4-20250514
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal"]
|
||||
max_agent_turns: 200
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.0
|
||||
terminal_backend: "local"
|
||||
terminal_timeout: 60
|
||||
presets: ["fast_test", "medium", "hard"]
|
||||
seeds: [1, 2, 3]
|
||||
run_timeout: 3600 # 60 min wall-clock per run, auto-FAIL if exceeded
|
||||
survival_weight: 0.5 # weight of binary survival in composite score
|
||||
funds_weight: 0.5 # weight of normalised final funds in composite score
|
||||
db_dir: "/tmp/yc_bench_dbs"
|
||||
company_name: "BenchCo"
|
||||
start_date: "01/01/2025" # MM/DD/YYYY (yc-bench convention)
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "yc-bench"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/yc-bench"
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-sonnet-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
34
environments/benchmarks/yc_bench/run_eval.sh
Executable file
34
environments/benchmarks/yc_bench/run_eval.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
|
||||
# YC-Bench Evaluation
|
||||
#
|
||||
# Requires: pip install "hermes-agent[yc-bench]"
|
||||
#
|
||||
# Run from repo root:
|
||||
# bash environments/benchmarks/yc_bench/run_eval.sh
|
||||
#
|
||||
# Override model:
|
||||
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
# --openai.model_name anthropic/claude-opus-4-20250514
|
||||
#
|
||||
# Run a single preset:
|
||||
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
# --env.presets '["fast_test"]' --env.seeds '[1]'
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p logs evals/yc-bench
|
||||
LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log"
|
||||
|
||||
echo "YC-Bench Evaluation"
|
||||
echo "Log: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \
|
||||
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
--config environments/benchmarks/yc_bench/default.yaml \
|
||||
"$@" \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "Log saved to: $LOG_FILE"
|
||||
847
environments/benchmarks/yc_bench/yc_bench_env.py
Normal file
847
environments/benchmarks/yc_bench/yc_bench_env.py
Normal file
@@ -0,0 +1,847 @@
|
||||
"""
|
||||
YCBenchEvalEnv -- YC-Bench Long-Horizon Agent Benchmark Environment
|
||||
|
||||
Evaluates agentic LLMs on YC-Bench: a deterministic, long-horizon benchmark
|
||||
where the agent acts as CEO of an AI startup over a simulated 1-3 year run.
|
||||
The agent manages cash flow, employees, tasks, and prestige across 4 domains,
|
||||
interacting exclusively via CLI subprocess calls against a SQLite-backed
|
||||
discrete-event simulation.
|
||||
|
||||
Unlike TerminalBench2 (per-task binary pass/fail), YC-Bench measures sustained
|
||||
multi-turn strategic coherence -- whether an agent can manage compounding
|
||||
decisions over hundreds of turns without going bankrupt.
|
||||
|
||||
This is an eval-only environment. Run via:
|
||||
|
||||
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
--config environments/benchmarks/yc_bench/default.yaml
|
||||
|
||||
The evaluate flow:
|
||||
1. setup() -- Verifies yc-bench installed, builds eval matrix (preset x seed)
|
||||
2. evaluate() -- Iterates over all runs sequentially through:
|
||||
a. rollout_and_score_eval() -- Per-run agent loop
|
||||
- Initialises a fresh yc-bench simulation via `sim init` (NOT `run`)
|
||||
- Runs HermesAgentLoop with terminal tool only
|
||||
- Reads final SQLite DB to extract score
|
||||
- Returns survival (0/1) + normalised funds score
|
||||
b. Aggregates per-preset and overall metrics
|
||||
c. Logs results via evaluate_log() and wandb
|
||||
|
||||
Key features:
|
||||
- CLI-only interface: agent calls yc-bench subcommands via terminal tool
|
||||
- Deterministic: same seed + preset = same world (SHA256-based RNG)
|
||||
- Multi-dimensional scoring: survival + normalised final funds
|
||||
- Per-preset difficulty breakdown in results
|
||||
- Isolated SQLite DB per run (no cross-run state leakage)
|
||||
|
||||
Requires: pip install hermes-agent[yc-bench]
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from atroposlib.envs.base import EvalHandlingEnum
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
|
||||
from environments.agent_loop import HermesAgentLoop
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# System prompt
|
||||
# =============================================================================
|
||||
|
||||
YC_BENCH_SYSTEM_PROMPT = """\
|
||||
You are the autonomous CEO of an early-stage AI startup in a deterministic
|
||||
business simulation. You manage the company exclusively through the `yc-bench`
|
||||
CLI tool. Your primary goal is to **survive** until the simulation horizon ends
|
||||
without going bankrupt, while **maximising final funds**.
|
||||
|
||||
## Simulation Mechanics
|
||||
|
||||
- **Funds**: You start with $250,000 seed capital. Revenue comes from completing
|
||||
tasks. Rewards scale with your prestige: `base × (1 + scale × (prestige − 1))`.
|
||||
- **Domains**: There are 4 skill domains: **research**, **inference**,
|
||||
**data_environment**, and **training**. Each has its own prestige level
|
||||
(1.0-10.0). Higher prestige unlocks better-paying tasks.
|
||||
- **Employees**: You have employees (Junior/Mid/Senior) with domain-specific
|
||||
skill rates. **Throughput splits**: `effective_rate = base_rate / N` where N
|
||||
is the number of active tasks assigned to that employee. Focus beats breadth.
|
||||
- **Payroll**: Deducted automatically on the first business day of each month.
|
||||
Running out of funds = bankruptcy = game over.
|
||||
- **Time**: The simulation runs on business days (Mon-Fri), 09:00-18:00.
|
||||
Time only advances when you call `yc-bench sim resume`.
|
||||
|
||||
## Task Lifecycle
|
||||
|
||||
1. Browse market tasks with `market browse`
|
||||
2. Accept a task with `task accept` (this sets its deadline)
|
||||
3. Assign employees with `task assign`
|
||||
4. Dispatch with `task dispatch` to start work
|
||||
5. Call `sim resume` to advance time and let employees make progress
|
||||
6. Tasks complete when all domain requirements are fulfilled
|
||||
|
||||
**Penalties for failure vary by difficulty preset.** Completing a task on time
|
||||
earns full reward + prestige gain. Missing a deadline or cancelling a task
|
||||
incurs prestige penalties -- cancelling is always more costly than letting a
|
||||
task fail, so cancel only as a last resort.
|
||||
|
||||
## CLI Commands
|
||||
|
||||
### Observe
|
||||
- `yc-bench company status` -- funds, prestige, runway
|
||||
- `yc-bench employee list` -- skills, salary, active tasks
|
||||
- `yc-bench market browse [--domain D] [--required-prestige-lte N]` -- available tasks
|
||||
- `yc-bench task list [--status active|planned]` -- your tasks
|
||||
- `yc-bench task inspect --task-id UUID` -- progress, deadline, assignments
|
||||
- `yc-bench finance ledger [--category monthly_payroll|task_reward]` -- transaction history
|
||||
- `yc-bench report monthly` -- monthly P&L
|
||||
|
||||
### Act
|
||||
- `yc-bench task accept --task-id UUID` -- accept from market
|
||||
- `yc-bench task assign --task-id UUID --employee-id UUID` -- assign employee
|
||||
- `yc-bench task dispatch --task-id UUID` -- start work (needs >=1 assignment)
|
||||
- `yc-bench task cancel --task-id UUID --reason "text"` -- cancel (prestige penalty)
|
||||
- `yc-bench sim resume` -- advance simulation clock
|
||||
|
||||
### Memory (persists across context truncation)
|
||||
- `yc-bench scratchpad read` -- read your persistent notes
|
||||
- `yc-bench scratchpad write --content "text"` -- overwrite notes
|
||||
- `yc-bench scratchpad append --content "text"` -- append to notes
|
||||
- `yc-bench scratchpad clear` -- clear notes
|
||||
|
||||
## Strategy Guidelines
|
||||
|
||||
1. **Specialise in 2-3 domains** to climb the prestige ladder faster and unlock
|
||||
high-reward tasks. Don't spread thin across all 4 domains early on.
|
||||
2. **Focus employees** -- assigning one employee to many tasks halves their
|
||||
throughput per additional task. Keep assignments concentrated.
|
||||
3. **Use the scratchpad** to track your strategy, upcoming deadlines, and
|
||||
employee assignments. This persists even if conversation context is truncated.
|
||||
4. **Monitor runway** -- always know how many months of payroll you can cover.
|
||||
Accept high-reward tasks before payroll dates.
|
||||
5. **Don't over-accept** -- taking too many tasks and missing deadlines cascades
|
||||
into prestige loss, locking you out of profitable contracts.
|
||||
6. Use `finance ledger` and `report monthly` to track revenue trends.
|
||||
|
||||
## Your Turn
|
||||
|
||||
Each turn:
|
||||
1. Call `yc-bench company status` and `yc-bench task list` to orient yourself.
|
||||
2. Check for completed tasks and pending deadlines.
|
||||
3. Browse market for profitable tasks within your prestige level.
|
||||
4. Accept, assign, and dispatch tasks strategically.
|
||||
5. Call `yc-bench sim resume` to advance time.
|
||||
6. Repeat until the simulation ends.
|
||||
|
||||
Think step by step before acting."""
|
||||
|
||||
# Starting funds in cents ($250,000)
|
||||
INITIAL_FUNDS_CENTS = 25_000_000
|
||||
|
||||
# Default horizon per preset (years)
|
||||
_PRESET_HORIZONS = {
|
||||
"tutorial": 1,
|
||||
"easy": 1,
|
||||
"medium": 1,
|
||||
"hard": 1,
|
||||
"nightmare": 1,
|
||||
"fast_test": 1,
|
||||
"default": 3,
|
||||
"high_reward": 1,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
class YCBenchEvalConfig(HermesAgentEnvConfig):
|
||||
"""
|
||||
Configuration for the YC-Bench evaluation environment.
|
||||
|
||||
Extends HermesAgentEnvConfig with YC-Bench-specific settings for
|
||||
preset selection, seed control, scoring, and simulation parameters.
|
||||
"""
|
||||
|
||||
presets: List[str] = Field(
|
||||
default=["fast_test", "medium", "hard"],
|
||||
description="YC-Bench preset names to evaluate.",
|
||||
)
|
||||
seeds: List[int] = Field(
|
||||
default=[1, 2, 3],
|
||||
description="Random seeds -- each preset x seed = one run.",
|
||||
)
|
||||
run_timeout: int = Field(
|
||||
default=3600,
|
||||
description="Maximum wall-clock seconds per run. Default 60 minutes.",
|
||||
)
|
||||
survival_weight: float = Field(
|
||||
default=0.5,
|
||||
description="Weight of survival (0/1) in composite score.",
|
||||
)
|
||||
funds_weight: float = Field(
|
||||
default=0.5,
|
||||
description="Weight of normalised final funds in composite score.",
|
||||
)
|
||||
db_dir: str = Field(
|
||||
default="/tmp/yc_bench_dbs",
|
||||
description="Directory for per-run SQLite databases.",
|
||||
)
|
||||
horizon_years: Optional[int] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Simulation horizon in years. If None (default), inferred from "
|
||||
"preset name (1 year for most, 3 for 'default')."
|
||||
),
|
||||
)
|
||||
company_name: str = Field(
|
||||
default="BenchCo",
|
||||
description="Name of the simulated company.",
|
||||
)
|
||||
start_date: str = Field(
|
||||
default="01/01/2025",
|
||||
description="Simulation start date in MM/DD/YYYY format (yc-bench convention).",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Scoring helpers
|
||||
# =============================================================================
|
||||
|
||||
def _read_final_score(db_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Read final game state from a YC-Bench SQLite database.
|
||||
|
||||
Returns dict with final_funds_cents (int), survived (bool),
|
||||
terminal_reason (str).
|
||||
|
||||
Note: yc-bench table names are plural -- 'companies' not 'company',
|
||||
'sim_events' not 'simulation_log'.
|
||||
"""
|
||||
if not os.path.exists(db_path):
|
||||
logger.warning("DB not found at %s", db_path)
|
||||
return {
|
||||
"final_funds_cents": 0,
|
||||
"survived": False,
|
||||
"terminal_reason": "db_missing",
|
||||
}
|
||||
|
||||
conn = None
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Read final funds from the 'companies' table
|
||||
cur.execute("SELECT funds_cents FROM companies LIMIT 1")
|
||||
row = cur.fetchone()
|
||||
funds = row[0] if row else 0
|
||||
|
||||
# Determine terminal reason from 'sim_events' table
|
||||
terminal_reason = "unknown"
|
||||
try:
|
||||
cur.execute(
|
||||
"SELECT event_type FROM sim_events "
|
||||
"WHERE event_type IN ('bankruptcy', 'horizon_end') "
|
||||
"ORDER BY scheduled_at DESC LIMIT 1"
|
||||
)
|
||||
event_row = cur.fetchone()
|
||||
if event_row:
|
||||
terminal_reason = event_row[0]
|
||||
except sqlite3.OperationalError:
|
||||
# Table may not exist if simulation didn't progress
|
||||
pass
|
||||
|
||||
survived = funds >= 0 and terminal_reason != "bankruptcy"
|
||||
return {
|
||||
"final_funds_cents": funds,
|
||||
"survived": survived,
|
||||
"terminal_reason": terminal_reason,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to read DB %s: %s", db_path, e)
|
||||
return {
|
||||
"final_funds_cents": 0,
|
||||
"survived": False,
|
||||
"terminal_reason": f"db_error: {e}",
|
||||
}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _compute_composite_score(
|
||||
final_funds_cents: int,
|
||||
survived: bool,
|
||||
survival_weight: float = 0.5,
|
||||
funds_weight: float = 0.5,
|
||||
initial_funds_cents: int = INITIAL_FUNDS_CENTS,
|
||||
) -> float:
|
||||
"""
|
||||
Compute composite score from survival and final funds.
|
||||
|
||||
Score = survival_weight * survival_score
|
||||
+ funds_weight * normalised_funds_score
|
||||
|
||||
Normalised funds uses log-scale relative to initial capital:
|
||||
- funds <= 0: 0.0
|
||||
- funds == initial: ~0.15
|
||||
- funds == 10x: ~0.52
|
||||
- funds == 100x: 1.0
|
||||
"""
|
||||
survival_score = 1.0 if survived else 0.0
|
||||
|
||||
if final_funds_cents <= 0:
|
||||
funds_score = 0.0
|
||||
else:
|
||||
max_ratio = 100.0
|
||||
ratio = final_funds_cents / max(initial_funds_cents, 1)
|
||||
funds_score = min(math.log1p(ratio) / math.log1p(max_ratio), 1.0)
|
||||
|
||||
return survival_weight * survival_score + funds_weight * funds_score
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Environment
|
||||
# =============================================================================
|
||||
|
||||
class YCBenchEvalEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
YC-Bench long-horizon agent benchmark environment (eval-only).
|
||||
|
||||
Each eval item is a (preset, seed) pair. The environment initialises the
|
||||
simulation via ``yc-bench sim init`` (NOT ``yc-bench run`` which would start
|
||||
a competing built-in agent loop). The HermesAgentLoop then drives the
|
||||
interaction by calling individual yc-bench CLI commands via the terminal tool.
|
||||
|
||||
After the agent loop ends, the SQLite DB is read to extract the final score.
|
||||
|
||||
Scoring:
|
||||
composite = 0.5 * survival + 0.5 * normalised_funds
|
||||
"""
|
||||
|
||||
name = "yc-bench"
|
||||
env_config_cls = YCBenchEvalConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[YCBenchEvalConfig, List[APIServerConfig]]:
|
||||
env_config = YCBenchEvalConfig(
|
||||
enabled_toolsets=["terminal"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
max_agent_turns=200,
|
||||
max_token_length=32000,
|
||||
agent_temperature=0.0,
|
||||
system_prompt=YC_BENCH_SYSTEM_PROMPT,
|
||||
terminal_backend="local",
|
||||
terminal_timeout=60,
|
||||
presets=["fast_test", "medium", "hard"],
|
||||
seeds=[1, 2, 3],
|
||||
run_timeout=3600,
|
||||
survival_weight=0.5,
|
||||
funds_weight=0.5,
|
||||
db_dir="/tmp/yc_bench_dbs",
|
||||
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||
group_size=1,
|
||||
steps_per_eval=1,
|
||||
total_steps=1,
|
||||
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||
use_wandb=True,
|
||||
wandb_name="yc-bench",
|
||||
ensure_scores_are_not_same=False,
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-sonnet-4.6",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
health_check=False,
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
# =========================================================================
|
||||
# Setup
|
||||
# =========================================================================
|
||||
|
||||
async def setup(self):
|
||||
"""Verify yc-bench is installed and build the eval matrix."""
|
||||
# Verify yc-bench CLI is available
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["yc-bench", "--help"], capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise FileNotFoundError
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
raise RuntimeError(
|
||||
"yc-bench CLI not found. Install with:\n"
|
||||
' pip install "hermes-agent[yc-bench]"\n'
|
||||
"Or: git clone https://github.com/collinear-ai/yc-bench "
|
||||
"&& cd yc-bench && pip install -e ."
|
||||
)
|
||||
print("yc-bench CLI verified.")
|
||||
|
||||
# Build eval matrix: preset x seed
|
||||
self.all_eval_items = [
|
||||
{"preset": preset, "seed": seed}
|
||||
for preset in self.config.presets
|
||||
for seed in self.config.seeds
|
||||
]
|
||||
self.iter = 0
|
||||
|
||||
os.makedirs(self.config.db_dir, exist_ok=True)
|
||||
self.eval_metrics: List[Tuple[str, float]] = []
|
||||
|
||||
# Streaming JSONL log for crash-safe result persistence
|
||||
log_dir = os.path.join(os.path.dirname(__file__), "logs")
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
|
||||
self._streaming_file = open(self._streaming_path, "w")
|
||||
self._streaming_lock = threading.Lock()
|
||||
|
||||
print(f"\nYC-Bench eval matrix: {len(self.all_eval_items)} runs")
|
||||
for item in self.all_eval_items:
|
||||
print(f" preset={item['preset']!r} seed={item['seed']}")
|
||||
print(f"Streaming results to: {self._streaming_path}\n")
|
||||
|
||||
def _save_result(self, result: Dict[str, Any]):
|
||||
"""Write a single run result to the streaming JSONL file immediately."""
|
||||
if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
|
||||
return
|
||||
with self._streaming_lock:
|
||||
self._streaming_file.write(
|
||||
json.dumps(result, ensure_ascii=False, default=str) + "\n"
|
||||
)
|
||||
self._streaming_file.flush()
|
||||
|
||||
# =========================================================================
|
||||
# Training pipeline stubs (eval-only -- not used)
|
||||
# =========================================================================
|
||||
|
||||
async def get_next_item(self):
|
||||
item = self.all_eval_items[self.iter % len(self.all_eval_items)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||
preset = item["preset"]
|
||||
seed = item["seed"]
|
||||
return (
|
||||
f"A new YC-Bench simulation has been initialized "
|
||||
f"(preset='{preset}', seed={seed}).\n"
|
||||
f"Your company '{self.config.company_name}' is ready.\n\n"
|
||||
"Begin by calling:\n"
|
||||
"1. `yc-bench company status` -- see your starting funds and prestige\n"
|
||||
"2. `yc-bench employee list` -- see your team and their skills\n"
|
||||
"3. `yc-bench market browse --required-prestige-lte 1` -- find tasks "
|
||||
"you can take\n\n"
|
||||
"Then accept 2-3 tasks, assign employees, dispatch them, and call "
|
||||
"`yc-bench sim resume` to advance time. Repeat this loop until the "
|
||||
"simulation ends (horizon reached or bankruptcy)."
|
||||
)
|
||||
|
||||
async def compute_reward(self, item, result, ctx) -> float:
|
||||
return 0.0
|
||||
|
||||
async def collect_trajectories(self, item):
|
||||
return None, []
|
||||
|
||||
async def score(self, rollout_group_data):
|
||||
return None
|
||||
|
||||
# =========================================================================
|
||||
# Per-run evaluation
|
||||
# =========================================================================
|
||||
|
||||
async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict:
|
||||
"""
|
||||
Evaluate a single (preset, seed) run.
|
||||
|
||||
1. Sets DATABASE_URL and YC_BENCH_EXPERIMENT env vars
|
||||
2. Initialises the simulation via ``yc-bench sim init`` (NOT ``run``)
|
||||
3. Runs HermesAgentLoop with terminal tool
|
||||
4. Reads SQLite DB to compute final score
|
||||
5. Returns result dict with survival, funds, and composite score
|
||||
"""
|
||||
preset = eval_item["preset"]
|
||||
seed = eval_item["seed"]
|
||||
run_id = str(uuid.uuid4())[:8]
|
||||
run_key = f"{preset}_seed{seed}_{run_id}"
|
||||
|
||||
from tqdm import tqdm
|
||||
tqdm.write(f" [START] preset={preset!r} seed={seed} (run_id={run_id})")
|
||||
run_start = time.time()
|
||||
|
||||
# Isolated DB per run -- prevents cross-run state leakage
|
||||
db_path = os.path.join(self.config.db_dir, f"yc_bench_{run_key}.db")
|
||||
os.environ["DATABASE_URL"] = f"sqlite:///{db_path}"
|
||||
os.environ["YC_BENCH_EXPERIMENT"] = preset
|
||||
|
||||
# Determine horizon: explicit config override > preset lookup > default 1
|
||||
horizon = self.config.horizon_years or _PRESET_HORIZONS.get(preset, 1)
|
||||
|
||||
try:
|
||||
# ----------------------------------------------------------
|
||||
# Step 1: Initialise the simulation via CLI
|
||||
# IMPORTANT: We use `sim init`, NOT `yc-bench run`.
|
||||
# `yc-bench run` starts yc-bench's own LLM agent loop (via
|
||||
# LiteLLM), which would compete with our HermesAgentLoop.
|
||||
# `sim init` just sets up the world and returns.
|
||||
# ----------------------------------------------------------
|
||||
init_cmd = [
|
||||
"yc-bench", "sim", "init",
|
||||
"--seed", str(seed),
|
||||
"--start-date", self.config.start_date,
|
||||
"--company-name", self.config.company_name,
|
||||
"--horizon-years", str(horizon),
|
||||
]
|
||||
init_result = subprocess.run(
|
||||
init_cmd, capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
if init_result.returncode != 0:
|
||||
error_msg = (init_result.stderr or init_result.stdout).strip()
|
||||
raise RuntimeError(f"yc-bench sim init failed: {error_msg}")
|
||||
|
||||
tqdm.write(f" Simulation initialized (horizon={horizon}yr)")
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Step 2: Run the HermesAgentLoop
|
||||
# ----------------------------------------------------------
|
||||
tools, valid_names = self._resolve_tools_for_group()
|
||||
|
||||
messages: List[Dict[str, Any]] = [
|
||||
{"role": "system", "content": YC_BENCH_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": self.format_prompt(eval_item)},
|
||||
]
|
||||
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=run_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Step 3: Read final score from the simulation DB
|
||||
# ----------------------------------------------------------
|
||||
score_data = _read_final_score(db_path)
|
||||
final_funds = score_data["final_funds_cents"]
|
||||
survived = score_data["survived"]
|
||||
terminal_reason = score_data["terminal_reason"]
|
||||
|
||||
composite = _compute_composite_score(
|
||||
final_funds_cents=final_funds,
|
||||
survived=survived,
|
||||
survival_weight=self.config.survival_weight,
|
||||
funds_weight=self.config.funds_weight,
|
||||
)
|
||||
|
||||
elapsed = time.time() - run_start
|
||||
status = "SURVIVED" if survived else "BANKRUPT"
|
||||
if final_funds >= 0:
|
||||
funds_str = f"${final_funds / 100:,.0f}"
|
||||
else:
|
||||
funds_str = f"-${abs(final_funds) / 100:,.0f}"
|
||||
|
||||
tqdm.write(
|
||||
f" [{status}] preset={preset!r} seed={seed} "
|
||||
f"funds={funds_str} score={composite:.3f} "
|
||||
f"turns={result.turns_used} ({elapsed:.0f}s)"
|
||||
)
|
||||
|
||||
out = {
|
||||
"preset": preset,
|
||||
"seed": seed,
|
||||
"survived": survived,
|
||||
"final_funds_cents": final_funds,
|
||||
"final_funds_usd": final_funds / 100,
|
||||
"terminal_reason": terminal_reason,
|
||||
"composite_score": composite,
|
||||
"turns_used": result.turns_used,
|
||||
"finished_naturally": result.finished_naturally,
|
||||
"elapsed_seconds": elapsed,
|
||||
"db_path": db_path,
|
||||
"messages": result.messages,
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
except Exception as e:
|
||||
elapsed = time.time() - run_start
|
||||
logger.error("Run %s failed: %s", run_key, e, exc_info=True)
|
||||
tqdm.write(
|
||||
f" [ERROR] preset={preset!r} seed={seed}: {e} ({elapsed:.0f}s)"
|
||||
)
|
||||
out = {
|
||||
"preset": preset,
|
||||
"seed": seed,
|
||||
"survived": False,
|
||||
"final_funds_cents": 0,
|
||||
"final_funds_usd": 0.0,
|
||||
"terminal_reason": f"error: {e}",
|
||||
"composite_score": 0.0,
|
||||
"turns_used": 0,
|
||||
"error": str(e),
|
||||
"elapsed_seconds": elapsed,
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
# =========================================================================
|
||||
# Evaluate
|
||||
# =========================================================================
|
||||
|
||||
async def _run_with_timeout(self, item: Dict[str, Any]) -> Dict:
|
||||
"""Wrap a single rollout with a wall-clock timeout."""
|
||||
preset = item["preset"]
|
||||
seed = item["seed"]
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
self.rollout_and_score_eval(item),
|
||||
timeout=self.config.run_timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
from tqdm import tqdm
|
||||
tqdm.write(
|
||||
f" [TIMEOUT] preset={preset!r} seed={seed} "
|
||||
f"(exceeded {self.config.run_timeout}s)"
|
||||
)
|
||||
out = {
|
||||
"preset": preset,
|
||||
"seed": seed,
|
||||
"survived": False,
|
||||
"final_funds_cents": 0,
|
||||
"final_funds_usd": 0.0,
|
||||
"terminal_reason": f"timeout ({self.config.run_timeout}s)",
|
||||
"composite_score": 0.0,
|
||||
"turns_used": 0,
|
||||
"error": "timeout",
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
async def evaluate(self, *args, **kwargs) -> None:
|
||||
"""
|
||||
Run YC-Bench evaluation over all (preset, seed) combinations.
|
||||
|
||||
Runs sequentially -- each run is 100-500 turns, parallelising would
|
||||
be prohibitively expensive and cause env var conflicts.
|
||||
"""
|
||||
start_time = time.time()
|
||||
from tqdm import tqdm
|
||||
|
||||
# --- tqdm-compatible logging handler (TB2 pattern) ---
|
||||
class _TqdmHandler(logging.Handler):
|
||||
def emit(self, record):
|
||||
try:
|
||||
tqdm.write(self.format(record))
|
||||
except Exception:
|
||||
self.handleError(record)
|
||||
|
||||
root = logging.getLogger()
|
||||
handler = _TqdmHandler()
|
||||
handler.setFormatter(
|
||||
logging.Formatter("%(levelname)s %(name)s: %(message)s")
|
||||
)
|
||||
root.handlers = [handler]
|
||||
for noisy in ("httpx", "openai"):
|
||||
logging.getLogger(noisy).setLevel(logging.WARNING)
|
||||
|
||||
# --- Print config summary ---
|
||||
print(f"\n{'='*60}")
|
||||
print("Starting YC-Bench Evaluation")
|
||||
print(f"{'='*60}")
|
||||
print(f" Presets: {self.config.presets}")
|
||||
print(f" Seeds: {self.config.seeds}")
|
||||
print(f" Total runs: {len(self.all_eval_items)}")
|
||||
print(f" Max turns/run: {self.config.max_agent_turns}")
|
||||
print(f" Run timeout: {self.config.run_timeout}s")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
results = []
|
||||
pbar = tqdm(
|
||||
total=len(self.all_eval_items), desc="YC-Bench", dynamic_ncols=True
|
||||
)
|
||||
|
||||
try:
|
||||
for item in self.all_eval_items:
|
||||
result = await self._run_with_timeout(item)
|
||||
results.append(result)
|
||||
survived_count = sum(1 for r in results if r.get("survived"))
|
||||
pbar.set_postfix_str(
|
||||
f"survived={survived_count}/{len(results)}"
|
||||
)
|
||||
pbar.update(1)
|
||||
|
||||
except (KeyboardInterrupt, asyncio.CancelledError):
|
||||
tqdm.write("\n[INTERRUPTED] Stopping evaluation...")
|
||||
pbar.close()
|
||||
try:
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
cleanup_all_environments()
|
||||
except Exception:
|
||||
pass
|
||||
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||
self._streaming_file.close()
|
||||
return
|
||||
|
||||
pbar.close()
|
||||
end_time = time.time()
|
||||
|
||||
# --- Compute metrics ---
|
||||
valid = [r for r in results if r is not None]
|
||||
if not valid:
|
||||
print("Warning: No valid results.")
|
||||
return
|
||||
|
||||
total = len(valid)
|
||||
survived_total = sum(1 for r in valid if r.get("survived"))
|
||||
survival_rate = survived_total / total if total else 0.0
|
||||
avg_score = (
|
||||
sum(r.get("composite_score", 0) for r in valid) / total
|
||||
if total
|
||||
else 0.0
|
||||
)
|
||||
|
||||
preset_results: Dict[str, List[Dict]] = defaultdict(list)
|
||||
for r in valid:
|
||||
preset_results[r["preset"]].append(r)
|
||||
|
||||
eval_metrics = {
|
||||
"eval/survival_rate": survival_rate,
|
||||
"eval/avg_composite_score": avg_score,
|
||||
"eval/total_runs": total,
|
||||
"eval/survived_runs": survived_total,
|
||||
"eval/evaluation_time_seconds": end_time - start_time,
|
||||
}
|
||||
|
||||
for preset, items in sorted(preset_results.items()):
|
||||
ps = sum(1 for r in items if r.get("survived"))
|
||||
pt = len(items)
|
||||
pa = (
|
||||
sum(r.get("composite_score", 0) for r in items) / pt
|
||||
if pt
|
||||
else 0
|
||||
)
|
||||
key = preset.replace("-", "_")
|
||||
eval_metrics[f"eval/survival_rate_{key}"] = ps / pt if pt else 0
|
||||
eval_metrics[f"eval/avg_score_{key}"] = pa
|
||||
|
||||
self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
|
||||
|
||||
# --- Print summary ---
|
||||
print(f"\n{'='*60}")
|
||||
print("YC-Bench Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(
|
||||
f"Overall survival rate: {survival_rate:.1%} "
|
||||
f"({survived_total}/{total})"
|
||||
)
|
||||
print(f"Average composite score: {avg_score:.4f}")
|
||||
print(f"Evaluation time: {end_time - start_time:.1f}s")
|
||||
|
||||
print("\nPer-preset breakdown:")
|
||||
for preset, items in sorted(preset_results.items()):
|
||||
ps = sum(1 for r in items if r.get("survived"))
|
||||
pt = len(items)
|
||||
pa = (
|
||||
sum(r.get("composite_score", 0) for r in items) / pt
|
||||
if pt
|
||||
else 0
|
||||
)
|
||||
print(f" {preset}: {ps}/{pt} survived avg_score={pa:.4f}")
|
||||
for r in items:
|
||||
status = "SURVIVED" if r.get("survived") else "BANKRUPT"
|
||||
funds = r.get("final_funds_usd", 0)
|
||||
print(
|
||||
f" seed={r['seed']} [{status}] "
|
||||
f"${funds:,.0f} "
|
||||
f"score={r.get('composite_score', 0):.3f}"
|
||||
)
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# --- Log results ---
|
||||
samples = [
|
||||
{k: v for k, v in r.items() if k != "messages"} for r in valid
|
||||
]
|
||||
|
||||
try:
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
samples=samples,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
generation_parameters={
|
||||
"temperature": self.config.agent_temperature,
|
||||
"max_tokens": self.config.max_token_length,
|
||||
"max_agent_turns": self.config.max_agent_turns,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error logging results: {e}")
|
||||
|
||||
# --- Cleanup (TB2 pattern) ---
|
||||
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||
self._streaming_file.close()
|
||||
print(f"Results saved to: {self._streaming_path}")
|
||||
|
||||
try:
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
cleanup_all_environments()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
from environments.agent_loop import _tool_executor
|
||||
_tool_executor.shutdown(wait=False, cancel_futures=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# =========================================================================
|
||||
# Wandb logging
|
||||
# =========================================================================
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log YC-Bench-specific metrics to wandb."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
for k, v in self.eval_metrics:
|
||||
wandb_metrics[k] = v
|
||||
self.eval_metrics = []
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
YCBenchEvalEnv.cli()
|
||||
670
environments/hermes_base_env.py
Normal file
670
environments/hermes_base_env.py
Normal file
@@ -0,0 +1,670 @@
|
||||
"""
|
||||
HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos
|
||||
|
||||
Provides the Atropos integration plumbing that all hermes-agent environments share:
|
||||
- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2)
|
||||
- Per-group toolset/distribution resolution
|
||||
- Agent loop orchestration via HermesAgentLoop
|
||||
- ToolContext creation for reward functions
|
||||
- ScoredDataGroup construction from ManagedServer state
|
||||
|
||||
Subclasses only need to implement:
|
||||
setup() -- Load dataset, initialize state
|
||||
get_next_item() -- Return the next item from the dataset
|
||||
format_prompt() -- Convert a dataset item into the user message
|
||||
compute_reward() -- Score the rollout (has full ToolContext access)
|
||||
evaluate() -- Periodic evaluation
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
# Ensure the hermes-agent repo root is on sys.path so that imports like
|
||||
# `from model_tools import ...` and `from environments.X import ...` work
|
||||
# regardless of where the script is invoked from.
|
||||
_repo_root = Path(__file__).resolve().parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import Field
|
||||
|
||||
# Load API keys from hermes-agent/.env so all environments can access them
|
||||
_env_path = _repo_root / ".env"
|
||||
if _env_path.exists():
|
||||
load_dotenv(dotenv_path=_env_path)
|
||||
|
||||
# Apply monkey patches for async-safe tool operation inside Atropos's event loop.
|
||||
# This patches SwerexModalEnvironment to use a background thread instead of
|
||||
# asyncio.run(), which would deadlock inside Atropos. Safe for normal CLI too.
|
||||
from environments.patches import apply_patches
|
||||
apply_patches()
|
||||
|
||||
from atroposlib.envs.base import (
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
ScoredDataGroup,
|
||||
ScoredDataItem,
|
||||
)
|
||||
from atroposlib.envs.server_handling.server_manager import (
|
||||
APIServerConfig,
|
||||
ServerBaseline,
|
||||
ServerManager,
|
||||
)
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
|
||||
# Import hermes-agent toolset infrastructure
|
||||
from model_tools import get_tool_definitions
|
||||
from toolset_distributions import sample_toolsets_from_distribution
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HermesAgentEnvConfig(BaseEnvConfig):
|
||||
"""
|
||||
Configuration for hermes-agent Atropos environments.
|
||||
|
||||
Extends BaseEnvConfig with agent-specific settings for toolsets,
|
||||
terminal backend, dataset loading, and tool call parsing.
|
||||
"""
|
||||
|
||||
# --- Toolset configuration ---
|
||||
# Mutually exclusive: use either enabled_toolsets OR distribution
|
||||
enabled_toolsets: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). "
|
||||
"If None and distribution is also None, all available toolsets are enabled.",
|
||||
)
|
||||
disabled_toolsets: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.",
|
||||
)
|
||||
distribution: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Name of a toolset distribution from toolset_distributions.py "
|
||||
"(e.g., 'development', 'terminal_tasks'). Sampled once per group. "
|
||||
"Mutually exclusive with enabled_toolsets.",
|
||||
)
|
||||
|
||||
# --- Agent loop configuration ---
|
||||
max_agent_turns: int = Field(
|
||||
default=30,
|
||||
description="Maximum number of LLM calls (tool-calling iterations) per rollout.",
|
||||
)
|
||||
system_prompt: Optional[str] = Field(
|
||||
default=None,
|
||||
description="System prompt for the agent. Tools are handled via the tools= parameter, "
|
||||
"not embedded in the prompt text.",
|
||||
)
|
||||
agent_temperature: float = Field(
|
||||
default=1.0,
|
||||
description="Sampling temperature for agent generation during rollouts.",
|
||||
)
|
||||
|
||||
# --- Terminal backend ---
|
||||
terminal_backend: str = Field(
|
||||
default="local",
|
||||
description="Terminal backend: 'local', 'docker', 'modal', 'daytona', 'ssh', 'singularity'. "
|
||||
"Modal or Daytona recommended for production RL (cloud isolation per rollout).",
|
||||
)
|
||||
terminal_timeout: int = Field(
|
||||
default=120,
|
||||
description="Per-command timeout in seconds for terminal tool calls. "
|
||||
"Commands exceeding this are killed. Increase for tasks with long-running "
|
||||
"commands (compilation, pip install, etc.).",
|
||||
)
|
||||
terminal_lifetime: int = Field(
|
||||
default=3600,
|
||||
description="Sandbox inactivity lifetime in seconds. The cleanup thread kills "
|
||||
"sandboxes that have been idle longer than this. Must be longer than "
|
||||
"the longest gap between tool calls (e.g., waiting for LLM response).",
|
||||
)
|
||||
|
||||
# --- Dataset ---
|
||||
dataset_name: Optional[str] = Field(
|
||||
default=None,
|
||||
description="HuggingFace dataset name. Optional if tasks are defined inline.",
|
||||
)
|
||||
dataset_split: str = Field(
|
||||
default="train",
|
||||
description="Dataset split to use.",
|
||||
)
|
||||
prompt_field: str = Field(
|
||||
default="prompt",
|
||||
description="Which field in the dataset contains the prompt.",
|
||||
)
|
||||
|
||||
# --- Thread pool ---
|
||||
tool_pool_size: int = Field(
|
||||
default=128,
|
||||
description="Thread pool size for tool execution. Each concurrent task needs a "
|
||||
"thread for tool calls. Must be large enough for parallel evaluation. "
|
||||
"Too small = thread pool starvation.",
|
||||
)
|
||||
|
||||
# --- Phase 2: Tool call parsing ---
|
||||
tool_call_parser: str = Field(
|
||||
default="hermes",
|
||||
description="Tool call parser name for Phase 2 (VLLM server type). "
|
||||
"Ignored in Phase 1 (OpenAI server type where VLLM parses natively). "
|
||||
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
|
||||
)
|
||||
|
||||
# --- Provider-specific parameters ---
|
||||
# Passed as extra_body to the OpenAI client's chat.completions.create() call.
|
||||
# Useful for OpenRouter provider preferences, transforms, route settings, etc.
|
||||
# Example YAML:
|
||||
# extra_body:
|
||||
# provider:
|
||||
# ignore: ["DeepInfra", "Fireworks"]
|
||||
# order: ["Together"]
|
||||
# transforms: ["middle-out"]
|
||||
extra_body: Optional[Dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="Extra body parameters passed to the OpenAI client's "
|
||||
"chat.completions.create(). Used for OpenRouter provider preferences, "
|
||||
"transforms, and other provider-specific settings.",
|
||||
)
|
||||
|
||||
|
||||
class HermesAgentBaseEnv(BaseEnv):
|
||||
"""
|
||||
Abstract base environment for hermes-agent Atropos integration.
|
||||
|
||||
Handles two modes of operation:
|
||||
- Phase 1 (OpenAI server type): Uses server.chat_completion() directly.
|
||||
The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing
|
||||
and reasoning extraction natively. DummyManagedServer provides placeholder
|
||||
tokens. Good for SFT data gen, verifier testing, evaluation.
|
||||
|
||||
- Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs
|
||||
via /generate. Client-side tool call parser reconstructs structured tool_calls
|
||||
from raw output. Full RL training capability.
|
||||
|
||||
Subclasses must implement:
|
||||
setup() -- Load dataset, initialize state
|
||||
get_next_item() -- Return the next item to roll out
|
||||
format_prompt() -- Convert a dataset item into the user message string
|
||||
compute_reward() -- Score the rollout using ToolContext
|
||||
evaluate() -- Periodic evaluation
|
||||
"""
|
||||
|
||||
name: Optional[str] = "hermes-agent"
|
||||
env_config_cls = HermesAgentEnvConfig
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: HermesAgentEnvConfig,
|
||||
server_configs: Union[ServerBaseline, List[APIServerConfig]],
|
||||
slurm=False,
|
||||
testing=False,
|
||||
):
|
||||
super().__init__(config, server_configs, slurm, testing)
|
||||
|
||||
# Set terminal environment variables so hermes tools pick them up.
|
||||
# These can all be overridden per-environment via config fields instead
|
||||
# of requiring users to set shell env vars.
|
||||
if config.terminal_backend:
|
||||
os.environ["TERMINAL_ENV"] = config.terminal_backend
|
||||
os.environ["TERMINAL_TIMEOUT"] = str(config.terminal_timeout)
|
||||
os.environ["TERMINAL_LIFETIME_SECONDS"] = str(config.terminal_lifetime)
|
||||
print(
|
||||
f"🖥️ Terminal: backend={config.terminal_backend}, "
|
||||
f"timeout={config.terminal_timeout}s, lifetime={config.terminal_lifetime}s"
|
||||
)
|
||||
|
||||
# Resize the agent loop's thread pool for tool execution.
|
||||
# This must be large enough for the number of concurrent tasks
|
||||
# (e.g., 89 parallel TB2 eval tasks each need a thread for tool calls).
|
||||
from environments.agent_loop import resize_tool_pool
|
||||
resize_tool_pool(config.tool_pool_size)
|
||||
|
||||
# Set tool_parser on the ServerManager so ManagedServer uses it
|
||||
# for bidirectional tool call translation (raw text ↔ OpenAI tool_calls).
|
||||
if hasattr(self.server, 'tool_parser'):
|
||||
self.server.tool_parser = config.tool_call_parser
|
||||
print(f"🔧 Tool parser: {config.tool_call_parser}")
|
||||
|
||||
# Current group's resolved tools (set in collect_trajectories)
|
||||
self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
|
||||
|
||||
# Tool error tracking for wandb logging
|
||||
self._tool_error_buffer: List[Dict[str, Any]] = []
|
||||
|
||||
# =========================================================================
|
||||
# Toolset resolution (per-group)
|
||||
# =========================================================================
|
||||
|
||||
def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]:
|
||||
"""
|
||||
Resolve toolsets for a group. Called once in collect_trajectories(),
|
||||
then shared by all collect_trajectory() calls in the group.
|
||||
|
||||
If distribution is set, samples probabilistically.
|
||||
If enabled_toolsets is set, uses that explicit list.
|
||||
disabled_toolsets is applied as a filter on top.
|
||||
|
||||
Returns:
|
||||
(tool_schemas, valid_tool_names) tuple
|
||||
"""
|
||||
config = self.config
|
||||
|
||||
if config.distribution:
|
||||
group_toolsets = sample_toolsets_from_distribution(config.distribution)
|
||||
logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets)
|
||||
else:
|
||||
group_toolsets = config.enabled_toolsets # None means "all available"
|
||||
if group_toolsets is None:
|
||||
logger.warning(
|
||||
"enabled_toolsets is None -- loading ALL tools including messaging. "
|
||||
"Set explicit enabled_toolsets for RL training."
|
||||
)
|
||||
|
||||
tools = get_tool_definitions(
|
||||
enabled_toolsets=group_toolsets,
|
||||
disabled_toolsets=config.disabled_toolsets,
|
||||
quiet_mode=True,
|
||||
)
|
||||
|
||||
valid_names = {t["function"]["name"] for t in tools} if tools else set()
|
||||
logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names))
|
||||
return tools, valid_names
|
||||
|
||||
# =========================================================================
|
||||
# Server mode detection
|
||||
# =========================================================================
|
||||
|
||||
def _use_managed_server(self) -> bool:
|
||||
"""
|
||||
Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1).
|
||||
|
||||
Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang',
|
||||
which go through the /generate endpoint for exact token tracking.
|
||||
|
||||
Phase 1 (direct server) is used for 'openai' server type, which uses
|
||||
/v1/chat/completions with native tool call parsing.
|
||||
"""
|
||||
if not self.server.servers:
|
||||
return False
|
||||
|
||||
server = self.server.servers[0]
|
||||
# If the server is an OpenAI server (not VLLM/SGLang), use direct mode
|
||||
from atroposlib.envs.server_handling.openai_server import OpenAIServer
|
||||
return not isinstance(server, OpenAIServer)
|
||||
|
||||
# =========================================================================
|
||||
# Core Atropos integration
|
||||
# =========================================================================
|
||||
|
||||
async def collect_trajectories(
|
||||
self, item: Item
|
||||
) -> Tuple[
|
||||
Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
|
||||
List[Item],
|
||||
]:
|
||||
"""
|
||||
Override collect_trajectories to resolve toolsets once per group,
|
||||
then delegate to the standard group-level collection.
|
||||
|
||||
The default BaseEnv.collect_trajectories() calls collect_trajectory()
|
||||
group_size times in parallel. We resolve tools once here and store
|
||||
them for all those calls to use.
|
||||
"""
|
||||
# Resolve toolsets for this group (shared by all rollouts in the group)
|
||||
self._current_group_tools = self._resolve_tools_for_group()
|
||||
|
||||
# Delegate to the default implementation which calls collect_trajectory()
|
||||
# group_size times via asyncio.gather
|
||||
return await super().collect_trajectories(item)
|
||||
|
||||
# =========================================================================
|
||||
# Wandb rollout display -- format trajectories nicely
|
||||
# =========================================================================
|
||||
|
||||
@staticmethod
|
||||
def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
Format a conversation's messages into a readable trajectory string
|
||||
for wandb rollout tables. Shows tool calls, tool results, and reasoning
|
||||
in a structured way instead of raw token decoding.
|
||||
"""
|
||||
parts = []
|
||||
for msg in messages:
|
||||
role = msg.get("role", "unknown")
|
||||
content = msg.get("content", "")
|
||||
|
||||
if role == "system":
|
||||
parts.append(f"[SYSTEM]\n{content}")
|
||||
|
||||
elif role == "user":
|
||||
parts.append(f"[USER]\n{content}")
|
||||
|
||||
elif role == "assistant":
|
||||
# Show reasoning if present
|
||||
reasoning = msg.get("reasoning_content", "")
|
||||
if reasoning:
|
||||
# Truncate long reasoning for display
|
||||
if len(reasoning) > 300:
|
||||
reasoning = reasoning[:300] + "..."
|
||||
parts.append(f"[ASSISTANT thinking]\n{reasoning}")
|
||||
|
||||
# Show content
|
||||
if content:
|
||||
parts.append(f"[ASSISTANT]\n{content}")
|
||||
|
||||
# Show tool calls
|
||||
tool_calls = msg.get("tool_calls", [])
|
||||
for tc in tool_calls:
|
||||
func = tc.get("function", {})
|
||||
name = func.get("name", "?")
|
||||
args = func.get("arguments", "{}")
|
||||
# Truncate long arguments for display
|
||||
if len(args) > 200:
|
||||
args = args[:200] + "..."
|
||||
parts.append(f"[TOOL CALL] {name}({args})")
|
||||
|
||||
elif role == "tool":
|
||||
tool_id = msg.get("tool_call_id", "")
|
||||
result = content
|
||||
# Truncate long tool results for display
|
||||
if len(result) > 500:
|
||||
result = result[:500] + "..."
|
||||
parts.append(f"[TOOL RESULT] {result}")
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
async def add_rollouts_for_wandb(
|
||||
self,
|
||||
scored_data,
|
||||
item=None,
|
||||
):
|
||||
"""
|
||||
Override to show formatted trajectories with tool calls visible,
|
||||
instead of raw token decoding which loses all structure.
|
||||
"""
|
||||
num_keep = self.config.num_rollouts_per_group_for_logging
|
||||
if num_keep == -1:
|
||||
num_keep = self.config.group_size
|
||||
|
||||
group = []
|
||||
for i in range(min(num_keep, len(scored_data.get("scores", [])))):
|
||||
score = scored_data["scores"][i]
|
||||
|
||||
# Use messages if available for rich display
|
||||
messages = None
|
||||
if scored_data.get("messages") and i < len(scored_data["messages"]):
|
||||
messages = scored_data["messages"][i]
|
||||
|
||||
if messages:
|
||||
text = self._format_trajectory_for_display(messages)
|
||||
elif scored_data.get("tokens") and i < len(scored_data["tokens"]):
|
||||
text = self.tokenizer.decode(scored_data["tokens"][i])
|
||||
else:
|
||||
text = "(no data)"
|
||||
|
||||
group.append((text, score))
|
||||
|
||||
self.rollouts_for_wandb.append(group)
|
||||
if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
|
||||
self.rollouts_for_wandb.pop(0)
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log base metrics including tool errors to wandb."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
# Log tool error stats
|
||||
if self._tool_error_buffer:
|
||||
wandb_metrics["train/tool_errors_count"] = len(self._tool_error_buffer)
|
||||
|
||||
# Log error details as a summary string (tables can crash wandb on tmp cleanup)
|
||||
error_summaries = []
|
||||
for err in self._tool_error_buffer:
|
||||
error_summaries.append(
|
||||
f"[turn {err['turn']}] {err['tool']}({err['args'][:80]}) -> {err['error'][:150]}"
|
||||
)
|
||||
wandb_metrics["train/tool_error_details"] = "\n".join(error_summaries)
|
||||
|
||||
# Also print to stdout for immediate visibility
|
||||
for summary in error_summaries:
|
||||
print(f" Tool Error: {summary}")
|
||||
|
||||
self._tool_error_buffer = []
|
||||
else:
|
||||
wandb_metrics["train/tool_errors_count"] = 0
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
async def collect_trajectory(
|
||||
self, item: Item
|
||||
) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]:
|
||||
"""
|
||||
Run a single rollout: agent loop + reward computation.
|
||||
|
||||
This is called group_size times in parallel by collect_trajectories().
|
||||
Each call gets its own task_id for terminal/browser session isolation.
|
||||
"""
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
# Get group-level tools (resolved once in collect_trajectories)
|
||||
if self._current_group_tools is None:
|
||||
# Fallback: resolve per-trajectory if called outside collect_trajectories
|
||||
tools, valid_names = self._resolve_tools_for_group()
|
||||
else:
|
||||
tools, valid_names = self._current_group_tools
|
||||
|
||||
# Build initial messages
|
||||
messages: List[Dict[str, Any]] = []
|
||||
if self.config.system_prompt:
|
||||
messages.append({"role": "system", "content": self.config.system_prompt})
|
||||
messages.append({"role": "user", "content": self.format_prompt(item)})
|
||||
|
||||
# Run the agent loop
|
||||
result: AgentResult
|
||||
if self._use_managed_server():
|
||||
# Phase 2: ManagedServer with ToolCallTranslator -- exact tokens + logprobs
|
||||
# tool_parser is set on ServerManager in __init__ and passed through
|
||||
# to ManagedServer, which uses ToolCallTranslator for bidirectional
|
||||
# translation between raw text and OpenAI tool_calls.
|
||||
try:
|
||||
async with self.server.managed_server(
|
||||
tokenizer=self.tokenizer,
|
||||
preserve_think_blocks=bool(self.config.thinking_mode),
|
||||
) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
except NotImplementedError:
|
||||
# DummyManagedServer not allowed -- fall back to Phase 1
|
||||
logger.warning(
|
||||
"ManagedServer not available (OpenAI server?). "
|
||||
"Falling back to direct server mode."
|
||||
)
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
else:
|
||||
# Phase 1: OpenAI server -- native tool_calls, placeholder tokens
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Skip reward computation if the agent loop produced no meaningful work
|
||||
# (e.g., API call failed on turn 1). No point spinning up a Modal sandbox
|
||||
# just to verify files that were never created.
|
||||
only_system_and_user = all(
|
||||
msg.get("role") in ("system", "user") for msg in result.messages
|
||||
)
|
||||
if result.turns_used == 0 or only_system_and_user:
|
||||
logger.warning(
|
||||
"Agent loop produced no output (turns=%d, msgs=%d). Skipping reward.",
|
||||
result.turns_used, len(result.messages),
|
||||
)
|
||||
reward = 0.0
|
||||
else:
|
||||
# Compute reward using ToolContext (gives verifier full tool access)
|
||||
ctx = ToolContext(task_id)
|
||||
try:
|
||||
reward = await self.compute_reward(item, result, ctx)
|
||||
except Exception as e:
|
||||
logger.error("compute_reward failed: %s", e)
|
||||
reward = 0.0
|
||||
finally:
|
||||
ctx.cleanup()
|
||||
|
||||
# Track tool errors for wandb logging
|
||||
if result.tool_errors:
|
||||
for err in result.tool_errors:
|
||||
self._tool_error_buffer.append({
|
||||
"turn": err.turn,
|
||||
"tool": err.tool_name,
|
||||
"args": err.arguments[:150],
|
||||
"error": err.error[:300],
|
||||
"result": err.tool_result[:300],
|
||||
})
|
||||
|
||||
# Build ScoredDataItem from ManagedServer state
|
||||
# Phase 2: real tokens/masks/logprobs from SequenceNodes
|
||||
# Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline)
|
||||
nodes = (result.managed_state or {}).get("nodes", [])
|
||||
|
||||
if nodes:
|
||||
# Phase 2 (or DummyManagedServer): use actual node data
|
||||
node = nodes[-1] # Final sequence node = full trajectory
|
||||
scored_item: Dict[str, Any] = {
|
||||
"tokens": node.tokens,
|
||||
"masks": node.masked_tokens,
|
||||
"scores": reward,
|
||||
}
|
||||
|
||||
# Include logprobs if available (Phase 2)
|
||||
if hasattr(node, "logprobs") and node.logprobs:
|
||||
scored_item["advantages"] = None # Computed by trainer
|
||||
scored_item["ref_logprobs"] = None
|
||||
else:
|
||||
# Phase 1 with no managed state: create placeholder tokens
|
||||
# so the data pipeline doesn't break. These are NOT suitable
|
||||
# for training but allow process mode (SFT data gen) to work.
|
||||
# Tokenize the full conversation to get approximate tokens.
|
||||
full_text = "\n".join(
|
||||
msg.get("content", "") for msg in result.messages if msg.get("content")
|
||||
)
|
||||
if self.tokenizer:
|
||||
tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
|
||||
else:
|
||||
tokens = list(range(min(len(full_text) // 4, 128)))
|
||||
|
||||
scored_item = {
|
||||
"tokens": tokens,
|
||||
"masks": [-100] + tokens[1:], # Mask first token as prompt
|
||||
"scores": reward,
|
||||
}
|
||||
|
||||
# Always include messages for wandb rollout display and data logging
|
||||
scored_item["messages"] = result.messages
|
||||
|
||||
return scored_item, []
|
||||
|
||||
# =========================================================================
|
||||
# Abstract methods -- subclasses must implement
|
||||
# =========================================================================
|
||||
|
||||
@abstractmethod
|
||||
async def setup(self):
|
||||
"""
|
||||
Load dataset, initialize state.
|
||||
|
||||
Called once when the environment starts. Typical implementation:
|
||||
self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
|
||||
self.iter = 0
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def get_next_item(self) -> Item:
|
||||
"""
|
||||
Return the next item from the dataset for rollout.
|
||||
|
||||
Called by the base env's main loop to get items for workers.
|
||||
Should cycle through the dataset.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def format_prompt(self, item: Item) -> str:
|
||||
"""
|
||||
Convert a dataset item into the user message for the agent.
|
||||
|
||||
Args:
|
||||
item: Dataset item (dict, tuple, etc.)
|
||||
|
||||
Returns:
|
||||
The prompt string to send to the agent
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def compute_reward(
|
||||
self, item: Item, result: AgentResult, ctx: ToolContext
|
||||
) -> float:
|
||||
"""
|
||||
Score the rollout. Has full access to:
|
||||
- item: the original dataset item (ground truth, test commands, etc.)
|
||||
- result: AgentResult with full messages, turn count, reasoning, etc.
|
||||
- ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web,
|
||||
browser, vision...) scoped to this rollout's sandbox. Nothing
|
||||
is off-limits.
|
||||
|
||||
Args:
|
||||
item: The dataset item that was rolled out
|
||||
result: The agent's rollout result
|
||||
ctx: ToolContext with full tool access for verification
|
||||
|
||||
Returns:
|
||||
Reward float (typically 0.0 to 1.0, but any float is valid)
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
"""
|
||||
Periodic evaluation. Called every steps_per_eval steps.
|
||||
|
||||
Typical implementation runs the agent on a held-out eval set
|
||||
and logs metrics via wandb/evaluate_log.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
0
environments/hermes_swe_env/__init__.py
Normal file
0
environments/hermes_swe_env/__init__.py
Normal file
34
environments/hermes_swe_env/default.yaml
Normal file
34
environments/hermes_swe_env/default.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
# SWE Environment -- Default Configuration
|
||||
#
|
||||
# SWE-bench style tasks with Modal sandboxes for cloud isolation.
|
||||
# Uses terminal + file + web toolsets.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/hermes_swe_env/hermes_swe_env.py serve \
|
||||
# --config environments/hermes_swe_env/default.yaml
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file", "web"]
|
||||
max_agent_turns: 30
|
||||
max_token_length: 4096
|
||||
group_size: 4
|
||||
terminal_backend: "modal"
|
||||
tool_call_parser: "hermes"
|
||||
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||
dataset_name: "bigcode/humanevalpack"
|
||||
dataset_split: "test"
|
||||
prompt_field: "prompt"
|
||||
steps_per_eval: 50
|
||||
total_steps: 500
|
||||
use_wandb: true
|
||||
wandb_name: "hermes-swe"
|
||||
system_prompt: >
|
||||
You are a skilled software engineer. You have access to a terminal,
|
||||
file tools, and web search. Use these tools to complete the coding task.
|
||||
Write clean, working code and verify it runs correctly before finishing.
|
||||
|
||||
openai:
|
||||
base_url: "http://localhost:8000/v1"
|
||||
model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||
server_type: "openai"
|
||||
api_key: ""
|
||||
229
environments/hermes_swe_env/hermes_swe_env.py
Normal file
229
environments/hermes_swe_env/hermes_swe_env.py
Normal file
@@ -0,0 +1,229 @@
|
||||
"""
|
||||
HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes
|
||||
|
||||
A concrete environment for software engineering tasks where the model writes code
|
||||
and the reward function runs tests to verify correctness. Uses Modal terminal
|
||||
backend for cloud-isolated sandboxes per rollout.
|
||||
|
||||
The reward function uses ToolContext.terminal() to run test commands in the same
|
||||
Modal sandbox the model used during its agentic loop. All filesystem state from
|
||||
the model's tool calls is preserved for verification.
|
||||
|
||||
Usage:
|
||||
# Phase 1: OpenAI server type
|
||||
vllm serve YourModel --tool-parser hermes
|
||||
run-api
|
||||
python environments/hermes_swe_env.py serve \\
|
||||
--openai.base_url http://localhost:8000/v1 \\
|
||||
--openai.model_name YourModel \\
|
||||
--openai.server_type openai \\
|
||||
--env.dataset_name bigcode/humanevalpack \\
|
||||
--env.terminal_backend modal
|
||||
|
||||
# Phase 2: VLLM server type (full RL training)
|
||||
python environments/hermes_swe_env.py serve \\
|
||||
--openai.base_url http://localhost:8000/v1 \\
|
||||
--openai.model_name YourModel \\
|
||||
--openai.server_type vllm \\
|
||||
--env.tool_call_parser hermes \\
|
||||
--env.terminal_backend modal
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
# Ensure repo root is on sys.path for imports
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from atroposlib.envs.base import ScoredDataGroup
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.agent_loop import AgentResult
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
from environments.tool_context import ToolContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HermesSweEnvConfig(HermesAgentEnvConfig):
|
||||
"""Config with defaults for SWE-bench style tasks."""
|
||||
|
||||
pass # Inherits all fields, overrides defaults in config_init
|
||||
|
||||
|
||||
class HermesSweEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
SWE-bench style environment using Modal terminal backend.
|
||||
|
||||
The model gets a coding task, uses terminal + file + web tools to solve it,
|
||||
and the reward function runs tests in the same Modal sandbox to verify.
|
||||
|
||||
Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
|
||||
and customize format_prompt() and compute_reward() as needed.
|
||||
"""
|
||||
|
||||
name = "hermes-swe"
|
||||
env_config_cls = HermesSweEnvConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
|
||||
"""
|
||||
Default configuration for the SWE environment.
|
||||
|
||||
Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
|
||||
"""
|
||||
env_config = HermesSweEnvConfig(
|
||||
# Toolsets: terminal for running code, file for reading/writing, web for docs
|
||||
enabled_toolsets=["terminal", "file", "web"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
# Agent settings -- SWE tasks need more turns
|
||||
max_agent_turns=30,
|
||||
max_token_length=4096,
|
||||
agent_temperature=1.0,
|
||||
system_prompt=(
|
||||
"You are a skilled software engineer. You have access to a terminal, "
|
||||
"file tools, and web search. Use these tools to complete the coding task. "
|
||||
"Write clean, working code and verify it runs correctly before finishing."
|
||||
),
|
||||
# Modal backend for cloud-isolated sandboxes
|
||||
terminal_backend="modal",
|
||||
# Dataset -- override via CLI for your specific SWE dataset
|
||||
dataset_name="bigcode/humanevalpack",
|
||||
dataset_split="test",
|
||||
prompt_field="prompt",
|
||||
# Atropos settings
|
||||
group_size=4,
|
||||
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||
tool_call_parser="hermes",
|
||||
steps_per_eval=50,
|
||||
total_steps=500,
|
||||
use_wandb=True,
|
||||
wandb_name="hermes-swe",
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="http://localhost:8000/v1",
|
||||
model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||
server_type="openai", # Phase 1; switch to "vllm" for Phase 2
|
||||
api_key="",
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
async def setup(self):
|
||||
"""Load the SWE dataset."""
|
||||
if self.config.dataset_name:
|
||||
self.dataset = load_dataset(
|
||||
self.config.dataset_name, split=self.config.dataset_split
|
||||
)
|
||||
else:
|
||||
# Placeholder if no dataset specified
|
||||
self.dataset = []
|
||||
self.iter = 0
|
||||
self.reward_buffer: List[float] = []
|
||||
|
||||
async def get_next_item(self) -> Dict[str, Any]:
|
||||
"""Cycle through the SWE dataset."""
|
||||
if not self.dataset:
|
||||
raise ValueError("No dataset loaded. Set dataset_name in config.")
|
||||
item = self.dataset[self.iter % len(self.dataset)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Format the SWE task prompt.
|
||||
|
||||
Override this in subclasses for different dataset formats.
|
||||
Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
|
||||
"""
|
||||
prompt = item.get(self.config.prompt_field, "")
|
||||
|
||||
# If the dataset has test information, include it in the prompt
|
||||
test_info = item.get("test", item.get("test_code", item.get("tests", "")))
|
||||
if test_info:
|
||||
prompt += f"\n\nTests to pass:\n{test_info}"
|
||||
|
||||
return prompt
|
||||
|
||||
async def compute_reward(
|
||||
self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
|
||||
) -> float:
|
||||
"""
|
||||
Score by running tests in the model's Modal sandbox.
|
||||
|
||||
Default implementation:
|
||||
- If the dataset item has a 'test' or 'test_code' field, run it
|
||||
- Check exit code: 0 = pass, non-zero = fail
|
||||
- Partial credit for file creation
|
||||
|
||||
Override this in subclasses for more sophisticated reward logic.
|
||||
"""
|
||||
# Find the test command from the dataset item
|
||||
test_code = item.get("test", item.get("test_code", item.get("tests", "")))
|
||||
|
||||
if test_code:
|
||||
# Run the test in the model's sandbox
|
||||
test_result = ctx.terminal(
|
||||
f'cd /workspace && python3 -c "{test_code}"', timeout=60
|
||||
)
|
||||
|
||||
if test_result["exit_code"] == 0:
|
||||
self.reward_buffer.append(1.0)
|
||||
return 1.0
|
||||
|
||||
# Partial credit: check if the model created any Python files
|
||||
file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
|
||||
if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
|
||||
self.reward_buffer.append(0.1)
|
||||
return 0.1
|
||||
|
||||
self.reward_buffer.append(0.0)
|
||||
return 0.0
|
||||
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
"""
|
||||
Run evaluation on a held-out set.
|
||||
|
||||
Override for dataset-specific evaluation logic.
|
||||
"""
|
||||
start_time = time.time()
|
||||
end_time = time.time()
|
||||
|
||||
eval_metrics = {"eval/placeholder": 0.0}
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log SWE-specific metrics."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
if self.reward_buffer:
|
||||
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
|
||||
self.reward_buffer
|
||||
)
|
||||
wandb_metrics["train/pass_rate"] = sum(
|
||||
1 for r in self.reward_buffer if r == 1.0
|
||||
) / len(self.reward_buffer)
|
||||
self.reward_buffer = []
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
HermesSweEnv.cli()
|
||||
204
environments/patches.py
Normal file
204
environments/patches.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
Monkey patches for making hermes-agent tools work inside async frameworks (Atropos).
|
||||
|
||||
Problem:
|
||||
Some tools use asyncio.run() internally (e.g., mini-swe-agent's Modal backend,
|
||||
web_extract). This crashes when called from inside Atropos's event loop because
|
||||
asyncio.run() can't be nested.
|
||||
|
||||
Solution:
|
||||
Replace the problematic methods with versions that use a dedicated background
|
||||
thread with its own event loop. The calling code sees the same sync interface --
|
||||
call a function, get a result -- but internally the async work happens on a
|
||||
separate thread that doesn't conflict with Atropos's loop.
|
||||
|
||||
These patches are safe for normal CLI use too: when there's no running event
|
||||
loop, the behavior is identical (the background thread approach works regardless).
|
||||
|
||||
What gets patched:
|
||||
- SwerexModalEnvironment.__init__ -- creates Modal deployment on a background thread
|
||||
- SwerexModalEnvironment.execute -- runs commands on the same background thread
|
||||
- SwerexModalEnvironment.stop -- stops deployment on the background thread
|
||||
|
||||
Usage:
|
||||
Call apply_patches() once at import time (done automatically by hermes_base_env.py).
|
||||
This is idempotent -- calling it multiple times is safe.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_patches_applied = False
|
||||
|
||||
|
||||
class _AsyncWorker:
|
||||
"""
|
||||
A dedicated background thread with its own event loop.
|
||||
|
||||
Allows sync code to submit async coroutines and block for results,
|
||||
even when called from inside another running event loop. Used to
|
||||
bridge sync tool interfaces with async backends (Modal, SWE-ReX).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loop: asyncio.AbstractEventLoop = None
|
||||
self._thread: threading.Thread = None
|
||||
self._started = threading.Event()
|
||||
|
||||
def start(self):
|
||||
"""Start the background event loop thread."""
|
||||
self._thread = threading.Thread(target=self._run_loop, daemon=True)
|
||||
self._thread.start()
|
||||
self._started.wait(timeout=30)
|
||||
|
||||
def _run_loop(self):
|
||||
"""Background thread entry point -- runs the event loop forever."""
|
||||
self._loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(self._loop)
|
||||
self._started.set()
|
||||
self._loop.run_forever()
|
||||
|
||||
def run_coroutine(self, coro, timeout=600):
|
||||
"""
|
||||
Submit a coroutine to the background loop and block until it completes.
|
||||
|
||||
Safe to call from any thread, including threads that already have
|
||||
a running event loop.
|
||||
"""
|
||||
if self._loop is None or self._loop.is_closed():
|
||||
raise RuntimeError("AsyncWorker loop is not running")
|
||||
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
||||
return future.result(timeout=timeout)
|
||||
|
||||
def stop(self):
|
||||
"""Stop the background event loop and join the thread."""
|
||||
if self._loop and self._loop.is_running():
|
||||
self._loop.call_soon_threadsafe(self._loop.stop)
|
||||
if self._thread:
|
||||
self._thread.join(timeout=10)
|
||||
|
||||
|
||||
def _patch_swerex_modal():
|
||||
"""
|
||||
Monkey patch SwerexModalEnvironment to use a background thread event loop
|
||||
instead of asyncio.run(). This makes it safe to call from inside Atropos's
|
||||
async event loop.
|
||||
|
||||
The patched methods have the exact same interface and behavior -- the only
|
||||
difference is HOW the async work is executed internally.
|
||||
"""
|
||||
try:
|
||||
from minisweagent.environments.extra.swerex_modal import (
|
||||
SwerexModalEnvironment,
|
||||
SwerexModalEnvironmentConfig,
|
||||
)
|
||||
from swerex.deployment.modal import ModalDeployment
|
||||
from swerex.runtime.abstract import Command as RexCommand
|
||||
except ImportError:
|
||||
# mini-swe-agent or swe-rex not installed -- nothing to patch
|
||||
logger.debug("mini-swe-agent Modal backend not available, skipping patch")
|
||||
return
|
||||
|
||||
# Save original methods so we can refer to config handling
|
||||
_original_init = SwerexModalEnvironment.__init__
|
||||
|
||||
def _patched_init(self, **kwargs):
|
||||
"""Patched __init__: creates Modal deployment on a background thread."""
|
||||
self.config = SwerexModalEnvironmentConfig(**kwargs)
|
||||
|
||||
# Start a dedicated event loop thread for all Modal async operations
|
||||
self._worker = _AsyncWorker()
|
||||
self._worker.start()
|
||||
|
||||
# Pre-build a modal.Image with pip fix for Modal's legacy image builder.
|
||||
# Modal requires `python -m pip` to work during image build, but some
|
||||
# task images (e.g., TBLite's broken-python) have intentionally broken pip.
|
||||
# Fix: remove stale pip dist-info and reinstall via ensurepip before Modal
|
||||
# tries to use it. This is a no-op for images where pip already works.
|
||||
import modal as _modal
|
||||
image_spec = self.config.image
|
||||
if isinstance(image_spec, str):
|
||||
image_spec = _modal.Image.from_registry(
|
||||
image_spec,
|
||||
setup_dockerfile_commands=[
|
||||
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
|
||||
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
|
||||
],
|
||||
)
|
||||
|
||||
# Create AND start the deployment entirely on the worker's loop/thread
|
||||
# so all gRPC channels and async state are bound to that loop
|
||||
async def _create_and_start():
|
||||
deployment = ModalDeployment(
|
||||
image=image_spec,
|
||||
startup_timeout=self.config.startup_timeout,
|
||||
runtime_timeout=self.config.runtime_timeout,
|
||||
deployment_timeout=self.config.deployment_timeout,
|
||||
install_pipx=self.config.install_pipx,
|
||||
modal_sandbox_kwargs=self.config.modal_sandbox_kwargs,
|
||||
)
|
||||
await deployment.start()
|
||||
return deployment
|
||||
|
||||
self.deployment = self._worker.run_coroutine(_create_and_start())
|
||||
|
||||
def _patched_execute(self, command: str, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]:
|
||||
"""Patched execute: runs commands on the background thread's loop."""
|
||||
async def _do_execute():
|
||||
return await self.deployment.runtime.execute(
|
||||
RexCommand(
|
||||
command=command,
|
||||
shell=True,
|
||||
check=False,
|
||||
cwd=cwd or self.config.cwd,
|
||||
timeout=timeout or self.config.timeout,
|
||||
merge_output_streams=True,
|
||||
env=self.config.env if self.config.env else None,
|
||||
)
|
||||
)
|
||||
|
||||
output = self._worker.run_coroutine(_do_execute())
|
||||
return {
|
||||
"output": output.stdout,
|
||||
"returncode": output.exit_code,
|
||||
}
|
||||
|
||||
def _patched_stop(self):
|
||||
"""Patched stop: stops deployment on the background thread, then stops the thread."""
|
||||
try:
|
||||
self._worker.run_coroutine(
|
||||
asyncio.wait_for(self.deployment.stop(), timeout=10),
|
||||
timeout=15,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
self._worker.stop()
|
||||
|
||||
# Apply the patches
|
||||
SwerexModalEnvironment.__init__ = _patched_init
|
||||
SwerexModalEnvironment.execute = _patched_execute
|
||||
SwerexModalEnvironment.stop = _patched_stop
|
||||
|
||||
logger.debug("Patched SwerexModalEnvironment for async-safe operation")
|
||||
|
||||
|
||||
def apply_patches():
|
||||
"""
|
||||
Apply all monkey patches needed for Atropos compatibility.
|
||||
|
||||
Safe to call multiple times -- patches are only applied once.
|
||||
Safe for normal CLI use -- patched code works identically when
|
||||
there is no running event loop.
|
||||
"""
|
||||
global _patches_applied
|
||||
if _patches_applied:
|
||||
return
|
||||
|
||||
_patch_swerex_modal()
|
||||
|
||||
_patches_applied = True
|
||||
0
environments/terminal_test_env/__init__.py
Normal file
0
environments/terminal_test_env/__init__.py
Normal file
34
environments/terminal_test_env/default.yaml
Normal file
34
environments/terminal_test_env/default.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
# Terminal Test Environment -- Default Configuration
|
||||
#
|
||||
# Simple file-creation tasks for validating the full Atropos + hermes-agent stack.
|
||||
# Uses Modal terminal backend and OpenRouter (Claude) for inference.
|
||||
# API keys loaded from ~/hermes-agent/.env
|
||||
#
|
||||
# Usage:
|
||||
# run-api
|
||||
# python environments/terminal_test_env/terminal_test_env.py serve \
|
||||
# --config environments/terminal_test_env/default.yaml
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 10
|
||||
max_token_length: 2048
|
||||
group_size: 3
|
||||
total_steps: 3
|
||||
steps_per_eval: 3
|
||||
terminal_backend: "modal"
|
||||
tool_call_parser: "hermes"
|
||||
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||
ensure_scores_are_not_same: false
|
||||
use_wandb: false
|
||||
system_prompt: >
|
||||
You are a helpful assistant with access to a terminal and file tools.
|
||||
Complete the user's request by using the available tools.
|
||||
Be precise and follow instructions exactly.
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-opus-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
292
environments/terminal_test_env/terminal_test_env.py
Normal file
292
environments/terminal_test_env/terminal_test_env.py
Normal file
@@ -0,0 +1,292 @@
|
||||
"""
|
||||
TerminalTestEnv -- Simple Test Environment for Validating the Stack
|
||||
|
||||
A self-contained environment with inline tasks (no external dataset needed).
|
||||
Each task asks the model to create a file at a known path with specific content.
|
||||
The reward verifier cats the file and checks if the content matches.
|
||||
|
||||
Enables only terminal + file toolsets. Uses Modal terminal backend with
|
||||
OpenRouter (Claude) by default.
|
||||
|
||||
Training tasks (3):
|
||||
1. Create ~/greeting.txt with "Hello from Hermes Agent"
|
||||
2. Create ~/count.txt with numbers 1-5, one per line
|
||||
3. Create ~/answer.txt with the result of 123 + 456
|
||||
|
||||
Eval task (1):
|
||||
1. Create ~/result.txt with the result of 6 * 7
|
||||
|
||||
Usage:
|
||||
# Start Atropos API server
|
||||
run-api
|
||||
|
||||
# Run environment (uses OpenRouter + Modal by default)
|
||||
python environments/terminal_test_env.py serve
|
||||
|
||||
# Process mode (no run-api needed, saves to JSONL)
|
||||
python environments/terminal_test_env.py process \\
|
||||
--env.data_path_to_save_groups terminal_test_output.jsonl
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
# Ensure repo root is on sys.path for imports
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from atroposlib.envs.base import ScoredDataGroup
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.agent_loop import AgentResult
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
from environments.tool_context import ToolContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Inline task definitions -- no external dataset needed
|
||||
# =============================================================================
|
||||
|
||||
TRAIN_TASKS = [
|
||||
{
|
||||
"prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
|
||||
"verify_path": "~/greeting.txt",
|
||||
"expected_content": "Hello from Hermes Agent",
|
||||
},
|
||||
{
|
||||
"prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
|
||||
"verify_path": "~/count.txt",
|
||||
"expected_content": "1\n2\n3\n4\n5",
|
||||
},
|
||||
{
|
||||
"prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
|
||||
"verify_path": "~/answer.txt",
|
||||
"expected_content": "579",
|
||||
},
|
||||
]
|
||||
|
||||
EVAL_TASKS = [
|
||||
{
|
||||
"prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
|
||||
"verify_path": "~/result.txt",
|
||||
"expected_content": "42",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class TerminalTestEnvConfig(HermesAgentEnvConfig):
|
||||
"""Config with defaults suitable for terminal testing."""
|
||||
|
||||
pass # Inherits all fields, overrides defaults in config_init
|
||||
|
||||
|
||||
class TerminalTestEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
Simple test environment with inline file-creation tasks.
|
||||
|
||||
All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
|
||||
The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
|
||||
against the expected string. Same verifier logic for all tasks.
|
||||
|
||||
This environment is designed to validate the full stack end-to-end:
|
||||
- Agent loop executes tool calls (terminal/file)
|
||||
- ToolContext provides terminal access to the reward function
|
||||
- Reward function verifies file content via cat
|
||||
- Scored data flows through the Atropos pipeline
|
||||
"""
|
||||
|
||||
name = "terminal-test"
|
||||
env_config_cls = TerminalTestEnvConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
|
||||
"""
|
||||
Default configuration for the terminal test environment.
|
||||
|
||||
Uses Modal terminal backend for cloud isolation and OpenRouter with
|
||||
Claude for inference. API keys loaded from ~/hermes-agent/.env.
|
||||
"""
|
||||
env_config = TerminalTestEnvConfig(
|
||||
# Terminal + file tools only
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
# Agent settings
|
||||
max_agent_turns=10, # Simple tasks, don't need many turns
|
||||
max_token_length=16000,
|
||||
agent_temperature=1.0,
|
||||
system_prompt=(
|
||||
"You are a helpful assistant with access to a terminal and file tools. "
|
||||
"Complete the user's request by using the available tools. "
|
||||
"Be precise and follow instructions exactly."
|
||||
),
|
||||
# Modal terminal backend for cloud-isolated sandboxes per rollout
|
||||
terminal_backend="modal",
|
||||
# Atropos settings
|
||||
group_size=3, # 3 rollouts per group
|
||||
tokenizer_name="NousResearch/q-30b-t-h45-e1",
|
||||
tool_call_parser="hermes",
|
||||
steps_per_eval=3, # Eval after all 3 steps
|
||||
total_steps=3, # 3 groups total (1 group per step)
|
||||
use_wandb=True,
|
||||
wandb_name="terminal-test",
|
||||
ensure_scores_are_not_same=False, # Allow all-same scores for simple tasks
|
||||
# No external dataset
|
||||
dataset_name=None,
|
||||
)
|
||||
|
||||
# OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-opus-4.6",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
health_check=False, # OpenRouter doesn't have a /health endpoint
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
async def setup(self):
|
||||
"""Initialize inline task lists."""
|
||||
self.train_tasks = list(TRAIN_TASKS)
|
||||
self.eval_tasks = list(EVAL_TASKS)
|
||||
self.iter = 0
|
||||
# Track reward stats for wandb logging
|
||||
self.reward_buffer: List[float] = []
|
||||
|
||||
async def get_next_item(self) -> Dict[str, str]:
|
||||
"""Cycle through training tasks."""
|
||||
item = self.train_tasks[self.iter % len(self.train_tasks)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item: Dict[str, str]) -> str:
|
||||
"""The prompt is directly in the task item."""
|
||||
return item["prompt"]
|
||||
|
||||
async def compute_reward(
|
||||
self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
|
||||
) -> float:
|
||||
"""
|
||||
Verify by cat-ing the expected file path and checking content matches.
|
||||
Same verifier for all tasks -- they all write a file at a known path.
|
||||
|
||||
Scoring:
|
||||
1.0 = exact match
|
||||
0.5 = expected content is present but has extra stuff
|
||||
0.0 = file doesn't exist or content doesn't match
|
||||
"""
|
||||
verify_result = ctx.terminal(f"cat {item['verify_path']}")
|
||||
|
||||
# File doesn't exist or can't be read
|
||||
if verify_result["exit_code"] != 0:
|
||||
self.reward_buffer.append(0.0)
|
||||
return 0.0
|
||||
|
||||
actual = verify_result.get("output", "").strip()
|
||||
expected = item["expected_content"].strip()
|
||||
|
||||
# Exact match
|
||||
if actual == expected:
|
||||
self.reward_buffer.append(1.0)
|
||||
return 1.0
|
||||
|
||||
# Partial credit: expected content is present but has extra stuff
|
||||
if expected in actual:
|
||||
self.reward_buffer.append(0.5)
|
||||
return 0.5
|
||||
|
||||
self.reward_buffer.append(0.0)
|
||||
return 0.0
|
||||
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
"""
|
||||
Run eval tasks using the agent loop and verify results.
|
||||
Logs accuracy metrics.
|
||||
"""
|
||||
start_time = time.time()
|
||||
correct = 0
|
||||
total = len(self.eval_tasks)
|
||||
samples = []
|
||||
|
||||
for eval_item in self.eval_tasks:
|
||||
try:
|
||||
# For eval, we do a simple single-turn completion (not full agent loop)
|
||||
# to keep eval fast. The agent loop is tested via training.
|
||||
completion = await self.server.chat_completion(
|
||||
messages=[
|
||||
{"role": "system", "content": self.config.system_prompt or ""},
|
||||
{"role": "user", "content": eval_item["prompt"]},
|
||||
],
|
||||
n=1,
|
||||
max_tokens=self.config.max_token_length,
|
||||
temperature=0.0,
|
||||
split="eval",
|
||||
)
|
||||
|
||||
response_content = (
|
||||
completion.choices[0].message.content if completion.choices else ""
|
||||
)
|
||||
|
||||
samples.append(
|
||||
{
|
||||
"prompt": eval_item["prompt"],
|
||||
"response": response_content,
|
||||
"expected": eval_item["expected_content"],
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Eval failed for item: %s", e)
|
||||
samples.append(
|
||||
{
|
||||
"prompt": eval_item["prompt"],
|
||||
"response": f"ERROR: {e}",
|
||||
"expected": eval_item["expected_content"],
|
||||
}
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
eval_metrics = {
|
||||
"eval/num_samples": total,
|
||||
}
|
||||
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
samples=samples,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log training metrics including reward stats and accuracy."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
if self.reward_buffer:
|
||||
total = len(self.reward_buffer)
|
||||
correct = sum(1 for r in self.reward_buffer if r == 1.0)
|
||||
partial = sum(1 for r in self.reward_buffer if r == 0.5)
|
||||
|
||||
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
|
||||
wandb_metrics["train/accuracy"] = correct / total
|
||||
wandb_metrics["train/partial_match_rate"] = partial / total
|
||||
wandb_metrics["train/total_rollouts"] = total
|
||||
self.reward_buffer = []
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TerminalTestEnv.cli()
|
||||
120
environments/tool_call_parsers/__init__.py
Normal file
120
environments/tool_call_parsers/__init__.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""
|
||||
Tool Call Parser Registry
|
||||
|
||||
Client-side parsers that extract structured tool_calls from raw model output text.
|
||||
Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns
|
||||
raw text without tool call parsing.
|
||||
|
||||
Each parser is a standalone reimplementation of the corresponding VLLM parser's
|
||||
non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library
|
||||
(re, json, uuid) and openai types.
|
||||
|
||||
Usage:
|
||||
from environments.tool_call_parsers import get_parser
|
||||
|
||||
parser = get_parser("hermes")
|
||||
content, tool_calls = parser.parse(raw_model_output)
|
||||
# content = text with tool call markup stripped
|
||||
# tool_calls = list of ChatCompletionMessageToolCall objects, or None
|
||||
"""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional, Tuple, Type
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Type alias for parser return value
|
||||
ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]]
|
||||
|
||||
|
||||
class ToolCallParser(ABC):
|
||||
"""
|
||||
Base class for tool call parsers.
|
||||
|
||||
Each parser knows how to extract structured tool_calls from a specific
|
||||
model family's raw output text format.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
"""
|
||||
Parse raw model output text for tool calls.
|
||||
|
||||
Args:
|
||||
text: Raw decoded text from the model's completion
|
||||
|
||||
Returns:
|
||||
Tuple of (content, tool_calls) where:
|
||||
- content: text with tool call markup stripped (the message 'content' field),
|
||||
or None if the entire output was tool calls
|
||||
- tool_calls: list of ChatCompletionMessageToolCall objects,
|
||||
or None if no tool calls were found
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
# Global parser registry: name -> parser class
|
||||
PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {}
|
||||
|
||||
|
||||
def register_parser(name: str):
|
||||
"""
|
||||
Decorator to register a parser class under a given name.
|
||||
|
||||
Usage:
|
||||
@register_parser("hermes")
|
||||
class HermesToolCallParser(ToolCallParser):
|
||||
...
|
||||
"""
|
||||
|
||||
def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]:
|
||||
PARSER_REGISTRY[name] = cls
|
||||
return cls
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def get_parser(name: str) -> ToolCallParser:
|
||||
"""
|
||||
Get a parser instance by name.
|
||||
|
||||
Args:
|
||||
name: Parser name (e.g., "hermes", "mistral", "llama3_json")
|
||||
|
||||
Returns:
|
||||
Instantiated parser
|
||||
|
||||
Raises:
|
||||
KeyError: If parser name is not found in registry
|
||||
"""
|
||||
if name not in PARSER_REGISTRY:
|
||||
available = sorted(PARSER_REGISTRY.keys())
|
||||
raise KeyError(
|
||||
f"Tool call parser '{name}' not found. Available parsers: {available}"
|
||||
)
|
||||
return PARSER_REGISTRY[name]()
|
||||
|
||||
|
||||
def list_parsers() -> List[str]:
|
||||
"""Return sorted list of registered parser names."""
|
||||
return sorted(PARSER_REGISTRY.keys())
|
||||
|
||||
|
||||
# Import all parser modules to trigger registration via @register_parser decorators
|
||||
# Each module registers itself when imported
|
||||
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.mistral_parser import MistralToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.llama_parser import LlamaToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.qwen_parser import QwenToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser # noqa: E402, F401
|
||||
72
environments/tool_call_parsers/deepseek_v3_1_parser.py
Normal file
72
environments/tool_call_parsers/deepseek_v3_1_parser.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
DeepSeek V3.1 tool call parser.
|
||||
|
||||
Similar to V3 but with a slightly different format:
|
||||
<|tool▁call▁begin|>function_name<|tool▁sep|>arguments<|tool▁call▁end|>
|
||||
|
||||
Note: V3 has type+name before the separator, V3.1 has name before and args after.
|
||||
|
||||
Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("deepseek_v3_1")
|
||||
@register_parser("deepseek_v31")
|
||||
class DeepSeekV31ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for DeepSeek V3.1 tool calls.
|
||||
|
||||
Slightly different regex than V3: function_name comes before the separator,
|
||||
arguments come after (no type field, no json code block wrapper).
|
||||
"""
|
||||
|
||||
START_TOKEN = "<|tool▁calls▁begin|>"
|
||||
|
||||
# Regex captures: function_name, function_arguments
|
||||
PATTERN = re.compile(
|
||||
r"<|tool▁call▁begin|>(?P<function_name>.*?)<|tool▁sep|>(?P<function_arguments>.*?)<|tool▁call▁end|>",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if self.START_TOKEN not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matches = self.PATTERN.findall(text)
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for match in matches:
|
||||
func_name, func_args = match
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=func_name.strip(),
|
||||
arguments=func_args.strip(),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
content = text[: text.find(self.START_TOKEN)].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
89
environments/tool_call_parsers/deepseek_v3_parser.py
Normal file
89
environments/tool_call_parsers/deepseek_v3_parser.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""
|
||||
DeepSeek V3 tool call parser.
|
||||
|
||||
Format uses special unicode tokens:
|
||||
<|tool▁calls▁begin|>
|
||||
<|tool▁call▁begin|>type<|tool▁sep|>function_name
|
||||
```json
|
||||
{"arg": "value"}
|
||||
```
|
||||
<|tool▁call▁end|>
|
||||
<|tool▁calls▁end|>
|
||||
|
||||
Fixes Issue #989: Support for multiple simultaneous tool calls.
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
import logging
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@register_parser("deepseek_v3")
|
||||
class DeepSeekV3ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for DeepSeek V3 tool calls.
|
||||
|
||||
Uses special unicode tokens with fullwidth angle brackets and block elements.
|
||||
Extracts type, function name, and JSON arguments from the structured format.
|
||||
Ensures all tool calls are captured when the model executes multiple actions.
|
||||
"""
|
||||
|
||||
START_TOKEN = "<|tool▁calls▁begin|>"
|
||||
|
||||
# Updated PATTERN: Using \s* instead of literal \n for increased robustness
|
||||
# against variations in model formatting (Issue #989).
|
||||
PATTERN = re.compile(
|
||||
r"<|tool▁call▁begin|>(?P<type>.*?)<|tool▁sep|>(?P<function_name>.*?)\s*```json\s*(?P<function_arguments>.*?)\s*```\s*<|tool▁call▁end|>",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
"""
|
||||
Parses the input text and extracts all available tool calls.
|
||||
"""
|
||||
if self.START_TOKEN not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
# Using finditer to capture ALL tool calls in the sequence
|
||||
matches = list(self.PATTERN.finditer(text))
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
|
||||
for match in matches:
|
||||
func_name = match.group("function_name").strip()
|
||||
func_args = match.group("function_arguments").strip()
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=func_name,
|
||||
arguments=func_args,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if tool_calls:
|
||||
# Content is text before the first tool call block
|
||||
content_index = text.find(self.START_TOKEN)
|
||||
content = text[:content_index].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
return text, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DeepSeek V3 tool calls: {e}")
|
||||
return text, None
|
||||
109
environments/tool_call_parsers/glm45_parser.py
Normal file
109
environments/tool_call_parsers/glm45_parser.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
GLM 4.5 (GLM-4-MoE) tool call parser.
|
||||
|
||||
Format uses custom arg_key/arg_value tags rather than standard JSON:
|
||||
<tool_call>function_name
|
||||
<arg_key>param1</arg_key><arg_value>value1</arg_value>
|
||||
<arg_key>param2</arg_key><arg_value>value2</arg_value>
|
||||
</tool_call>
|
||||
|
||||
Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback.
|
||||
|
||||
Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
def _deserialize_value(value: str) -> Any:
|
||||
"""
|
||||
Try to deserialize a string value to its native Python type.
|
||||
Attempts json.loads, then ast.literal_eval, then returns raw string.
|
||||
"""
|
||||
try:
|
||||
return json.loads(value)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
try:
|
||||
return ast.literal_eval(value)
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
pass
|
||||
|
||||
return value
|
||||
|
||||
|
||||
@register_parser("glm45")
|
||||
class Glm45ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for GLM 4.5 (GLM-4-MoE) tool calls.
|
||||
|
||||
Uses <tool_call>...</tool_call> tags with <arg_key>/<arg_value> pairs
|
||||
instead of standard JSON arguments.
|
||||
"""
|
||||
|
||||
FUNC_CALL_REGEX = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
|
||||
FUNC_DETAIL_REGEX = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
|
||||
FUNC_ARG_REGEX = re.compile(
|
||||
r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
|
||||
)
|
||||
|
||||
START_TOKEN = "<tool_call>"
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if self.START_TOKEN not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matched_calls = self.FUNC_CALL_REGEX.findall(text)
|
||||
if not matched_calls:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
|
||||
for match in matched_calls:
|
||||
detail = self.FUNC_DETAIL_REGEX.search(match)
|
||||
if not detail:
|
||||
continue
|
||||
|
||||
func_name = detail.group(1).strip()
|
||||
func_args_raw = detail.group(2)
|
||||
|
||||
# Parse arg_key/arg_value pairs
|
||||
pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else []
|
||||
arg_dict: Dict[str, Any] = {}
|
||||
for key, value in pairs:
|
||||
arg_key = key.strip()
|
||||
arg_val = _deserialize_value(value.strip())
|
||||
arg_dict[arg_key] = arg_val
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=func_name,
|
||||
arguments=json.dumps(arg_dict, ensure_ascii=False),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
content = text[: text.find(self.START_TOKEN)].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
35
environments/tool_call_parsers/glm47_parser.py
Normal file
35
environments/tool_call_parsers/glm47_parser.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
GLM 4.7 tool call parser.
|
||||
|
||||
Same as GLM 4.5 but with slightly different regex patterns.
|
||||
The tool_call tags may wrap differently and arg parsing handles
|
||||
newlines between key/value pairs.
|
||||
|
||||
Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser).
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, register_parser
|
||||
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser
|
||||
|
||||
|
||||
@register_parser("glm47")
|
||||
class Glm47ToolCallParser(Glm45ToolCallParser):
|
||||
"""
|
||||
Parser for GLM 4.7 tool calls.
|
||||
Extends GLM 4.5 with updated regex patterns.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# GLM 4.7 uses a slightly different detail regex that includes
|
||||
# the <tool_call> wrapper and optional arg_key content
|
||||
self.FUNC_DETAIL_REGEX = re.compile(
|
||||
r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
|
||||
)
|
||||
# GLM 4.7 handles newlines between arg_key and arg_value tags
|
||||
self.FUNC_ARG_REGEX = re.compile(
|
||||
r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
|
||||
re.DOTALL,
|
||||
)
|
||||
73
environments/tool_call_parsers/hermes_parser.py
Normal file
73
environments/tool_call_parsers/hermes_parser.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""
|
||||
Hermes tool call parser.
|
||||
|
||||
Format: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
|
||||
Based on VLLM's Hermes2ProToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("hermes")
|
||||
class HermesToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Hermes-format tool calls.
|
||||
|
||||
Matches <tool_call>...</tool_call> tags containing JSON with "name" and "arguments".
|
||||
Also handles unclosed <tool_call> at end-of-string (truncated generation).
|
||||
"""
|
||||
|
||||
# Matches both closed and unclosed tool_call tags
|
||||
PATTERN = re.compile(
|
||||
r"<tool_call>\s*(.*?)\s*</tool_call>|<tool_call>\s*(.*)", re.DOTALL
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if "<tool_call>" not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matches = self.PATTERN.findall(text)
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for match in matches:
|
||||
# match is a tuple: (closed_content, unclosed_content)
|
||||
raw_json = match[0] if match[0] else match[1]
|
||||
if not raw_json.strip():
|
||||
continue
|
||||
|
||||
tc_data = json.loads(raw_json)
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=tc_data["name"],
|
||||
arguments=json.dumps(
|
||||
tc_data.get("arguments", {}), ensure_ascii=False
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
# Content is everything before the first <tool_call> tag
|
||||
content = text[: text.find("<tool_call>")].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
93
environments/tool_call_parsers/kimi_k2_parser.py
Normal file
93
environments/tool_call_parsers/kimi_k2_parser.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
Kimi K2 tool call parser.
|
||||
|
||||
Format:
|
||||
<|tool_calls_section_begin|>
|
||||
<|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|>
|
||||
<|tool_calls_section_end|>
|
||||
|
||||
The function_id format is typically "functions.func_name:index" or "func_name:index".
|
||||
|
||||
Based on VLLM's KimiK2ToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("kimi_k2")
|
||||
class KimiK2ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Kimi K2 tool calls.
|
||||
|
||||
Uses section begin/end tokens wrapping individual tool call begin/end tokens.
|
||||
The tool_call_id contains the function name (after last dot, before colon).
|
||||
"""
|
||||
|
||||
# Support both singular and plural variants
|
||||
START_TOKENS = [
|
||||
"<|tool_calls_section_begin|>",
|
||||
"<|tool_call_section_begin|>",
|
||||
]
|
||||
|
||||
# Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments
|
||||
PATTERN = re.compile(
|
||||
r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*"
|
||||
r"<\|tool_call_argument_begin\|>\s*"
|
||||
r"(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*"
|
||||
r"<\|tool_call_end\|>",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
# Check for any variant of the start token
|
||||
has_start = any(token in text for token in self.START_TOKENS)
|
||||
if not has_start:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matches = self.PATTERN.findall(text)
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for match in matches:
|
||||
function_id, function_args = match
|
||||
|
||||
# Extract function name from ID format: "functions.get_weather:0" -> "get_weather"
|
||||
function_name = function_id.split(":")[0].split(".")[-1]
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=function_id, # Preserve the original ID format
|
||||
type="function",
|
||||
function=Function(
|
||||
name=function_name,
|
||||
arguments=function_args.strip(),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
# Content is everything before the tool calls section
|
||||
earliest_start = len(text)
|
||||
for token in self.START_TOKENS:
|
||||
idx = text.find(token)
|
||||
if idx >= 0 and idx < earliest_start:
|
||||
earliest_start = idx
|
||||
|
||||
content = text[:earliest_start].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
96
environments/tool_call_parsers/llama_parser.py
Normal file
96
environments/tool_call_parsers/llama_parser.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""
|
||||
Llama 3.x / 4 tool call parser.
|
||||
|
||||
Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys.
|
||||
May be preceded by <|python_tag|> token. Supports multiple JSON objects separated
|
||||
by content or semicolons.
|
||||
|
||||
Based on VLLM's Llama3JsonToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("llama3_json")
|
||||
@register_parser("llama4_json")
|
||||
class LlamaToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Llama 3.x and 4 JSON-format tool calls.
|
||||
|
||||
Finds JSON objects containing "name" + ("arguments" or "parameters") keys.
|
||||
Uses Python's json.JSONDecoder.raw_decode for robust extraction of
|
||||
JSON objects from mixed text.
|
||||
"""
|
||||
|
||||
BOT_TOKEN = "<|python_tag|>"
|
||||
|
||||
# Regex to find the start of potential JSON objects
|
||||
JSON_START = re.compile(r"\{")
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
# Quick check: need either the bot token or a JSON brace
|
||||
if self.BOT_TOKEN not in text and "{" not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
decoder = json.JSONDecoder()
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
end_index = -1 # Track where the last parsed JSON ended
|
||||
|
||||
for match in self.JSON_START.finditer(text):
|
||||
start = match.start()
|
||||
# Skip if this brace is inside a previously parsed JSON object
|
||||
if start <= end_index:
|
||||
continue
|
||||
|
||||
try:
|
||||
obj, json_end = decoder.raw_decode(text[start:])
|
||||
end_index = start + json_end
|
||||
|
||||
# Must have "name" and either "arguments" or "parameters"
|
||||
name = obj.get("name")
|
||||
args = obj.get("arguments", obj.get("parameters"))
|
||||
|
||||
if not name or args is None:
|
||||
continue
|
||||
|
||||
# Normalize arguments to JSON string
|
||||
if isinstance(args, dict):
|
||||
args = json.dumps(args, ensure_ascii=False)
|
||||
elif not isinstance(args, str):
|
||||
args = json.dumps(args, ensure_ascii=False)
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(name=name, arguments=args),
|
||||
)
|
||||
)
|
||||
except (json.JSONDecodeError, KeyError, ValueError):
|
||||
continue
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
# Content is everything before the first tool call JSON
|
||||
# Find where the first tool call starts in the text
|
||||
first_tc_start = text.find("{")
|
||||
if self.BOT_TOKEN in text:
|
||||
first_tc_start = text.find(self.BOT_TOKEN)
|
||||
content = text[:first_tc_start].strip() if first_tc_start > 0 else None
|
||||
|
||||
return content, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user