mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 15:01:34 +08:00
326 lines
11 KiB
Python
326 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""Build the Hermes Skills Index — a centralized JSON catalog of all skills.
|
|||
|
|
|
|||
|
|
This script crawls every skill source (skills.sh, GitHub taps, official,
|
|||
|
|
clawhub, lobehub, claude-marketplace) and writes a JSON index with resolved
|
|||
|
|
GitHub paths. The index is served as a static file on the docs site so that
|
|||
|
|
`hermes skills search/install` can use it without hitting the GitHub API.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
# Local (uses gh CLI or GITHUB_TOKEN for auth)
|
|||
|
|
python scripts/build_skills_index.py
|
|||
|
|
|
|||
|
|
# CI (set GITHUB_TOKEN as secret)
|
|||
|
|
GITHUB_TOKEN=ghp_... python scripts/build_skills_index.py
|
|||
|
|
|
|||
|
|
Output: website/static/api/skills-index.json
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
from collections import defaultdict
|
|||
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
|
|||
|
|
# Allow importing from repo root
|
|||
|
|
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|||
|
|
sys.path.insert(0, REPO_ROOT)
|
|||
|
|
|
|||
|
|
# Ensure HERMES_HOME is set (needed by tools/skills_hub.py imports)
|
|||
|
|
os.environ.setdefault("HERMES_HOME", os.path.join(os.path.expanduser("~"), ".hermes"))
|
|||
|
|
|
|||
|
|
from tools.skills_hub import (
|
|||
|
|
GitHubAuth,
|
|||
|
|
GitHubSource,
|
|||
|
|
SkillsShSource,
|
|||
|
|
OptionalSkillSource,
|
|||
|
|
WellKnownSkillSource,
|
|||
|
|
ClawHubSource,
|
|||
|
|
ClaudeMarketplaceSource,
|
|||
|
|
LobeHubSource,
|
|||
|
|
SkillMeta,
|
|||
|
|
)
|
|||
|
|
import httpx
|
|||
|
|
|
|||
|
|
OUTPUT_PATH = os.path.join(REPO_ROOT, "website", "static", "api", "skills-index.json")
|
|||
|
|
INDEX_VERSION = 1
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _meta_to_dict(meta: SkillMeta) -> dict:
|
|||
|
|
"""Convert a SkillMeta to a serializable dict."""
|
|||
|
|
return {
|
|||
|
|
"name": meta.name,
|
|||
|
|
"description": meta.description,
|
|||
|
|
"source": meta.source,
|
|||
|
|
"identifier": meta.identifier,
|
|||
|
|
"trust_level": meta.trust_level,
|
|||
|
|
"repo": meta.repo or "",
|
|||
|
|
"path": meta.path or "",
|
|||
|
|
"tags": meta.tags or [],
|
|||
|
|
"extra": meta.extra or {},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def crawl_source(source, source_name: str, limit: int) -> list:
|
|||
|
|
"""Crawl a single source and return skill dicts."""
|
|||
|
|
print(f" Crawling {source_name}...", flush=True)
|
|||
|
|
start = time.time()
|
|||
|
|
try:
|
|||
|
|
results = source.search("", limit=limit)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" Error crawling {source_name}: {e}", file=sys.stderr)
|
|||
|
|
return []
|
|||
|
|
skills = [_meta_to_dict(m) for m in results]
|
|||
|
|
elapsed = time.time() - start
|
|||
|
|
print(f" {source_name}: {len(skills)} skills ({elapsed:.1f}s)", flush=True)
|
|||
|
|
return skills
|
|||
|
|
|
|||
|
|
|
|||
|
|
def crawl_skills_sh(source: SkillsShSource) -> list:
|
|||
|
|
"""Crawl skills.sh using popular queries for broad coverage."""
|
|||
|
|
print(" Crawling skills.sh (popular queries)...", flush=True)
|
|||
|
|
start = time.time()
|
|||
|
|
|
|||
|
|
queries = [
|
|||
|
|
"", # featured
|
|||
|
|
"react", "python", "web", "api", "database", "docker",
|
|||
|
|
"testing", "scraping", "design", "typescript", "git",
|
|||
|
|
"aws", "security", "data", "ml", "ai", "devops",
|
|||
|
|
"frontend", "backend", "mobile", "cli", "documentation",
|
|||
|
|
"kubernetes", "terraform", "rust", "go", "java",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
all_skills: dict[str, dict] = {}
|
|||
|
|
for query in queries:
|
|||
|
|
try:
|
|||
|
|
results = source.search(query, limit=50)
|
|||
|
|
for meta in results:
|
|||
|
|
entry = _meta_to_dict(meta)
|
|||
|
|
if entry["identifier"] not in all_skills:
|
|||
|
|
all_skills[entry["identifier"]] = entry
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" Warning: skills.sh search '{query}' failed: {e}",
|
|||
|
|
file=sys.stderr)
|
|||
|
|
|
|||
|
|
elapsed = time.time() - start
|
|||
|
|
print(f" skills.sh: {len(all_skills)} unique skills ({elapsed:.1f}s)",
|
|||
|
|
flush=True)
|
|||
|
|
return list(all_skills.values())
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _fetch_repo_tree(repo: str, auth: GitHubAuth) -> list:
|
|||
|
|
"""Fetch the recursive tree for a repo. Returns list of tree entries."""
|
|||
|
|
headers = auth.get_headers()
|
|||
|
|
try:
|
|||
|
|
resp = httpx.get(
|
|||
|
|
f"https://api.github.com/repos/{repo}",
|
|||
|
|
headers=headers, timeout=15, follow_redirects=True,
|
|||
|
|
)
|
|||
|
|
if resp.status_code != 200:
|
|||
|
|
return []
|
|||
|
|
branch = resp.json().get("default_branch", "main")
|
|||
|
|
|
|||
|
|
resp = httpx.get(
|
|||
|
|
f"https://api.github.com/repos/{repo}/git/trees/{branch}",
|
|||
|
|
params={"recursive": "1"},
|
|||
|
|
headers=headers, timeout=30, follow_redirects=True,
|
|||
|
|
)
|
|||
|
|
if resp.status_code != 200:
|
|||
|
|
return []
|
|||
|
|
data = resp.json()
|
|||
|
|
if data.get("truncated"):
|
|||
|
|
return []
|
|||
|
|
return data.get("tree", [])
|
|||
|
|
except Exception:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
|
|||
|
|
def batch_resolve_paths(skills: list, auth: GitHubAuth) -> list:
|
|||
|
|
"""Resolve GitHub paths for skills.sh entries using batch tree lookups.
|
|||
|
|
|
|||
|
|
Instead of resolving each skill individually (N×M API calls), we:
|
|||
|
|
1. Group skills by repo
|
|||
|
|
2. Fetch one tree per repo (2 API calls per repo)
|
|||
|
|
3. Find all SKILL.md files in the tree
|
|||
|
|
4. Match skills to their resolved paths
|
|||
|
|
"""
|
|||
|
|
# Filter to skills.sh entries that need resolution
|
|||
|
|
skills_sh = [s for s in skills if s["source"] in ("skills.sh", "skills-sh")]
|
|||
|
|
if not skills_sh:
|
|||
|
|
return skills
|
|||
|
|
|
|||
|
|
print(f" Resolving paths for {len(skills_sh)} skills.sh entries...",
|
|||
|
|
flush=True)
|
|||
|
|
start = time.time()
|
|||
|
|
|
|||
|
|
# Group by repo
|
|||
|
|
by_repo: dict[str, list] = defaultdict(list)
|
|||
|
|
for s in skills_sh:
|
|||
|
|
repo = s.get("repo", "")
|
|||
|
|
if repo:
|
|||
|
|
by_repo[repo].append(s)
|
|||
|
|
|
|||
|
|
print(f" {len(by_repo)} unique repos to scan", flush=True)
|
|||
|
|
|
|||
|
|
resolved_count = 0
|
|||
|
|
|
|||
|
|
# Fetch trees in parallel (up to 6 concurrent)
|
|||
|
|
def _resolve_repo(repo: str, entries: list):
|
|||
|
|
tree = _fetch_repo_tree(repo, auth)
|
|||
|
|
if not tree:
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
# Find all SKILL.md paths in this repo
|
|||
|
|
skill_paths = {} # skill_dir_name -> full_path
|
|||
|
|
for item in tree:
|
|||
|
|
if item.get("type") != "blob":
|
|||
|
|
continue
|
|||
|
|
path = item.get("path", "")
|
|||
|
|
if path.endswith("/SKILL.md"):
|
|||
|
|
skill_dir = path[: -len("/SKILL.md")]
|
|||
|
|
dir_name = skill_dir.split("/")[-1]
|
|||
|
|
skill_paths[dir_name.lower()] = f"{repo}/{skill_dir}"
|
|||
|
|
|
|||
|
|
# Also check SKILL.md frontmatter name if we can match by path
|
|||
|
|
# For now, just index by directory name
|
|||
|
|
elif path == "SKILL.md":
|
|||
|
|
# Root-level SKILL.md
|
|||
|
|
skill_paths["_root_"] = f"{repo}"
|
|||
|
|
|
|||
|
|
count = 0
|
|||
|
|
for entry in entries:
|
|||
|
|
# Try to match the skill's name/path to a tree entry
|
|||
|
|
skill_name = entry.get("name", "").lower()
|
|||
|
|
skill_path = entry.get("path", "").lower()
|
|||
|
|
identifier = entry.get("identifier", "")
|
|||
|
|
|
|||
|
|
# Extract the skill token from the identifier
|
|||
|
|
# e.g. "skills-sh/d4vinci/scrapling/scrapling-official" -> "scrapling-official"
|
|||
|
|
parts = identifier.replace("skills-sh/", "").replace("skills.sh/", "")
|
|||
|
|
skill_token = parts.split("/")[-1].lower() if "/" in parts else ""
|
|||
|
|
|
|||
|
|
# Try matching in order of likelihood
|
|||
|
|
for candidate in [skill_token, skill_name, skill_path]:
|
|||
|
|
if not candidate:
|
|||
|
|
continue
|
|||
|
|
matched = skill_paths.get(candidate)
|
|||
|
|
if matched:
|
|||
|
|
entry["resolved_github_id"] = matched
|
|||
|
|
count += 1
|
|||
|
|
break
|
|||
|
|
else:
|
|||
|
|
# Try fuzzy: skill_token with common transformations
|
|||
|
|
for tree_name, tree_path in skill_paths.items():
|
|||
|
|
if (skill_token and (
|
|||
|
|
tree_name.replace("-", "") == skill_token.replace("-", "")
|
|||
|
|
or skill_token in tree_name
|
|||
|
|
or tree_name in skill_token
|
|||
|
|
)):
|
|||
|
|
entry["resolved_github_id"] = tree_path
|
|||
|
|
count += 1
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
return count
|
|||
|
|
|
|||
|
|
with ThreadPoolExecutor(max_workers=6) as pool:
|
|||
|
|
futures = {
|
|||
|
|
pool.submit(_resolve_repo, repo, entries): repo
|
|||
|
|
for repo, entries in by_repo.items()
|
|||
|
|
}
|
|||
|
|
for future in as_completed(futures):
|
|||
|
|
try:
|
|||
|
|
resolved_count += future.result()
|
|||
|
|
except Exception as e:
|
|||
|
|
repo = futures[future]
|
|||
|
|
print(f" Warning: {repo}: {e}", file=sys.stderr)
|
|||
|
|
|
|||
|
|
elapsed = time.time() - start
|
|||
|
|
print(f" Resolved {resolved_count}/{len(skills_sh)} paths ({elapsed:.1f}s)",
|
|||
|
|
flush=True)
|
|||
|
|
return skills
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
print("Building Hermes Skills Index...", flush=True)
|
|||
|
|
overall_start = time.time()
|
|||
|
|
|
|||
|
|
auth = GitHubAuth()
|
|||
|
|
print(f"GitHub auth: {auth.auth_method()}")
|
|||
|
|
if auth.auth_method() == "anonymous":
|
|||
|
|
print("WARNING: No GitHub authentication — rate limit is 60/hr. "
|
|||
|
|
"Set GITHUB_TOKEN for better results.", file=sys.stderr)
|
|||
|
|
|
|||
|
|
skills_sh_source = SkillsShSource(auth=auth)
|
|||
|
|
sources = {
|
|||
|
|
"official": OptionalSkillSource(),
|
|||
|
|
"well-known": WellKnownSkillSource(),
|
|||
|
|
"github": GitHubSource(auth=auth),
|
|||
|
|
"clawhub": ClawHubSource(),
|
|||
|
|
"claude-marketplace": ClaudeMarketplaceSource(auth=auth),
|
|||
|
|
"lobehub": LobeHubSource(),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
all_skills: list[dict] = []
|
|||
|
|
|
|||
|
|
# Crawl skills.sh
|
|||
|
|
all_skills.extend(crawl_skills_sh(skills_sh_source))
|
|||
|
|
|
|||
|
|
# Crawl other sources in parallel
|
|||
|
|
with ThreadPoolExecutor(max_workers=4) as pool:
|
|||
|
|
futures = {}
|
|||
|
|
for name, source in sources.items():
|
|||
|
|
futures[pool.submit(crawl_source, source, name, 500)] = name
|
|||
|
|
for future in as_completed(futures):
|
|||
|
|
try:
|
|||
|
|
all_skills.extend(future.result())
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" Error: {e}", file=sys.stderr)
|
|||
|
|
|
|||
|
|
# Batch resolve GitHub paths for skills.sh entries
|
|||
|
|
all_skills = batch_resolve_paths(all_skills, auth)
|
|||
|
|
|
|||
|
|
# Deduplicate by identifier
|
|||
|
|
seen: dict[str, dict] = {}
|
|||
|
|
for skill in all_skills:
|
|||
|
|
key = skill["identifier"]
|
|||
|
|
if key not in seen:
|
|||
|
|
seen[key] = skill
|
|||
|
|
deduped = list(seen.values())
|
|||
|
|
|
|||
|
|
# Sort
|
|||
|
|
source_order = {"official": 0, "skills-sh": 1, "skills.sh": 1,
|
|||
|
|
"github": 2, "well-known": 3, "clawhub": 4,
|
|||
|
|
"claude-marketplace": 5, "lobehub": 6}
|
|||
|
|
deduped.sort(key=lambda s: (source_order.get(s["source"], 99), s["name"]))
|
|||
|
|
|
|||
|
|
# Build index
|
|||
|
|
index = {
|
|||
|
|
"version": INDEX_VERSION,
|
|||
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|||
|
|
"skill_count": len(deduped),
|
|||
|
|
"skills": deduped,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
|
|||
|
|
with open(OUTPUT_PATH, "w") as f:
|
|||
|
|
json.dump(index, f, separators=(",", ":"), ensure_ascii=False)
|
|||
|
|
|
|||
|
|
elapsed = time.time() - overall_start
|
|||
|
|
file_size = os.path.getsize(OUTPUT_PATH)
|
|||
|
|
print(f"\nDone! {len(deduped)} skills indexed in {elapsed:.0f}s")
|
|||
|
|
print(f"Output: {OUTPUT_PATH} ({file_size / 1024:.0f} KB)")
|
|||
|
|
|
|||
|
|
from collections import Counter
|
|||
|
|
by_source = Counter(s["source"] for s in deduped)
|
|||
|
|
for src, count in sorted(by_source.items(), key=lambda x: -x[1]):
|
|||
|
|
resolved = sum(1 for s in deduped
|
|||
|
|
if s["source"] == src and s.get("resolved_github_id"))
|
|||
|
|
extra = f" ({resolved} resolved)" if resolved else ""
|
|||
|
|
print(f" {src}: {count}{extra}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|