Add file manipulation tools and enhance setup scripts

- Introduced file manipulation capabilities in `model_tools.py`, including functions for reading, writing, patching, and searching files.
- Added a new `file` toolset in `toolsets.py` and updated distributions to include file tools.
- Enhanced `setup-hermes.sh` and `install.sh` scripts to check for and optionally install `ripgrep` for faster file searching.
- Implemented a new `file_operations.py` module to encapsulate file operations using shell commands.
- Updated `doctor.py` and `install.ps1` to check for `ripgrep` and provide installation guidance if not found.
- Added fuzzy matching and patch parsing capabilities to improve file manipulation accuracy and flexibility.
This commit is contained in:
teknium1
2026-02-05 03:49:46 -08:00
parent 5c3105b437
commit 533c064269
12 changed files with 2549 additions and 13 deletions

View File

@@ -33,6 +33,9 @@ from typing import Dict, Any, List, Optional, Tuple
from tools.web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_firecrawl_api_key
from tools.terminal_tool import terminal_tool, check_terminal_requirements, TERMINAL_TOOL_DESCRIPTION, cleanup_vm
# File manipulation tools (read, write, patch, search)
from tools.file_tools import read_file_tool, write_file_tool, patch_tool, search_tool
from tools import check_file_requirements
# Hecate/MorphCloud terminal tool (cloud VMs) - available as alternative backend
from tools.terminal_hecate import terminal_hecate_tool, check_hecate_requirements, TERMINAL_HECATE_DESCRIPTION
from tools.vision_tools import vision_analyze_tool, check_vision_requirements
@@ -155,6 +158,13 @@ TOOLSET_REQUIREMENTS = {
"rl_list_runs", "rl_test_inference",
],
},
"file": {
"name": "File Operations (read, write, patch, search)",
"env_vars": [], # Uses terminal backend, no additional requirements
"check_fn": check_file_requirements,
"setup_url": None,
"tools": ["read_file", "write_file", "patch", "search"],
},
}
@@ -675,6 +685,163 @@ def get_rl_tool_definitions() -> List[Dict[str, Any]]:
]
def get_file_tool_definitions() -> List[Dict[str, Any]]:
"""
Get tool definitions for file manipulation tools in OpenAI's expected format.
File tools operate via the terminal backend and support any environment
(local, docker, singularity, ssh, modal).
Returns:
List[Dict]: List of file tool definitions compatible with OpenAI API
"""
return [
{
"type": "function",
"function": {
"name": "read_file",
"description": "Read a file with pagination support. Returns content with line numbers in 'LINE_NUM|CONTENT' format. For binary files (images), returns base64-encoded data. If file not found, suggests similar filenames.",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the file to read (absolute or relative)"
},
"offset": {
"type": "integer",
"description": "Line number to start reading from (1-indexed, default: 1)",
"default": 1,
"minimum": 1
},
"limit": {
"type": "integer",
"description": "Maximum number of lines to read (default: 500, max: 2000)",
"default": 500,
"maximum": 2000
}
},
"required": ["path"]
}
}
},
{
"type": "function",
"function": {
"name": "write_file",
"description": "Write content to a file. Creates parent directories automatically. Returns bytes written and lint check results for supported languages.",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the file to write (will be created if doesn't exist)"
},
"content": {
"type": "string",
"description": "Content to write to the file"
}
},
"required": ["path", "content"]
}
}
},
{
"type": "function",
"function": {
"name": "patch",
"description": "Modify files using either simple string replacement or V4A patch format. Mode 'replace' does find-and-replace with fuzzy matching. Mode 'patch' applies multi-file changes using V4A format (*** Begin/End Patch). Auto-runs syntax checks on modified files.",
"parameters": {
"type": "object",
"properties": {
"mode": {
"type": "string",
"enum": ["replace", "patch"],
"description": "Edit mode: 'replace' for string replacement, 'patch' for V4A patch format",
"default": "replace"
},
"path": {
"type": "string",
"description": "File path (required for 'replace' mode)"
},
"old_string": {
"type": "string",
"description": "Text to find and replace (required for 'replace' mode). Must be unique in file unless replace_all=true"
},
"new_string": {
"type": "string",
"description": "Replacement text (required for 'replace' mode)"
},
"replace_all": {
"type": "boolean",
"description": "Replace all occurrences instead of requiring unique match (default: false)",
"default": False
},
"patch": {
"type": "string",
"description": "V4A format patch content (required for 'patch' mode). Format: *** Begin Patch / *** Update File: path / @@ context @@ / -removed / +added / *** End Patch"
}
},
"required": ["mode"]
}
}
},
{
"type": "function",
"function": {
"name": "search",
"description": "Search for content in files or search for files by name. Use target='content' to search inside files (like grep), or target='files' to find files by name pattern (like glob/find). Results sorted by modification time (newest first).",
"parameters": {
"type": "object",
"properties": {
"pattern": {
"type": "string",
"description": "For target='content': regex pattern to search for. For target='files': glob pattern (e.g., '*.py', '*config*')"
},
"target": {
"type": "string",
"enum": ["content", "files"],
"description": "Search mode: 'content' searches inside files, 'files' searches for files by name",
"default": "content"
},
"path": {
"type": "string",
"description": "Directory or file to search in (default: current directory)",
"default": "."
},
"file_glob": {
"type": "string",
"description": "Filter files by pattern when target='content' (e.g., '*.py' to only search Python files)"
},
"limit": {
"type": "integer",
"description": "Maximum number of results (default: 50)",
"default": 50
},
"offset": {
"type": "integer",
"description": "Skip first N results for pagination (default: 0)",
"default": 0
},
"output_mode": {
"type": "string",
"enum": ["content", "files_only", "count"],
"description": "For target='content': 'content' shows matches, 'files_only' shows file paths, 'count' shows match counts per file",
"default": "content"
},
"context": {
"type": "integer",
"description": "Lines of context around matches (only for target='content', output_mode='content')",
"default": 0
}
},
"required": ["pattern"]
}
}
}
]
def get_all_tool_names() -> List[str]:
"""
Get the names of all available tools across all toolsets.
@@ -733,6 +900,12 @@ def get_all_tool_names() -> List[str]:
"rl_list_runs", "rl_test_inference"
])
# File manipulation tools (use terminal backend)
if check_file_requirements():
tool_names.extend([
"read_file", "write_file", "patch", "search"
])
return tool_names
@@ -782,6 +955,11 @@ def get_toolset_for_tool(tool_name: str) -> str:
"rl_stop_training": "rl_tools",
"rl_get_results": "rl_tools",
"rl_list_runs": "rl_tools",
# File manipulation tools
"read_file": "file_tools",
"write_file": "file_tools",
"patch": "file_tools",
"search": "file_tools",
}
return toolset_mapping.get(tool_name, "unknown")
@@ -864,6 +1042,11 @@ def get_tool_definitions(
for tool in get_rl_tool_definitions():
all_available_tools_map[tool["function"]["name"]] = tool
# File manipulation tools (use terminal backend)
if check_file_requirements():
for tool in get_file_tool_definitions():
all_available_tools_map[tool["function"]["name"]] = tool
# Determine which tools to include based on toolsets
tools_to_include = set()
@@ -899,7 +1082,8 @@ def get_tool_definitions(
"rl_start_training", "rl_check_status",
"rl_stop_training", "rl_get_results",
"rl_list_runs", "rl_test_inference"
]
],
"file_tools": ["read_file", "write_file", "patch", "search"]
}
legacy_tools = legacy_map.get(toolset_name, [])
tools_to_include.update(legacy_tools)
@@ -951,7 +1135,8 @@ def get_tool_definitions(
"rl_start_training", "rl_check_status",
"rl_stop_training", "rl_get_results",
"rl_list_runs", "rl_test_inference"
]
],
"file_tools": ["read_file", "write_file", "patch", "search"]
}
legacy_tools = legacy_map.get(toolset_name, [])
tools_to_include.difference_update(legacy_tools)
@@ -1338,6 +1523,70 @@ def handle_rl_function_call(
return json.dumps({"error": f"Unknown RL function: {function_name}"}, ensure_ascii=False)
def handle_file_function_call(
function_name: str,
function_args: Dict[str, Any],
task_id: Optional[str] = None
) -> str:
"""
Handle function calls for file manipulation tools.
These tools use the terminal backend for all operations, supporting
local, docker, singularity, ssh, and modal environments.
Args:
function_name (str): Name of the file function to call
function_args (Dict): Arguments for the function
task_id (str): Task identifier for environment isolation
Returns:
str: Function result as JSON string
"""
# Determine task_id to use
tid = task_id or "default"
if function_name == "read_file":
return read_file_tool(
path=function_args.get("path", ""),
offset=function_args.get("offset", 1),
limit=function_args.get("limit", 500),
task_id=tid
)
elif function_name == "write_file":
return write_file_tool(
path=function_args.get("path", ""),
content=function_args.get("content", ""),
task_id=tid
)
elif function_name == "patch":
return patch_tool(
mode=function_args.get("mode", "replace"),
path=function_args.get("path"),
old_string=function_args.get("old_string"),
new_string=function_args.get("new_string"),
replace_all=function_args.get("replace_all", False),
patch=function_args.get("patch"),
task_id=tid
)
elif function_name == "search":
return search_tool(
pattern=function_args.get("pattern", ""),
target=function_args.get("target", "content"),
path=function_args.get("path", "."),
file_glob=function_args.get("file_glob"),
limit=function_args.get("limit", 50),
offset=function_args.get("offset", 0),
output_mode=function_args.get("output_mode", "content"),
context=function_args.get("context", 0),
task_id=tid
)
return json.dumps({"error": f"Unknown file function: {function_name}"}, ensure_ascii=False)
def handle_function_call(
function_name: str,
function_args: Dict[str, Any],
@@ -1411,6 +1660,10 @@ def handle_function_call(
]:
return handle_rl_function_call(function_name, function_args)
# Route file manipulation tools
elif function_name in ["read_file", "write_file", "patch", "search"]:
return handle_file_function_call(function_name, function_args, task_id)
else:
error_msg = f"Unknown function: {function_name}"
print(f"{error_msg}")
@@ -1482,6 +1735,12 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
"tools": ["schedule_cronjob", "list_cronjobs", "remove_cronjob"],
"description": "Schedule and manage automated tasks (cronjobs) - only available in interactive CLI mode",
"requirements": ["HERMES_INTERACTIVE=1 (set automatically by cli.py)"]
},
"file_tools": {
"available": check_file_requirements(),
"tools": ["read_file", "write_file", "patch", "search"],
"description": "File manipulation tools: read/write files, search content/files, patch with fuzzy matching",
"requirements": ["Terminal backend available (local/docker/ssh/singularity/modal)"]
}
}
@@ -1502,7 +1761,8 @@ def check_toolset_requirements() -> Dict[str, bool]:
"image_tools": check_image_generation_requirements(),
"skills_tools": check_skills_requirements(),
"browser_tools": check_browser_requirements(),
"cronjob_tools": check_cronjob_requirements()
"cronjob_tools": check_cronjob_requirements(),
"file_tools": check_file_requirements()
}
if __name__ == "__main__":