mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix(patch): harden V4A patch parser and fuzzy match — 9 correctness bugs
- Bug 1: replace read_file(limit=10000) with read_file_raw in _apply_update, preventing silent truncation of files >2000 lines and corruption of lines >2000 chars; add read_file_raw to FileOperations abstract interface and ShellFileOperations - Bug 2: split apply_v4a_operations into validate-then-apply phases; if any hunk fails validation, zero writes occur (was: continue after failure, leaving filesystem partially modified) - Bug 3: parse_v4a_patch now returns an error for begin-marker-with-no-ops, empty file paths, and moves missing a destination (was: always returned error=None) - Bug 4: raise strategy 7 (block anchor) single-candidate similarity threshold from 0.10 to 0.50, eliminating false-positive matches in repetitive code - Bug 5: add _strategy_unicode_normalized (new strategy 7) with position mapping via _build_orig_to_norm_map; smart quotes and em-dashes in LLM-generated patches now match via strategies 1-6 before falling through to fuzzy strategies - Bug 6: extend fuzzy_find_and_replace to return 4-tuple (content, count, error, strategy); update all 5 call sites across patch_parser.py, file_operations.py, and skill_manager_tool.py - Bug 7: guard in _apply_update returns error when addition-only context hint is ambiguous (>1 occurrences); validation phase errors on both 0 and >1 - Bug 8: _apply_delete returns error (not silent success) on missing file - Bug 9: _validate_operations checks source existence and destination absence for MOVE operations before any write occurs
This commit is contained in:
@@ -28,6 +28,7 @@ Usage:
|
||||
result = apply_v4a_operations(operations, file_ops)
|
||||
"""
|
||||
|
||||
import difflib
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Tuple, Any
|
||||
@@ -202,31 +203,162 @@ def parse_v4a_patch(patch_content: str) -> Tuple[List[PatchOperation], Optional[
|
||||
if current_hunk and current_hunk.lines:
|
||||
current_op.hunks.append(current_hunk)
|
||||
operations.append(current_op)
|
||||
|
||||
|
||||
# Validate the parsed result
|
||||
if not operations:
|
||||
# Empty patch is not an error — callers get [] and can decide
|
||||
return operations, None
|
||||
|
||||
parse_errors: List[str] = []
|
||||
for op in operations:
|
||||
if not op.file_path:
|
||||
parse_errors.append("Operation with empty file path")
|
||||
if op.operation == OperationType.UPDATE and not op.hunks:
|
||||
parse_errors.append(f"UPDATE {op.file_path!r}: no hunks found")
|
||||
if op.operation == OperationType.MOVE and not op.new_path:
|
||||
parse_errors.append(f"MOVE {op.file_path!r}: missing destination path (expected 'src -> dst')")
|
||||
|
||||
if parse_errors:
|
||||
return [], "Parse error: " + "; ".join(parse_errors)
|
||||
|
||||
return operations, None
|
||||
|
||||
|
||||
def apply_v4a_operations(operations: List[PatchOperation],
|
||||
file_ops: Any) -> 'PatchResult':
|
||||
def _count_occurrences(text: str, pattern: str) -> int:
|
||||
"""Count non-overlapping occurrences of *pattern* in *text*."""
|
||||
count = 0
|
||||
start = 0
|
||||
while True:
|
||||
pos = text.find(pattern, start)
|
||||
if pos == -1:
|
||||
break
|
||||
count += 1
|
||||
start = pos + 1
|
||||
return count
|
||||
|
||||
|
||||
def _validate_operations(
|
||||
operations: List[PatchOperation],
|
||||
file_ops: Any,
|
||||
) -> List[str]:
|
||||
"""Validate all operations without writing any files.
|
||||
|
||||
Returns a list of error strings; an empty list means all operations
|
||||
are valid and the apply phase can proceed safely.
|
||||
|
||||
For UPDATE operations, hunks are simulated in order so that later
|
||||
hunks validate against post-earlier-hunk content (matching apply order).
|
||||
"""
|
||||
Apply V4A patch operations using a file operations interface.
|
||||
|
||||
# Deferred import: breaks the patch_parser ↔ fuzzy_match circular dependency
|
||||
from tools.fuzzy_match import fuzzy_find_and_replace
|
||||
|
||||
errors: List[str] = []
|
||||
|
||||
for op in operations:
|
||||
if op.operation == OperationType.UPDATE:
|
||||
read_result = file_ops.read_file_raw(op.file_path)
|
||||
if read_result.error:
|
||||
errors.append(f"{op.file_path}: {read_result.error}")
|
||||
continue
|
||||
|
||||
simulated = read_result.content
|
||||
for hunk in op.hunks:
|
||||
search_lines = [l.content for l in hunk.lines if l.prefix in (' ', '-')]
|
||||
if not search_lines:
|
||||
# Addition-only hunk: validate context hint uniqueness
|
||||
if hunk.context_hint:
|
||||
occurrences = _count_occurrences(simulated, hunk.context_hint)
|
||||
if occurrences == 0:
|
||||
errors.append(
|
||||
f"{op.file_path}: addition-only hunk context hint "
|
||||
f"'{hunk.context_hint}' not found"
|
||||
)
|
||||
elif occurrences > 1:
|
||||
errors.append(
|
||||
f"{op.file_path}: addition-only hunk context hint "
|
||||
f"'{hunk.context_hint}' is ambiguous "
|
||||
f"({occurrences} occurrences)"
|
||||
)
|
||||
continue
|
||||
|
||||
search_pattern = '\n'.join(search_lines)
|
||||
replace_lines = [l.content for l in hunk.lines if l.prefix in (' ', '+')]
|
||||
replacement = '\n'.join(replace_lines)
|
||||
|
||||
new_simulated, count, _strategy, match_error = fuzzy_find_and_replace(
|
||||
simulated, search_pattern, replacement, replace_all=False
|
||||
)
|
||||
if count == 0:
|
||||
label = f"'{hunk.context_hint}'" if hunk.context_hint else "(no hint)"
|
||||
errors.append(
|
||||
f"{op.file_path}: hunk {label} not found"
|
||||
+ (f" — {match_error}" if match_error else "")
|
||||
)
|
||||
else:
|
||||
# Advance simulation so subsequent hunks validate correctly.
|
||||
# Reuse the result from the call above — no second fuzzy run.
|
||||
simulated = new_simulated
|
||||
|
||||
elif op.operation == OperationType.DELETE:
|
||||
read_result = file_ops.read_file_raw(op.file_path)
|
||||
if read_result.error:
|
||||
errors.append(f"{op.file_path}: file not found for deletion")
|
||||
|
||||
elif op.operation == OperationType.MOVE:
|
||||
if not op.new_path:
|
||||
errors.append(f"{op.file_path}: MOVE operation missing destination path")
|
||||
continue
|
||||
src_result = file_ops.read_file_raw(op.file_path)
|
||||
if src_result.error:
|
||||
errors.append(f"{op.file_path}: source file not found for move")
|
||||
dst_result = file_ops.read_file_raw(op.new_path)
|
||||
if not dst_result.error:
|
||||
errors.append(
|
||||
f"{op.new_path}: destination already exists — move would overwrite"
|
||||
)
|
||||
|
||||
# ADD: parent directory creation handled by write_file; no pre-check needed.
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def apply_v4a_operations(operations: List[PatchOperation],
|
||||
file_ops: Any) -> 'PatchResult':
|
||||
"""Apply V4A patch operations using a file operations interface.
|
||||
|
||||
Uses a two-phase validate-then-apply approach:
|
||||
- Phase 1: validate all operations against current file contents without
|
||||
writing anything. If any validation error is found, return immediately
|
||||
with no filesystem changes.
|
||||
- Phase 2: apply all operations. A failure here (e.g. a race between
|
||||
validation and apply) is reported with a note to run ``git diff``.
|
||||
|
||||
Args:
|
||||
operations: List of PatchOperation from parse_v4a_patch
|
||||
file_ops: Object with read_file, write_file methods
|
||||
|
||||
file_ops: Object with read_file_raw, write_file methods
|
||||
|
||||
Returns:
|
||||
PatchResult with results of all operations
|
||||
"""
|
||||
# Import here to avoid circular imports
|
||||
from tools.file_operations import PatchResult
|
||||
|
||||
|
||||
# ---- Phase 1: validate ----
|
||||
validation_errors = _validate_operations(operations, file_ops)
|
||||
if validation_errors:
|
||||
return PatchResult(
|
||||
success=False,
|
||||
error="Patch validation failed (no files were modified):\n"
|
||||
+ "\n".join(f" • {e}" for e in validation_errors),
|
||||
)
|
||||
|
||||
# ---- Phase 2: apply ----
|
||||
files_modified = []
|
||||
files_created = []
|
||||
files_deleted = []
|
||||
all_diffs = []
|
||||
errors = []
|
||||
|
||||
|
||||
for op in operations:
|
||||
try:
|
||||
if op.operation == OperationType.ADD:
|
||||
@@ -236,7 +368,7 @@ def apply_v4a_operations(operations: List[PatchOperation],
|
||||
all_diffs.append(result[1])
|
||||
else:
|
||||
errors.append(f"Failed to add {op.file_path}: {result[1]}")
|
||||
|
||||
|
||||
elif op.operation == OperationType.DELETE:
|
||||
result = _apply_delete(op, file_ops)
|
||||
if result[0]:
|
||||
@@ -244,7 +376,7 @@ def apply_v4a_operations(operations: List[PatchOperation],
|
||||
all_diffs.append(result[1])
|
||||
else:
|
||||
errors.append(f"Failed to delete {op.file_path}: {result[1]}")
|
||||
|
||||
|
||||
elif op.operation == OperationType.MOVE:
|
||||
result = _apply_move(op, file_ops)
|
||||
if result[0]:
|
||||
@@ -252,7 +384,7 @@ def apply_v4a_operations(operations: List[PatchOperation],
|
||||
all_diffs.append(result[1])
|
||||
else:
|
||||
errors.append(f"Failed to move {op.file_path}: {result[1]}")
|
||||
|
||||
|
||||
elif op.operation == OperationType.UPDATE:
|
||||
result = _apply_update(op, file_ops)
|
||||
if result[0]:
|
||||
@@ -260,19 +392,19 @@ def apply_v4a_operations(operations: List[PatchOperation],
|
||||
all_diffs.append(result[1])
|
||||
else:
|
||||
errors.append(f"Failed to update {op.file_path}: {result[1]}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Error processing {op.file_path}: {str(e)}")
|
||||
|
||||
|
||||
# Run lint on all modified/created files
|
||||
lint_results = {}
|
||||
for f in files_modified + files_created:
|
||||
if hasattr(file_ops, '_check_lint'):
|
||||
lint_result = file_ops._check_lint(f)
|
||||
lint_results[f] = lint_result.to_dict()
|
||||
|
||||
|
||||
combined_diff = '\n'.join(all_diffs)
|
||||
|
||||
|
||||
if errors:
|
||||
return PatchResult(
|
||||
success=False,
|
||||
@@ -281,16 +413,17 @@ def apply_v4a_operations(operations: List[PatchOperation],
|
||||
files_created=files_created,
|
||||
files_deleted=files_deleted,
|
||||
lint=lint_results if lint_results else None,
|
||||
error='; '.join(errors)
|
||||
error="Apply phase failed (state may be inconsistent — run `git diff` to assess):\n"
|
||||
+ "\n".join(f" • {e}" for e in errors),
|
||||
)
|
||||
|
||||
|
||||
return PatchResult(
|
||||
success=True,
|
||||
diff=combined_diff,
|
||||
files_modified=files_modified,
|
||||
files_created=files_created,
|
||||
files_deleted=files_deleted,
|
||||
lint=lint_results if lint_results else None
|
||||
lint=lint_results if lint_results else None,
|
||||
)
|
||||
|
||||
|
||||
@@ -317,68 +450,56 @@ def _apply_add(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
|
||||
def _apply_delete(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
"""Apply a delete file operation."""
|
||||
# Read file first for diff
|
||||
read_result = file_ops.read_file(op.file_path)
|
||||
|
||||
if read_result.error and "not found" in read_result.error.lower():
|
||||
# File doesn't exist, nothing to delete
|
||||
return True, f"# {op.file_path} already deleted or doesn't exist"
|
||||
|
||||
# Delete directly via shell command using the underlying environment
|
||||
rm_result = file_ops._exec(f"rm -f {file_ops._escape_shell_arg(op.file_path)}")
|
||||
|
||||
if rm_result.exit_code != 0:
|
||||
return False, rm_result.stdout
|
||||
|
||||
diff = f"--- a/{op.file_path}\n+++ /dev/null\n# File deleted"
|
||||
return True, diff
|
||||
# Read before deleting so we can produce a real unified diff.
|
||||
# Validation already confirmed existence; this guards against races.
|
||||
read_result = file_ops.read_file_raw(op.file_path)
|
||||
if read_result.error:
|
||||
return False, f"Cannot delete {op.file_path}: file not found"
|
||||
|
||||
result = file_ops.delete_file(op.file_path)
|
||||
if result.error:
|
||||
return False, result.error
|
||||
|
||||
removed_lines = read_result.content.splitlines(keepends=True)
|
||||
diff = ''.join(difflib.unified_diff(
|
||||
removed_lines, [],
|
||||
fromfile=f"a/{op.file_path}",
|
||||
tofile="/dev/null",
|
||||
))
|
||||
return True, diff or f"# Deleted: {op.file_path}"
|
||||
|
||||
|
||||
def _apply_move(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
"""Apply a move file operation."""
|
||||
# Use shell mv command
|
||||
mv_result = file_ops._exec(
|
||||
f"mv {file_ops._escape_shell_arg(op.file_path)} {file_ops._escape_shell_arg(op.new_path)}"
|
||||
)
|
||||
|
||||
if mv_result.exit_code != 0:
|
||||
return False, mv_result.stdout
|
||||
|
||||
result = file_ops.move_file(op.file_path, op.new_path)
|
||||
if result.error:
|
||||
return False, result.error
|
||||
|
||||
diff = f"# Moved: {op.file_path} -> {op.new_path}"
|
||||
return True, diff
|
||||
|
||||
|
||||
def _apply_update(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
"""Apply an update file operation."""
|
||||
# Read current content
|
||||
read_result = file_ops.read_file(op.file_path, limit=10000)
|
||||
|
||||
# Deferred import: breaks the patch_parser ↔ fuzzy_match circular dependency
|
||||
from tools.fuzzy_match import fuzzy_find_and_replace
|
||||
|
||||
# Read current content — raw so no line-number prefixes or per-line truncation
|
||||
read_result = file_ops.read_file_raw(op.file_path)
|
||||
|
||||
if read_result.error:
|
||||
return False, f"Cannot read file: {read_result.error}"
|
||||
|
||||
# Parse content (remove line numbers)
|
||||
current_lines = []
|
||||
for line in read_result.content.split('\n'):
|
||||
if re.match(r'^\s*\d+\|', line):
|
||||
# Line format: " 123|content"
|
||||
parts = line.split('|', 1)
|
||||
if len(parts) == 2:
|
||||
current_lines.append(parts[1])
|
||||
else:
|
||||
current_lines.append(line)
|
||||
else:
|
||||
current_lines.append(line)
|
||||
|
||||
current_content = '\n'.join(current_lines)
|
||||
|
||||
|
||||
current_content = read_result.content
|
||||
|
||||
# Apply each hunk
|
||||
new_content = current_content
|
||||
|
||||
|
||||
for hunk in op.hunks:
|
||||
# Build search pattern from context and removed lines
|
||||
search_lines = []
|
||||
replace_lines = []
|
||||
|
||||
|
||||
for line in hunk.lines:
|
||||
if line.prefix == ' ':
|
||||
search_lines.append(line.content)
|
||||
@@ -387,17 +508,15 @@ def _apply_update(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
search_lines.append(line.content)
|
||||
elif line.prefix == '+':
|
||||
replace_lines.append(line.content)
|
||||
|
||||
|
||||
if search_lines:
|
||||
search_pattern = '\n'.join(search_lines)
|
||||
replacement = '\n'.join(replace_lines)
|
||||
|
||||
# Use fuzzy matching
|
||||
from tools.fuzzy_match import fuzzy_find_and_replace
|
||||
new_content, count, error = fuzzy_find_and_replace(
|
||||
|
||||
new_content, count, _strategy, error = fuzzy_find_and_replace(
|
||||
new_content, search_pattern, replacement, replace_all=False
|
||||
)
|
||||
|
||||
|
||||
if error and count == 0:
|
||||
# Try with context hint if available
|
||||
if hunk.context_hint:
|
||||
@@ -408,8 +527,8 @@ def _apply_update(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
window_start = max(0, hint_pos - 500)
|
||||
window_end = min(len(new_content), hint_pos + 2000)
|
||||
window = new_content[window_start:window_end]
|
||||
|
||||
window_new, count, error = fuzzy_find_and_replace(
|
||||
|
||||
window_new, count, _strategy, error = fuzzy_find_and_replace(
|
||||
window, search_pattern, replacement, replace_all=False
|
||||
)
|
||||
|
||||
@@ -424,16 +543,23 @@ def _apply_update(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
# Insert at the location indicated by the context hint, or at end of file.
|
||||
insert_text = '\n'.join(replace_lines)
|
||||
if hunk.context_hint:
|
||||
hint_pos = new_content.find(hunk.context_hint)
|
||||
if hint_pos != -1:
|
||||
occurrences = _count_occurrences(new_content, hunk.context_hint)
|
||||
if occurrences == 0:
|
||||
# Hint not found — append at end as a safe fallback
|
||||
new_content = new_content.rstrip('\n') + '\n' + insert_text + '\n'
|
||||
elif occurrences > 1:
|
||||
return False, (
|
||||
f"Addition-only hunk: context hint '{hunk.context_hint}' is ambiguous "
|
||||
f"({occurrences} occurrences) — provide a more unique hint"
|
||||
)
|
||||
else:
|
||||
hint_pos = new_content.find(hunk.context_hint)
|
||||
# Insert after the line containing the context hint
|
||||
eol = new_content.find('\n', hint_pos)
|
||||
if eol != -1:
|
||||
new_content = new_content[:eol + 1] + insert_text + '\n' + new_content[eol + 1:]
|
||||
else:
|
||||
new_content = new_content + '\n' + insert_text
|
||||
else:
|
||||
new_content = new_content.rstrip('\n') + '\n' + insert_text + '\n'
|
||||
else:
|
||||
new_content = new_content.rstrip('\n') + '\n' + insert_text + '\n'
|
||||
|
||||
@@ -443,7 +569,6 @@ def _apply_update(op: PatchOperation, file_ops: Any) -> Tuple[bool, str]:
|
||||
return False, write_result.error
|
||||
|
||||
# Generate diff
|
||||
import difflib
|
||||
diff_lines = difflib.unified_diff(
|
||||
current_content.splitlines(keepends=True),
|
||||
new_content.splitlines(keepends=True),
|
||||
|
||||
Reference in New Issue
Block a user