mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix: cache _contains_cjk, escape LIKE wildcards, add regression tests
On top of the CJK FTS5 bypass from #15509: - Cache _contains_cjk() result in a local var to avoid redundant O(n) scans on every CJK query - Escape %, _ in LIKE queries so literal wildcards in user input are not treated as SQL wildcards (consistent with other LIKE queries in hermes_state.py that use ESCAPE '\') - Fix misleading comment ('or CJK fallback' → accurate description) - Add 3 regression tests: - test_cjk_partial_fts5_results_supplemented_by_like (#15500 / #14829) - test_cjk_like_dedup_no_duplicates - test_cjk_like_escapes_wildcards (new wildcard escaping)
This commit is contained in:
@@ -1336,7 +1336,8 @@ class SessionDB:
|
|||||||
# "大 AND 别 AND 山 AND 项 AND 目". This produces false positives
|
# "大 AND 别 AND 山 AND 项 AND 目". This produces false positives
|
||||||
# (all chars scattered in a message) and misses exact phrase matches.
|
# (all chars scattered in a message) and misses exact phrase matches.
|
||||||
# LIKE substring search is more accurate for CJK phrase matching.
|
# LIKE substring search is more accurate for CJK phrase matching.
|
||||||
if self._contains_cjk(query):
|
is_cjk = self._contains_cjk(query)
|
||||||
|
if is_cjk:
|
||||||
matches = []
|
matches = []
|
||||||
else:
|
else:
|
||||||
with self._lock:
|
with self._lock:
|
||||||
@@ -1348,11 +1349,15 @@ class SessionDB:
|
|||||||
else:
|
else:
|
||||||
matches = [dict(row) for row in cursor.fetchall()]
|
matches = [dict(row) for row in cursor.fetchall()]
|
||||||
|
|
||||||
# LIKE search for CJK queries (primary path) or CJK fallback
|
# LIKE substring search for CJK queries (primary path since FTS5
|
||||||
if not matches and self._contains_cjk(query):
|
# cannot do phrase matching with the unicode61 tokenizer).
|
||||||
|
if not matches and is_cjk:
|
||||||
raw_query = query.strip('"').strip()
|
raw_query = query.strip('"').strip()
|
||||||
like_where = ["m.content LIKE ?"]
|
# Escape LIKE wildcards so literal %, _ in the query
|
||||||
like_params: list = [f"%{raw_query}%"]
|
# are not treated as single/multi-char wildcards.
|
||||||
|
escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
||||||
|
like_where = ["m.content LIKE ? ESCAPE '\\'"]
|
||||||
|
like_params: list = [f"%{escaped}%"]
|
||||||
if source_filter is not None:
|
if source_filter is not None:
|
||||||
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||||
like_params.extend(source_filter)
|
like_params.extend(source_filter)
|
||||||
|
|||||||
@@ -743,6 +743,40 @@ class TestCJKSearchFallback:
|
|||||||
results = db.search_messages("Agent通信")
|
results = db.search_messages("Agent通信")
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
|
|
||||||
|
def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
|
||||||
|
"""When FTS5 returns *some* CJK results, LIKE must still find all matches.
|
||||||
|
|
||||||
|
Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops
|
||||||
|
certain CJK characters, so multi-character queries may return partial
|
||||||
|
results. The LIKE path must always run for CJK queries.
|
||||||
|
"""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.create_session(session_id="s2", source="telegram")
|
||||||
|
db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
|
||||||
|
db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
|
||||||
|
results = db.search_messages("昨晚")
|
||||||
|
assert len(results) == 2
|
||||||
|
session_ids = {r["session_id"] for r in results}
|
||||||
|
assert session_ids == {"s1", "s2"}
|
||||||
|
|
||||||
|
def test_cjk_like_dedup_no_duplicates(self, db):
|
||||||
|
"""When FTS5 and LIKE both find the same message, no duplicates."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="测试去重逻辑")
|
||||||
|
results = db.search_messages("测试")
|
||||||
|
assert len(results) == 1
|
||||||
|
|
||||||
|
def test_cjk_like_escapes_wildcards(self, db):
|
||||||
|
"""LIKE wildcards (%, _) in CJK queries are treated as literals."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.create_session(session_id="s2", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="达成100%完成率")
|
||||||
|
db.append_message("s2", role="user", content="达成100完成率是目标")
|
||||||
|
# The % in the query must be literal — should only match s1
|
||||||
|
results = db.search_messages("100%完成")
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0]["session_id"] == "s1"
|
||||||
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session search and listing
|
# Session search and listing
|
||||||
|
|||||||
Reference in New Issue
Block a user