From c93f1c861bc8b6c51f7bf930cdab3b356c42aff0 Mon Sep 17 00:00:00 2001 From: alt-glitch Date: Mon, 27 Apr 2026 06:44:41 +0530 Subject: [PATCH] fix: cache _contains_cjk, escape LIKE wildcards, add regression tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On top of the CJK FTS5 bypass from #15509: - Cache _contains_cjk() result in a local var to avoid redundant O(n) scans on every CJK query - Escape %, _ in LIKE queries so literal wildcards in user input are not treated as SQL wildcards (consistent with other LIKE queries in hermes_state.py that use ESCAPE '\') - Fix misleading comment ('or CJK fallback' → accurate description) - Add 3 regression tests: - test_cjk_partial_fts5_results_supplemented_by_like (#15500 / #14829) - test_cjk_like_dedup_no_duplicates - test_cjk_like_escapes_wildcards (new wildcard escaping) --- hermes_state.py | 15 ++++++++++----- tests/test_hermes_state.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index eb94987305..de968e5abc 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -1336,7 +1336,8 @@ class SessionDB: # "大 AND 别 AND 山 AND 项 AND 目". This produces false positives # (all chars scattered in a message) and misses exact phrase matches. # LIKE substring search is more accurate for CJK phrase matching. - if self._contains_cjk(query): + is_cjk = self._contains_cjk(query) + if is_cjk: matches = [] else: with self._lock: @@ -1348,11 +1349,15 @@ class SessionDB: else: matches = [dict(row) for row in cursor.fetchall()] - # LIKE search for CJK queries (primary path) or CJK fallback - if not matches and self._contains_cjk(query): + # LIKE substring search for CJK queries (primary path since FTS5 + # cannot do phrase matching with the unicode61 tokenizer). + if not matches and is_cjk: raw_query = query.strip('"').strip() - like_where = ["m.content LIKE ?"] - like_params: list = [f"%{raw_query}%"] + # Escape LIKE wildcards so literal %, _ in the query + # are not treated as single/multi-char wildcards. + escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + like_where = ["m.content LIKE ? ESCAPE '\\'"] + like_params: list = [f"%{escaped}%"] if source_filter is not None: like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})") like_params.extend(source_filter) diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index 868a28c530..559bac9cbb 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -743,6 +743,40 @@ class TestCJKSearchFallback: results = db.search_messages("Agent通信") assert len(results) == 1 + def test_cjk_partial_fts5_results_supplemented_by_like(self, db): + """When FTS5 returns *some* CJK results, LIKE must still find all matches. + + Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops + certain CJK characters, so multi-character queries may return partial + results. The LIKE path must always run for CJK queries. + """ + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="telegram") + db.append_message("s1", role="user", content="昨晚讨论了记忆系统") + db.append_message("s2", role="user", content="昨晚的会议纪要已发送") + results = db.search_messages("昨晚") + assert len(results) == 2 + session_ids = {r["session_id"] for r in results} + assert session_ids == {"s1", "s2"} + + def test_cjk_like_dedup_no_duplicates(self, db): + """When FTS5 and LIKE both find the same message, no duplicates.""" + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="测试去重逻辑") + results = db.search_messages("测试") + assert len(results) == 1 + + def test_cjk_like_escapes_wildcards(self, db): + """LIKE wildcards (%, _) in CJK queries are treated as literals.""" + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="cli") + db.append_message("s1", role="user", content="达成100%完成率") + db.append_message("s2", role="user", content="达成100完成率是目标") + # The % in the query must be literal — should only match s1 + results = db.search_messages("100%完成") + assert len(results) == 1 + assert results[0]["session_id"] == "s1" + # ========================================================================= # Session search and listing