From c93f1c861bc8b6c51f7bf930cdab3b356c42aff0 Mon Sep 17 00:00:00 2001
From: alt-glitch <balyan.sid@gmail.com>
Date: Mon, 27 Apr 2026 06:44:41 +0530
Subject: [PATCH] fix: cache _contains_cjk, escape LIKE wildcards, add
 regression tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On top of the CJK FTS5 bypass from #15509:

- Cache _contains_cjk() result in a local var to avoid redundant O(n)
  scans on every CJK query
- Escape %, _ in LIKE queries so literal wildcards in user input are
  not treated as SQL wildcards (consistent with other LIKE queries in
  hermes_state.py that use ESCAPE '\')
- Fix misleading comment ('or CJK fallback' → accurate description)
- Add 3 regression tests:
  - test_cjk_partial_fts5_results_supplemented_by_like (#15500 / #14829)
  - test_cjk_like_dedup_no_duplicates
  - test_cjk_like_escapes_wildcards (new wildcard escaping)
---
 hermes_state.py            | 15 ++++++++++-----
 tests/test_hermes_state.py | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/hermes_state.py b/hermes_state.py
index eb94987305..de968e5abc 100644
--- a/hermes_state.py
+++ b/hermes_state.py
@@ -1336,7 +1336,8 @@ class SessionDB:
         # "大 AND 别 AND 山 AND 项 AND 目".  This produces false positives
         # (all chars scattered in a message) and misses exact phrase matches.
         # LIKE substring search is more accurate for CJK phrase matching.
-        if self._contains_cjk(query):
+        is_cjk = self._contains_cjk(query)
+        if is_cjk:
             matches = []
         else:
             with self._lock:
@@ -1348,11 +1349,15 @@ class SessionDB:
                 else:
                     matches = [dict(row) for row in cursor.fetchall()]
 
-        # LIKE search for CJK queries (primary path) or CJK fallback
-        if not matches and self._contains_cjk(query):
+        # LIKE substring search for CJK queries (primary path since FTS5
+        # cannot do phrase matching with the unicode61 tokenizer).
+        if not matches and is_cjk:
             raw_query = query.strip('"').strip()
-            like_where = ["m.content LIKE ?"]
-            like_params: list = [f"%{raw_query}%"]
+            # Escape LIKE wildcards so literal %, _ in the query
+            # are not treated as single/multi-char wildcards.
+            escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
+            like_where = ["m.content LIKE ? ESCAPE '\\'"]
+            like_params: list = [f"%{escaped}%"]
             if source_filter is not None:
                 like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
                 like_params.extend(source_filter)
diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py
index 868a28c530..559bac9cbb 100644
--- a/tests/test_hermes_state.py
+++ b/tests/test_hermes_state.py
@@ -743,6 +743,40 @@ class TestCJKSearchFallback:
         results = db.search_messages("Agent通信")
         assert len(results) == 1
 
+    def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
+        """When FTS5 returns *some* CJK results, LIKE must still find all matches.
+
+        Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops
+        certain CJK characters, so multi-character queries may return partial
+        results.  The LIKE path must always run for CJK queries.
+        """
+        db.create_session(session_id="s1", source="cli")
+        db.create_session(session_id="s2", source="telegram")
+        db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
+        db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
+        results = db.search_messages("昨晚")
+        assert len(results) == 2
+        session_ids = {r["session_id"] for r in results}
+        assert session_ids == {"s1", "s2"}
+
+    def test_cjk_like_dedup_no_duplicates(self, db):
+        """When FTS5 and LIKE both find the same message, no duplicates."""
+        db.create_session(session_id="s1", source="cli")
+        db.append_message("s1", role="user", content="测试去重逻辑")
+        results = db.search_messages("测试")
+        assert len(results) == 1
+
+    def test_cjk_like_escapes_wildcards(self, db):
+        """LIKE wildcards (%, _) in CJK queries are treated as literals."""
+        db.create_session(session_id="s1", source="cli")
+        db.create_session(session_id="s2", source="cli")
+        db.append_message("s1", role="user", content="达成100%完成率")
+        db.append_message("s2", role="user", content="达成100完成率是目标")
+        # The % in the query must be literal — should only match s1
+        results = db.search_messages("100%完成")
+        assert len(results) == 1
+        assert results[0]["session_id"] == "s1"
+
 
 # =========================================================================
 # Session search and listing