mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
feat: trigram FTS5 index for CJK search, replace LIKE fallback (#16651)
* fix: bypass FTS5 for CJK queries in session_search FTS5 default tokenizer splits CJK characters into individual tokens, so multi-character queries like "大别山项目" become AND of single chars. This produces few/no results compared to LIKE substring search. For CJK queries, skip FTS5 entirely and use LIKE for accurate phrase matching. Fixes NousResearch/hermes-agent#15500 * fix: cache _contains_cjk, escape LIKE wildcards, add regression tests On top of the CJK FTS5 bypass from #15509: - Cache _contains_cjk() result in a local var to avoid redundant O(n) scans on every CJK query - Escape %, _ in LIKE queries so literal wildcards in user input are not treated as SQL wildcards (consistent with other LIKE queries in hermes_state.py that use ESCAPE '\') - Fix misleading comment ('or CJK fallback' → accurate description) - Add 3 regression tests: - test_cjk_partial_fts5_results_supplemented_by_like (#15500 / #14829) - test_cjk_like_dedup_no_duplicates - test_cjk_like_escapes_wildcards (new wildcard escaping) * feat: trigram FTS5 index for CJK search, replace LIKE fallback Replace the LIKE '%query%' full-table-scan fallback for CJK queries with a proper trigram FTS5 index (messages_fts_trigram). The trigram tokenizer creates overlapping 3-byte sequences so substring matching works natively for any script — CJK, Thai, etc. For queries with 3+ CJK characters: uses the trigram FTS5 table with proper ranking, snippets, and indexed lookups. For shorter queries (1-2 CJK chars): falls back to LIKE since the trigram tokenizer needs ≥9 UTF-8 bytes (3 CJK chars) minimum. Schema v10 migration creates the trigram table and backfills existing messages. Triggers keep the index in sync on INSERT/UPDATE/DELETE. Builds on top of #16276 (bypass FTS5 for CJK, escape LIKE wildcards). --------- Co-authored-by: vominh1919 <vominh1919@gmail.com>
This commit is contained in:
210
hermes_state.py
210
hermes_state.py
@@ -31,7 +31,7 @@ T = TypeVar("T")
|
||||
|
||||
DEFAULT_DB_PATH = get_hermes_home() / "state.db"
|
||||
|
||||
SCHEMA_VERSION = 9
|
||||
SCHEMA_VERSION = 10
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS schema_version (
|
||||
@@ -119,6 +119,32 @@ CREATE TRIGGER IF NOT EXISTS messages_fts_update AFTER UPDATE ON messages BEGIN
|
||||
END;
|
||||
"""
|
||||
|
||||
# Trigram FTS5 table for CJK substring search. The default unicode61
|
||||
# tokenizer splits CJK characters into individual tokens, breaking phrase
|
||||
# matching. The trigram tokenizer creates overlapping 3-byte sequences so
|
||||
# substring queries work natively for any script (CJK, Thai, etc.).
|
||||
FTS_TRIGRAM_SQL = """
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts_trigram USING fts5(
|
||||
content,
|
||||
content=messages,
|
||||
content_rowid=id,
|
||||
tokenize='trigram'
|
||||
);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_insert AFTER INSERT ON messages BEGIN
|
||||
INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_delete AFTER DELETE ON messages BEGIN
|
||||
INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_update AFTER UPDATE ON messages BEGIN
|
||||
INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
|
||||
INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
|
||||
END;
|
||||
"""
|
||||
|
||||
|
||||
class SessionDB:
|
||||
"""
|
||||
@@ -366,6 +392,18 @@ class SessionDB:
|
||||
except sqlite3.OperationalError:
|
||||
pass # Column already exists
|
||||
cursor.execute("UPDATE schema_version SET version = 9")
|
||||
if current_version < 10:
|
||||
# v10: trigram FTS5 table for CJK/substring search.
|
||||
# Created via FTS_TRIGRAM_SQL below; backfill existing messages.
|
||||
try:
|
||||
cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
|
||||
except sqlite3.OperationalError:
|
||||
cursor.executescript(FTS_TRIGRAM_SQL)
|
||||
cursor.execute(
|
||||
"INSERT INTO messages_fts_trigram(rowid, content) "
|
||||
"SELECT id, content FROM messages WHERE content IS NOT NULL"
|
||||
)
|
||||
cursor.execute("UPDATE schema_version SET version = 10")
|
||||
|
||||
# Unique title index — always ensure it exists (safe to run after migrations
|
||||
# since the title column is guaranteed to exist at this point)
|
||||
@@ -383,6 +421,12 @@ class SessionDB:
|
||||
except sqlite3.OperationalError:
|
||||
cursor.executescript(FTS_SQL)
|
||||
|
||||
# Trigram FTS5 for CJK/substring search
|
||||
try:
|
||||
cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
|
||||
except sqlite3.OperationalError:
|
||||
cursor.executescript(FTS_TRIGRAM_SQL)
|
||||
|
||||
self._conn.commit()
|
||||
|
||||
# =========================================================================
|
||||
@@ -1291,6 +1335,16 @@ class SessionDB:
|
||||
return sanitized.strip()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _is_cjk_codepoint(cp: int) -> bool:
|
||||
return (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
|
||||
0x3400 <= cp <= 0x4DBF or # CJK Extension A
|
||||
0x20000 <= cp <= 0x2A6DF or # CJK Extension B
|
||||
0x3000 <= cp <= 0x303F or # CJK Symbols
|
||||
0x3040 <= cp <= 0x309F or # Hiragana
|
||||
0x30A0 <= cp <= 0x30FF or # Katakana
|
||||
0xAC00 <= cp <= 0xD7AF) # Hangul Syllables
|
||||
|
||||
@staticmethod
|
||||
def _contains_cjk(text: str) -> bool:
|
||||
"""Check if text contains CJK (Chinese, Japanese, Korean) characters."""
|
||||
@@ -1306,6 +1360,11 @@ class SessionDB:
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _count_cjk(cls, text: str) -> int:
|
||||
"""Count CJK characters in text."""
|
||||
return sum(1 for ch in text if cls._is_cjk_codepoint(ord(ch)))
|
||||
|
||||
def search_messages(
|
||||
self,
|
||||
query: str,
|
||||
@@ -1376,52 +1435,113 @@ class SessionDB:
|
||||
LIMIT ? OFFSET ?
|
||||
"""
|
||||
|
||||
with self._lock:
|
||||
try:
|
||||
cursor = self._conn.execute(sql, params)
|
||||
except sqlite3.OperationalError:
|
||||
# FTS5 query syntax error despite sanitization — return empty
|
||||
# unless query contains CJK (fall back to LIKE below)
|
||||
if not self._contains_cjk(query):
|
||||
return []
|
||||
matches = []
|
||||
else:
|
||||
matches = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK
|
||||
# characters individually, causing multi-character queries to fail.
|
||||
if not matches and self._contains_cjk(query):
|
||||
# CJK queries bypass the unicode61 FTS5 table. The default tokenizer
|
||||
# splits CJK characters into individual tokens, so "大别山项目" becomes
|
||||
# "大 AND 别 AND 山 AND 项 AND 目" — producing false positives and
|
||||
# missing exact phrase matches.
|
||||
#
|
||||
# For queries with 3+ CJK characters, we use the trigram FTS5 table
|
||||
# (indexed substring matching with ranking and snippets). For shorter
|
||||
# CJK queries (1-2 chars), trigram can't match (it needs ≥9 UTF-8
|
||||
# bytes = 3 CJK chars), so we fall back to LIKE.
|
||||
is_cjk = self._contains_cjk(query)
|
||||
if is_cjk:
|
||||
raw_query = query.strip('"').strip()
|
||||
like_where = ["m.content LIKE ?"]
|
||||
like_params: list = [f"%{raw_query}%"]
|
||||
if source_filter is not None:
|
||||
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||
like_params.extend(source_filter)
|
||||
if exclude_sources is not None:
|
||||
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
|
||||
like_params.extend(exclude_sources)
|
||||
if role_filter:
|
||||
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
|
||||
like_params.extend(role_filter)
|
||||
like_sql = f"""
|
||||
SELECT m.id, m.session_id, m.role,
|
||||
substr(m.content,
|
||||
max(1, instr(m.content, ?) - 40),
|
||||
120) AS snippet,
|
||||
m.content, m.timestamp, m.tool_name,
|
||||
s.source, s.model, s.started_at AS session_started
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE {' AND '.join(like_where)}
|
||||
ORDER BY m.timestamp DESC
|
||||
LIMIT ? OFFSET ?
|
||||
"""
|
||||
like_params.extend([limit, offset])
|
||||
# instr() parameter goes first in the bound list
|
||||
like_params = [raw_query] + like_params
|
||||
cjk_count = self._count_cjk(raw_query)
|
||||
|
||||
if cjk_count >= 3:
|
||||
# Trigram FTS5 path — quote each non-operator token to handle
|
||||
# FTS5 special chars (%, *, etc.) while preserving boolean
|
||||
# operators (AND, OR, NOT) for multi-term queries.
|
||||
tokens = raw_query.split()
|
||||
parts = []
|
||||
for tok in tokens:
|
||||
if tok.upper() in ("AND", "OR", "NOT"):
|
||||
parts.append(tok)
|
||||
else:
|
||||
parts.append('"' + tok.replace('"', '""') + '"')
|
||||
trigram_query = " ".join(parts)
|
||||
tri_where = ["messages_fts_trigram MATCH ?"]
|
||||
tri_params: list = [trigram_query]
|
||||
if source_filter is not None:
|
||||
tri_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||
tri_params.extend(source_filter)
|
||||
if exclude_sources is not None:
|
||||
tri_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
|
||||
tri_params.extend(exclude_sources)
|
||||
if role_filter:
|
||||
tri_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
|
||||
tri_params.extend(role_filter)
|
||||
tri_sql = f"""
|
||||
SELECT
|
||||
m.id,
|
||||
m.session_id,
|
||||
m.role,
|
||||
snippet(messages_fts_trigram, 0, '>>>', '<<<', '...', 40) AS snippet,
|
||||
m.content,
|
||||
m.timestamp,
|
||||
m.tool_name,
|
||||
s.source,
|
||||
s.model,
|
||||
s.started_at AS session_started
|
||||
FROM messages_fts_trigram
|
||||
JOIN messages m ON m.id = messages_fts_trigram.rowid
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE {' AND '.join(tri_where)}
|
||||
ORDER BY rank
|
||||
LIMIT ? OFFSET ?
|
||||
"""
|
||||
tri_params.extend([limit, offset])
|
||||
with self._lock:
|
||||
try:
|
||||
tri_cursor = self._conn.execute(tri_sql, tri_params)
|
||||
except sqlite3.OperationalError:
|
||||
matches = []
|
||||
else:
|
||||
matches = [dict(row) for row in tri_cursor.fetchall()]
|
||||
else:
|
||||
# Short CJK query (1-2 chars) — trigram needs ≥3 CJK chars.
|
||||
# Fall back to LIKE substring search.
|
||||
escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
||||
like_where = ["m.content LIKE ? ESCAPE '\\'"]
|
||||
like_params: list = [f"%{escaped}%"]
|
||||
if source_filter is not None:
|
||||
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||
like_params.extend(source_filter)
|
||||
if exclude_sources is not None:
|
||||
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
|
||||
like_params.extend(exclude_sources)
|
||||
if role_filter:
|
||||
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
|
||||
like_params.extend(role_filter)
|
||||
like_sql = f"""
|
||||
SELECT m.id, m.session_id, m.role,
|
||||
substr(m.content,
|
||||
max(1, instr(m.content, ?) - 40),
|
||||
120) AS snippet,
|
||||
m.content, m.timestamp, m.tool_name,
|
||||
s.source, s.model, s.started_at AS session_started
|
||||
FROM messages m
|
||||
JOIN sessions s ON s.id = m.session_id
|
||||
WHERE {' AND '.join(like_where)}
|
||||
ORDER BY m.timestamp DESC
|
||||
LIMIT ? OFFSET ?
|
||||
"""
|
||||
like_params.extend([limit, offset])
|
||||
# instr() parameter goes first in the bound list
|
||||
like_params = [raw_query] + like_params
|
||||
with self._lock:
|
||||
like_cursor = self._conn.execute(like_sql, like_params)
|
||||
matches = [dict(row) for row in like_cursor.fetchall()]
|
||||
else:
|
||||
with self._lock:
|
||||
like_cursor = self._conn.execute(like_sql, like_params)
|
||||
matches = [dict(row) for row in like_cursor.fetchall()]
|
||||
try:
|
||||
cursor = self._conn.execute(sql, params)
|
||||
except sqlite3.OperationalError:
|
||||
# FTS5 query syntax error despite sanitization — return empty
|
||||
return []
|
||||
else:
|
||||
matches = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# Add surrounding context (1 message before + after each match).
|
||||
# Done outside the lock so we don't hold it across N sequential queries.
|
||||
|
||||
@@ -772,6 +772,51 @@ class TestCJKSearchFallback:
|
||||
results = db.search_messages("Agent通信")
|
||||
assert len(results) == 1
|
||||
|
||||
def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
|
||||
"""When FTS5 returns *some* CJK results, LIKE must still find all matches.
|
||||
|
||||
Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops
|
||||
certain CJK characters, so multi-character queries may return partial
|
||||
results. The LIKE path must always run for CJK queries.
|
||||
"""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.create_session(session_id="s2", source="telegram")
|
||||
db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
|
||||
db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
|
||||
results = db.search_messages("昨晚")
|
||||
assert len(results) == 2
|
||||
session_ids = {r["session_id"] for r in results}
|
||||
assert session_ids == {"s1", "s2"}
|
||||
|
||||
def test_cjk_like_dedup_no_duplicates(self, db):
|
||||
"""When FTS5 and LIKE both find the same message, no duplicates."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="测试去重逻辑")
|
||||
results = db.search_messages("测试")
|
||||
assert len(results) == 1
|
||||
|
||||
def test_cjk_like_escapes_wildcards(self, db):
|
||||
"""Special characters (%, _) in CJK queries are treated as literals."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.create_session(session_id="s2", source="cli")
|
||||
db.append_message("s1", role="user", content="达成100%完成率")
|
||||
db.append_message("s2", role="user", content="达成100完成率是目标")
|
||||
# The % in the query must be literal — should only match s1
|
||||
results = db.search_messages("100%完成")
|
||||
assert len(results) == 1
|
||||
assert results[0]["session_id"] == "s1"
|
||||
|
||||
def test_cjk_trigram_preserves_boolean_operators(self, db):
|
||||
"""Boolean operators (OR, AND, NOT) work in CJK trigram queries."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.create_session(session_id="s2", source="cli")
|
||||
db.append_message("s1", role="user", content="记忆系统很好用")
|
||||
db.append_message("s2", role="user", content="断裂连接需要修复")
|
||||
results = db.search_messages("记忆系统 OR 断裂连接")
|
||||
assert len(results) == 2
|
||||
session_ids = {r["session_id"] for r in results}
|
||||
assert session_ids == {"s1", "s2"}
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Session search and listing
|
||||
@@ -1229,7 +1274,7 @@ class TestSchemaInit:
|
||||
def test_schema_version(self, db):
|
||||
cursor = db._conn.execute("SELECT version FROM schema_version")
|
||||
version = cursor.fetchone()[0]
|
||||
assert version == 9
|
||||
assert version == 10
|
||||
|
||||
def test_title_column_exists(self, db):
|
||||
"""Verify the title column was created in the sessions table."""
|
||||
@@ -1290,7 +1335,7 @@ class TestSchemaInit:
|
||||
|
||||
# Verify migration
|
||||
cursor = migrated_db._conn.execute("SELECT version FROM schema_version")
|
||||
assert cursor.fetchone()[0] == 9
|
||||
assert cursor.fetchone()[0] == 10
|
||||
|
||||
# Verify title column exists and is NULL for existing sessions
|
||||
session = migrated_db.get_session("existing")
|
||||
|
||||
Reference in New Issue
Block a user