mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-29 23:41:35 +08:00
204 lines
6.5 KiB
Python
204 lines
6.5 KiB
Python
|
|
"""Holographic Reduced Representations (HRR) with phase encoding.
|
||
|
|
|
||
|
|
HRRs are a vector symbolic architecture for encoding compositional structure
|
||
|
|
into fixed-width distributed representations. This module uses *phase vectors*:
|
||
|
|
each concept is a vector of angles in [0, 2π). The algebraic operations are:
|
||
|
|
|
||
|
|
bind — circular convolution (phase addition) — associates two concepts
|
||
|
|
unbind — circular correlation (phase subtraction) — retrieves a bound value
|
||
|
|
bundle — superposition (circular mean) — merges multiple concepts
|
||
|
|
|
||
|
|
Phase encoding is numerically stable, avoids the magnitude collapse of
|
||
|
|
traditional complex-number HRRs, and maps cleanly to cosine similarity.
|
||
|
|
|
||
|
|
Atoms are generated deterministically from SHA-256 so representations are
|
||
|
|
identical across processes, machines, and language versions.
|
||
|
|
|
||
|
|
References:
|
||
|
|
Plate (1995) — Holographic Reduced Representations
|
||
|
|
Gayler (2004) — Vector Symbolic Architectures answer Jackendoff's challenges
|
||
|
|
"""
|
||
|
|
|
||
|
|
import hashlib
|
||
|
|
import logging
|
||
|
|
import struct
|
||
|
|
import math
|
||
|
|
|
||
|
|
try:
|
||
|
|
import numpy as np
|
||
|
|
_HAS_NUMPY = True
|
||
|
|
except ImportError:
|
||
|
|
_HAS_NUMPY = False
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
_TWO_PI = 2.0 * math.pi
|
||
|
|
|
||
|
|
|
||
|
|
def _require_numpy() -> None:
|
||
|
|
if not _HAS_NUMPY:
|
||
|
|
raise RuntimeError("numpy is required for holographic operations")
|
||
|
|
|
||
|
|
|
||
|
|
def encode_atom(word: str, dim: int = 1024) -> "np.ndarray":
|
||
|
|
"""Deterministic phase vector via SHA-256 counter blocks.
|
||
|
|
|
||
|
|
Uses hashlib (not numpy RNG) for cross-platform reproducibility.
|
||
|
|
|
||
|
|
Algorithm:
|
||
|
|
- Generate enough SHA-256 blocks by hashing f"{word}:{i}" for i=0,1,2,...
|
||
|
|
- Concatenate digests, interpret as uint16 values via struct.unpack
|
||
|
|
- Scale to [0, 2π): phases = values * (2π / 65536)
|
||
|
|
- Truncate to dim elements
|
||
|
|
- Returns np.float64 array of shape (dim,)
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
|
||
|
|
# Each SHA-256 digest is 32 bytes = 16 uint16 values.
|
||
|
|
values_per_block = 16
|
||
|
|
blocks_needed = math.ceil(dim / values_per_block)
|
||
|
|
|
||
|
|
uint16_values: list[int] = []
|
||
|
|
for i in range(blocks_needed):
|
||
|
|
digest = hashlib.sha256(f"{word}:{i}".encode()).digest()
|
||
|
|
uint16_values.extend(struct.unpack("<16H", digest))
|
||
|
|
|
||
|
|
phases = np.array(uint16_values[:dim], dtype=np.float64) * (_TWO_PI / 65536.0)
|
||
|
|
return phases
|
||
|
|
|
||
|
|
|
||
|
|
def bind(a: "np.ndarray", b: "np.ndarray") -> "np.ndarray":
|
||
|
|
"""Circular convolution = element-wise phase addition.
|
||
|
|
|
||
|
|
Binding associates two concepts into a single composite vector.
|
||
|
|
The result is dissimilar to both inputs (quasi-orthogonal).
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
return (a + b) % _TWO_PI
|
||
|
|
|
||
|
|
|
||
|
|
def unbind(memory: "np.ndarray", key: "np.ndarray") -> "np.ndarray":
|
||
|
|
"""Circular correlation = element-wise phase subtraction.
|
||
|
|
|
||
|
|
Unbinding retrieves the value associated with a key from a memory vector.
|
||
|
|
unbind(bind(a, b), a) ≈ b (up to superposition noise)
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
return (memory - key) % _TWO_PI
|
||
|
|
|
||
|
|
|
||
|
|
def bundle(*vectors: "np.ndarray") -> "np.ndarray":
|
||
|
|
"""Superposition via circular mean of complex exponentials.
|
||
|
|
|
||
|
|
Bundling merges multiple vectors into one that is similar to each input.
|
||
|
|
The result can hold O(sqrt(dim)) items before similarity degrades.
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
complex_sum = np.sum([np.exp(1j * v) for v in vectors], axis=0)
|
||
|
|
return np.angle(complex_sum) % _TWO_PI
|
||
|
|
|
||
|
|
|
||
|
|
def similarity(a: "np.ndarray", b: "np.ndarray") -> float:
|
||
|
|
"""Phase cosine similarity. Range [-1, 1].
|
||
|
|
|
||
|
|
Returns 1.0 for identical vectors, near 0.0 for random (unrelated) vectors,
|
||
|
|
and -1.0 for perfectly anti-correlated vectors.
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
return float(np.mean(np.cos(a - b)))
|
||
|
|
|
||
|
|
|
||
|
|
def encode_text(text: str, dim: int = 1024) -> "np.ndarray":
|
||
|
|
"""Bag-of-words: bundle of atom vectors for each token.
|
||
|
|
|
||
|
|
Tokenizes by lowercasing, splitting on whitespace, and stripping
|
||
|
|
leading/trailing punctuation from each token.
|
||
|
|
|
||
|
|
Returns bundle of all token atom vectors.
|
||
|
|
If text is empty or produces no tokens, returns encode_atom("__hrr_empty__", dim).
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
|
||
|
|
tokens = [
|
||
|
|
token.strip(".,!?;:\"'()[]{}")
|
||
|
|
for token in text.lower().split()
|
||
|
|
]
|
||
|
|
tokens = [t for t in tokens if t]
|
||
|
|
|
||
|
|
if not tokens:
|
||
|
|
return encode_atom("__hrr_empty__", dim)
|
||
|
|
|
||
|
|
atom_vectors = [encode_atom(token, dim) for token in tokens]
|
||
|
|
return bundle(*atom_vectors)
|
||
|
|
|
||
|
|
|
||
|
|
def encode_fact(content: str, entities: list[str], dim: int = 1024) -> "np.ndarray":
|
||
|
|
"""Structured encoding: content bound to ROLE_CONTENT, each entity bound to ROLE_ENTITY, all bundled.
|
||
|
|
|
||
|
|
Role vectors are reserved atoms: "__hrr_role_content__", "__hrr_role_entity__"
|
||
|
|
|
||
|
|
Components:
|
||
|
|
1. bind(encode_text(content, dim), encode_atom("__hrr_role_content__", dim))
|
||
|
|
2. For each entity: bind(encode_atom(entity.lower(), dim), encode_atom("__hrr_role_entity__", dim))
|
||
|
|
3. bundle all components together
|
||
|
|
|
||
|
|
This enables algebraic extraction:
|
||
|
|
unbind(fact, bind(entity, ROLE_ENTITY)) ≈ content_vector
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
|
||
|
|
role_content = encode_atom("__hrr_role_content__", dim)
|
||
|
|
role_entity = encode_atom("__hrr_role_entity__", dim)
|
||
|
|
|
||
|
|
components: list[np.ndarray] = [
|
||
|
|
bind(encode_text(content, dim), role_content)
|
||
|
|
]
|
||
|
|
|
||
|
|
for entity in entities:
|
||
|
|
components.append(bind(encode_atom(entity.lower(), dim), role_entity))
|
||
|
|
|
||
|
|
return bundle(*components)
|
||
|
|
|
||
|
|
|
||
|
|
def phases_to_bytes(phases: "np.ndarray") -> bytes:
|
||
|
|
"""Serialize phase vector to bytes. float64 tobytes — 8 KB at dim=1024."""
|
||
|
|
_require_numpy()
|
||
|
|
return phases.tobytes()
|
||
|
|
|
||
|
|
|
||
|
|
def bytes_to_phases(data: bytes) -> "np.ndarray":
|
||
|
|
"""Deserialize bytes back to phase vector. Inverse of phases_to_bytes.
|
||
|
|
|
||
|
|
The .copy() call is required because frombuffer returns a read-only view
|
||
|
|
backed by the bytes object; callers expect a mutable array.
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
return np.frombuffer(data, dtype=np.float64).copy()
|
||
|
|
|
||
|
|
|
||
|
|
def snr_estimate(dim: int, n_items: int) -> float:
|
||
|
|
"""Signal-to-noise ratio estimate for holographic storage.
|
||
|
|
|
||
|
|
SNR = sqrt(dim / n_items) when n_items > 0, else inf.
|
||
|
|
|
||
|
|
The SNR falls below 2.0 when n_items > dim / 4, meaning retrieval
|
||
|
|
errors become likely. Logs a warning when this threshold is crossed.
|
||
|
|
"""
|
||
|
|
_require_numpy()
|
||
|
|
|
||
|
|
if n_items <= 0:
|
||
|
|
return float("inf")
|
||
|
|
|
||
|
|
snr = math.sqrt(dim / n_items)
|
||
|
|
|
||
|
|
if snr < 2.0:
|
||
|
|
logger.warning(
|
||
|
|
"HRR storage near capacity: SNR=%.2f (dim=%d, n_items=%d). "
|
||
|
|
"Retrieval accuracy may degrade. Consider increasing dim or reducing stored items.",
|
||
|
|
snr,
|
||
|
|
dim,
|
||
|
|
n_items,
|
||
|
|
)
|
||
|
|
|
||
|
|
return snr
|