diff --git a/scripts/release.py b/scripts/release.py
index a5c19503b0..80c75be3cd 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -97,6 +97,7 @@ AUTHOR_MAP = {
     "mygamez@163.com": "zhongyueming1121",
     "hansnow@users.noreply.github.com": "hansnow",
     "134848055+UNLINEARITY@users.noreply.github.com": "UNLINEARITY",
+    "ben.burtenshaw@gmail.com": "burtenshaw",
     # contributors (manual mapping from git names)
     "ahmedsherif95@gmail.com": "asheriif",
     "liujinkun@bytedance.com": "liujinkun2025",
diff --git a/skills/mlops/inference/llama-cpp/SKILL.md b/skills/mlops/inference/llama-cpp/SKILL.md
index 26c9e19759..0844e4d5a4 100644
--- a/skills/mlops/inference/llama-cpp/SKILL.md
+++ b/skills/mlops/inference/llama-cpp/SKILL.md
@@ -1,9 +1,10 @@
 ---
 name: llama-cpp
-description: Run LLM inference with llama.cpp on CPU, Apple Silicon, AMD/Intel GPUs, or NVIDIA. Covers GGUF quant selection, Hugging Face Hub model search with `apps=llama.cpp`, hardware-aware quant recommendations from `?local-app=llama.cpp`, extracting available `.gguf` files from the Hugging Face tree API, and building the right `llama-cli` or `llama-server` command directly from Hub URLs.
-version: 2.1.1
+description: llama.cpp local GGUF inference + HF Hub model discovery.
+version: 2.1.2
 author: Orchestra Research
 license: MIT
+dependencies: [llama-cpp-python>=0.2.0]
 metadata:
   hermes:
     tags: [llama.cpp, GGUF, Quantization, Hugging Face Hub, CPU Inference, Apple Silicon, Edge Deployment, AMD GPUs, Intel GPUs, NVIDIA, URL-first]
@@ -96,7 +97,6 @@ llama-server \
 ```bash
 curl http://localhost:8080/v1/chat/completions \
   -H "Content-Type: application/json" \
-  -H "Authorization: Bearer no-key" \
   -d '{
     "messages": [
       {"role": "user", "content": "Write a limerick about Python exceptions"}
@@ -104,6 +104,68 @@ curl http://localhost:8080/v1/chat/completions \
   }'
 ```
 
+## Python bindings (llama-cpp-python)
+
+`pip install llama-cpp-python` (CUDA: `CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --force-reinstall --no-cache-dir`; Metal: `CMAKE_ARGS="-DGGML_METAL=on" ...`).
+
+### Basic generation
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path="./model-q4_k_m.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35,     # 0 for CPU, 99 to offload everything
+    n_threads=8,
+)
+
+out = llm("What is machine learning?", max_tokens=256, temperature=0.7)
+print(out["choices"][0]["text"])
+```
+
+### Chat + streaming
+
+```python
+llm = Llama(
+    model_path="./model-q4_k_m.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35,
+    chat_format="llama-3",   # or "chatml", "mistral", etc.
+)
+
+resp = llm.create_chat_completion(
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is Python?"},
+    ],
+    max_tokens=256,
+)
+print(resp["choices"][0]["message"]["content"])
+
+# Streaming
+for chunk in llm("Explain quantum computing:", max_tokens=256, stream=True):
+    print(chunk["choices"][0]["text"], end="", flush=True)
+```
+
+### Embeddings
+
+```python
+llm = Llama(model_path="./model-q4_k_m.gguf", embedding=True, n_gpu_layers=35)
+vec = llm.embed("This is a test sentence.")
+print(f"Embedding dimension: {len(vec)}")
+```
+
+You can also load a GGUF straight from the Hub:
+
+```python
+llm = Llama.from_pretrained(
+    repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
+    filename="*Q4_K_M.gguf",
+    n_gpu_layers=35,
+)
+```
+
 ## Choosing a quant
 
 Use the Hub page first, generic heuristics second.