diff --git a/scripts/release.py b/scripts/release.py index a5c19503b0..80c75be3cd 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -97,6 +97,7 @@ AUTHOR_MAP = { "mygamez@163.com": "zhongyueming1121", "hansnow@users.noreply.github.com": "hansnow", "134848055+UNLINEARITY@users.noreply.github.com": "UNLINEARITY", + "ben.burtenshaw@gmail.com": "burtenshaw", # contributors (manual mapping from git names) "ahmedsherif95@gmail.com": "asheriif", "liujinkun@bytedance.com": "liujinkun2025", diff --git a/skills/mlops/inference/llama-cpp/SKILL.md b/skills/mlops/inference/llama-cpp/SKILL.md index 26c9e19759..0844e4d5a4 100644 --- a/skills/mlops/inference/llama-cpp/SKILL.md +++ b/skills/mlops/inference/llama-cpp/SKILL.md @@ -1,9 +1,10 @@ --- name: llama-cpp -description: Run LLM inference with llama.cpp on CPU, Apple Silicon, AMD/Intel GPUs, or NVIDIA. Covers GGUF quant selection, Hugging Face Hub model search with `apps=llama.cpp`, hardware-aware quant recommendations from `?local-app=llama.cpp`, extracting available `.gguf` files from the Hugging Face tree API, and building the right `llama-cli` or `llama-server` command directly from Hub URLs. -version: 2.1.1 +description: llama.cpp local GGUF inference + HF Hub model discovery. +version: 2.1.2 author: Orchestra Research license: MIT +dependencies: [llama-cpp-python>=0.2.0] metadata: hermes: tags: [llama.cpp, GGUF, Quantization, Hugging Face Hub, CPU Inference, Apple Silicon, Edge Deployment, AMD GPUs, Intel GPUs, NVIDIA, URL-first] @@ -96,7 +97,6 @@ llama-server \ ```bash curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer no-key" \ -d '{ "messages": [ {"role": "user", "content": "Write a limerick about Python exceptions"} @@ -104,6 +104,68 @@ curl http://localhost:8080/v1/chat/completions \ }' ``` +## Python bindings (llama-cpp-python) + +`pip install llama-cpp-python` (CUDA: `CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --force-reinstall --no-cache-dir`; Metal: `CMAKE_ARGS="-DGGML_METAL=on" ...`). + +### Basic generation + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="./model-q4_k_m.gguf", + n_ctx=4096, + n_gpu_layers=35, # 0 for CPU, 99 to offload everything + n_threads=8, +) + +out = llm("What is machine learning?", max_tokens=256, temperature=0.7) +print(out["choices"][0]["text"]) +``` + +### Chat + streaming + +```python +llm = Llama( + model_path="./model-q4_k_m.gguf", + n_ctx=4096, + n_gpu_layers=35, + chat_format="llama-3", # or "chatml", "mistral", etc. +) + +resp = llm.create_chat_completion( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is Python?"}, + ], + max_tokens=256, +) +print(resp["choices"][0]["message"]["content"]) + +# Streaming +for chunk in llm("Explain quantum computing:", max_tokens=256, stream=True): + print(chunk["choices"][0]["text"], end="", flush=True) +``` + +### Embeddings + +```python +llm = Llama(model_path="./model-q4_k_m.gguf", embedding=True, n_gpu_layers=35) +vec = llm.embed("This is a test sentence.") +print(f"Embedding dimension: {len(vec)}") +``` + +You can also load a GGUF straight from the Hub: + +```python +llm = Llama.from_pretrained( + repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF", + filename="*Q4_K_M.gguf", + n_gpu_layers=35, +) +``` + ## Choosing a quant Use the Hub page first, generic heuristics second.