feat(ocr-and-documents): add OCR and document extraction skills

- Introduced new skills for extracting text from PDFs, scanned documents, and images using OCR and document parsing tools. - Added detailed documentation for usage and installation of `pymupdf` and `marker-pdf` for local extraction. - Implemented scripts for text extraction with both lightweight and high-quality options, including support for various document formats. - Updated web extraction functionality to handle PDF URLs directly, enhancing usability for academic papers and documents.
2026-04-28 06:51:16 +08:00 · 2026-02-26 23:06:08 -08:00
parent 21cf339a85
commit 19abbfff96
5 changed files with 322 additions and 1 deletions
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -1240,7 +1240,7 @@ WEB_SEARCH_SCHEMA = {

 WEB_EXTRACT_SCHEMA = {
    "name": "web_extract",
-    "description": "Extract content from web page URLs. Returns page content in markdown format. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.",
+    "description": "Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.",
    "parameters": {
        "type": "object",
        "properties": {