130 Commits
v1.1.0 ... main

Author SHA1 Message Date
zrguo
1c7247c5ab Merge pull request #142 from yrangana/fix/status-case-mismatch-enum
Fix status comparison case mismatch in processor.py
2025-10-20 15:46:31 +08:00
Yasiru Rangana
e70cf8d38a fix: use DocStatus.PROCESSED enum instead of hardcoded uppercase string
## Problem

Status comparisons used hardcoded uppercase string "PROCESSED" which
didn't match LightRAG's DocStatus enum that stores lowercase "processed".
This caused text_processed to always return False even when documents
were successfully processed.

**Evidence:**
- LightRAG's DocStatus enum (lightrag/base.py): PROCESSED = "processed"
- RAGAnything's DocStatus enum (raganything/base.py:11): PROCESSED = "processed"
- Current code checked: doc_status == "PROCESSED" (uppercase) 
- Actual value from LightRAG: "processed" (lowercase) ✓

**Impact:**
- is_document_fully_processed() always returned False
- get_document_processing_status() showed text_processed as False
- Multimodal processing logic incorrectly detected status

## Solution

Replace hardcoded string literals with DocStatus.PROCESSED enum constant
(already imported at line 14).

**Changes:**
- Line 481: doc_status == "PROCESSED" → DocStatus.PROCESSED
- Line 486: doc_status == "PROCESSED" → DocStatus.PROCESSED
- Line 1355: doc_status.get("status") == "PROCESSED" → DocStatus.PROCESSED
- Line 1387: doc_status.get("status") == "PROCESSED" → DocStatus.PROCESSED
- Updated comments (lines 463, 478) for consistency

**Benefits:**
1.  Fixes case mismatch bug - enum auto-converts to lowercase
2.  Type-safe - IDE/linter catches errors
3.  Maintainable - single source of truth (no magic strings)
4.  Future-proof - if enum changes, code updates automatically
5.  Follows Python best practices

**Compatibility:**
- Works with LightRAG v1.4.9.2+
- Compatible with LightRAG v1.4.9.3 (which added PREPROCESSED status)
- No breaking changes

**References:**
- LightRAG DocStatus: lightrag/base.py
- RAGAnything DocStatus: raganything/base.py:11
- Related: LightRAG v1.4.9.3 added PREPROCESSED = "multimodal_processed"
2025-10-19 23:36:54 +11:00
zrguo
8079053506 Update README_zh.md 2025-10-16 14:53:16 +08:00
zrguo
e6af42be94 Update README.md 2025-10-16 14:49:16 +08:00
zrguo
e7273346e1 fix lint 2025-10-15 14:38:57 +08:00
zrguo
3c5c3fa3d5 Update Citation 2025-10-15 14:23:54 +08:00
chaohuang-ai
9207cbed46 Update README.md 2025-10-15 10:57:22 +08:00
chaohuang-ai
ba464f459a Update README.md 2025-10-15 10:56:30 +08:00
zrguo
806ac9ba3e Merge pull request #122 from LaansDole/main
feat: RAG-Anything runs offline
2025-10-13 10:56:37 +08:00
laansdole
ab552321ae chores: refactor examples 2025-10-04 21:27:41 +07:00
laansdole
0f6d3ea83a feat: implement tiktoken env variables 2025-10-04 21:26:59 +07:00
laansdole
10ee99952a docs: update decision records 2025-10-04 21:25:53 +07:00
Do Le Long An
0ac3dc6bf7 Merge branch 'HKUDS:main' into main 2025-09-26 15:30:13 +07:00
chaohuang-ai
1e0568543f Update README.md 2025-09-25 09:34:30 +08:00
laansdole
8757265825 chores: refactor 2025-09-24 13:56:13 +07:00
laansdole
06011b0e6a fix: lint 2025-09-24 13:52:24 +07:00
laansdole
d7bec4e472 feat: RAG-Anything runs offline 2025-09-24 13:38:53 +07:00
chaohuang-ai
d30e71502e Update README.md 2025-09-22 20:57:02 +08:00
zrguo
214eb0f94d update 2025-09-22 11:11:40 +08:00
zrguo
1d48f24b4a fix lint 2025-09-22 10:42:35 +08:00
zrguo
8e0e05d497 Merge pull request #99 from LaansDole/main
Feat: LM Studio integration example and uv implementation
2025-09-22 10:21:59 +08:00
zrguo
fcd21ea31d Merge pull request #103 from hongdongjian/dev
feat: Add support for Chinese characters in PDF generation
2025-09-22 10:21:14 +08:00
Do Le Long An
df99bfb82f Merge branch 'HKUDS:main' into main 2025-09-19 16:16:36 +07:00
hongdongjian
7e5e691650 feat: Add support for Chinese characters in PDF generation 2025-09-16 16:20:43 +08:00
zrguo
8d170e52c8 fix lint 2025-09-16 15:48:20 +08:00
zrguo
ba3f737c87 Merge pull request #106 from liz-in-tech/fix-cleanup-warning
fix: replace __del__ with atexit to fix RAGAnything cleanup warning
2025-09-16 15:40:18 +08:00
zrguo
48087eac78 Add log info 2025-09-16 15:32:12 +08:00
zrguo
d63798f05c fix lint 2025-09-16 11:31:35 +08:00
zrguo
991794361f Compatible with new fields 2025-09-16 11:10:26 +08:00
zrguo
30b6e2358b Merge pull request #113 from HKUDS/ui
Add RAGAnything processing to LightRAG's webui
2025-09-16 10:20:56 +08:00
laansdole
c16208de29 fix: unified env variables with other types 2025-09-13 18:29:09 +07:00
hzywhite
ba3c0154dd Update processor.py 2025-09-11 22:35:19 +08:00
liz
260044f449 fix: replace __del__ with atexit to fix RAGAnything cleanup warning 2025-09-09 17:10:36 +08:00
laansdole
8d8805c66f fix: lightrag combatibility 2025-09-06 15:45:36 +07:00
Do Le Long An
2a3d222140 Merge branch 'HKUDS:main' into main 2025-09-06 15:01:38 +07:00
hzywhite
bcd6cc16c0 Update processor.py 2025-09-05 20:30:47 +08:00
hzywhite
de2824f816 summary 2025-09-05 14:57:00 +08:00
hzywhite
9872b86d13 summary 2025-09-05 14:56:35 +08:00
laansdole
b9f5e9d7d3 chores: refactor 2025-09-03 16:17:39 +07:00
laansdole
0b0c20aa7a fix: lmstudio implementation 2025-09-03 14:57:54 +07:00
laansdole
0d73279aab chores: refactor 2025-09-03 14:27:53 +07:00
laansdole
12855ec5bb feat: lmstudio implementation 2025-09-03 14:26:28 +07:00
zrguo
c3ca5dd756 Merge pull request #97 from HKUDS/ui
Add RAGAnything processing to LightRAG's webui
2025-09-03 15:11:11 +08:00
hzywhite
6877983a71 summary 2025-09-02 18:19:56 +08:00
laansdole
4f879408e3 feat: lmstudio and uv implementation 2025-09-02 16:03:38 +07:00
hzywhite
92cf72fe8a summary 2025-09-02 16:38:41 +08:00
hzywhite
c6d805005e Update processor.py 2025-09-02 06:18:52 +08:00
hzywhite
ec858868ee Update processor.py 2025-09-02 06:17:27 +08:00
hzywhite
6781662a3e summary 2025-09-01 23:04:03 +08:00
hzywhite
dc3a46f247 summary 2025-09-01 22:14:07 +08:00
hzywhite
a87166814c summary 2025-09-01 21:57:31 +08:00
hzywhite
2addb7b799 summary 2025-09-01 21:30:36 +08:00
hzywhite
e5a3d0cfae summary 2025-09-01 15:39:34 +08:00
chaohuang-ai
362302276c Update README.md 2025-08-17 13:27:10 +08:00
zrguo
79078b2f6c Merge pull request #86 from HKUDS/vlm_enhanced_query
VLM Enhanced Query
2025-08-15 20:18:21 +08:00
zrguo
d031468437 Update query.py 2025-08-12 19:17:27 +08:00
zrguo
801f276d82 update debug log 2025-08-12 17:46:36 +08:00
zrguo
dfd9ec855e vlm_enhanced_query 2025-08-12 15:59:50 +08:00
zrguo
cf2aa70cfd Add MinerU log 2025-08-06 18:13:20 +08:00
zrguo
0f6f41aafb Update parser.py 2025-08-06 17:31:57 +08:00
zrguo
9649e31d1a Update _ensure_lightrag_initialized 2025-08-06 17:09:02 +08:00
zrguo
d7eaa8642b Update processor.py 2025-07-31 18:51:48 +08:00
zrguo
380e3cdf3c Update __init__.py 2025-07-31 18:35:06 +08:00
zrguo
d5ff598a9b Add finalize_storages 2025-07-31 18:34:36 +08:00
zrguo
bc7d5ec0b3 fix lint 2025-07-29 21:36:23 +08:00
zzhtx258
c31610200c Merge branch 'main' of https://github.com/HKUDS/RAG-Anything 2025-07-29 20:20:37 +08:00
zzhtx258
d858eabaf9 Fixed docling parser 2025-07-29 19:54:55 +08:00
zrguo
5e56140300 fix lint 2025-07-29 17:07:15 +08:00
zrguo
4f900db761 Merge pull request #64 from ShorthillsAI/main
Add Batch Processing and Enhanced Markdown Features
2025-07-29 17:06:09 +08:00
zrguo
180082cbf5 Merge pull request #68 from BenjaminX/git_ignore_for_AI
Update .gitignore to include AI-related files and directories
2025-07-29 17:05:08 +08:00
Benjamin
935b70a65f Update .gitignore to include AI-related files and directories 2025-07-28 13:13:53 +08:00
Shorthills AI
a7c46d5e55 Merge pull request #5 from MinalMahalaShorthillsAI/my-feature-branch
Improvised version
2025-07-28 10:24:51 +05:30
Shorthills AI
336ae09177 Merge branch 'HKUDS:main' into main 2025-07-28 10:23:59 +05:30
MinalMahalaShorthillsAI
60f05e04cf improvised version 2025-07-28 10:08:54 +05:30
Shorthills AI
099b502860 Merge pull request #2 from MinalMahalaShorthillsAI/main
Corrected the Lint Errors and Restored the batch functionalities along with new batch processing
2025-07-25 08:21:55 +05:30
MinalMahalaShorthillsAI
1764e1ee8d corrected the linting errors 2025-07-24 15:07:33 +05:30
MinalMahalaShorthillsAI
5b44298214 Merge remote-tracking branch 'upstream/main' 2025-07-24 15:00:42 +05:30
MinalMahalaShorthillsAI
356f26a390 Restrored previous version along with new features 2025-07-24 14:22:10 +05:30
MinalMahalaShorthillsAI
0653b0c7f0 Fixed Lint and formatting errors 2025-07-24 14:20:50 +05:30
zrguo
7775bb35ea Update __init__.py 2025-07-24 15:00:48 +08:00
zrguo
8d4bb554a1 Merge pull request #65 from HKUDS/async
Comprehensive Optimization of Multimodal Chunk Processing in RAGAnything
2025-07-24 14:19:47 +08:00
zrguo
fd418b69f6 Update processor.py 2025-07-24 14:10:16 +08:00
zrguo
1d40425c81 update multimodal process pipeline 2025-07-24 14:09:26 +08:00
Shorthills AI
2056c358ac Merge pull request #1 from MinalMahalaShorthillsAI/batch-markdown-enhancements
Add batch processing and enhanced markdown features
2025-07-24 10:48:28 +05:30
zrguo
6dc0effafb Merge pull request #62 from HKUDS/insert_content_list
Direct Content List Insertion
2025-07-23 19:04:53 +08:00
zrguo
84276c38ca fix lint 2025-07-23 19:04:11 +08:00
zrguo
710ed38d88 direct content list insertion 2025-07-23 19:01:26 +08:00
MinalMahalaShorthillsAI
a1a783b481 Add batch processing and enhanced markdown features 2025-07-23 13:23:22 +05:30
zrguo
905466436d Update lightrag_kwargs 2025-07-22 19:23:17 +08:00
zrguo
a88831e540 Update query.py 2025-07-22 16:26:52 +08:00
zrguo
4b08d62f74 Update examples 2025-07-22 11:20:51 +08:00
zrguo
9f9fb68010 Update __init__.py 2025-07-22 02:25:00 +08:00
zrguo
f40fe6fbf3 Merge pull request #59 from HKUDS/cache
Added caching mechanism
2025-07-22 02:23:01 +08:00
zrguo
84bdd21073 Add multimodal_query cache 2025-07-22 02:05:38 +08:00
zrguo
2f52f9b4e0 Add caching mechanism 2025-07-22 01:58:42 +08:00
zrguo
33a9dd6bb9 Add parse cache 2025-07-22 00:15:09 +08:00
zrguo
d8302d0cf8 Update parser param 2025-07-21 23:48:27 +08:00
zrguo
7aafb58e41 Merge pull request #58 from HKUDS/docling
Add Docling Parser
2025-07-21 12:30:43 +08:00
zrguo
de7b401b23 Delete mineru_parser.py 2025-07-21 12:30:25 +08:00
zrguo
13306ef249 Update README 2025-07-21 12:27:19 +08:00
zrguo
9867ac38ab Update docling output 2025-07-21 12:21:20 +08:00
zrguo
131d41a60e Update RAGAnything_example 2025-07-21 10:58:14 +08:00
zrguo
656c4cdae6 Merge branch 'main' into docling 2025-07-17 23:02:39 +08:00
zrguo
9781605b94 Update modalprocessors.py 2025-07-17 11:11:42 +08:00
zrguo
0f3cce4ad0 fix image parse 2025-07-15 17:54:42 +08:00
zzhtx258
e8f9a877e2 Updated env example, adapted generic parser into rag anything core code 2025-07-15 17:11:00 +08:00
zzhtx258
b39efde039 sync with main 2025-07-15 15:31:23 +08:00
zzhtx258
c0da599225 Create generic Parser class, add docling as an avaliable parser 2025-07-15 15:06:00 +08:00
zrguo
9ccb55cde8 fix lint 2025-07-14 18:08:04 +08:00
zrguo
e6fd04bbd9 Merge pull request #53 from liseri/main
更新mineru2.0部分参数配置
2025-07-14 18:07:22 +08:00
liseri
f1d4867a0c 修改readme 2025-07-14 04:00:18 +00:00
liseri
2ba6d1cf94 improve mineru_parser params of mineru2.0 2025-07-14 11:39:50 +08:00
chaohuang-ai
f4ff60b88c Update README.md 2025-07-09 15:50:27 +08:00
zrguo
2baa214cd4 Fix _read_output_files() 2025-07-07 19:13:09 +08:00
zrguo
92aecaa2b3 Merge pull request #47 from nssai001/feat/add_method__read_output_files
fix: _read_output_files 函数无法准确找到 md 文件的路径问题
2025-07-07 19:03:07 +08:00
wangliguo
2b030435d1 fix: _read_output_files 函数无法准确找到 md 文件的路径问题 2025-07-07 16:55:19 +08:00
zrguo
788540f01e Fix MinerU parsing configuration 2025-07-05 18:32:27 +08:00
zrguo
983d994c4c Add Link 2025-07-05 14:31:13 +08:00
zrguo
5f1c650bd8 Update News 2025-07-05 14:27:49 +08:00
zrguo
ef094868de Update RAGAnything init 2025-07-05 14:24:28 +08:00
zrguo
c818cd145f Update README 2025-07-05 02:48:01 +08:00
zrguo
322358df42 fix lint 2025-07-04 21:16:03 +08:00
zrguo
5288be9d0c Merge pull request #42 from HKUDS/context
Process multimodal with context
2025-07-04 21:09:08 +08:00
zrguo
4e9a7354a7 process multimodal with context 2025-07-04 21:05:22 +08:00
zrguo
217978e0e9 Merge pull request #39 from wbrettenny/patch-1
Update README.md
2025-07-04 16:41:49 +08:00
Warren Brettenny
1e4bdbd5b3 Update README.md
Added `await initialize_pipeline_status()` to Section 6 `Loading Existing LightRAG Instance`, so that the example will work.
2025-07-04 10:14:48 +02:00
zrguo
5ba5beceb3 Update raganything_example.py 2025-07-04 11:31:49 +08:00
zrguo
d9e6e80735 Merge pull request #35 from HKUDS/query_with_multimodal
Query with Multimodal
2025-07-04 11:25:21 +08:00
zrguo
711ac23ade fix lint 2025-07-04 11:24:35 +08:00
zrguo
b33f3bebcd Query with Multimodal 2025-07-04 11:20:00 +08:00
36 changed files with 12999 additions and 2880 deletions

View File

@@ -28,3 +28,9 @@ jobs:
- name: Run pre-commit
run: pre-commit run --all-files --show-diff-on-failure
- name: Commit lint changes
uses: stefanzweifel/git-auto-commit-action@v5
with:
commit_message: "chore: apply linting and formatting"
branch: ${{ github.head_ref }}

10
.gitignore vendored
View File

@@ -11,6 +11,7 @@ __pycache__/
.venv/
env/
venv/
*.env*
.env_example
@@ -46,7 +47,7 @@ neo4jWorkDir/
# Data & Storage
inputs/
rag_storage/
rag_storage*/
examples/input/
examples/output/
output*/
@@ -61,12 +62,19 @@ ignore_this.txt
dickens*/
book.txt
LightRAG.pdf
LightRAG_2-4.pdf
download_models_hf.py
lightrag-dev/
gui/
tiktoken_cache/
# unit-test files
test_*
# Cline files
memory-bank/
# AI
.claude/
.cursor/
CLAUDE.md

723
README.md
View File

@@ -4,7 +4,9 @@
<img src="./assets/logo.png" width="120" height="120" alt="RAG-Anything Logo" style="border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);">
</div>
# 🚀 RAG-Anything: All-in-One RAG System
# 🚀 RAG-Anything: All-in-One RAG Framework
<a href="https://trendshift.io/repositories/14959" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14959" alt="HKUDS%2FRAG-Anything | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<div align="center">
<img src="https://readme-typing-svg.herokuapp.com?font=Orbitron&size=24&duration=3000&pause=1000&color=00D9FF&center=true&vCenter=true&width=600&lines=Welcome+to+RAG-Anything;Next-Gen+Multimodal+RAG+System;Powered+by+Advanced+AI+Technology" alt="Typing Animation" />
@@ -14,13 +16,14 @@
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
<p>
<a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥Project-Page-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
<a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
<a href='https://arxiv.org/abs/2510.12323'><img src='https://img.shields.io/badge/📄arXiv-2510.12323-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
<a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡Based%20on-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
</p>
<p>
<a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
<img src="https://img.shields.io/badge/🐍Python-3.9+-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
<img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
<a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
<a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/⚡uv-Ready-ff6b6b?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e"></a>
</p>
<p>
<a href="https://discord.gg/yF2MmDJyGJ"><img src="https://img.shields.io/badge/💬Discord-Community-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e"></a>
@@ -47,6 +50,15 @@
---
## 🎉 News
- [X] [2025.10]🎯📢 🚀 We have released the technical report of [RAG-Anything](http://arxiv.org/abs/2510.12323). Access it now to explore our latest research findings.
- [X] [2025.08]🎯📢 🔍 RAG-Anything now features **VLM-Enhanced Query** mode! When documents include images, the system seamlessly integrates them into VLM for advanced multimodal analysis, combining visual and textual context for deeper insights.
- [X] [2025.07]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.
- [X] [2025.07]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.
- [X] [2025.07]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.
---
## 🌟 System Overview
*Next-Generation Multimodal Intelligence*
@@ -72,6 +84,7 @@ Users can query documents containing **interleaved text**, **visual diagrams**,
- **🧠 Specialized Content Analysis** - Dedicated processors for images, tables, mathematical equations, and heterogeneous content types
- **🔗 Multimodal Knowledge Graph** - Automatic entity extraction and cross-modal relationship discovery for enhanced understanding
- **⚡ Adaptive Processing Modes** - Flexible MinerU-based parsing or direct multimodal content injection workflows
- **📋 Direct Content List Insertion** - Bypass document parsing by directly inserting pre-parsed content lists from external sources
- **🎯 Hybrid Intelligent Retrieval** - Advanced search capabilities spanning textual and multimodal content with contextual understanding
</div>
@@ -176,7 +189,7 @@ The system deploys modality-aware processing units for heterogeneous data modali
</div>
### 4. Multi-Modal Knowledge Graph Index
### 4. Multimodal Knowledge Graph Index
<div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
@@ -236,14 +249,26 @@ pip install 'raganything[image,text]' # Multiple features
```
#### Option 2: Install from Source
```bash
# Install uv (if not already installed)
curl -LsSf https://astral.sh/uv/install.sh | sh
# Clone and setup the project with uv
git clone https://github.com/HKUDS/RAG-Anything.git
cd RAG-Anything
pip install -e .
# With optional dependencies
pip install -e '.[all]'
# Install the package and dependencies in a virtual environment
uv sync
# If you encounter network timeouts (especially for opencv packages):
# UV_HTTP_TIMEOUT=120 uv sync
# Run commands directly with uv (recommended approach)
uv run python examples/raganything_example.py --help
# Install with optional dependencies
uv sync --extra image --extra text # Specific extras
uv sync --all-extras # All optional features
```
#### Optional Dependencies
@@ -267,7 +292,7 @@ pip install -e '.[all]'
mineru --version
# Check if properly configured
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU installed properly' if rag.check_mineru_installation() else '❌ MinerU installation issue')"
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU installed properly' if rag.check_parser_installation() else '❌ MinerU installation issue')"
```
Models are downloaded automatically on first use. For manual download, refer to [MinerU Model Source Configuration](https://github.com/opendatalab/MinerU/blob/master/README.md#22-model-source-configuration).
@@ -278,42 +303,162 @@ Models are downloaded automatically on first use. For manual download, refer to
```python
import asyncio
from raganything import RAGAnything
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
async def main():
# Set up API configuration
api_key = "your-api-key"
base_url = "your-base-url" # Optional
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir="./rag_storage",
parser="mineru", # Parser selection: mineru or docling
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
# Define LLM model function
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Define vision model function for image processing
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt}
if system_prompt
else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
# Define embedding function
embedding_func = EmbeddingFunc(
embedding_dim=3072,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key=api_key,
base_url=base_url,
),
)
# Initialize RAGAnything
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
# Process a document
await rag.process_document_complete(
file_path="path/to/your/document.pdf",
output_dir="./output",
parse_method="auto"
)
# Query the processed content
# Pure text query - for basic knowledge base search
text_result = await rag.aquery(
"What are the main findings shown in the figures and tables?",
mode="hybrid"
)
print("Text query result:", text_result)
# Multimodal query with specific multimodal content
multimodal_result = await rag.aquery_with_multimodal(
"Explain this formula and its relevance to the document content",
multimodal_content=[{
"type": "equation",
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
"equation_caption": "Document relevance probability"
}],
mode="hybrid"
)
print("Multimodal query result:", multimodal_result)
if __name__ == "__main__":
asyncio.run(main())
```
#### 2. Direct Multimodal Content Processing
```python
import asyncio
from lightrag import LightRAG
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
async def process_multimodal_content():
# Set up API configuration
api_key = "your-api-key"
base_url = "your-base-url" # Optional
# Initialize LightRAG
rag = LightRAG(
working_dir="./rag_storage",
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
**kwargs,
),
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]} if image_data else {"role": "user", "content": prompt}
],
api_key="your-api-key",
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
api_key=api_key,
base_url=base_url,
**kwargs,
),
embedding_func=EmbeddingFunc(
@@ -325,52 +470,43 @@ async def main():
api_key=api_key,
base_url=base_url,
),
),
)
# Process a document
await rag.process_document_complete(
file_path="path/to/your/document.pdf",
output_dir="./output",
parse_method="auto"
)
# Query the processed content
result = await rag.query_with_multimodal(
"What are the main findings shown in the figures and tables?",
mode="hybrid"
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
#### 2. Direct Multimodal Content Processing
```python
import asyncio
from lightrag import LightRAG
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
async def process_multimodal_content():
# Initialize LightRAG
rag = LightRAG(
working_dir="./rag_storage",
# ... your LLM and embedding configurations
)
)
await rag.initialize_storages()
# Process an image
image_processor = ImageModalProcessor(
lightrag=rag,
modal_caption_func=your_vision_model_func
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]} if image_data else {"role": "user", "content": prompt}
],
api_key=api_key,
base_url=base_url,
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
)
image_content = {
"img_path": "path/to/image.jpg",
"img_caption": ["Figure 1: Experimental results"],
"img_footnote": ["Data collected in 2024"]
"image_caption": ["Figure 1: Experimental results"],
"image_footnote": ["Data collected in 2024"]
}
description, entity_info = await image_processor.process_multimodal_content(
@@ -383,7 +519,15 @@ async def process_multimodal_content():
# Process a table
table_processor = TableModalProcessor(
lightrag=rag,
modal_caption_func=your_llm_model_func
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
)
table_content = {
@@ -436,25 +580,93 @@ class CustomModalProcessor(GenericModalProcessor):
#### 5. Query Options
RAG-Anything provides three types of query methods:
**Pure Text Queries** - Direct knowledge base search using LightRAG:
```python
# Different query modes
result_hybrid = await rag.query_with_multimodal("Your question", mode="hybrid")
result_local = await rag.query_with_multimodal("Your question", mode="local")
result_global = await rag.query_with_multimodal("Your question", mode="global")
# Different query modes for text queries
text_result_hybrid = await rag.aquery("Your question", mode="hybrid")
text_result_local = await rag.aquery("Your question", mode="local")
text_result_global = await rag.aquery("Your question", mode="global")
text_result_naive = await rag.aquery("Your question", mode="naive")
# Synchronous version
sync_text_result = rag.query("Your question", mode="hybrid")
```
**VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:
```python
# VLM enhanced query (automatically enabled when vision_model_func is provided)
vlm_result = await rag.aquery(
"Analyze the charts and figures in the document",
mode="hybrid"
# vlm_enhanced=True is automatically set when vision_model_func is available
)
# Manually control VLM enhancement
vlm_enabled = await rag.aquery(
"What do the images show in this document?",
mode="hybrid",
vlm_enhanced=True # Force enable VLM enhancement
)
vlm_disabled = await rag.aquery(
"What do the images show in this document?",
mode="hybrid",
vlm_enhanced=False # Force disable VLM enhancement
)
# When documents contain images, VLM can see and analyze them directly
# The system will automatically:
# 1. Retrieve relevant context containing image paths
# 2. Load and encode images as base64
# 3. Send both text context and images to VLM for comprehensive analysis
```
**Multimodal Queries** - Enhanced queries with specific multimodal content analysis:
```python
# Query with table data
table_result = await rag.aquery_with_multimodal(
"Compare these performance metrics with the document content",
multimodal_content=[{
"type": "table",
"table_data": """Method,Accuracy,Speed
RAGAnything,95.2%,120ms
Traditional,87.3%,180ms""",
"table_caption": "Performance comparison"
}],
mode="hybrid"
)
# Query with equation content
equation_result = await rag.aquery_with_multimodal(
"Explain this formula and its relevance to the document content",
multimodal_content=[{
"type": "equation",
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
"equation_caption": "Document relevance probability"
}],
mode="hybrid"
)
```
#### 6. Loading Existing LightRAG Instance
```python
import asyncio
from raganything import RAGAnything
from raganything import RAGAnything, RAGAnythingConfig
from lightrag import LightRAG
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import EmbeddingFunc
import os
async def load_existing_lightrag():
# First, create or load an existing LightRAG instance
# Set up API configuration
api_key = "your-api-key"
base_url = "your-base-url" # Optional
# First, create or load existing LightRAG instance
lightrag_working_dir = "./existing_lightrag_storage"
# Check if previous LightRAG instance exists
@@ -463,7 +675,7 @@ async def load_existing_lightrag():
else:
print("❌ No existing LightRAG instance found, will create new one")
# Create/Load LightRAG instance with your configurations
# Create/load LightRAG instance with your configuration
lightrag_instance = LightRAG(
working_dir=lightrag_working_dir,
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
@@ -471,7 +683,8 @@ async def load_existing_lightrag():
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
api_key=api_key,
base_url=base_url,
**kwargs,
),
embedding_func=EmbeddingFunc(
@@ -488,44 +701,73 @@ async def load_existing_lightrag():
# Initialize storage (this will load existing data if available)
await lightrag_instance.initialize_storages()
await initialize_pipeline_status()
# Now initialize RAGAnything with the existing LightRAG instance
# Define vision model function for image processing
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt}
if system_prompt
else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
# Now use existing LightRAG instance to initialize RAGAnything
rag = RAGAnything(
lightrag=lightrag_instance, # Pass the existing LightRAG instance
# Only need vision model for multimodal processing
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]} if image_data else {"role": "user", "content": prompt}
],
api_key="your-api-key",
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
**kwargs,
)
lightrag=lightrag_instance, # Pass existing LightRAG instance
vision_model_func=vision_model_func,
# Note: working_dir, llm_model_func, embedding_func, etc. are inherited from lightrag_instance
)
# Query the existing knowledge base
result = await rag.query_with_multimodal(
# Query existing knowledge base
result = await rag.aquery(
"What data has been processed in this LightRAG instance?",
mode="hybrid"
)
print("Query result:", result)
# Add new multimodal documents to the existing LightRAG instance
# Add new multimodal document to existing LightRAG instance
await rag.process_document_complete(
file_path="path/to/new/multimodal_document.pdf",
output_dir="./output"
@@ -535,6 +777,195 @@ if __name__ == "__main__":
asyncio.run(load_existing_lightrag())
```
#### 7. Direct Content List Insertion
For scenarios where you already have a pre-parsed content list (e.g., from external parsers or previous processing), you can directly insert it into RAGAnything without document parsing:
```python
import asyncio
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
async def insert_content_list_example():
# Set up API configuration
api_key = "your-api-key"
base_url = "your-base-url" # Optional
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir="./rag_storage",
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
# Define model functions
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
],
} if image_data else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
embedding_func = EmbeddingFunc(
embedding_dim=3072,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key=api_key,
base_url=base_url,
),
)
# Initialize RAGAnything
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
# Example: Pre-parsed content list from external source
content_list = [
{
"type": "text",
"text": "This is the introduction section of our research paper.",
"page_idx": 0 # Page number where this content appears
},
{
"type": "image",
"img_path": "/absolute/path/to/figure1.jpg", # IMPORTANT: Use absolute path
"image_caption": ["Figure 1: System Architecture"],
"image_footnote": ["Source: Authors' original design"],
"page_idx": 1 # Page number where this image appears
},
{
"type": "table",
"table_body": "| Method | Accuracy | F1-Score |\n|--------|----------|----------|\n| Ours | 95.2% | 0.94 |\n| Baseline | 87.3% | 0.85 |",
"table_caption": ["Table 1: Performance Comparison"],
"table_footnote": ["Results on test dataset"],
"page_idx": 2 # Page number where this table appears
},
{
"type": "equation",
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
"text": "Document relevance probability formula",
"page_idx": 3 # Page number where this equation appears
},
{
"type": "text",
"text": "In conclusion, our method demonstrates superior performance across all metrics.",
"page_idx": 4 # Page number where this content appears
}
]
# Insert the content list directly
await rag.insert_content_list(
content_list=content_list,
file_path="research_paper.pdf", # Reference file name for citation
split_by_character=None, # Optional text splitting
split_by_character_only=False, # Optional text splitting mode
doc_id=None, # Optional custom document ID (will be auto-generated if not provided)
display_stats=True # Show content statistics
)
# Query the inserted content
result = await rag.aquery(
"What are the key findings and performance metrics mentioned in the research?",
mode="hybrid"
)
print("Query result:", result)
# You can also insert multiple content lists with different document IDs
another_content_list = [
{
"type": "text",
"text": "This is content from another document.",
"page_idx": 0 # Page number where this content appears
},
{
"type": "table",
"table_body": "| Feature | Value |\n|---------|-------|\n| Speed | Fast |\n| Accuracy | High |",
"table_caption": ["Feature Comparison"],
"page_idx": 1 # Page number where this table appears
}
]
await rag.insert_content_list(
content_list=another_content_list,
file_path="another_document.pdf",
doc_id="custom-doc-id-123" # Custom document ID
)
if __name__ == "__main__":
asyncio.run(insert_content_list_example())
```
**Content List Format:**
The `content_list` should follow the standard format with each item being a dictionary containing:
- **Text content**: `{"type": "text", "text": "content text", "page_idx": 0}`
- **Image content**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["caption"], "image_footnote": ["note"], "page_idx": 1}`
- **Table content**: `{"type": "table", "table_body": "markdown table", "table_caption": ["caption"], "table_footnote": ["note"], "page_idx": 2}`
- **Equation content**: `{"type": "equation", "latex": "LaTeX formula", "text": "description", "page_idx": 3}`
- **Generic content**: `{"type": "custom_type", "content": "any content", "page_idx": 4}`
**Important Notes:**
- **`img_path`**: Must be an absolute path to the image file (e.g., `/home/user/images/chart.jpg` or `C:\Users\user\images\chart.jpg`)
- **`page_idx`**: Represents the page number where the content appears in the original document (0-based indexing)
- **Content ordering**: Items are processed in the order they appear in the list
This method is particularly useful when:
- You have content from external parsers (non-MinerU/Docling)
- You want to process programmatically generated content
- You need to insert content from multiple sources into a single knowledge base
- You have cached parsing results that you want to reuse
---
## 🛠️ Examples
@@ -556,8 +987,8 @@ The `examples/` directory contains comprehensive usage examples:
**Run examples:**
```bash
# End-to-end processing
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
# End-to-end processing with parser selection
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
# Direct modal processing
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
@@ -594,14 +1025,32 @@ Create a `.env` file (refer to `.env.example`):
```bash
OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # Optional
OUTPUT_DIR=./output # Default output directory for parsed documents
PARSER=mineru # Parser selection: mineru or docling
PARSE_METHOD=auto # Parse method: auto, ocr, or txt
```
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys.
**Note:** For backward compatibility, legacy environment variable names are still supported:
- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
### Parser Configuration
RAGAnything now supports multiple parsers, each with specific advantages:
#### MinerU Parser
- Supports PDF, images, Office documents, and more formats
- Powerful OCR and table extraction capabilities
- GPU acceleration support
#### Docling Parser
- Optimized for Office documents and HTML files
- Better document structure preservation
- Native support for multiple Office formats
### MinerU Configuration
MinerU 2.0 uses a simplified configuration approach:
```bash
# MinerU 2.0 uses command-line parameters instead of config files
# Check available options:
@@ -613,20 +1062,43 @@ mineru -p input.pdf -o output_dir -m ocr # OCR-focused parsing
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU acceleration
```
You can also configure MinerU through RAGAnything parameters:
You can also configure parsing through RAGAnything parameters:
```python
# Configure parsing behavior
# Basic parsing configuration with parser selection
await rag.process_document_complete(
file_path="document.pdf",
parse_method="auto", # or "ocr", "txt"
device="cuda", # GPU acceleration
backend="pipeline", # parsing backend
lang="en" # language optimization
output_dir="./output/",
parse_method="auto", # or "ocr", "txt"
parser="mineru" # Optional: "mineru" or "docling"
)
# Advanced parsing configuration with special parameters
await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # Parsing method: "auto", "ocr", "txt"
parser="mineru", # Parser selection: "mineru" or "docling"
# MinerU special parameters - all supported kwargs:
lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja")
device="cuda:0", # Inference device: "cpu", "cuda", "cuda:0", "npu", "mps"
start_page=0, # Starting page number (0-based, for PDF)
end_page=10, # Ending page number (0-based, for PDF)
formula=True, # Enable formula parsing
table=True, # Enable table parsing
backend="pipeline", # Parsing backend: pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client.
source="huggingface", # Model source: "huggingface", "modelscope", "local"
# vlm_url="http://127.0.0.1:3000" # Service address when using backend=vlm-sglang-client
# Standard RAGAnything parameters
display_stats=True, # Display content statistics
split_by_character=None, # Optional character to split text by
doc_id=None # Optional document ID
)
```
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments.
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs.
### Processing Requirements
@@ -676,13 +1148,14 @@ Different content types require specific optional dependencies:
If you find RAG-Anything useful in your research, please cite our paper:
```bibtex
@article{guo2024lightrag,
title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
year={2024},
eprint={2410.05779},
archivePrefix={arXiv},
primaryClass={cs.IR}
@misc{guo2025raganythingallinoneragframework,
title={RAG-Anything: All-in-One RAG Framework},
author={Zirui Guo and Xubin Ren and Lingrui Xu and Jiahao Zhang and Chao Huang},
year={2025},
eprint={2510.12323},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2510.12323},
}
```

View File

@@ -14,12 +14,12 @@
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
<p>
<a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥项目-主页-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
<a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
<a href='https://arxiv.org/abs/2510.12323'><img src='https://img.shields.io/badge/📄arXiv-2510.12323-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
<a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡基于-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
</p>
<p>
<a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
<img src="https://img.shields.io/badge/🐍Python-3.9+-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
<img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
<a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
</p>
<p>
@@ -47,6 +47,14 @@
---
## 🎉 新闻
- [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式当文档包含图片时系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。
- [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md),支持为多模态内容处理添加相关上下文信息。
- [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询,实现了集成文本、图像、表格和公式处理的增强检索生成功能。
- [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟感谢您的支持和贡献。
---
## 🌟 系统概述
*下一代多模态智能*
@@ -68,6 +76,7 @@
- **🧠 多模态内容分析引擎** - 针对图像、表格、公式和通用文本内容部署专门的处理器,确保各类内容的精准解析
- **🔗 基于知识图谱索引** - 实现自动化实体提取和关系构建,建立跨模态的语义连接网络
- **⚡ 灵活的处理架构** - 支持基于MinerU的智能解析模式和直接多模态内容插入模式满足不同应用场景需求
- **📋 直接内容列表插入** - 跳过文档解析,直接插入来自外部源的预解析内容列表,支持多种数据来源整合
- **🎯 跨模态检索机制** - 实现跨文本和多模态内容的智能检索,提供精准的信息定位和匹配能力
</div>
@@ -263,7 +272,7 @@ pip install -e '.[all]'
mineru --version
# 检查是否正确配置
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU安装正常' if rag.check_mineru_installation() else '❌ MinerU安装有问题')"
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU安装正常' if rag.check_parser_installation() else '❌ MinerU安装有问题')"
```
模型在首次使用时自动下载。手动下载参考[MinerU模型源配置](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#22-%E6%A8%A1%E5%9E%8B%E6%BA%90%E9%85%8D%E7%BD%AE)
@@ -274,42 +283,164 @@ python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅
```python
import asyncio
from raganything import RAGAnything
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
async def main():
# 初始化RAGAnything
# 设置 API 配置
api_key = "your-api-key"
base_url = "your-base-url" # 可选
# 创建 RAGAnything 配置
config = RAGAnythingConfig(
working_dir="./rag_storage",
parser="mineru", # 选择解析器mineru 或 docling
parse_method="auto", # 解析方法auto, ocr 或 txt
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
# 定义 LLM 模型函数
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 定义视觉模型函数用于图像处理
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
# 如果提供了messages格式用于多模态VLM增强查询直接使用
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 传统单图片格式
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt}
if system_prompt
else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 纯文本格式
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
# 定义嵌入函数
embedding_func = EmbeddingFunc(
embedding_dim=3072,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key=api_key,
base_url=base_url,
),
)
# 初始化 RAGAnything
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
# 处理文档
await rag.process_document_complete(
file_path="path/to/your/document.pdf",
output_dir="./output",
parse_method="auto"
)
# 查询处理后的内容
# 纯文本查询 - 基本知识库搜索
text_result = await rag.aquery(
"文档的主要内容是什么?",
mode="hybrid"
)
print("文本查询结果:", text_result)
# 多模态查询 - 包含具体多模态内容的查询
multimodal_result = await rag.aquery_with_multimodal(
"分析这个性能数据并解释与现有文档内容的关系",
multimodal_content=[{
"type": "table",
"table_data": """系统,准确率,F1分数
RAGAnything,95.2%,0.94
基准方法,87.3%,0.85""",
"table_caption": "性能对比结果"
}],
mode="hybrid"
)
print("多模态查询结果:", multimodal_result)
if __name__ == "__main__":
asyncio.run(main())
```
#### 2. 直接多模态内容处理
```python
import asyncio
from lightrag import LightRAG
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
async def process_multimodal_content():
# 设置 API 配置
api_key = "your-api-key"
base_url = "your-base-url" # 可选
# 初始化 LightRAG
rag = LightRAG(
working_dir="./rag_storage",
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
**kwargs,
),
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]} if image_data else {"role": "user", "content": prompt}
],
api_key="your-api-key",
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
api_key=api_key,
base_url=base_url,
**kwargs,
),
embedding_func=EmbeddingFunc(
@@ -321,52 +452,43 @@ async def main():
api_key=api_key,
base_url=base_url,
),
),
)
# 处理文档
await rag.process_document_complete(
file_path="path/to/your/document.pdf",
output_dir="./output",
parse_method="auto"
)
# 查询处理后的内容
result = await rag.query_with_multimodal(
"图表中显示的主要发现是什么?",
mode="hybrid"
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
#### 2. 直接多模态内容处理
```python
import asyncio
from lightrag import LightRAG
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
async def process_multimodal_content():
# 初始化LightRAG
rag = LightRAG(
working_dir="./rag_storage",
# ... 你的LLM和嵌入配置
)
)
await rag.initialize_storages()
# 处理图像
image_processor = ImageModalProcessor(
lightrag=rag,
modal_caption_func=your_vision_model_func
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]} if image_data else {"role": "user", "content": prompt}
],
api_key=api_key,
base_url=base_url,
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
)
image_content = {
"img_path": "path/to/image.jpg",
"img_caption": ["图1实验结果"],
"img_footnote": ["数据收集于2024年"]
"image_caption": ["图1实验结果"],
"image_footnote": ["数据收集于2024年"]
}
description, entity_info = await image_processor.process_multimodal_content(
@@ -379,7 +501,15 @@ async def process_multimodal_content():
# 处理表格
table_processor = TableModalProcessor(
lightrag=rag,
modal_caption_func=your_llm_model_func
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
)
table_content = {
@@ -432,11 +562,74 @@ class CustomModalProcessor(GenericModalProcessor):
#### 5. 查询选项
RAG-Anything 提供三种类型的查询方法:
**纯文本查询** - 使用LightRAG直接进行知识库搜索
```python
# 不同的查询模式
result_hybrid = await rag.query_with_multimodal("你的问题", mode="hybrid")
result_local = await rag.query_with_multimodal("你的问题", mode="local")
result_global = await rag.query_with_multimodal("你的问题", mode="global")
# 文本查询的不同模式
text_result_hybrid = await rag.aquery("你的问题", mode="hybrid")
text_result_local = await rag.aquery("你的问题", mode="local")
text_result_global = await rag.aquery("你的问题", mode="global")
text_result_naive = await rag.aquery("你的问题", mode="naive")
# 同步版本
sync_text_result = rag.query("你的问题", mode="hybrid")
```
**VLM增强查询** - 使用VLM自动分析检索上下文中的图像
```python
# VLM增强查询当提供vision_model_func时自动启用
vlm_result = await rag.aquery(
"分析文档中的图表和数据",
mode="hybrid"
# vlm_enhanced=True 当vision_model_func可用时自动设置
)
# 手动控制VLM增强
vlm_enabled = await rag.aquery(
"这个文档中的图片显示了什么内容?",
mode="hybrid",
vlm_enhanced=True # 强制启用VLM增强
)
vlm_disabled = await rag.aquery(
"这个文档中的图片显示了什么内容?",
mode="hybrid",
vlm_enhanced=False # 强制禁用VLM增强
)
# 当文档包含图片时VLM可以直接查看和分析图片
# 系统将自动:
# 1. 检索包含图片路径的相关上下文
# 2. 加载图片并编码为base64格式
# 3. 将文本上下文和图片一起发送给VLM进行综合分析
```
**多模态查询** - 包含特定多模态内容分析的增强查询:
```python
# 包含表格数据的查询
table_result = await rag.aquery_with_multimodal(
"比较这些性能指标与文档内容",
multimodal_content=[{
"type": "table",
"table_data": """方法,准确率,速度
LightRAG,95.2%,120ms
传统方法,87.3%,180ms""",
"table_caption": "性能对比"
}],
mode="hybrid"
)
# 包含公式内容的查询
equation_result = await rag.aquery_with_multimodal(
"解释这个公式及其与文档内容的相关性",
multimodal_content=[{
"type": "equation",
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
"equation_caption": "文档相关性概率"
}],
mode="hybrid"
)
```
#### 6. 加载已存在的LightRAG实例
@@ -450,16 +643,20 @@ from lightrag.utils import EmbeddingFunc
import os
async def load_existing_lightrag():
# 首先创建或加载已存在的LightRAG实例
# 设置 API 配置
api_key = "your-api-key"
base_url = "your-base-url" # 可选
# 首先,创建或加载已存在的 LightRAG 实例
lightrag_working_dir = "./existing_lightrag_storage"
# 检查是否存在之前的LightRAG实例
# 检查是否存在之前的 LightRAG 实例
if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):
print("✅ 发现已存在的LightRAG实例正在加载...")
print("✅ 发现已存在的 LightRAG 实例,正在加载...")
else:
print("❌ 未找到已存在的LightRAG实例将创建新实例")
print("❌ 未找到已存在的 LightRAG 实例,将创建新实例")
# 使用您的配置创建/加载LightRAG实例
# 使用您的配置创建/加载 LightRAG 实例
lightrag_instance = LightRAG(
working_dir=lightrag_working_dir,
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
@@ -467,7 +664,8 @@ async def load_existing_lightrag():
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
api_key=api_key,
base_url=base_url,
**kwargs,
),
embedding_func=EmbeddingFunc(
@@ -484,44 +682,73 @@ async def load_existing_lightrag():
# 初始化存储(如果有现有数据,这将加载它们)
await lightrag_instance.initialize_storages()
await initialize_pipeline_status()
# 现在使用已存在的LightRAG实例初始化RAGAnything
# 定义视觉模型函数用于图像处理
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
# 如果提供了messages格式用于多模态VLM增强查询直接使用
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 传统单图片格式
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt}
if system_prompt
else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 纯文本格式
else:
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
# 现在使用已存在的 LightRAG 实例初始化 RAGAnything
rag = RAGAnything(
lightrag=lightrag_instance, # 传入已存在的LightRAG实例
# 只需要为多模态处理配置vision model
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]} if image_data else {"role": "user", "content": prompt}
],
api_key="your-api-key",
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
**kwargs,
)
# 注意working_dir、llm_model_func、embedding_func等都从lightrag_instance继承
lightrag=lightrag_instance, # 传入已存在的 LightRAG 实例
vision_model_func=vision_model_func,
# 注意working_dir、llm_model_func、embedding_func 等都从 lightrag_instance 继承
)
# 查询已存在的知识库
result = await rag.query_with_multimodal(
"这个LightRAG实例中处理了哪些数据",
result = await rag.aquery(
"这个 LightRAG 实例中处理了哪些数据?",
mode="hybrid"
)
print("查询结果:", result)
# 向已存在的LightRAG实例添加新的多模态文档
# 向已存在的 LightRAG 实例添加新的多模态文档
await rag.process_document_complete(
file_path="path/to/new/multimodal_document.pdf",
output_dir="./output"
@@ -531,6 +758,195 @@ if __name__ == "__main__":
asyncio.run(load_existing_lightrag())
```
#### 7. 直接插入内容列表
当您已经有预解析的内容列表(例如,来自外部解析器或之前的处理结果)时,可以直接插入到 RAGAnything 中而无需文档解析:
```python
import asyncio
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
async def insert_content_list_example():
# 设置 API 配置
api_key = "your-api-key"
base_url = "your-base-url" # 可选
# 创建 RAGAnything 配置
config = RAGAnythingConfig(
working_dir="./rag_storage",
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
# 定义模型函数
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
# 如果提供了messages格式用于多模态VLM增强查询直接使用
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 传统单图片格式
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
],
} if image_data else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 纯文本格式
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
embedding_func = EmbeddingFunc(
embedding_dim=3072,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key=api_key,
base_url=base_url,
),
)
# 初始化 RAGAnything
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
# 示例:来自外部源的预解析内容列表
content_list = [
{
"type": "text",
"text": "这是我们研究论文的引言部分。",
"page_idx": 0 # 此内容出现的页码
},
{
"type": "image",
"img_path": "/absolute/path/to/figure1.jpg", # 重要:使用绝对路径
"image_caption": ["图1系统架构"],
"image_footnote": ["来源:作者原创设计"],
"page_idx": 1 # 此图像出现的页码
},
{
"type": "table",
"table_body": "| 方法 | 准确率 | F1分数 |\n|------|--------|--------|\n| 我们的方法 | 95.2% | 0.94 |\n| 基准方法 | 87.3% | 0.85 |",
"table_caption": ["表1性能对比"],
"table_footnote": ["测试数据集结果"],
"page_idx": 2 # 此表格出现的页码
},
{
"type": "equation",
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
"text": "文档相关性概率公式",
"page_idx": 3 # 此公式出现的页码
},
{
"type": "text",
"text": "总之,我们的方法在所有指标上都表现出优越的性能。",
"page_idx": 4 # 此内容出现的页码
}
]
# 直接插入内容列表
await rag.insert_content_list(
content_list=content_list,
file_path="research_paper.pdf", # 用于引用的参考文件名
split_by_character=None, # 可选的文本分割
split_by_character_only=False, # 可选的文本分割模式
doc_id=None, # 可选的自定义文档ID如果未提供将自动生成
display_stats=True # 显示内容统计信息
)
# 查询插入的内容
result = await rag.aquery(
"研究中提到的主要发现和性能指标是什么?",
mode="hybrid"
)
print("查询结果:", result)
# 您也可以使用不同的文档ID插入多个内容列表
another_content_list = [
{
"type": "text",
"text": "这是来自另一个文档的内容。",
"page_idx": 0 # 此内容出现的页码
},
{
"type": "table",
"table_body": "| 特性 | 值 |\n|------|----|\n| 速度 | 快速 |\n| 准确性 | 高 |",
"table_caption": ["特性对比"],
"page_idx": 1 # 此表格出现的页码
}
]
await rag.insert_content_list(
content_list=another_content_list,
file_path="another_document.pdf",
doc_id="custom-doc-id-123" # 自定义文档ID
)
if __name__ == "__main__":
asyncio.run(insert_content_list_example())
```
**内容列表格式:**
`content_list` 应遵循标准格式,每个项目都是包含以下内容的字典:
- **文本内容**: `{"type": "text", "text": "内容文本", "page_idx": 0}`
- **图像内容**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["标题"], "image_footnote": ["注释"], "page_idx": 1}`
- **表格内容**: `{"type": "table", "table_body": "markdown表格", "table_caption": ["标题"], "table_footnote": ["注释"], "page_idx": 2}`
- **公式内容**: `{"type": "equation", "latex": "LaTeX公式", "text": "描述", "page_idx": 3}`
- **通用内容**: `{"type": "custom_type", "content": "任何内容", "page_idx": 4}`
**重要说明:**
- **`img_path`**: 必须是图像文件的绝对路径(例如:`/home/user/images/chart.jpg``C:\Users\user\images\chart.jpg`
- **`page_idx`**: 表示内容在原始文档中出现的页码从0开始的索引
- **内容顺序**: 项目按照在列表中出现的顺序进行处理
此方法在以下情况下特别有用:
- 您有来自外部解析器的内容非MinerU/Docling
- 您想要处理程序化生成的内容
- 您需要将来自多个源的内容插入到单个知识库中
- 您有想要重用的缓存解析结果
---
## 🛠️ 示例
@@ -552,8 +968,8 @@ if __name__ == "__main__":
**运行示例:**
```bash
# 端到端处理
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
# 端到端处理(包含解析器选择)
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
# 直接模态处理
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
@@ -592,11 +1008,29 @@ python examples/text_format_test.py --check-reportlab --file dummy
```bash
OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # 可选
OUTPUT_DIR=./output # 解析文档的默认输出目录
PARSER=mineru # 解析器选择mineru 或 docling
PARSE_METHOD=auto # 解析方法auto, ocr 或 txt
```
### MinerU配置
**注意:** 为了向后兼容,旧的环境变量名称仍然有效:
- `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD`
MinerU 2.0使用简化的配置方式:
### 解析器配置
RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势:
#### MinerU 解析器
- 支持PDF、图像、Office文档等多种格式
- 强大的OCR和表格提取能力
- 支持GPU加速
#### Docling 解析器
- 专门优化Office文档和HTML文件的解析
- 更好的文档结构保持
- 原生支持多种Office格式
### MinerU配置
```bash
# MinerU 2.0使用命令行参数而不是配置文件
@@ -609,20 +1043,43 @@ mineru -p input.pdf -o output_dir -m ocr # OCR重点解析
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU加速
```
你也可以通过RAGAnything参数配置MinerU
你也可以通过RAGAnything参数配置解析
```python
# 配置解析行为
# 基础解析配置解析器选择
await rag.process_document_complete(
file_path="document.pdf",
parse_method="auto", # 或 "ocr", "txt"
device="cuda", # GPU加速
backend="pipeline", # 解析后端
lang="ch" # 语言优化
output_dir="./output/",
parse_method="auto", # 或 "ocr", "txt"
parser="mineru" # 可选:"mineru" 或 "docling"
)
# 高级解析配置(包含特殊参数)
await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # 解析方法:"auto", "ocr", "txt"
parser="mineru", # 解析器选择:"mineru" 或 "docling"
# MinerU特殊参数 - 支持的所有kwargs
lang="ch", # 文档语言优化(如:"ch", "en", "ja"
device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
start_page=0, # 起始页码0为基准适用于PDF
end_page=10, # 结束页码0为基准适用于PDF
formula=True, # 启用公式解析
table=True, # 启用表格解析
backend="pipeline", # 解析后端pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client
source="huggingface", # 模型源:"huggingface", "modelscope", "local"
# vlm_url="http://127.0.0.1:3000" # 当backend=vlm-sglang-client时需指定服务地址
# RAGAnything标准参数
display_stats=True, # 显示内容统计信息
split_by_character=None, # 可选的文本分割字符
doc_id=None # 可选的文档ID
)
```
> **注意**MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。
> **注意**MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。RAG-Anything现在支持多种文档解析器 - 你可以根据需要在MinerU和Docling之间选择。
### 处理要求
@@ -670,13 +1127,14 @@ await rag.process_document_complete(
</div>
```bibtex
@article{guo2024lightrag,
title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
year={2024},
eprint={2410.05779},
archivePrefix={arXiv},
primaryClass={cs.IR}
@misc{guo2025raganythingallinoneragframework,
title={RAG-Anything: All-in-One RAG Framework},
author={Zirui Guo and Xubin Ren and Lingrui Xu and Jiahao Zhang and Chao Huang},
year={2025},
eprint={2510.12323},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2510.12323},
}
```

341
docs/batch_processing.md Normal file
View File

@@ -0,0 +1,341 @@
# Batch Processing
This document describes the batch processing feature for RAG-Anything, which allows you to process multiple documents in parallel for improved throughput.
## Overview
The batch processing feature allows you to process multiple documents concurrently, significantly improving throughput for large document collections. It provides parallel processing, progress tracking, error handling, and flexible configuration options.
## Key Features
- **Parallel Processing**: Process multiple files concurrently using thread pools
- **Progress Tracking**: Real-time progress bars with `tqdm`
- **Error Handling**: Comprehensive error reporting and recovery
- **Flexible Input**: Support for files, directories, and recursive search
- **Configurable Workers**: Adjustable number of parallel workers
- **Installation Check Bypass**: Optional skip for environments with package conflicts
## Installation
```bash
# Basic installation
pip install raganything[all]
# Required for batch processing
pip install tqdm
```
## Usage
### Basic Batch Processing
```python
from raganything.batch_parser import BatchParser
# Create batch parser
batch_parser = BatchParser(
parser_type="mineru", # or "docling"
max_workers=4,
show_progress=True,
timeout_per_file=300,
skip_installation_check=False # Set to True if having parser installation issues
)
# Process multiple files
result = batch_parser.process_batch(
file_paths=["doc1.pdf", "doc2.docx", "folder/"],
output_dir="./batch_output",
parse_method="auto",
recursive=True
)
# Check results
print(result.summary())
print(f"Success rate: {result.success_rate:.1f}%")
print(f"Processing time: {result.processing_time:.2f} seconds")
```
### Asynchronous Batch Processing
```python
import asyncio
from raganything.batch_parser import BatchParser
async def async_batch_processing():
batch_parser = BatchParser(
parser_type="mineru",
max_workers=4,
show_progress=True
)
# Process files asynchronously
result = await batch_parser.process_batch_async(
file_paths=["doc1.pdf", "doc2.docx"],
output_dir="./output",
parse_method="auto"
)
return result
# Run async processing
result = asyncio.run(async_batch_processing())
```
### Integration with RAG-Anything
```python
from raganything import RAGAnything
rag = RAGAnything()
# Process documents with batch functionality
result = rag.process_documents_batch(
file_paths=["doc1.pdf", "doc2.docx"],
output_dir="./output",
max_workers=4,
show_progress=True
)
print(f"Processed {len(result.successful_files)} files successfully")
```
### Process Documents with RAG Integration
```python
# Process documents in batch and then add them to RAG
result = await rag.process_documents_with_rag_batch(
file_paths=["doc1.pdf", "doc2.docx"],
output_dir="./output",
max_workers=4,
show_progress=True
)
print(f"Processed {result['successful_rag_files']} files with RAG")
print(f"Total processing time: {result['total_processing_time']:.2f} seconds")
```
### Command Line Interface
```bash
# Basic batch processing
python -m raganything.batch_parser path/to/docs/ --output ./output --workers 4
# With specific parser
python -m raganything.batch_parser path/to/docs/ --parser mineru --method auto
# Without progress bar
python -m raganything.batch_parser path/to/docs/ --output ./output --no-progress
# Help
python -m raganything.batch_parser --help
```
## Configuration
### Environment Variables
```env
# Batch processing configuration
MAX_CONCURRENT_FILES=4
SUPPORTED_FILE_EXTENSIONS=.pdf,.docx,.doc,.pptx,.ppt,.xlsx,.xls,.txt,.md
RECURSIVE_FOLDER_PROCESSING=true
PARSER_OUTPUT_DIR=./parsed_output
```
### BatchParser Parameters
- **parser_type**: `"mineru"` or `"docling"` (default: `"mineru"`)
- **max_workers**: Number of parallel workers (default: `4`)
- **show_progress**: Show progress bar (default: `True`)
- **timeout_per_file**: Timeout per file in seconds (default: `300`)
- **skip_installation_check**: Skip parser installation check (default: `False`)
## Supported File Types
- **PDF files**: `.pdf`
- **Office documents**: `.doc`, `.docx`, `.ppt`, `.pptx`, `.xls`, `.xlsx`
- **Images**: `.png`, `.jpg`, `.jpeg`, `.bmp`, `.tiff`, `.tif`, `.gif`, `.webp`
- **Text files**: `.txt`, `.md`
## API Reference
### BatchProcessingResult
```python
@dataclass
class BatchProcessingResult:
successful_files: List[str] # Successfully processed files
failed_files: List[str] # Failed files
total_files: int # Total number of files
processing_time: float # Total processing time in seconds
errors: Dict[str, str] # Error messages for failed files
output_dir: str # Output directory used
def summary(self) -> str: # Human-readable summary
def success_rate(self) -> float: # Success rate as percentage
```
### BatchParser Methods
```python
class BatchParser:
def __init__(self, parser_type: str = "mineru", max_workers: int = 4, ...):
"""Initialize batch parser"""
def get_supported_extensions(self) -> List[str]:
"""Get list of supported file extensions"""
def filter_supported_files(self, file_paths: List[str], recursive: bool = True) -> List[str]:
"""Filter files to only supported types"""
def process_batch(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
"""Process files in batch"""
async def process_batch_async(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
"""Process files in batch asynchronously"""
```
## Performance Considerations
### Memory Usage
- Each worker uses additional memory
- Recommended: 2-4 workers for most systems
- Monitor memory usage with large files
### CPU Usage
- Parallel processing utilizes multiple cores
- Optimal worker count depends on CPU cores and file sizes
- I/O may become bottleneck with many small files
### Recommended Settings
- **Small files** (< 1MB): Higher worker count (6-8)
- **Large files** (> 100MB): Lower worker count (2-3)
- **Mixed sizes**: Start with 4 workers and adjust
## Troubleshooting
### Common Issues
#### Memory Errors
```python
# Solution: Reduce max_workers
batch_parser = BatchParser(max_workers=2)
```
#### Timeout Errors
```python
# Solution: Increase timeout_per_file
batch_parser = BatchParser(timeout_per_file=600) # 10 minutes
```
#### Parser Installation Issues
```python
# Solution: Skip installation check
batch_parser = BatchParser(skip_installation_check=True)
```
#### File Not Found Errors
- Check file paths and permissions
- Ensure input files exist
- Verify directory access rights
### Debug Mode
Enable debug logging for detailed information:
```python
import logging
logging.basicConfig(level=logging.DEBUG)
# Create batch parser with debug logging
batch_parser = BatchParser(parser_type="mineru", max_workers=2)
```
### Error Handling
The batch processor provides comprehensive error handling:
```python
result = batch_parser.process_batch(file_paths=["doc1.pdf", "doc2.docx"])
# Check for errors
if result.failed_files:
print("Failed files:")
for file_path in result.failed_files:
error_message = result.errors.get(file_path, "Unknown error")
print(f" - {file_path}: {error_message}")
# Process only successful files
for file_path in result.successful_files:
print(f"Successfully processed: {file_path}")
```
## Examples
### Process Entire Directory
```python
from pathlib import Path
# Process all supported files in a directory
batch_parser = BatchParser(max_workers=4)
directory_path = Path("./documents")
result = batch_parser.process_batch(
file_paths=[str(directory_path)],
output_dir="./processed",
recursive=True # Include subdirectories
)
print(f"Processed {len(result.successful_files)} out of {result.total_files} files")
```
### Filter Files Before Processing
```python
# Get all files in directory
all_files = ["doc1.pdf", "image.png", "spreadsheet.xlsx", "unsupported.xyz"]
# Filter to supported files only
supported_files = batch_parser.filter_supported_files(all_files)
print(f"Will process {len(supported_files)} out of {len(all_files)} files")
# Process only supported files
result = batch_parser.process_batch(
file_paths=supported_files,
output_dir="./output"
)
```
### Custom Error Handling
```python
def process_with_retry(file_paths, max_retries=3):
"""Process files with retry logic"""
for attempt in range(max_retries):
result = batch_parser.process_batch(file_paths, "./output")
if not result.failed_files:
break # All files processed successfully
print(f"Attempt {attempt + 1}: {len(result.failed_files)} files failed")
file_paths = result.failed_files # Retry failed files
return result
```
## Best Practices
1. **Start with default settings** and adjust based on performance
2. **Monitor system resources** during batch processing
3. **Use appropriate worker counts** for your hardware
4. **Handle errors gracefully** with retry logic
5. **Test with small batches** before processing large collections
6. **Use skip_installation_check** if facing parser installation issues
7. **Enable progress tracking** for long-running operations
8. **Set appropriate timeouts** based on expected file processing times
## Conclusion
The batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline.

View File

@@ -0,0 +1,375 @@
# Context-Aware Multimodal Processing in RAGAnything
This document describes the context-aware multimodal processing feature in RAGAnything, which provides surrounding content information to LLMs when analyzing images, tables, equations, and other multimodal content for enhanced accuracy and relevance.
## Overview
The context-aware feature enables RAGAnything to automatically extract and provide surrounding text content as context when processing multimodal content. This leads to more accurate and contextually relevant analysis by giving AI models additional information about where the content appears in the document structure.
### Key Benefits
- **Enhanced Accuracy**: Context helps AI understand the purpose and meaning of multimodal content
- **Semantic Coherence**: Generated descriptions align with document context and terminology
- **Automated Integration**: Context extraction is automatically enabled during document processing
- **Flexible Configuration**: Multiple extraction modes and filtering options
## Key Features
### 1. Configuration Support
- **Integrated Configuration**: Complete context options in `RAGAnythingConfig`
- **Environment Variables**: Configure all context parameters via environment variables
- **Dynamic Updates**: Runtime configuration updates supported
- **Content Format Control**: Configurable content source format detection
### 2. Automated Integration
- **Auto-Initialization**: Modal processors automatically receive tokenizer and context configuration
- **Content Source Setup**: Document processing automatically sets content sources for context extraction
- **Position Information**: Automatic position info (page_idx, index) passed to processors
- **Batch Processing**: Context-aware batch processing for efficient document handling
### 3. Advanced Token Management
- **Accurate Token Counting**: Uses LightRAG's tokenizer for precise token calculation
- **Smart Boundary Preservation**: Truncates at sentence/paragraph boundaries
- **Backward Compatibility**: Fallback to character truncation when tokenizer unavailable
### 4. Universal Context Extraction
- **Multiple Formats**: Support for MinerU, plain text, custom formats
- **Flexible Modes**: Page-based and chunk-based context extraction
- **Content Filtering**: Configurable content type filtering
- **Header Support**: Optional inclusion of document headers and structure
## Configuration
### RAGAnythingConfig Parameters
```python
# Context Extraction Configuration
context_window: int = 1 # Context window size (pages/chunks)
context_mode: str = "page" # Context mode ("page" or "chunk")
max_context_tokens: int = 2000 # Maximum context tokens
include_headers: bool = True # Include document headers
include_captions: bool = True # Include image/table captions
context_filter_content_types: List[str] = ["text"] # Content types to include
content_format: str = "minerU" # Default content format for context extraction
```
### Environment Variables
```bash
# Context extraction settings
CONTEXT_WINDOW=2
CONTEXT_MODE=page
MAX_CONTEXT_TOKENS=3000
INCLUDE_HEADERS=true
INCLUDE_CAPTIONS=true
CONTEXT_FILTER_CONTENT_TYPES=text,image
CONTENT_FORMAT=minerU
```
## Usage Guide
### 1. Basic Configuration
```python
from raganything import RAGAnything, RAGAnythingConfig
# Create configuration with context settings
config = RAGAnythingConfig(
context_window=2,
context_mode="page",
max_context_tokens=3000,
include_headers=True,
include_captions=True,
context_filter_content_types=["text", "image"],
content_format="minerU"
)
# Create RAGAnything instance
rag_anything = RAGAnything(
config=config,
llm_model_func=your_llm_function,
embedding_func=your_embedding_function
)
```
### 2. Automatic Document Processing
```python
# Context is automatically enabled during document processing
await rag_anything.process_document_complete("document.pdf")
```
### 3. Manual Content Source Configuration
```python
# Set content source for specific content lists
rag_anything.set_content_source_for_context(content_list, "minerU")
# Update context configuration at runtime
rag_anything.update_context_config(
context_window=1,
max_context_tokens=1500,
include_captions=False
)
```
### 4. Direct Modal Processor Usage
```python
from raganything.modalprocessors import (
ContextExtractor,
ContextConfig,
ImageModalProcessor
)
# Configure context extraction
config = ContextConfig(
context_window=1,
context_mode="page",
max_context_tokens=2000,
include_headers=True,
include_captions=True,
filter_content_types=["text"]
)
# Initialize context extractor
context_extractor = ContextExtractor(config)
# Initialize modal processor with context support
processor = ImageModalProcessor(lightrag, caption_func, context_extractor)
# Set content source
processor.set_content_source(content_list, "minerU")
# Process with context
item_info = {
"page_idx": 2,
"index": 5,
"type": "image"
}
result = await processor.process_multimodal_content(
modal_content=image_data,
content_type="image",
file_path="document.pdf",
entity_name="Architecture Diagram",
item_info=item_info
)
```
## Context Modes
### Page-Based Context (`context_mode="page"`)
- Extracts context based on page boundaries
- Uses `page_idx` field from content items
- Suitable for document-structured content
- Example: Include text from 2 pages before and after current image
### Chunk-Based Context (`context_mode="chunk"`)
- Extracts context based on content item positions
- Uses sequential position in content list
- Suitable for fine-grained control
- Example: Include 5 content items before and after current table
## Processing Workflow
### 1. Document Parsing
```
Document Input → MinerU Parsing → content_list Generation
```
### 2. Context Setup
```
content_list → Set as Context Source → All Modal Processors Gain Context Capability
```
### 3. Multimodal Processing
```
Multimodal Content → Extract Surrounding Context → Enhanced LLM Analysis → More Accurate Results
```
## Content Source Formats
### MinerU Format
```json
[
{
"type": "text",
"text": "Document content here...",
"text_level": 1,
"page_idx": 0
},
{
"type": "image",
"img_path": "images/figure1.jpg",
"image_caption": ["Figure 1: Architecture"],
"image_footnote": [],
"page_idx": 1
}
]
```
### Custom Text Chunks
```python
text_chunks = [
"First chunk of text content...",
"Second chunk of text content...",
"Third chunk of text content..."
]
```
### Plain Text
```python
full_document = "Complete document text with all content..."
```
## Configuration Examples
### High-Precision Context
For focused analysis with minimal context:
```python
config = RAGAnythingConfig(
context_window=1,
context_mode="page",
max_context_tokens=1000,
include_headers=True,
include_captions=False,
context_filter_content_types=["text"]
)
```
### Comprehensive Context
For broad analysis with rich context:
```python
config = RAGAnythingConfig(
context_window=2,
context_mode="page",
max_context_tokens=3000,
include_headers=True,
include_captions=True,
context_filter_content_types=["text", "image", "table"]
)
```
### Chunk-Based Analysis
For fine-grained sequential context:
```python
config = RAGAnythingConfig(
context_window=5,
context_mode="chunk",
max_context_tokens=2000,
include_headers=False,
include_captions=False,
context_filter_content_types=["text"]
)
```
## Performance Optimization
### 1. Accurate Token Control
- Uses real tokenizer for precise token counting
- Avoids exceeding LLM token limits
- Provides consistent performance
### 2. Smart Truncation
- Truncates at sentence boundaries
- Maintains semantic integrity
- Adds truncation indicators
### 3. Caching Optimization
- Context extraction results can be reused
- Reduces redundant computation overhead
## Advanced Features
### Context Truncation
The system automatically truncates context to fit within token limits:
- Uses actual tokenizer for accurate token counting
- Attempts to end at sentence boundaries (periods)
- Falls back to line boundaries if needed
- Adds "..." indicator for truncated content
### Header Formatting
When `include_headers=True`, headers are formatted with markdown-style prefixes:
```
# Level 1 Header
## Level 2 Header
### Level 3 Header
```
### Caption Integration
When `include_captions=True`, image and table captions are included as:
```
[Image: Figure 1 caption text]
[Table: Table 1 caption text]
```
## Integration with RAGAnything
The context-aware feature is seamlessly integrated into RAGAnything's workflow:
1. **Automatic Setup**: Context extractors are automatically created and configured
2. **Content Source Management**: Document processing automatically sets content sources
3. **Processor Integration**: All modal processors receive context capabilities
4. **Configuration Consistency**: Single configuration system for all context settings
## Error Handling
The system includes robust error handling:
- Gracefully handles missing or invalid content sources
- Returns empty context for unsupported formats
- Logs warnings for configuration issues
- Continues processing even if context extraction fails
## Compatibility
- **Backward Compatible**: Existing code works without modification
- **Optional Feature**: Context can be selectively enabled/disabled
- **Flexible Configuration**: Supports multiple configuration combinations
## Best Practices
1. **Token Limits**: Ensure `max_context_tokens` doesn't exceed LLM context limits
2. **Performance Impact**: Larger context windows increase processing time
3. **Content Quality**: Context quality directly affects analysis accuracy
4. **Window Size**: Match window size to content structure (documents vs articles)
5. **Content Filtering**: Use `context_filter_content_types` to reduce noise
## Troubleshooting
### Common Issues
**Context Not Extracted**
- Check if `set_content_source_for_context()` was called
- Verify `item_info` contains required fields (`page_idx`, `index`)
- Confirm content source format is correct
**Context Too Long/Short**
- Adjust `max_context_tokens` setting
- Modify `context_window` size
- Check `context_filter_content_types` configuration
**Irrelevant Context**
- Refine `context_filter_content_types` to exclude noise
- Reduce `context_window` size
- Set `include_captions=False` if captions are not helpful
**Configuration Issues**
- Verify environment variables are set correctly
- Check RAGAnythingConfig parameter names
- Ensure content_format matches your data source
## Examples
Check out these example files for complete usage demonstrations:
- **Configuration Examples**: See how to set up different context configurations
- **Integration Examples**: Learn how to integrate context-aware processing into your workflow
- **Custom Processors**: Examples of creating custom modal processors with context support
## API Reference
For detailed API documentation, see the docstrings in:
- `raganything/modalprocessors.py` - Context extraction and modal processors
- `raganything/config.py` - Configuration options
- `raganything/raganything.py` - Main RAGAnything class integration

552
docs/enhanced_markdown.md Normal file
View File

@@ -0,0 +1,552 @@
# Enhanced Markdown Conversion
This document describes the enhanced markdown conversion feature for RAG-Anything, which provides high-quality PDF generation from markdown files with multiple backend options and advanced styling.
## Overview
The enhanced markdown conversion feature provides professional-quality PDF generation from markdown files. It supports multiple conversion backends, advanced styling options, syntax highlighting, and seamless integration with RAG-Anything's document processing pipeline.
## Key Features
- **Multiple Backends**: WeasyPrint, Pandoc, and automatic backend selection
- **Advanced Styling**: Custom CSS, syntax highlighting, and professional layouts
- **Image Support**: Embedded images with proper scaling and positioning
- **Table Support**: Formatted tables with borders and professional styling
- **Code Highlighting**: Syntax highlighting for code blocks using Pygments
- **Custom Templates**: Support for custom CSS and document templates
- **Table of Contents**: Automatic TOC generation with navigation links
- **Professional Typography**: High-quality fonts and spacing
## Installation
### Required Dependencies
```bash
# Basic installation
pip install raganything[all]
# Required for enhanced markdown conversion
pip install markdown weasyprint pygments
```
### Optional Dependencies
```bash
# For Pandoc backend (system installation required)
# Ubuntu/Debian:
sudo apt-get install pandoc wkhtmltopdf
# macOS:
brew install pandoc wkhtmltopdf
# Or using conda:
conda install -c conda-forge pandoc wkhtmltopdf
```
### Backend-Specific Installation
#### WeasyPrint (Recommended)
```bash
# Install WeasyPrint with system dependencies
pip install weasyprint
# Ubuntu/Debian system dependencies:
sudo apt-get install -y build-essential python3-dev python3-pip \
python3-setuptools python3-wheel python3-cffi libcairo2 \
libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
libffi-dev shared-mime-info
```
#### Pandoc
- Download from: https://pandoc.org/installing.html
- Requires system-wide installation
- Used for complex document structures and LaTeX-quality output
## Usage
### Basic Conversion
```python
from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
# Create converter with default settings
converter = EnhancedMarkdownConverter()
# Convert markdown file to PDF
success = converter.convert_file_to_pdf(
input_path="document.md",
output_path="document.pdf",
method="auto" # Automatically select best available backend
)
if success:
print("✅ Conversion successful!")
else:
print("❌ Conversion failed")
```
### Advanced Configuration
```python
# Create custom configuration
config = MarkdownConfig(
page_size="A4", # A4, Letter, Legal, etc.
margin="1in", # CSS-style margins
font_size="12pt", # Base font size
line_height="1.5", # Line spacing
include_toc=True, # Generate table of contents
syntax_highlighting=True, # Enable code syntax highlighting
# Custom CSS styling
custom_css="""
body {
font-family: 'Georgia', serif;
color: #333;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 0.3em;
}
code {
background-color: #f8f9fa;
padding: 2px 4px;
border-radius: 3px;
}
pre {
background-color: #f8f9fa;
border-left: 4px solid #3498db;
padding: 15px;
border-radius: 5px;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}
th, td {
border: 1px solid #ddd;
padding: 8px 12px;
text-align: left;
}
th {
background-color: #f2f2f2;
font-weight: bold;
}
"""
)
converter = EnhancedMarkdownConverter(config)
```
### Backend Selection
```python
# Check available backends
converter = EnhancedMarkdownConverter()
backend_info = converter.get_backend_info()
print("Available backends:")
for backend, available in backend_info["available_backends"].items():
status = "" if available else ""
print(f" {status} {backend}")
print(f"Recommended backend: {backend_info['recommended_backend']}")
# Use specific backend
converter.convert_file_to_pdf(
input_path="document.md",
output_path="document.pdf",
method="weasyprint" # or "pandoc", "pandoc_system", "auto"
)
```
### Content Conversion
```python
# Convert markdown content directly (not from file)
markdown_content = """
# Sample Document
## Introduction
This is a **bold** statement with *italic* text.
## Code Example
```python
def hello_world():
print("Hello, World!")
return "Success"
```
## Table
| Feature | Status | Notes |
|---------|--------|-------|
| PDF Generation | ✅ | Working |
| Syntax Highlighting | ✅ | Pygments |
| Custom CSS | ✅ | Full support |
"""
success = converter.convert_markdown_to_pdf(
markdown_content=markdown_content,
output_path="sample.pdf",
method="auto"
)
```
### Command Line Interface
```bash
# Basic conversion
python -m raganything.enhanced_markdown document.md --output document.pdf
# With specific backend
python -m raganything.enhanced_markdown document.md --method weasyprint
# With custom CSS file
python -m raganything.enhanced_markdown document.md --css custom_style.css
# Show backend information
python -m raganything.enhanced_markdown --info
# Help
python -m raganything.enhanced_markdown --help
```
## Backend Comparison
| Backend | Pros | Cons | Best For | Quality |
|---------|------|------|----------|---------|
| **WeasyPrint** | • Excellent CSS support<br>• Fast rendering<br>• Great web-style layouts<br>• Python-based | • Limited LaTeX features<br>• Requires system deps | • Web-style documents<br>• Custom styling<br>• Fast conversion | ⭐⭐⭐⭐ |
| **Pandoc** | • Extensive features<br>• LaTeX-quality output<br>• Academic formatting<br>• Many input/output formats | • Slower conversion<br>• System installation<br>• Complex setup | • Academic papers<br>• Complex documents<br>• Publication quality | ⭐⭐⭐⭐⭐ |
| **Auto** | • Automatic selection<br>• Fallback support<br>• User-friendly | • May not use optimal backend | • General use<br>• Quick setup<br>• Development | ⭐⭐⭐⭐ |
## Configuration Options
### MarkdownConfig Parameters
```python
@dataclass
class MarkdownConfig:
# Page layout
page_size: str = "A4" # A4, Letter, Legal, A3, etc.
margin: str = "1in" # CSS margin format
font_size: str = "12pt" # Base font size
line_height: str = "1.5" # Line spacing multiplier
# Content options
include_toc: bool = True # Generate table of contents
syntax_highlighting: bool = True # Enable code highlighting
image_max_width: str = "100%" # Maximum image width
table_style: str = "..." # Default table CSS
# Styling
css_file: Optional[str] = None # External CSS file path
custom_css: Optional[str] = None # Inline CSS content
template_file: Optional[str] = None # Custom HTML template
# Output options
output_format: str = "pdf" # Currently only PDF supported
output_dir: Optional[str] = None # Output directory
# Metadata
metadata: Optional[Dict[str, str]] = None # Document metadata
```
### Supported Markdown Features
#### Basic Formatting
- **Headers**: `# ## ### #### ##### ######`
- **Emphasis**: `*italic*`, `**bold**`, `***bold italic***`
- **Links**: `[text](url)`, `[text][ref]`
- **Images**: `![alt](url)`, `![alt][ref]`
- **Lists**: Ordered and unordered, nested
- **Blockquotes**: `> quote`
- **Line breaks**: Double space or `\n\n`
#### Advanced Features
- **Tables**: GitHub-style tables with alignment
- **Code blocks**: Fenced code blocks with language specification
- **Inline code**: `backtick code`
- **Horizontal rules**: `---` or `***`
- **Footnotes**: `[^1]` references
- **Definition lists**: Term and definition pairs
- **Attributes**: `{#id .class key=value}`
#### Code Highlighting
```markdown
```python
def example_function():
"""This will be syntax highlighted"""
return "Hello, World!"
```
```javascript
function exampleFunction() {
// This will also be highlighted
return "Hello, World!";
}
```
```
## Integration with RAG-Anything
The enhanced markdown conversion integrates seamlessly with RAG-Anything:
```python
from raganything import RAGAnything
# Initialize RAG-Anything
rag = RAGAnything()
# Process markdown files - enhanced conversion is used automatically
await rag.process_document_complete("document.md")
# Batch processing with enhanced markdown conversion
result = rag.process_documents_batch(
file_paths=["doc1.md", "doc2.md", "doc3.md"],
output_dir="./output"
)
# The .md files will be converted to PDF using enhanced conversion
# before being processed by the RAG system
```
## Performance Considerations
### Conversion Speed
- **WeasyPrint**: ~1-3 seconds for typical documents
- **Pandoc**: ~3-10 seconds for typical documents
- **Large documents**: Time scales roughly linearly with content
### Memory Usage
- **WeasyPrint**: ~50-100MB per conversion
- **Pandoc**: ~100-200MB per conversion
- **Images**: Large images increase memory usage significantly
### Optimization Tips
1. **Resize large images** before embedding
2. **Use compressed images** (JPEG for photos, PNG for graphics)
3. **Limit concurrent conversions** to avoid memory issues
4. **Cache converted content** when processing multiple times
## Examples
### Sample Markdown Document
```markdown
# Technical Documentation
## Table of Contents
[TOC]
## Overview
This document provides comprehensive technical specifications.
## Architecture
### System Components
1. **Parser Engine**: Handles document processing
2. **Storage Layer**: Manages data persistence
3. **Query Interface**: Provides search capabilities
### Code Implementation
```python
from raganything import RAGAnything
# Initialize system
rag = RAGAnything(config={
"working_dir": "./storage",
"enable_image_processing": True
})
# Process document
await rag.process_document_complete("document.pdf")
```
### Performance Metrics
| Component | Throughput | Latency | Memory |
|-----------|------------|---------|--------|
| Parser | 100 docs/hour | 36s avg | 2.5 GB |
| Storage | 1000 ops/sec | 1ms avg | 512 MB |
| Query | 50 queries/sec | 20ms avg | 1 GB |
## Integration Notes
> **Important**: Always validate input before processing.
## Conclusion
The enhanced system provides excellent performance for document processing workflows.
```
### Generated PDF Features
The enhanced markdown converter produces PDFs with:
- **Professional typography** with proper font selection and spacing
- **Syntax-highlighted code blocks** using Pygments
- **Formatted tables** with borders and alternating row colors
- **Clickable table of contents** with navigation links
- **Responsive images** that scale appropriately
- **Custom styling** through CSS
- **Proper page breaks** and margins
- **Document metadata** and properties
## Troubleshooting
### Common Issues
#### WeasyPrint Installation Problems
```bash
# Ubuntu/Debian: Install system dependencies
sudo apt-get update
sudo apt-get install -y build-essential python3-dev libcairo2 \
libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
libffi-dev shared-mime-info
# Then reinstall WeasyPrint
pip install --force-reinstall weasyprint
```
#### Pandoc Not Found
```bash
# Check if Pandoc is installed
pandoc --version
# Install Pandoc (Ubuntu/Debian)
sudo apt-get install pandoc wkhtmltopdf
# Or download from: https://pandoc.org/installing.html
```
#### CSS Issues
- Check CSS syntax in custom_css
- Verify CSS file paths exist
- Test CSS with simple HTML first
- Use browser developer tools to debug styling
#### Image Problems
- Ensure images are accessible (correct paths)
- Check image file formats (PNG, JPEG, GIF supported)
- Verify image file permissions
- Consider image size and format optimization
#### Font Issues
```python
# Use web-safe fonts
config = MarkdownConfig(
custom_css="""
body {
font-family: 'Arial', 'Helvetica', sans-serif;
}
"""
)
```
### Debug Mode
Enable detailed logging for troubleshooting:
```python
import logging
# Enable debug logging
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Create converter with debug logging
converter = EnhancedMarkdownConverter()
result = converter.convert_file_to_pdf("test.md", "test.pdf")
```
### Error Handling
```python
def robust_conversion(input_path, output_path):
"""Convert with fallback backends"""
converter = EnhancedMarkdownConverter()
# Try backends in order of preference
backends = ["weasyprint", "pandoc", "auto"]
for backend in backends:
try:
success = converter.convert_file_to_pdf(
input_path=input_path,
output_path=output_path,
method=backend
)
if success:
print(f"✅ Conversion successful with {backend}")
return True
except Exception as e:
print(f"{backend} failed: {str(e)}")
continue
print("❌ All backends failed")
return False
```
## API Reference
### EnhancedMarkdownConverter
```python
class EnhancedMarkdownConverter:
def __init__(self, config: Optional[MarkdownConfig] = None):
"""Initialize converter with optional configuration"""
def convert_file_to_pdf(self, input_path: str, output_path: str, method: str = "auto") -> bool:
"""Convert markdown file to PDF"""
def convert_markdown_to_pdf(self, markdown_content: str, output_path: str, method: str = "auto") -> bool:
"""Convert markdown content to PDF"""
def get_backend_info(self) -> Dict[str, Any]:
"""Get information about available backends"""
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
"""Convert using WeasyPrint backend"""
def convert_with_pandoc(self, markdown_content: str, output_path: str) -> bool:
"""Convert using Pandoc backend"""
```
## Best Practices
1. **Choose the right backend** for your use case:
- **WeasyPrint** for web-style documents and custom CSS
- **Pandoc** for academic papers and complex formatting
- **Auto** for general use and development
2. **Optimize images** before embedding:
- Use appropriate formats (JPEG for photos, PNG for graphics)
- Compress images to reduce file size
- Set reasonable maximum widths
3. **Design responsive layouts**:
- Use relative units (%, em) instead of absolute (px)
- Test with different page sizes
- Consider print-specific CSS
4. **Test your styling**:
- Start with default styling and incrementally customize
- Test with sample content before production use
- Validate CSS syntax
5. **Handle errors gracefully**:
- Implement fallback backends
- Provide meaningful error messages
- Log conversion attempts for debugging
6. **Performance optimization**:
- Cache converted content when possible
- Process large batches with appropriate worker counts
- Monitor memory usage with large documents
## Conclusion
The enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs.

78
docs/offline_setup.md Normal file
View File

@@ -0,0 +1,78 @@
# Running RAG-Anything in an Offline Environment
This document explains a critical consideration for running the RAG-Anything project in an environment with no internet access.
## The Network Dependency: `LightRAG` and `tiktoken`
The `RAGAnything` core engine relies on the `LightRAG` library for its primary functionality. `LightRAG`, in turn, uses OpenAI's `tiktoken` library for text tokenization.
By default, the `tiktoken` library has a network dependency. On its first use, it attempts to download tokenizer models from OpenAI's public servers (`openaipublic.blob.core.windows.net`). If the application is running in an offline or network-restricted environment, this download will fail, causing the `LightRAG` instance to fail to initialize.
This results in an error similar to the following:
```
Failed to initialize LightRAG instance: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/o200k_ba
```
This dependency is indirect. The `RAG-Anything` codebase itself does not directly import or call `tiktoken`. The call is made from within the `lightrag` library.
## The Solution: Using a Local `tiktoken` Cache
To resolve this issue and enable fully offline operation, you must provide a local cache for the `tiktoken` models. This is achieved by setting the `TIKTOKEN_CACHE_DIR` environment variable **before** the application starts.
When this environment variable is set, `tiktoken` will look for its model files in the specified local directory instead of attempting to download them from the internet.
### Steps to Implement the Solution:
1. **Create a Model Cache:** In an environment *with* internet access, run the provided script to download and cache the necessary `tiktoken` models.
```bash
# Run the cache creation script
uv run scripts/create_tiktoken_cache.py
```
This will create a `tiktoken_cache` directory in your project root containing the required model files.
2. **Configure the Environment Variable:** Add the following line to your `.env` file:
```bash
TIKTOKEN_CACHE_DIR=./tiktoken_cache
```
**Important:** You should ensure that the `.env` file is loaded **before** `LightRAG` imports `tiktoken`, making this configuration effective.
```python
import os
from typing import Dict, Any, Optional, Callable
import sys
import asyncio
import atexit
from dataclasses import dataclass, field
from pathlib import Path
from dotenv import load_dotenv
# Add project root directory to Python path
sys.path.insert(0, str(Path(__file__).parent.parent))
# Load environment variables FIRST - before any imports that use tiktoken
load_dotenv(dotenv_path=".env", override=False)
# Now import LightRAG (which will import tiktoken with the correct env var set)
from lightrag import LightRAG
from lightrag.utils import logger
# Rest of the code...
```
### Testing the Offline Setup
1. **Create a `tiktoken_cache` directory:** If you don't have one already, create a directory named `tiktoken_cache` in the project root.
2. **Populate the cache:** Run the `scripts/create_tiktoken_cache.py` script to download the necessary tiktoken models into the `tiktoken_cache` directory.
3. **Set the `TIKTOKEN_CACHE_DIR` environment variable:** Add the line `TIKTOKEN_CACHE_DIR=./tiktoken_cache` to your `.env` file.
4. **Disconnect from the internet:** Disable your internet connection or put your machine in airplane mode.
5. **Run the application:** Start the `RAG-Anything` application. For example:
```
uv run examples/raganything_example.py requirements.txt
```
By following these steps, you can eliminate the network dependency and run the `RAG-Anything` project successfully in a fully offline environment.

View File

@@ -10,6 +10,12 @@ OLLAMA_EMULATING_MODEL_TAG=latest
# WORKERS=2
# CORS_ORIGINS=http://localhost:3000,http://localhost:8080
### Tiktoken Cache Configuration (for offline deployment)
### Set this to a local directory containing cached tiktoken models
### This prevents tiktoken from downloading models from the internet on initialization
### See docs/offline_setup.md for setup instructions
# TIKTOKEN_CACHE_DIR=./tiktoken_cache
### Login Configuration
# AUTH_ACCOUNTS='admin:admin123,user1:pass456'
# TOKEN_SECRET=Your-Key-For-LightRAG-API-Server
@@ -33,9 +39,10 @@ OLLAMA_EMULATING_MODEL_TAG=latest
### RAGAnything Configuration (Multimodal Document Processing)
### ---
### MinerU Parser Configuration
# MINERU_PARSE_METHOD=auto
# MINERU_OUTPUT_DIR=./output
### Parser Configuration
# PARSE_METHOD=auto
# OUTPUT_DIR=./output
# PARSER=mineru
# DISPLAY_CONTENT_STATS=true
### Multimodal Processing Configuration
@@ -48,6 +55,15 @@ OLLAMA_EMULATING_MODEL_TAG=latest
# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md
# RECURSIVE_FOLDER_PROCESSING=true
### Context Extraction Configuration
# CONTEXT_WINDOW=1
# CONTEXT_MODE=page
# MAX_CONTEXT_TOKENS=2000
# INCLUDE_HEADERS=true
# INCLUDE_CAPTIONS=true
# CONTEXT_FILTER_CONTENT_TYPES=text
# CONTENT_FORMAT=minerU
### Max nodes return from grap retrieval
# MAX_GRAPH_NODES=1000
@@ -93,7 +109,7 @@ MAX_ASYNC=4
### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)
### MAX_TOKENS: set as num_ctx option for Ollama by API Server
MAX_TOKENS=32768
### LLM Binding type: openai, ollama, lollms, azure_openai
### LLM Binding type: openai, ollama, lollms, azure_openai, lmstudio
LLM_BINDING=openai
LLM_MODEL=gpt-4o
LLM_BINDING_HOST=https://api.openai.com/v1
@@ -103,7 +119,7 @@ LLM_BINDING_API_KEY=your_api_key
# AZURE_OPENAI_DEPLOYMENT=gpt-4o
### Embedding Configuration
### Embedding Binding type: openai, ollama, lollms, azure_openai
### Embedding Binding type: openai, ollama, lollms, azure_openai, lmstudio
EMBEDDING_BINDING=ollama
EMBEDDING_MODEL=bge-m3:latest
EMBEDDING_DIM=1024

View File

@@ -0,0 +1,561 @@
#!/usr/bin/env python
"""
Batch Processing Example for RAG-Anything
This example demonstrates how to use the batch processing capabilities
to process multiple documents in parallel for improved throughput.
Features demonstrated:
- Basic batch processing with BatchParser
- Asynchronous batch processing
- Integration with RAG-Anything
- Error handling and progress tracking
- File filtering and directory processing
"""
import asyncio
import logging
from pathlib import Path
import tempfile
import time
# Add project root directory to Python path
import sys
sys.path.append(str(Path(__file__).parent.parent))
from raganything import RAGAnything, RAGAnythingConfig
from raganything.batch_parser import BatchParser
def create_sample_documents():
"""Create sample documents for batch processing testing"""
temp_dir = Path(tempfile.mkdtemp())
sample_files = []
# Create various document types
documents = {
"document1.txt": "This is a simple text document for testing batch processing.",
"document2.txt": "Another text document with different content.",
"document3.md": """# Markdown Document
## Introduction
This is a markdown document for testing.
### Features
- Markdown formatting
- Code blocks
- Lists
```python
def example():
return "Hello from markdown"
```
""",
"report.txt": """Business Report
Executive Summary:
This report demonstrates batch processing capabilities.
Key Findings:
1. Parallel processing improves throughput
2. Progress tracking enhances user experience
3. Error handling ensures reliability
Conclusion:
Batch processing is essential for large-scale document processing.
""",
"notes.md": """# Meeting Notes
## Date: 2024-01-15
### Attendees
- Alice Johnson
- Bob Smith
- Carol Williams
### Discussion Topics
1. **Batch Processing Implementation**
- Parallel document processing
- Progress tracking
- Error handling strategies
2. **Performance Metrics**
- Target: 100 documents/hour
- Memory usage: < 4GB
- Success rate: > 95%
### Action Items
- [ ] Implement batch processing
- [ ] Add progress bars
- [ ] Test with large document sets
- [ ] Optimize memory usage
### Next Steps
Continue development and testing of batch processing features.
""",
}
# Create files
for filename, content in documents.items():
file_path = temp_dir / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
sample_files.append(str(file_path))
return sample_files, temp_dir
def demonstrate_basic_batch_processing():
"""Demonstrate basic batch processing functionality"""
print("\n" + "=" * 60)
print("BASIC BATCH PROCESSING DEMONSTRATION")
print("=" * 60)
# Create sample documents
sample_files, temp_dir = create_sample_documents()
try:
print(f"Created {len(sample_files)} sample documents in: {temp_dir}")
for file_path in sample_files:
print(f" - {Path(file_path).name}")
# Create batch parser
batch_parser = BatchParser(
parser_type="mineru",
max_workers=3,
show_progress=True,
timeout_per_file=60,
skip_installation_check=True, # Skip installation check for demo
)
print("\nBatch parser configured:")
print(" - Parser type: mineru")
print(" - Max workers: 3")
print(" - Progress tracking: enabled")
print(" - Timeout per file: 60 seconds")
# Check supported extensions
supported_extensions = batch_parser.get_supported_extensions()
print(f" - Supported extensions: {supported_extensions}")
# Filter files to supported types
supported_files = batch_parser.filter_supported_files(sample_files)
print("\nFile filtering results:")
print(f" - Total files: {len(sample_files)}")
print(f" - Supported files: {len(supported_files)}")
# Process batch
output_dir = temp_dir / "batch_output"
print("\nStarting batch processing...")
print(f"Output directory: {output_dir}")
start_time = time.time()
result = batch_parser.process_batch(
file_paths=supported_files,
output_dir=str(output_dir),
parse_method="auto",
recursive=False,
)
processing_time = time.time() - start_time
# Display results
print("\n" + "-" * 40)
print("BATCH PROCESSING RESULTS")
print("-" * 40)
print(result.summary())
print(f"Total processing time: {processing_time:.2f} seconds")
print(f"Success rate: {result.success_rate:.1f}%")
if result.successful_files:
print("\nSuccessfully processed files:")
for file_path in result.successful_files:
print(f"{Path(file_path).name}")
if result.failed_files:
print("\nFailed files:")
for file_path in result.failed_files:
error = result.errors.get(file_path, "Unknown error")
print(f"{Path(file_path).name}: {error}")
return result
except Exception as e:
print(f"❌ Batch processing demonstration failed: {str(e)}")
return None
async def demonstrate_async_batch_processing():
"""Demonstrate asynchronous batch processing"""
print("\n" + "=" * 60)
print("ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION")
print("=" * 60)
# Create sample documents
sample_files, temp_dir = create_sample_documents()
try:
print(f"Processing {len(sample_files)} documents asynchronously...")
# Create batch parser
batch_parser = BatchParser(
parser_type="mineru",
max_workers=2,
show_progress=True,
skip_installation_check=True,
)
# Process batch asynchronously
output_dir = temp_dir / "async_output"
start_time = time.time()
result = await batch_parser.process_batch_async(
file_paths=sample_files,
output_dir=str(output_dir),
parse_method="auto",
recursive=False,
)
processing_time = time.time() - start_time
# Display results
print("\n" + "-" * 40)
print("ASYNC BATCH PROCESSING RESULTS")
print("-" * 40)
print(result.summary())
print(f"Async processing time: {processing_time:.2f} seconds")
print(f"Success rate: {result.success_rate:.1f}%")
return result
except Exception as e:
print(f"❌ Async batch processing demonstration failed: {str(e)}")
return None
async def demonstrate_rag_integration():
"""Demonstrate batch processing integration with RAG-Anything"""
print("\n" + "=" * 60)
print("RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION")
print("=" * 60)
# Create sample documents
sample_files, temp_dir = create_sample_documents()
try:
# Initialize RAG-Anything with temporary storage
config = RAGAnythingConfig(
working_dir=str(temp_dir / "rag_storage"),
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
max_concurrent_files=2,
)
rag = RAGAnything(config=config)
print("RAG-Anything initialized with batch processing capabilities")
# Show available batch methods
batch_methods = [method for method in dir(rag) if "batch" in method.lower()]
print(f"Available batch methods: {batch_methods}")
# Demonstrate batch processing with RAG integration
print(f"\nProcessing {len(sample_files)} documents with RAG integration...")
# Use the RAG-integrated batch processing
try:
# Process documents in batch
result = rag.process_documents_batch(
file_paths=sample_files,
output_dir=str(temp_dir / "rag_batch_output"),
max_workers=2,
show_progress=True,
)
print("\n" + "-" * 40)
print("RAG BATCH PROCESSING RESULTS")
print("-" * 40)
print(result.summary())
print(f"Success rate: {result.success_rate:.1f}%")
# Demonstrate batch processing with full RAG integration
print("\nProcessing documents with full RAG integration...")
rag_result = await rag.process_documents_with_rag_batch(
file_paths=sample_files[:2], # Process subset for demo
output_dir=str(temp_dir / "rag_full_output"),
max_workers=1,
show_progress=True,
)
print("\n" + "-" * 40)
print("FULL RAG INTEGRATION RESULTS")
print("-" * 40)
print(f"Parse result: {rag_result['parse_result'].summary()}")
print(
f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds"
)
print(
f"Successfully processed with RAG: {rag_result['successful_rag_files']}"
)
print(f"Failed RAG processing: {rag_result['failed_rag_files']}")
return rag_result
except Exception as e:
print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
print(
"Note: This is expected in environments without full API configuration"
)
return None
except Exception as e:
print(f"❌ RAG integration demonstration failed: {str(e)}")
return None
def demonstrate_directory_processing():
"""Demonstrate processing entire directories"""
print("\n" + "=" * 60)
print("DIRECTORY PROCESSING DEMONSTRATION")
print("=" * 60)
# Create a directory structure with nested files
temp_dir = Path(tempfile.mkdtemp())
# Create main directory files
main_files = {
"overview.txt": "Main directory overview document",
"readme.md": "# Project README\n\nThis is the main project documentation.",
}
# Create subdirectory
sub_dir = temp_dir / "subdirectory"
sub_dir.mkdir()
sub_files = {
"details.txt": "Detailed information in subdirectory",
"notes.md": "# Notes\n\nAdditional notes and information.",
}
# Write all files
all_files = []
for filename, content in main_files.items():
file_path = temp_dir / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
all_files.append(str(file_path))
for filename, content in sub_files.items():
file_path = sub_dir / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
all_files.append(str(file_path))
try:
print("Created directory structure:")
print(f" Main directory: {temp_dir}")
print(f" Files in main: {list(main_files.keys())}")
print(f" Subdirectory: {sub_dir}")
print(f" Files in sub: {list(sub_files.keys())}")
# Create batch parser
batch_parser = BatchParser(
parser_type="mineru",
max_workers=2,
show_progress=True,
skip_installation_check=True,
)
# Process entire directory recursively
print("\nProcessing entire directory recursively...")
result = batch_parser.process_batch(
file_paths=[str(temp_dir)], # Pass directory path
output_dir=str(temp_dir / "directory_output"),
parse_method="auto",
recursive=True, # Include subdirectories
)
print("\n" + "-" * 40)
print("DIRECTORY PROCESSING RESULTS")
print("-" * 40)
print(result.summary())
print(f"Total files found and processed: {result.total_files}")
print(f"Success rate: {result.success_rate:.1f}%")
if result.successful_files:
print("\nSuccessfully processed:")
for file_path in result.successful_files:
relative_path = Path(file_path).relative_to(temp_dir)
print(f"{relative_path}")
return result
except Exception as e:
print(f"❌ Directory processing demonstration failed: {str(e)}")
return None
def demonstrate_error_handling():
"""Demonstrate error handling and recovery"""
print("\n" + "=" * 60)
print("ERROR HANDLING DEMONSTRATION")
print("=" * 60)
temp_dir = Path(tempfile.mkdtemp())
# Create files with various issues
files_with_issues = {
"valid_file.txt": "This is a valid file that should process successfully.",
"empty_file.txt": "", # Empty file
"large_file.txt": "x" * 1000000, # Large file (1MB of 'x')
}
created_files = []
for filename, content in files_with_issues.items():
file_path = temp_dir / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
created_files.append(str(file_path))
# Add a non-existent file to the list
created_files.append(str(temp_dir / "non_existent_file.txt"))
try:
print(f"Testing error handling with {len(created_files)} files:")
for file_path in created_files:
name = Path(file_path).name
exists = Path(file_path).exists()
size = Path(file_path).stat().st_size if exists else 0
print(f" - {name}: {'exists' if exists else 'missing'}, {size} bytes")
# Create batch parser with short timeout for demonstration
batch_parser = BatchParser(
parser_type="mineru",
max_workers=2,
show_progress=True,
timeout_per_file=30, # Short timeout for demo
skip_installation_check=True,
)
# Process files and handle errors
result = batch_parser.process_batch(
file_paths=created_files,
output_dir=str(temp_dir / "error_test_output"),
parse_method="auto",
)
print("\n" + "-" * 40)
print("ERROR HANDLING RESULTS")
print("-" * 40)
print(result.summary())
if result.successful_files:
print("\nSuccessful files:")
for file_path in result.successful_files:
print(f"{Path(file_path).name}")
if result.failed_files:
print("\nFailed files with error details:")
for file_path in result.failed_files:
error = result.errors.get(file_path, "Unknown error")
print(f"{Path(file_path).name}: {error}")
# Demonstrate retry logic
if result.failed_files:
print(
f"\nDemonstrating retry logic for {len(result.failed_files)} failed files..."
)
# Retry only the failed files
retry_result = batch_parser.process_batch(
file_paths=result.failed_files,
output_dir=str(temp_dir / "retry_output"),
parse_method="auto",
)
print(f"Retry results: {retry_result.summary()}")
return result
except Exception as e:
print(f"❌ Error handling demonstration failed: {str(e)}")
return None
async def main():
"""Main demonstration function"""
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
print("RAG-Anything Batch Processing Demonstration")
print("=" * 70)
print("This example demonstrates various batch processing capabilities:")
print(" - Basic batch processing with progress tracking")
print(" - Asynchronous processing for improved performance")
print(" - Integration with RAG-Anything pipeline")
print(" - Directory processing with recursive file discovery")
print(" - Comprehensive error handling and recovery")
results = {}
# Run demonstrations
print("\n🚀 Starting demonstrations...")
# Basic batch processing
results["basic"] = demonstrate_basic_batch_processing()
# Asynchronous processing
results["async"] = await demonstrate_async_batch_processing()
# RAG integration
results["rag"] = await demonstrate_rag_integration()
# Directory processing
results["directory"] = demonstrate_directory_processing()
# Error handling
results["error_handling"] = demonstrate_error_handling()
# Summary
print("\n" + "=" * 70)
print("DEMONSTRATION SUMMARY")
print("=" * 70)
for demo_name, result in results.items():
if result:
if hasattr(result, "success_rate"):
print(
f"{demo_name.upper()}: {result.success_rate:.1f}% success rate"
)
else:
print(f"{demo_name.upper()}: Completed successfully")
else:
print(f"{demo_name.upper()}: Failed or had limitations")
print("\n📊 Key Features Demonstrated:")
print(" - Parallel document processing with configurable worker counts")
print(" - Real-time progress tracking with tqdm progress bars")
print(" - Comprehensive error handling and reporting")
print(" - File filtering based on supported document types")
print(" - Directory processing with recursive file discovery")
print(" - Asynchronous processing for improved performance")
print(" - Integration with RAG-Anything document pipeline")
print(" - Retry logic for failed documents")
print(" - Detailed processing statistics and timing")
print("\n💡 Best Practices Highlighted:")
print(" - Use appropriate worker counts for your system")
print(" - Enable progress tracking for long-running operations")
print(" - Handle errors gracefully with retry mechanisms")
print(" - Filter files to supported types before processing")
print(" - Set reasonable timeouts for document processing")
print(" - Use skip_installation_check for environments with conflicts")
if __name__ == "__main__":
asyncio.run(main())

File diff suppressed because it is too large Load Diff

View File

@@ -14,6 +14,7 @@ Usage:
"""
import argparse
import asyncio
import sys
from pathlib import Path
from raganything import RAGAnything
@@ -51,7 +52,7 @@ def get_image_info(image_path: Path):
return {"error": str(e)}
def test_image_format_parsing(file_path: str):
async def test_image_format_parsing(file_path: str):
"""Test image format parsing with MinerU"""
print(f"🧪 Testing image format parsing: {file_path}")
@@ -101,12 +102,12 @@ def test_image_format_parsing(file_path: str):
print(f"✅ Format {file_path.suffix.upper()} is natively supported by MinerU")
# Initialize RAGAnything (only for parsing functionality)
rag = RAGAnything(working_dir="./temp_parsing_test")
rag = RAGAnything()
try:
# Test image parsing with MinerU
print("\n🔄 Testing image parsing with MinerU...")
content_list, md_content = rag.parse_document(
content_list, md_content = await rag.parse_document(
file_path=str(file_path),
output_dir="./test_output",
parse_method="ocr", # Images use OCR method
@@ -147,10 +148,9 @@ def test_image_format_parsing(file_path: str):
print(f"\n🖼️ Found {len(image_items)} processed image(s):")
for i, item in enumerate(image_items, 1):
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
if item.get("img_caption"):
print(
f" Caption: {item.get('img_caption', [])[0] if item.get('img_caption') else 'N/A'}"
)
caption = item.get("image_caption", item.get("img_caption", []))
if caption:
print(f" Caption: {caption[0] if caption else 'N/A'}")
# Display text blocks (OCR results)
text_items = [
@@ -196,7 +196,7 @@ def main():
parser = argparse.ArgumentParser(
description="Test image format parsing with MinerU"
)
parser.add_argument("--file", required=True, help="Path to the image file to test")
parser.add_argument("--file", help="Path to the image file to test")
parser.add_argument(
"--check-pillow", action="store_true", help="Only check PIL/Pillow installation"
)
@@ -212,9 +212,15 @@ def main():
print("✅ PIL/Pillow installation check passed!")
return 0
# If not just checking dependencies, file argument is required
if not args.file:
print("❌ Error: --file argument is required when not using --check-pillow")
parser.print_help()
return 1
# Run the parsing test
try:
success = test_image_format_parsing(args.file)
success = asyncio.run(test_image_format_parsing(args.file))
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ Test interrupted by user")

View File

@@ -0,0 +1,422 @@
#!/usr/bin/env python
"""
Example script demonstrating direct content list insertion with RAGAnything
This example shows how to:
1. Create a simple content list with different content types
2. Insert content list directly without document parsing using insert_content_list() method
3. Perform pure text queries using aquery() method
4. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method
5. Handle different types of multimodal content in the inserted knowledge base
"""
import os
import argparse
import asyncio
import logging
import logging.config
from pathlib import Path
# Add project root directory to Python path
import sys
sys.path.append(str(Path(__file__).parent.parent))
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
from raganything import RAGAnything, RAGAnythingConfig
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env", override=False)
def configure_logging():
"""Configure logging for the application"""
# Get log directory path from environment variable or use current directory
log_dir = os.getenv("LOG_DIR", os.getcwd())
log_file_path = os.path.abspath(
os.path.join(log_dir, "insert_content_list_example.log")
)
print(f"\nInsert Content List example log file: {log_file_path}\n")
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
# Get log file max size and backup count from environment variables
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
logging.config.dictConfig(
{
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"default": {
"format": "%(levelname)s: %(message)s",
},
"detailed": {
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
},
},
"handlers": {
"console": {
"formatter": "default",
"class": "logging.StreamHandler",
"stream": "ext://sys.stderr",
},
"file": {
"formatter": "detailed",
"class": "logging.handlers.RotatingFileHandler",
"filename": log_file_path,
"maxBytes": log_max_bytes,
"backupCount": log_backup_count,
"encoding": "utf-8",
},
},
"loggers": {
"lightrag": {
"handlers": ["console", "file"],
"level": "INFO",
"propagate": False,
},
},
}
)
# Set the logger level to INFO
logger.setLevel(logging.INFO)
# Enable verbose debug if needed
set_verbose_debug(os.getenv("VERBOSE", "false").lower() == "true")
def create_sample_content_list():
"""
Create a simple content list for testing insert_content_list functionality
Returns:
List[Dict]: Sample content list with various content types
Note:
- img_path should be absolute path to the image file
- page_idx represents the page number where the content appears (0-based)
"""
content_list = [
# Introduction text
{
"type": "text",
"text": "Welcome to the RAGAnything System Documentation. This guide covers the advanced multimodal document processing capabilities and features of our comprehensive RAG system.",
"page_idx": 0, # Page number where this content appears
},
# System architecture image
{
"type": "image",
"img_path": "/absolute/path/to/system_architecture.jpg", # IMPORTANT: Use absolute path to image file
"image_caption": ["Figure 1: RAGAnything System Architecture"],
"image_footnote": [
"The architecture shows the complete pipeline from document parsing to multimodal query processing"
],
"page_idx": 1, # Page number where this image appears
},
# Performance comparison table
{
"type": "table",
"table_body": """| System | Accuracy | Processing Speed | Memory Usage |
|--------|----------|------------------|--------------|
| RAGAnything | 95.2% | 120ms | 2.1GB |
| Traditional RAG | 87.3% | 180ms | 3.2GB |
| Baseline System | 82.1% | 220ms | 4.1GB |
| Simple Retrieval | 76.5% | 95ms | 1.8GB |""",
"table_caption": [
"Table 1: Performance Comparison of Different RAG Systems"
],
"table_footnote": [
"All tests conducted on the same hardware with identical test datasets"
],
"page_idx": 2, # Page number where this table appears
},
# Mathematical formula
{
"type": "equation",
"latex": "Relevance(d, q) = \\sum_{i=1}^{n} w_i \\cdot sim(t_i^d, t_i^q) \\cdot \\alpha_i",
"text": "Document relevance scoring formula where w_i are term weights, sim() is similarity function, and α_i are modality importance factors",
"page_idx": 3, # Page number where this equation appears
},
# Feature description
{
"type": "text",
"text": "The system supports multiple content modalities including text, images, tables, and mathematical equations. Each modality is processed using specialized processors optimized for that content type.",
"page_idx": 4, # Page number where this content appears
},
# Technical specifications table
{
"type": "table",
"table_body": """| Feature | Specification |
|---------|---------------|
| Supported Formats | PDF, DOCX, PPTX, XLSX, Images |
| Max Document Size | 100MB |
| Concurrent Processing | Up to 8 documents |
| Query Response Time | <200ms average |
| Knowledge Graph Nodes | Up to 1M entities |""",
"table_caption": ["Table 2: Technical Specifications"],
"table_footnote": [
"Specifications may vary based on hardware configuration"
],
"page_idx": 5, # Page number where this table appears
},
# Conclusion
{
"type": "text",
"text": "RAGAnything represents a significant advancement in multimodal document processing, providing comprehensive solutions for complex knowledge extraction and retrieval tasks.",
"page_idx": 6, # Page number where this content appears
},
]
return content_list
async def demo_insert_content_list(
api_key: str,
base_url: str = None,
working_dir: str = None,
):
"""
Demonstrate content list insertion and querying with RAGAnything
Args:
api_key: OpenAI API key
base_url: Optional base URL for API
working_dir: Working directory for RAG storage
"""
try:
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir=working_dir or "./rag_storage",
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
display_content_stats=True, # Show content statistics
)
# Define LLM model function
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Define vision model function for image processing
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
):
if image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt}
if system_prompt
else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
# Define embedding function - using environment variables for configuration
embedding_dim = int(os.getenv("EMBEDDING_DIM", "3072"))
embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
embedding_func = EmbeddingFunc(
embedding_dim=embedding_dim,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model=embedding_model,
api_key=api_key,
base_url=base_url,
),
)
# Initialize RAGAnything
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
# Create sample content list
logger.info("Creating sample content list...")
content_list = create_sample_content_list()
logger.info(f"Created content list with {len(content_list)} items")
# Insert content list directly
logger.info("\nInserting content list into RAGAnything...")
await rag.insert_content_list(
content_list=content_list,
file_path="raganything_documentation.pdf", # Reference file name for citation
split_by_character=None, # Optional text splitting
split_by_character_only=False, # Optional text splitting mode
doc_id="demo-doc-001", # Custom document ID
display_stats=True, # Show content statistics
)
logger.info("Content list insertion completed!")
# Example queries - demonstrating different query approaches
logger.info("\nQuerying inserted content:")
# 1. Pure text queries using aquery()
text_queries = [
"What is RAGAnything and what are its main features?",
"How does RAGAnything compare to traditional RAG systems?",
"What are the technical specifications of the system?",
]
for query in text_queries:
logger.info(f"\n[Text Query]: {query}")
result = await rag.aquery(query, mode="hybrid")
logger.info(f"Answer: {result}")
# 2. Multimodal query with specific multimodal content using aquery_with_multimodal()
logger.info(
"\n[Multimodal Query]: Analyzing new performance data against existing benchmarks"
)
multimodal_result = await rag.aquery_with_multimodal(
"Compare this new performance data with the existing benchmark results in the documentation",
multimodal_content=[
{
"type": "table",
"table_data": """Method,Accuracy,Speed,Memory
New_Approach,97.1%,110ms,1.9GB
Enhanced_RAG,91.4%,140ms,2.5GB""",
"table_caption": "Latest experimental results",
}
],
mode="hybrid",
)
logger.info(f"Answer: {multimodal_result}")
# 3. Another multimodal query with equation content
logger.info("\n[Multimodal Query]: Mathematical formula analysis")
equation_result = await rag.aquery_with_multimodal(
"How does this similarity formula relate to the relevance scoring mentioned in the documentation?",
multimodal_content=[
{
"type": "equation",
"latex": "sim(a, b) = \\frac{a \\cdot b}{||a|| \\times ||b||} + \\beta \\cdot context\\_weight",
"equation_caption": "Enhanced cosine similarity with context weighting",
}
],
mode="hybrid",
)
logger.info(f"Answer: {equation_result}")
# 4. Insert another content list with different document ID
logger.info("\nInserting additional content list...")
additional_content = [
{
"type": "text",
"text": "This is additional documentation about advanced features and configuration options.",
"page_idx": 0, # Page number where this content appears
},
{
"type": "table",
"table_body": """| Configuration | Default Value | Range |
|---------------|---------------|-------|
| Chunk Size | 512 tokens | 128-2048 |
| Context Window | 4096 tokens | 1024-8192 |
| Batch Size | 32 | 1-128 |""",
"table_caption": ["Advanced Configuration Parameters"],
"page_idx": 1, # Page number where this table appears
},
]
await rag.insert_content_list(
content_list=additional_content,
file_path="advanced_configuration.pdf",
doc_id="demo-doc-002", # Different document ID
)
# Query combined knowledge base
logger.info("\n[Combined Query]: What configuration options are available?")
combined_result = await rag.aquery(
"What configuration options are available and what are their default values?",
mode="hybrid",
)
logger.info(f"Answer: {combined_result}")
except Exception as e:
logger.error(f"Error in content list insertion demo: {str(e)}")
import traceback
logger.error(traceback.format_exc())
def main():
"""Main function to run the example"""
parser = argparse.ArgumentParser(description="Insert Content List Example")
parser.add_argument(
"--working_dir", "-w", default="./rag_storage", help="Working directory path"
)
parser.add_argument(
"--api-key",
default=os.getenv("LLM_BINDING_API_KEY"),
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
)
parser.add_argument(
"--base-url",
default=os.getenv("LLM_BINDING_HOST"),
help="Optional base URL for API",
)
args = parser.parse_args()
# Check if API key is provided
if not args.api_key:
logger.error("Error: OpenAI API key is required")
logger.error("Set api key environment variable or use --api-key option")
return
# Run the demo
asyncio.run(
demo_insert_content_list(
args.api_key,
args.base_url,
args.working_dir,
)
)
if __name__ == "__main__":
# Configure logging first
configure_logging()
print("RAGAnything Insert Content List Example")
print("=" * 45)
print("Demonstrating direct content list insertion without document parsing")
print("=" * 45)
main()

View File

@@ -0,0 +1,334 @@
"""
LM Studio Integration Example with RAG-Anything
This example demonstrates how to integrate LM Studio with RAG-Anything for local
text document processing and querying.
Requirements:
- LM Studio running locally with server enabled
- OpenAI Python package: pip install openai
- RAG-Anything installed: pip install raganything
Environment Setup:
Create a .env file with:
LLM_BINDING=lmstudio
LLM_MODEL=openai/gpt-oss-20b
LLM_BINDING_HOST=http://localhost:1234/v1
LLM_BINDING_API_KEY=lm-studio
EMBEDDING_BINDING=lmstudio
EMBEDDING_MODEL=text-embedding-nomic-embed-text-v1.5
EMBEDDING_BINDING_HOST=http://localhost:1234/v1
EMBEDDING_BINDING_API_KEY=lm-studio
"""
import os
import uuid
import asyncio
from typing import List, Dict, Optional
from dotenv import load_dotenv
from openai import AsyncOpenAI
# Load environment variables
load_dotenv()
# RAG-Anything imports
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.utils import EmbeddingFunc
from lightrag.llm.openai import openai_complete_if_cache
LM_BASE_URL = os.getenv("LLM_BINDING_HOST", "http://localhost:1234/v1")
LM_API_KEY = os.getenv("LLM_BINDING_API_KEY", "lm-studio")
LM_MODEL_NAME = os.getenv("LLM_MODEL", "openai/gpt-oss-20b")
LM_EMBED_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-nomic-embed-text-v1.5")
async def lmstudio_llm_model_func(
prompt: str,
system_prompt: Optional[str] = None,
history_messages: List[Dict] = None,
**kwargs,
) -> str:
"""Top-level LLM function for LightRAG (pickle-safe)."""
return await openai_complete_if_cache(
model=LM_MODEL_NAME,
prompt=prompt,
system_prompt=system_prompt,
history_messages=history_messages or [],
base_url=LM_BASE_URL,
api_key=LM_API_KEY,
**kwargs,
)
async def lmstudio_embedding_async(texts: List[str]) -> List[List[float]]:
"""Top-level embedding function for LightRAG (pickle-safe)."""
from lightrag.llm.openai import openai_embed
embeddings = await openai_embed(
texts=texts,
model=LM_EMBED_MODEL,
base_url=LM_BASE_URL,
api_key=LM_API_KEY,
)
return embeddings.tolist()
class LMStudioRAGIntegration:
"""Integration class for LM Studio with RAG-Anything."""
def __init__(self):
# LM Studio configuration using standard LLM_BINDING variables
self.base_url = os.getenv("LLM_BINDING_HOST", "http://localhost:1234/v1")
self.api_key = os.getenv("LLM_BINDING_API_KEY", "lm-studio")
self.model_name = os.getenv("LLM_MODEL", "openai/gpt-oss-20b")
self.embedding_model = os.getenv(
"EMBEDDING_MODEL", "text-embedding-nomic-embed-text-v1.5"
)
# RAG-Anything configuration
# Use a fresh working directory each run to avoid legacy doc_status schema conflicts
self.config = RAGAnythingConfig(
working_dir=f"./rag_storage_lmstudio/{uuid.uuid4()}",
parser="mineru",
parse_method="auto",
enable_image_processing=False,
enable_table_processing=True,
enable_equation_processing=True,
)
print(f"📁 Using working_dir: {self.config.working_dir}")
self.rag = None
async def test_connection(self) -> bool:
"""Test LM Studio connection."""
try:
print(f"🔌 Testing LM Studio connection at: {self.base_url}")
client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)
models = await client.models.list()
print(f"✅ Connected successfully! Found {len(models.data)} models")
# Show available models
print("📊 Available models:")
for i, model in enumerate(models.data[:5]):
marker = "🎯" if model.id == self.model_name else " "
print(f"{marker} {i+1}. {model.id}")
if len(models.data) > 5:
print(f" ... and {len(models.data) - 5} more models")
return True
except Exception as e:
print(f"❌ Connection failed: {str(e)}")
print("\n💡 Troubleshooting tips:")
print("1. Ensure LM Studio is running")
print("2. Start the local server in LM Studio")
print("3. Load a model or enable just-in-time loading")
print(f"4. Verify server address: {self.base_url}")
return False
finally:
try:
await client.close()
except Exception:
pass
async def test_chat_completion(self) -> bool:
"""Test basic chat functionality."""
try:
print(f"💬 Testing chat with model: {self.model_name}")
client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)
response = await client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{
"role": "user",
"content": "Hello! Please confirm you're working and tell me your capabilities.",
},
],
max_tokens=100,
temperature=0.7,
)
result = response.choices[0].message.content.strip()
print("✅ Chat test successful!")
print(f"Response: {result}")
return True
except Exception as e:
print(f"❌ Chat test failed: {str(e)}")
return False
finally:
try:
await client.close()
except Exception:
pass
# Deprecated factory helpers removed to reduce redundancy
def embedding_func_factory(self):
"""Create a completely serializable embedding function."""
return EmbeddingFunc(
embedding_dim=768, # nomic-embed-text-v1.5 default dimension
max_token_size=8192, # nomic-embed-text-v1.5 context length
func=lmstudio_embedding_async,
)
async def initialize_rag(self):
"""Initialize RAG-Anything with LM Studio functions."""
print("Initializing RAG-Anything with LM Studio...")
try:
self.rag = RAGAnything(
config=self.config,
llm_model_func=lmstudio_llm_model_func,
embedding_func=self.embedding_func_factory(),
)
# Compatibility: avoid writing unknown field 'multimodal_processed' to LightRAG doc_status
# Older LightRAG versions may not accept this extra field in DocProcessingStatus
async def _noop_mark_multimodal(doc_id: str):
return None
self.rag._mark_multimodal_processing_complete = _noop_mark_multimodal
print("✅ RAG-Anything initialized successfully!")
return True
except Exception as e:
print(f"❌ RAG initialization failed: {str(e)}")
return False
async def process_document_example(self, file_path: str):
"""Example: Process a document with LM Studio backend."""
if not self.rag:
print("❌ RAG not initialized. Call initialize_rag() first.")
return
try:
print(f"📄 Processing document: {file_path}")
await self.rag.process_document_complete(
file_path=file_path,
output_dir="./output_lmstudio",
parse_method="auto",
display_stats=True,
)
print("✅ Document processing completed!")
except Exception as e:
print(f"❌ Document processing failed: {str(e)}")
async def query_examples(self):
"""Example queries with different modes."""
if not self.rag:
print("❌ RAG not initialized. Call initialize_rag() first.")
return
# Example queries
queries = [
("What are the main topics in the processed documents?", "hybrid"),
("Summarize any tables or data found in the documents", "local"),
("What images or figures are mentioned?", "global"),
]
print("\n🔍 Running example queries...")
for query, mode in queries:
try:
print(f"\nQuery ({mode}): {query}")
result = await self.rag.aquery(query, mode=mode)
print(f"Answer: {result[:200]}...")
except Exception as e:
print(f"❌ Query failed: {str(e)}")
async def simple_query_example(self):
"""Example basic text query with sample content."""
if not self.rag:
print("❌ RAG not initialized")
return
try:
print("\nAdding sample content for testing...")
# Create content list in the format expected by RAGAnything
content_list = [
{
"type": "text",
"text": """LM Studio Integration with RAG-Anything
This integration demonstrates how to connect LM Studio's local AI models with RAG-Anything's document processing capabilities. The system uses:
- LM Studio for local LLM inference
- nomic-embed-text-v1.5 for embeddings (768 dimensions)
- RAG-Anything for document processing and retrieval
Key benefits include:
- Privacy: All processing happens locally
- Performance: Direct API access to local models
- Flexibility: Support for various document formats
- Cost-effective: No external API usage""",
"page_idx": 0,
}
]
# Insert the content list using the correct method
await self.rag.insert_content_list(
content_list=content_list,
file_path="lmstudio_integration_demo.txt",
# Use a unique doc_id to avoid collisions and doc_status reuse across runs
doc_id=f"demo-content-{uuid.uuid4()}",
display_stats=True,
)
print("✅ Sample content added to knowledge base")
print("\nTesting basic text query...")
# Simple text query example
result = await self.rag.aquery(
"What are the key benefits of this LM Studio integration?",
mode="hybrid",
)
print(f"✅ Query result: {result[:300]}...")
except Exception as e:
print(f"❌ Query failed: {str(e)}")
async def main():
"""Main example function."""
print("=" * 70)
print("LM Studio + RAG-Anything Integration Example")
print("=" * 70)
# Initialize integration
integration = LMStudioRAGIntegration()
# Test connection
if not await integration.test_connection():
return False
print()
if not await integration.test_chat_completion():
return False
# Initialize RAG
print("\n" + "" * 50)
if not await integration.initialize_rag():
return False
# Example document processing (uncomment and provide a real file path)
# await integration.process_document_example("path/to/your/document.pdf")
# Example queries (uncomment after processing documents)
# await integration.query_examples()
# Example basic query
await integration.simple_query_example()
print("\n" + "=" * 70)
print("Integration example completed successfully!")
print("=" * 70)
return True
if __name__ == "__main__":
print("🚀 Starting LM Studio integration example...")
success = asyncio.run(main())
exit(0 if success else 1)

View File

@@ -91,12 +91,12 @@ async def process_image_example(lightrag: LightRAG, vision_model_func):
# Prepare image content
image_content = {
"img_path": "image.jpg",
"img_caption": ["Example image caption"],
"img_footnote": ["Example image footnote"],
"image_caption": ["Example image caption"],
"image_footnote": ["Example image footnote"],
}
# Process image
description, entity_info = await image_processor.process_multimodal_content(
(description, entity_info, _) = await image_processor.process_multimodal_content(
modal_content=image_content,
content_type="image",
file_path="image_example.jpg",
@@ -128,7 +128,7 @@ async def process_table_example(lightrag: LightRAG, llm_model_func):
}
# Process table
description, entity_info = await table_processor.process_multimodal_content(
(description, entity_info, _) = await table_processor.process_multimodal_content(
modal_content=table_content,
content_type="table",
file_path="table_example.md",
@@ -151,7 +151,7 @@ async def process_equation_example(lightrag: LightRAG, llm_model_func):
equation_content = {"text": "E = mc^2", "text_format": "LaTeX"}
# Process equation
description, entity_info = await equation_processor.process_multimodal_content(
(description, entity_info, _) = await equation_processor.process_multimodal_content(
modal_content=equation_content,
content_type="equation",
file_path="equation_example.txt",
@@ -164,14 +164,20 @@ async def process_equation_example(lightrag: LightRAG, llm_model_func):
async def initialize_rag(api_key: str, base_url: str = None):
# Use environment variables for embedding configuration
import os
embedding_dim = int(os.getenv("EMBEDDING_DIM", "3072"))
embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
rag = LightRAG(
working_dir=WORKING_DIR,
embedding_func=EmbeddingFunc(
embedding_dim=3072,
embedding_dim=embedding_dim,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
model=embedding_model,
api_key=api_key,
base_url=base_url,
),

View File

@@ -14,6 +14,7 @@ Usage:
"""
import argparse
import asyncio
import sys
from pathlib import Path
from raganything import RAGAnything
@@ -45,7 +46,7 @@ def check_libreoffice_installation():
return False
def test_office_document_parsing(file_path: str):
async def test_office_document_parsing(file_path: str):
"""Test Office document parsing with MinerU"""
print(f"🧪 Testing Office document parsing: {file_path}")
@@ -66,12 +67,12 @@ def test_office_document_parsing(file_path: str):
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
# Initialize RAGAnything (only for parsing functionality)
rag = RAGAnything(working_dir="./temp_parsing_test")
rag = RAGAnything()
try:
# Test document parsing with MinerU
print("\n🔄 Testing document parsing with MinerU...")
content_list, md_content = rag.parse_document(
content_list, md_content = await rag.parse_document(
file_path=str(file_path),
output_dir="./test_output",
parse_method="auto",
@@ -157,9 +158,7 @@ def main():
parser = argparse.ArgumentParser(
description="Test Office document parsing with MinerU"
)
parser.add_argument(
"--file", required=True, help="Path to the Office document to test"
)
parser.add_argument("--file", help="Path to the Office document to test")
parser.add_argument(
"--check-libreoffice",
action="store_true",
@@ -177,9 +176,17 @@ def main():
print("✅ LibreOffice installation check passed!")
return 0
# If not just checking dependencies, file argument is required
if not args.file:
print(
"❌ Error: --file argument is required when not using --check-libreoffice"
)
parser.print_help()
return 1
# Run the parsing test
try:
success = test_office_document_parsing(args.file)
success = asyncio.run(test_office_document_parsing(args.file))
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ Test interrupted by user")

View File

@@ -3,9 +3,10 @@
Example script demonstrating the integration of MinerU parser with RAGAnything
This example shows how to:
1. Process parsed documents with RAGAnything
2. Perform multimodal queries on the processed documents
3. Handle different types of content (text, images, tables)
1. Process documents with RAGAnything using MinerU parser
2. Perform pure text queries using aquery() method
3. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method
4. Handle different types of multimodal content (tables, equations) in queries
"""
import os
@@ -24,6 +25,10 @@ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
from raganything import RAGAnything, RAGAnythingConfig
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env", override=False)
def configure_logging():
"""Configure logging for the application"""
@@ -87,6 +92,7 @@ async def process_with_rag(
api_key: str,
base_url: str = None,
working_dir: str = None,
parser: str = None,
):
"""
Process document with RAGAnything
@@ -102,7 +108,8 @@ async def process_with_rag(
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir=working_dir or "./rag_storage",
mineru_parse_method="auto",
parser=parser, # Parser selection: mineru or docling
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
@@ -122,9 +129,27 @@ async def process_with_rag(
# Define vision model function for image processing
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
prompt,
system_prompt=None,
history_messages=[],
image_data=None,
messages=None,
**kwargs,
):
if image_data:
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -153,16 +178,20 @@ async def process_with_rag(
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
# Define embedding function
# Define embedding function - using environment variables for configuration
embedding_dim = int(os.getenv("EMBEDDING_DIM", "3072"))
embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
embedding_func = EmbeddingFunc(
embedding_dim=3072,
embedding_dim=embedding_dim,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
model=embedding_model,
api_key=api_key,
base_url=base_url,
),
@@ -181,19 +210,55 @@ async def process_with_rag(
file_path=file_path, output_dir=output_dir, parse_method="auto"
)
# Example queries
queries = [
# Example queries - demonstrating different query approaches
logger.info("\nQuerying processed document:")
# 1. Pure text queries using aquery()
text_queries = [
"What is the main content of the document?",
"Describe the images and figures in the document",
"Tell me about the experimental results and data tables",
"What are the key topics discussed?",
]
logger.info("\nQuerying processed document:")
for query in queries:
logger.info(f"\nQuery: {query}")
result = await rag.query_with_multimodal(query, mode="hybrid")
for query in text_queries:
logger.info(f"\n[Text Query]: {query}")
result = await rag.aquery(query, mode="hybrid")
logger.info(f"Answer: {result}")
# 2. Multimodal query with specific multimodal content using aquery_with_multimodal()
logger.info(
"\n[Multimodal Query]: Analyzing performance data in context of document"
)
multimodal_result = await rag.aquery_with_multimodal(
"Compare this performance data with any similar results mentioned in the document",
multimodal_content=[
{
"type": "table",
"table_data": """Method,Accuracy,Processing_Time
RAGAnything,95.2%,120ms
Traditional_RAG,87.3%,180ms
Baseline,82.1%,200ms""",
"table_caption": "Performance comparison results",
}
],
mode="hybrid",
)
logger.info(f"Answer: {multimodal_result}")
# 3. Another multimodal query with equation content
logger.info("\n[Multimodal Query]: Mathematical formula analysis")
equation_result = await rag.aquery_with_multimodal(
"Explain this formula and relate it to any mathematical concepts in the document",
multimodal_content=[
{
"type": "equation",
"latex": "F1 = 2 \\cdot \\frac{precision \\cdot recall}{precision + recall}",
"equation_caption": "F1-score calculation formula",
}
],
mode="hybrid",
)
logger.info(f"Answer: {equation_result}")
except Exception as e:
logger.error(f"Error processing with RAG: {str(e)}")
import traceback
@@ -213,17 +278,26 @@ def main():
)
parser.add_argument(
"--api-key",
default=os.getenv("OPENAI_API_KEY"),
help="OpenAI API key (defaults to OPENAI_API_KEY env var)",
default=os.getenv("LLM_BINDING_API_KEY"),
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
)
parser.add_argument(
"--base-url",
default=os.getenv("LLM_BINDING_HOST"),
help="Optional base URL for API",
)
parser.add_argument(
"--parser",
default=os.getenv("PARSER", "mineru"),
help="Optional base URL for API",
)
parser.add_argument("--base-url", help="Optional base URL for API")
args = parser.parse_args()
# Check if API key is provided
if not args.api_key:
logger.error("Error: OpenAI API key is required")
logger.error("Set OPENAI_API_KEY environment variable or use --api-key option")
logger.error("Set api key environment variable or use --api-key option")
return
# Create output directory if specified
@@ -233,7 +307,12 @@ def main():
# Process with RAG
asyncio.run(
process_with_rag(
args.file_path, args.output, args.api_key, args.base_url, args.working_dir
args.file_path,
args.output,
args.api_key,
args.base_url,
args.working_dir,
args.parser,
)
)

View File

@@ -14,6 +14,7 @@ Usage:
"""
import argparse
import asyncio
import sys
from pathlib import Path
from raganything import RAGAnything
@@ -34,7 +35,7 @@ def check_reportlab_installation():
return False
def test_text_format_parsing(file_path: str):
async def test_text_format_parsing(file_path: str):
"""Test text format parsing with MinerU"""
print(f"🧪 Testing text format parsing: {file_path}")
@@ -66,12 +67,12 @@ def test_text_format_parsing(file_path: str):
)
# Initialize RAGAnything (only for parsing functionality)
rag = RAGAnything(working_dir="./temp_parsing_test")
rag = RAGAnything()
try:
# Test text parsing with MinerU
print("\n🔄 Testing text parsing with MinerU...")
content_list, md_content = rag.parse_document(
content_list, md_content = await rag.parse_document(
file_path=str(file_path),
output_dir="./test_output",
parse_method="auto",
@@ -157,7 +158,7 @@ def test_text_format_parsing(file_path: str):
def main():
"""Main function"""
parser = argparse.ArgumentParser(description="Test text format parsing with MinerU")
parser.add_argument("--file", required=True, help="Path to the text file to test")
parser.add_argument("--file", help="Path to the text file to test")
parser.add_argument(
"--check-reportlab",
action="store_true",
@@ -175,9 +176,15 @@ def main():
print("✅ ReportLab installation check passed!")
return 0
# If not just checking dependencies, file argument is required
if not args.file:
print("❌ Error: --file argument is required when not using --check-reportlab")
parser.print_help()
return 1
# Run the parsing test
try:
success = test_text_format_parsing(args.file)
success = asyncio.run(test_text_format_parsing(args.file))
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ Test interrupted by user")

75
pyproject.toml Normal file
View File

@@ -0,0 +1,75 @@
[build-system]
requires = ["setuptools>=64", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "raganything"
dynamic = ["version"]
authors = [
{name = "Zirui Guo"}
]
description = "RAGAnything: All-in-One RAG System"
readme = "README.md"
license = { text = "MIT" }
requires-python = ">=3.10"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = [
"huggingface_hub",
"lightrag-hku",
"mineru[core]",
"tqdm",
]
[project.optional-dependencies]
image = ["Pillow>=10.0.0"]
text = ["reportlab>=4.0.0"]
office = [] # Requires LibreOffice (external program)
markdown = [
"markdown>=3.4.0",
"weasyprint>=60.0",
"pygments>=2.10.0",
]
all = [
"Pillow>=10.0.0",
"reportlab>=4.0.0",
"markdown>=3.4.0",
"weasyprint>=60.0",
"pygments>=2.10.0"
]
[project.urls]
Homepage = "https://github.com/HKUDS/RAG-Anything"
Documentation = "https://github.com/HKUDS/RAG-Anything"
Repository = "https://github.com/HKUDS/RAG-Anything"
Issues = "https://github.com/HKUDS/RAG-Anything/issues"
[tool.uv]
dev-dependencies = [
"pytest>=6.0",
"pytest-asyncio",
"black",
"isort",
"flake8",
"mypy",
"openai",
"python-dotenv",
]
[tool.setuptools.packages.find]
include = ["raganything*"]
[tool.setuptools]
include-package-data = true
[tool.setuptools.dynamic]
version = {attr = "raganything.__version__"}
[tool.ruff]
target-version = "py310"

View File

@@ -1,7 +1,7 @@
from .raganything import RAGAnything as RAGAnything
from .raganything import RAGAnythingConfig as RAGAnythingConfig
from .config import RAGAnythingConfig as RAGAnythingConfig
__version__ = "1.1.0"
__version__ = "1.2.8"
__author__ = "Zirui Guo"
__url__ = "https://github.com/HKUDS/RAG-Anything"

12
raganything/base.py Normal file
View File

@@ -0,0 +1,12 @@
from enum import Enum
class DocStatus(str, Enum):
"""Document processing status"""
READY = "ready"
HANDLING = "handling"
PENDING = "pending"
PROCESSING = "processing"
PROCESSED = "processed"
FAILED = "failed"

386
raganything/batch.py Normal file
View File

@@ -0,0 +1,386 @@
"""
Batch processing functionality for RAGAnything
Contains methods for processing multiple documents in batch mode
"""
import asyncio
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, TYPE_CHECKING
import time
from .batch_parser import BatchParser, BatchProcessingResult
if TYPE_CHECKING:
from .config import RAGAnythingConfig
class BatchMixin:
"""BatchMixin class containing batch processing functionality for RAGAnything"""
# Type hints for mixin attributes (will be available when mixed into RAGAnything)
config: "RAGAnythingConfig"
logger: logging.Logger
# Type hints for methods that will be available from other mixins
async def _ensure_lightrag_initialized(self) -> None: ...
async def process_document_complete(self, file_path: str, **kwargs) -> None: ...
# ==========================================
# ORIGINAL BATCH PROCESSING METHOD (RESTORED)
# ==========================================
async def process_folder_complete(
self,
folder_path: str,
output_dir: str = None,
parse_method: str = None,
display_stats: bool = None,
split_by_character: str | None = None,
split_by_character_only: bool = False,
file_extensions: Optional[List[str]] = None,
recursive: bool = None,
max_workers: int = None,
):
"""
Process all supported files in a folder
Args:
folder_path: Path to the folder containing files to process
output_dir: Directory for parsed outputs (optional)
parse_method: Parsing method to use (optional)
display_stats: Whether to display statistics (optional)
split_by_character: Character to split by (optional)
split_by_character_only: Whether to split only by character (optional)
file_extensions: List of file extensions to process (optional)
recursive: Whether to process folders recursively (optional)
max_workers: Maximum number of workers for concurrent processing (optional)
"""
if output_dir is None:
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.parse_method
if display_stats is None:
display_stats = True
if file_extensions is None:
file_extensions = self.config.supported_file_extensions
if recursive is None:
recursive = self.config.recursive_folder_processing
if max_workers is None:
max_workers = self.config.max_concurrent_files
await self._ensure_lightrag_initialized()
# Get all files in the folder
folder_path_obj = Path(folder_path)
if not folder_path_obj.exists():
raise FileNotFoundError(f"Folder not found: {folder_path}")
# Collect files based on supported extensions
files_to_process = []
for file_ext in file_extensions:
if recursive:
pattern = f"**/*{file_ext}"
else:
pattern = f"*{file_ext}"
files_to_process.extend(folder_path_obj.glob(pattern))
if not files_to_process:
self.logger.warning(f"No supported files found in {folder_path}")
return
self.logger.info(
f"Found {len(files_to_process)} files to process in {folder_path}"
)
# Create output directory if it doesn't exist
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Process files with controlled concurrency
semaphore = asyncio.Semaphore(max_workers)
tasks = []
async def process_single_file(file_path: Path):
async with semaphore:
try:
await self.process_document_complete(
str(file_path),
output_dir=output_dir,
parse_method=parse_method,
split_by_character=split_by_character,
split_by_character_only=split_by_character_only,
)
return True, str(file_path), None
except Exception as e:
self.logger.error(f"Failed to process {file_path}: {str(e)}")
return False, str(file_path), str(e)
# Create tasks for all files
for file_path in files_to_process:
task = asyncio.create_task(process_single_file(file_path))
tasks.append(task)
# Wait for all tasks to complete
results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results
successful_files = []
failed_files = []
for result in results:
if isinstance(result, Exception):
failed_files.append(("unknown", str(result)))
else:
success, file_path, error = result
if success:
successful_files.append(file_path)
else:
failed_files.append((file_path, error))
# Display statistics if requested
if display_stats:
self.logger.info("Processing complete!")
self.logger.info(f" Successful: {len(successful_files)} files")
self.logger.info(f" Failed: {len(failed_files)} files")
if failed_files:
self.logger.warning("Failed files:")
for file_path, error in failed_files:
self.logger.warning(f" - {file_path}: {error}")
# ==========================================
# NEW ENHANCED BATCH PROCESSING METHODS
# ==========================================
def process_documents_batch(
self,
file_paths: List[str],
output_dir: Optional[str] = None,
parse_method: Optional[str] = None,
max_workers: Optional[int] = None,
recursive: Optional[bool] = None,
show_progress: bool = True,
**kwargs,
) -> BatchProcessingResult:
"""
Process multiple documents in batch using the new BatchParser
Args:
file_paths: List of file paths or directories to process
output_dir: Output directory for parsed files
parse_method: Parsing method to use
max_workers: Maximum number of workers for parallel processing
recursive: Whether to process directories recursively
show_progress: Whether to show progress bar
**kwargs: Additional arguments passed to the parser
Returns:
BatchProcessingResult: Results of the batch processing
"""
# Use config defaults if not specified
if output_dir is None:
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.parse_method
if max_workers is None:
max_workers = self.config.max_concurrent_files
if recursive is None:
recursive = self.config.recursive_folder_processing
# Create batch parser
batch_parser = BatchParser(
parser_type=self.config.parser,
max_workers=max_workers,
show_progress=show_progress,
skip_installation_check=True, # Skip installation check for better UX
)
# Process batch
return batch_parser.process_batch(
file_paths=file_paths,
output_dir=output_dir,
parse_method=parse_method,
recursive=recursive,
**kwargs,
)
async def process_documents_batch_async(
self,
file_paths: List[str],
output_dir: Optional[str] = None,
parse_method: Optional[str] = None,
max_workers: Optional[int] = None,
recursive: Optional[bool] = None,
show_progress: bool = True,
**kwargs,
) -> BatchProcessingResult:
"""
Asynchronously process multiple documents in batch
Args:
file_paths: List of file paths or directories to process
output_dir: Output directory for parsed files
parse_method: Parsing method to use
max_workers: Maximum number of workers for parallel processing
recursive: Whether to process directories recursively
show_progress: Whether to show progress bar
**kwargs: Additional arguments passed to the parser
Returns:
BatchProcessingResult: Results of the batch processing
"""
# Use config defaults if not specified
if output_dir is None:
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.parse_method
if max_workers is None:
max_workers = self.config.max_concurrent_files
if recursive is None:
recursive = self.config.recursive_folder_processing
# Create batch parser
batch_parser = BatchParser(
parser_type=self.config.parser,
max_workers=max_workers,
show_progress=show_progress,
skip_installation_check=True, # Skip installation check for better UX
)
# Process batch asynchronously
return await batch_parser.process_batch_async(
file_paths=file_paths,
output_dir=output_dir,
parse_method=parse_method,
recursive=recursive,
**kwargs,
)
def get_supported_file_extensions(self) -> List[str]:
"""Get list of supported file extensions for batch processing"""
batch_parser = BatchParser(parser_type=self.config.parser)
return batch_parser.get_supported_extensions()
def filter_supported_files(
self, file_paths: List[str], recursive: Optional[bool] = None
) -> List[str]:
"""
Filter file paths to only include supported file types
Args:
file_paths: List of file paths to filter
recursive: Whether to process directories recursively
Returns:
List of supported file paths
"""
if recursive is None:
recursive = self.config.recursive_folder_processing
batch_parser = BatchParser(parser_type=self.config.parser)
return batch_parser.filter_supported_files(file_paths, recursive)
async def process_documents_with_rag_batch(
self,
file_paths: List[str],
output_dir: Optional[str] = None,
parse_method: Optional[str] = None,
max_workers: Optional[int] = None,
recursive: Optional[bool] = None,
show_progress: bool = True,
**kwargs,
) -> Dict[str, Any]:
"""
Process documents in batch and then add them to RAG
This method combines document parsing and RAG insertion:
1. First, parse all documents using batch processing
2. Then, process each successfully parsed document with RAG
Args:
file_paths: List of file paths or directories to process
output_dir: Output directory for parsed files
parse_method: Parsing method to use
max_workers: Maximum number of workers for parallel processing
recursive: Whether to process directories recursively
show_progress: Whether to show progress bar
**kwargs: Additional arguments passed to the parser
Returns:
Dict containing both parse results and RAG processing results
"""
start_time = time.time()
# Use config defaults if not specified
if output_dir is None:
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.parse_method
if max_workers is None:
max_workers = self.config.max_concurrent_files
if recursive is None:
recursive = self.config.recursive_folder_processing
self.logger.info("Starting batch processing with RAG integration")
# Step 1: Parse documents in batch
parse_result = self.process_documents_batch(
file_paths=file_paths,
output_dir=output_dir,
parse_method=parse_method,
max_workers=max_workers,
recursive=recursive,
show_progress=show_progress,
**kwargs,
)
# Step 2: Process with RAG
# Initialize RAG system
await self._ensure_lightrag_initialized()
# Then, process each successful file with RAG
rag_results = {}
if parse_result.successful_files:
self.logger.info(
f"Processing {len(parse_result.successful_files)} files with RAG"
)
# Process files with RAG (this could be parallelized in the future)
for file_path in parse_result.successful_files:
try:
# Process the successfully parsed file with RAG
await self.process_document_complete(
file_path,
output_dir=output_dir,
parse_method=parse_method,
**kwargs,
)
# Get some statistics about the processed content
# This would require additional tracking in the RAG system
rag_results[file_path] = {"status": "success", "processed": True}
except Exception as e:
self.logger.error(
f"Failed to process {file_path} with RAG: {str(e)}"
)
rag_results[file_path] = {
"status": "failed",
"error": str(e),
"processed": False,
}
processing_time = time.time() - start_time
return {
"parse_result": parse_result,
"rag_results": rag_results,
"total_processing_time": processing_time,
"successful_rag_files": len(
[r for r in rag_results.values() if r["processed"]]
),
"failed_rag_files": len(
[r for r in rag_results.values() if not r["processed"]]
),
}

430
raganything/batch_parser.py Normal file
View File

@@ -0,0 +1,430 @@
"""
Batch and Parallel Document Parsing
This module provides functionality for processing multiple documents in parallel,
with progress reporting and error handling.
"""
import asyncio
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
import time
from tqdm import tqdm
from .parser import MineruParser, DoclingParser
@dataclass
class BatchProcessingResult:
"""Result of batch processing operation"""
successful_files: List[str]
failed_files: List[str]
total_files: int
processing_time: float
errors: Dict[str, str]
output_dir: str
@property
def success_rate(self) -> float:
"""Calculate success rate as percentage"""
if self.total_files == 0:
return 0.0
return (len(self.successful_files) / self.total_files) * 100
def summary(self) -> str:
"""Generate a summary of the batch processing results"""
return (
f"Batch Processing Summary:\n"
f" Total files: {self.total_files}\n"
f" Successful: {len(self.successful_files)} ({self.success_rate:.1f}%)\n"
f" Failed: {len(self.failed_files)}\n"
f" Processing time: {self.processing_time:.2f} seconds\n"
f" Output directory: {self.output_dir}"
)
class BatchParser:
"""
Batch document parser with parallel processing capabilities
Supports processing multiple documents concurrently with progress tracking
and comprehensive error handling.
"""
def __init__(
self,
parser_type: str = "mineru",
max_workers: int = 4,
show_progress: bool = True,
timeout_per_file: int = 300,
skip_installation_check: bool = False,
):
"""
Initialize batch parser
Args:
parser_type: Type of parser to use ("mineru" or "docling")
max_workers: Maximum number of parallel workers
show_progress: Whether to show progress bars
timeout_per_file: Timeout in seconds for each file
skip_installation_check: Skip parser installation check (useful for testing)
"""
self.parser_type = parser_type
self.max_workers = max_workers
self.show_progress = show_progress
self.timeout_per_file = timeout_per_file
self.logger = logging.getLogger(__name__)
# Initialize parser
if parser_type == "mineru":
self.parser = MineruParser()
elif parser_type == "docling":
self.parser = DoclingParser()
else:
raise ValueError(f"Unsupported parser type: {parser_type}")
# Check parser installation (optional)
if not skip_installation_check:
if not self.parser.check_installation():
self.logger.warning(
f"{parser_type.title()} parser installation check failed. "
f"This may be due to package conflicts. "
f"Use skip_installation_check=True to bypass this check."
)
# Don't raise an error, just warn - the parser might still work
def get_supported_extensions(self) -> List[str]:
"""Get list of supported file extensions"""
return list(
self.parser.OFFICE_FORMATS
| self.parser.IMAGE_FORMATS
| self.parser.TEXT_FORMATS
| {".pdf"}
)
def filter_supported_files(
self, file_paths: List[str], recursive: bool = True
) -> List[str]:
"""
Filter file paths to only include supported file types
Args:
file_paths: List of file paths or directories
recursive: Whether to search directories recursively
Returns:
List of supported file paths
"""
supported_extensions = set(self.get_supported_extensions())
supported_files = []
for path_str in file_paths:
path = Path(path_str)
if path.is_file():
if path.suffix.lower() in supported_extensions:
supported_files.append(str(path))
else:
self.logger.warning(f"Unsupported file type: {path}")
elif path.is_dir():
if recursive:
# Recursively find all files
for file_path in path.rglob("*"):
if (
file_path.is_file()
and file_path.suffix.lower() in supported_extensions
):
supported_files.append(str(file_path))
else:
# Only files in the directory (not subdirectories)
for file_path in path.glob("*"):
if (
file_path.is_file()
and file_path.suffix.lower() in supported_extensions
):
supported_files.append(str(file_path))
else:
self.logger.warning(f"Path does not exist: {path}")
return supported_files
def process_single_file(
self, file_path: str, output_dir: str, parse_method: str = "auto", **kwargs
) -> Tuple[bool, str, Optional[str]]:
"""
Process a single file
Args:
file_path: Path to the file to process
output_dir: Output directory
parse_method: Parsing method
**kwargs: Additional parser arguments
Returns:
Tuple of (success, file_path, error_message)
"""
try:
start_time = time.time()
# Create file-specific output directory
file_name = Path(file_path).stem
file_output_dir = Path(output_dir) / file_name
file_output_dir.mkdir(parents=True, exist_ok=True)
# Parse the document
content_list = self.parser.parse_document(
file_path=file_path,
output_dir=str(file_output_dir),
method=parse_method,
**kwargs,
)
processing_time = time.time() - start_time
self.logger.info(
f"Successfully processed {file_path} "
f"({len(content_list)} content blocks, {processing_time:.2f}s)"
)
return True, file_path, None
except Exception as e:
error_msg = f"Failed to process {file_path}: {str(e)}"
self.logger.error(error_msg)
return False, file_path, error_msg
def process_batch(
self,
file_paths: List[str],
output_dir: str,
parse_method: str = "auto",
recursive: bool = True,
**kwargs,
) -> BatchProcessingResult:
"""
Process multiple files in parallel
Args:
file_paths: List of file paths or directories to process
output_dir: Base output directory
parse_method: Parsing method for all files
recursive: Whether to search directories recursively
**kwargs: Additional parser arguments
Returns:
BatchProcessingResult with processing statistics
"""
start_time = time.time()
# Filter to supported files
supported_files = self.filter_supported_files(file_paths, recursive)
if not supported_files:
self.logger.warning("No supported files found to process")
return BatchProcessingResult(
successful_files=[],
failed_files=[],
total_files=0,
processing_time=0.0,
errors={},
output_dir=output_dir,
)
self.logger.info(f"Found {len(supported_files)} files to process")
# Create output directory
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Process files in parallel
successful_files = []
failed_files = []
errors = {}
# Create progress bar if requested
pbar = None
if self.show_progress:
pbar = tqdm(
total=len(supported_files),
desc=f"Processing files ({self.parser_type})",
unit="file",
)
try:
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all tasks
future_to_file = {
executor.submit(
self.process_single_file,
file_path,
output_dir,
parse_method,
**kwargs,
): file_path
for file_path in supported_files
}
# Process completed tasks
for future in as_completed(
future_to_file, timeout=self.timeout_per_file
):
success, file_path, error_msg = future.result()
if success:
successful_files.append(file_path)
else:
failed_files.append(file_path)
errors[file_path] = error_msg
if pbar:
pbar.update(1)
except Exception as e:
self.logger.error(f"Batch processing failed: {str(e)}")
# Mark remaining files as failed
for future in future_to_file:
if not future.done():
file_path = future_to_file[future]
failed_files.append(file_path)
errors[file_path] = f"Processing interrupted: {str(e)}"
if pbar:
pbar.update(1)
finally:
if pbar:
pbar.close()
processing_time = time.time() - start_time
# Create result
result = BatchProcessingResult(
successful_files=successful_files,
failed_files=failed_files,
total_files=len(supported_files),
processing_time=processing_time,
errors=errors,
output_dir=output_dir,
)
# Log summary
self.logger.info(result.summary())
return result
async def process_batch_async(
self,
file_paths: List[str],
output_dir: str,
parse_method: str = "auto",
recursive: bool = True,
**kwargs,
) -> BatchProcessingResult:
"""
Async version of batch processing
Args:
file_paths: List of file paths or directories to process
output_dir: Base output directory
parse_method: Parsing method for all files
recursive: Whether to search directories recursively
**kwargs: Additional parser arguments
Returns:
BatchProcessingResult with processing statistics
"""
# Run the sync version in a thread pool
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
self.process_batch,
file_paths,
output_dir,
parse_method,
recursive,
**kwargs,
)
def main():
"""Command-line interface for batch parsing"""
import argparse
parser = argparse.ArgumentParser(description="Batch document parsing")
parser.add_argument("paths", nargs="+", help="File paths or directories to process")
parser.add_argument("--output", "-o", required=True, help="Output directory")
parser.add_argument(
"--parser",
choices=["mineru", "docling"],
default="mineru",
help="Parser to use",
)
parser.add_argument(
"--method",
choices=["auto", "txt", "ocr"],
default="auto",
help="Parsing method",
)
parser.add_argument(
"--workers", type=int, default=4, help="Number of parallel workers"
)
parser.add_argument(
"--no-progress", action="store_true", help="Disable progress bar"
)
parser.add_argument(
"--recursive",
action="store_true",
default=True,
help="Search directories recursively",
)
parser.add_argument(
"--timeout", type=int, default=300, help="Timeout per file (seconds)"
)
args = parser.parse_args()
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
try:
# Create batch parser
batch_parser = BatchParser(
parser_type=args.parser,
max_workers=args.workers,
show_progress=not args.no_progress,
timeout_per_file=args.timeout,
)
# Process files
result = batch_parser.process_batch(
file_paths=args.paths,
output_dir=args.output,
parse_method=args.method,
recursive=args.recursive,
)
# Print summary
print("\n" + result.summary())
# Exit with error code if any files failed
if result.failed_files:
return 1
return 0
except Exception as e:
print(f"Error: {str(e)}")
return 1
if __name__ == "__main__":
exit(main())

147
raganything/config.py Normal file
View File

@@ -0,0 +1,147 @@
"""
Configuration classes for RAGAnything
Contains configuration dataclasses with environment variable support
"""
from dataclasses import dataclass, field
from typing import List
from lightrag.utils import get_env_value
@dataclass
class RAGAnythingConfig:
"""Configuration class for RAGAnything with environment variable support"""
# Directory Configuration
# ---
working_dir: str = field(default=get_env_value("WORKING_DIR", "./rag_storage", str))
"""Directory where RAG storage and cache files are stored."""
# Parser Configuration
# ---
parse_method: str = field(default=get_env_value("PARSE_METHOD", "auto", str))
"""Default parsing method for document parsing: 'auto', 'ocr', or 'txt'."""
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
"""Default output directory for parsed content."""
parser: str = field(default=get_env_value("PARSER", "mineru", str))
"""Parser selection: 'mineru' or 'docling'."""
display_content_stats: bool = field(
default=get_env_value("DISPLAY_CONTENT_STATS", True, bool)
)
"""Whether to display content statistics during parsing."""
# Multimodal Processing Configuration
# ---
enable_image_processing: bool = field(
default=get_env_value("ENABLE_IMAGE_PROCESSING", True, bool)
)
"""Enable image content processing."""
enable_table_processing: bool = field(
default=get_env_value("ENABLE_TABLE_PROCESSING", True, bool)
)
"""Enable table content processing."""
enable_equation_processing: bool = field(
default=get_env_value("ENABLE_EQUATION_PROCESSING", True, bool)
)
"""Enable equation content processing."""
# Batch Processing Configuration
# ---
max_concurrent_files: int = field(
default=get_env_value("MAX_CONCURRENT_FILES", 1, int)
)
"""Maximum number of files to process concurrently."""
supported_file_extensions: List[str] = field(
default_factory=lambda: get_env_value(
"SUPPORTED_FILE_EXTENSIONS",
".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md",
str,
).split(",")
)
"""List of supported file extensions for batch processing."""
recursive_folder_processing: bool = field(
default=get_env_value("RECURSIVE_FOLDER_PROCESSING", True, bool)
)
"""Whether to recursively process subfolders in batch mode."""
# Context Extraction Configuration
# ---
context_window: int = field(default=get_env_value("CONTEXT_WINDOW", 1, int))
"""Number of pages/chunks to include before and after current item for context."""
context_mode: str = field(default=get_env_value("CONTEXT_MODE", "page", str))
"""Context extraction mode: 'page' for page-based, 'chunk' for chunk-based."""
max_context_tokens: int = field(
default=get_env_value("MAX_CONTEXT_TOKENS", 2000, int)
)
"""Maximum number of tokens in extracted context."""
include_headers: bool = field(default=get_env_value("INCLUDE_HEADERS", True, bool))
"""Whether to include document headers and titles in context."""
include_captions: bool = field(
default=get_env_value("INCLUDE_CAPTIONS", True, bool)
)
"""Whether to include image/table captions in context."""
context_filter_content_types: List[str] = field(
default_factory=lambda: get_env_value(
"CONTEXT_FILTER_CONTENT_TYPES", "text", str
).split(",")
)
"""Content types to include in context extraction (e.g., 'text', 'image', 'table')."""
content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str))
"""Default content format for context extraction when processing documents."""
def __post_init__(self):
"""Post-initialization setup for backward compatibility"""
# Support legacy environment variable names for backward compatibility
legacy_parse_method = get_env_value("MINERU_PARSE_METHOD", None, str)
if legacy_parse_method and not get_env_value("PARSE_METHOD", None, str):
self.parse_method = legacy_parse_method
import warnings
warnings.warn(
"MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.",
DeprecationWarning,
stacklevel=2,
)
@property
def mineru_parse_method(self) -> str:
"""
Backward compatibility property for old code.
.. deprecated::
Use `parse_method` instead. This property will be removed in a future version.
"""
import warnings
warnings.warn(
"mineru_parse_method is deprecated. Use parse_method instead.",
DeprecationWarning,
stacklevel=2,
)
return self.parse_method
@mineru_parse_method.setter
def mineru_parse_method(self, value: str):
"""Setter for backward compatibility"""
import warnings
warnings.warn(
"mineru_parse_method is deprecated. Use parse_method instead.",
DeprecationWarning,
stacklevel=2,
)
self.parse_method = value

View File

@@ -0,0 +1,534 @@
"""
Enhanced Markdown to PDF Conversion
This module provides improved Markdown to PDF conversion with:
- Better formatting and styling
- Image support
- Table support
- Code syntax highlighting
- Custom templates
- Multiple output formats
"""
import os
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from dataclasses import dataclass
import tempfile
import subprocess
try:
import markdown
MARKDOWN_AVAILABLE = True
except ImportError:
MARKDOWN_AVAILABLE = False
try:
from weasyprint import HTML
WEASYPRINT_AVAILABLE = True
except ImportError:
WEASYPRINT_AVAILABLE = False
try:
# Check if pandoc module exists (not used directly, just for detection)
import importlib.util
spec = importlib.util.find_spec("pandoc")
PANDOC_AVAILABLE = spec is not None
except ImportError:
PANDOC_AVAILABLE = False
@dataclass
class MarkdownConfig:
"""Configuration for Markdown to PDF conversion"""
# Styling options
css_file: Optional[str] = None
template_file: Optional[str] = None
page_size: str = "A4"
margin: str = "1in"
font_size: str = "12pt"
line_height: str = "1.5"
# Content options
include_toc: bool = True
syntax_highlighting: bool = True
image_max_width: str = "100%"
table_style: str = "border-collapse: collapse; width: 100%;"
# Output options
output_format: str = "pdf" # pdf, html, docx
output_dir: Optional[str] = None
# Advanced options
custom_css: Optional[str] = None
metadata: Optional[Dict[str, str]] = None
class EnhancedMarkdownConverter:
"""
Enhanced Markdown to PDF converter with multiple backends
Supports multiple conversion methods:
- WeasyPrint (recommended for HTML/CSS styling)
- Pandoc (recommended for complex documents)
- ReportLab (fallback, basic styling)
"""
def __init__(self, config: Optional[MarkdownConfig] = None):
"""
Initialize the converter
Args:
config: Configuration for conversion
"""
self.config = config or MarkdownConfig()
self.logger = logging.getLogger(__name__)
# Check available backends
self.available_backends = self._check_backends()
self.logger.info(f"Available backends: {list(self.available_backends.keys())}")
def _check_backends(self) -> Dict[str, bool]:
"""Check which conversion backends are available"""
backends = {
"weasyprint": WEASYPRINT_AVAILABLE,
"pandoc": PANDOC_AVAILABLE,
"markdown": MARKDOWN_AVAILABLE,
}
# Check if pandoc is installed on system
try:
subprocess.run(["pandoc", "--version"], capture_output=True, check=True)
backends["pandoc_system"] = True
except (subprocess.CalledProcessError, FileNotFoundError):
backends["pandoc_system"] = False
return backends
def _get_default_css(self) -> str:
"""Get default CSS styling"""
return """
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
h1, h2, h3, h4, h5, h6 {
color: #2c3e50;
margin-top: 1.5em;
margin-bottom: 0.5em;
}
h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; border-bottom: 1px solid #bdc3c7; padding-bottom: 0.2em; }
h3 { font-size: 1.3em; }
h4 { font-size: 1.1em; }
p { margin-bottom: 1em; }
code {
background-color: #f8f9fa;
padding: 2px 4px;
border-radius: 3px;
font-family: 'Courier New', monospace;
font-size: 0.9em;
}
pre {
background-color: #f8f9fa;
padding: 15px;
border-radius: 5px;
overflow-x: auto;
border-left: 4px solid #3498db;
}
pre code {
background-color: transparent;
padding: 0;
}
blockquote {
border-left: 4px solid #3498db;
margin: 0;
padding-left: 20px;
color: #7f8c8d;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}
th, td {
border: 1px solid #ddd;
padding: 8px 12px;
text-align: left;
}
th {
background-color: #f2f2f2;
font-weight: bold;
}
img {
max-width: 100%;
height: auto;
display: block;
margin: 1em auto;
}
ul, ol {
margin-bottom: 1em;
}
li {
margin-bottom: 0.5em;
}
a {
color: #3498db;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
.toc {
background-color: #f8f9fa;
padding: 15px;
border-radius: 5px;
margin-bottom: 2em;
}
.toc ul {
list-style-type: none;
padding-left: 0;
}
.toc li {
margin-bottom: 0.3em;
}
.toc a {
color: #2c3e50;
}
"""
def _process_markdown_content(self, content: str) -> str:
"""Process Markdown content with extensions"""
if not MARKDOWN_AVAILABLE:
raise RuntimeError(
"Markdown library not available. Install with: pip install markdown"
)
# Configure Markdown extensions
extensions = [
"markdown.extensions.tables",
"markdown.extensions.fenced_code",
"markdown.extensions.codehilite",
"markdown.extensions.toc",
"markdown.extensions.attr_list",
"markdown.extensions.def_list",
"markdown.extensions.footnotes",
]
extension_configs = {
"codehilite": {
"css_class": "highlight",
"use_pygments": True,
},
"toc": {
"title": "Table of Contents",
"permalink": True,
},
}
# Convert Markdown to HTML
md = markdown.Markdown(
extensions=extensions, extension_configs=extension_configs
)
html_content = md.convert(content)
# Add CSS styling
css = self.config.custom_css or self._get_default_css()
# Create complete HTML document
html_doc = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Converted Document</title>
<style>
{css}
</style>
</head>
<body>
{html_content}
</body>
</html>
"""
return html_doc
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
"""Convert using WeasyPrint (best for styling)"""
if not WEASYPRINT_AVAILABLE:
raise RuntimeError(
"WeasyPrint not available. Install with: pip install weasyprint"
)
try:
# Process Markdown to HTML
html_content = self._process_markdown_content(markdown_content)
# Convert HTML to PDF
html = HTML(string=html_content)
html.write_pdf(output_path)
self.logger.info(
f"Successfully converted to PDF using WeasyPrint: {output_path}"
)
return True
except Exception as e:
self.logger.error(f"WeasyPrint conversion failed: {str(e)}")
return False
def convert_with_pandoc(
self, markdown_content: str, output_path: str, use_system_pandoc: bool = False
) -> bool:
"""Convert using Pandoc (best for complex documents)"""
if (
not self.available_backends.get("pandoc_system", False)
and not use_system_pandoc
):
raise RuntimeError(
"Pandoc not available. Install from: https://pandoc.org/installing.html"
)
temp_md_path = None
try:
import subprocess
# Create temporary markdown file
with tempfile.NamedTemporaryFile(
mode="w", suffix=".md", delete=False
) as temp_file:
temp_file.write(markdown_content)
temp_md_path = temp_file.name
# Build pandoc command with wkhtmltopdf engine
cmd = [
"pandoc",
temp_md_path,
"-o",
output_path,
"--pdf-engine=wkhtmltopdf",
"--standalone",
"--toc",
"--number-sections",
]
# Run pandoc
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
if result.returncode == 0:
self.logger.info(
f"Successfully converted to PDF using Pandoc: {output_path}"
)
return True
else:
self.logger.error(f"Pandoc conversion failed: {result.stderr}")
return False
except Exception as e:
self.logger.error(f"Pandoc conversion failed: {str(e)}")
return False
finally:
if temp_md_path and os.path.exists(temp_md_path):
try:
os.unlink(temp_md_path)
except OSError as e:
self.logger.error(
f"Failed to clean up temp file {temp_md_path}: {str(e)}"
)
def convert_markdown_to_pdf(
self, markdown_content: str, output_path: str, method: str = "auto"
) -> bool:
"""
Convert markdown content to PDF
Args:
markdown_content: Markdown content to convert
output_path: Output PDF file path
method: Conversion method ("auto", "weasyprint", "pandoc", "pandoc_system")
Returns:
True if conversion successful, False otherwise
"""
if method == "auto":
method = self._get_recommended_backend()
try:
if method == "weasyprint":
return self.convert_with_weasyprint(markdown_content, output_path)
elif method == "pandoc":
return self.convert_with_pandoc(markdown_content, output_path)
elif method == "pandoc_system":
return self.convert_with_pandoc(
markdown_content, output_path, use_system_pandoc=True
)
else:
raise ValueError(f"Unknown conversion method: {method}")
except Exception as e:
self.logger.error(f"{method.title()} conversion failed: {str(e)}")
return False
def convert_file_to_pdf(
self, input_path: str, output_path: Optional[str] = None, method: str = "auto"
) -> bool:
"""
Convert Markdown file to PDF
Args:
input_path: Input Markdown file path
output_path: Output PDF file path (optional)
method: Conversion method
Returns:
bool: True if conversion successful
"""
input_path_obj = Path(input_path)
if not input_path_obj.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
# Read markdown content
try:
with open(input_path_obj, "r", encoding="utf-8") as f:
markdown_content = f.read()
except UnicodeDecodeError:
# Try with different encodings
for encoding in ["gbk", "latin-1", "cp1252"]:
try:
with open(input_path_obj, "r", encoding=encoding) as f:
markdown_content = f.read()
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(
f"Could not decode file {input_path} with any supported encoding"
)
# Determine output path
if output_path is None:
output_path = str(input_path_obj.with_suffix(".pdf"))
return self.convert_markdown_to_pdf(markdown_content, output_path, method)
def get_backend_info(self) -> Dict[str, Any]:
"""Get information about available backends"""
return {
"available_backends": self.available_backends,
"recommended_backend": self._get_recommended_backend(),
"config": {
"page_size": self.config.page_size,
"margin": self.config.margin,
"font_size": self.config.font_size,
"include_toc": self.config.include_toc,
"syntax_highlighting": self.config.syntax_highlighting,
},
}
def _get_recommended_backend(self) -> str:
"""Get recommended backend based on availability"""
if self.available_backends.get("pandoc_system", False):
return "pandoc"
elif self.available_backends.get("weasyprint", False):
return "weasyprint"
else:
return "none"
def main():
"""Command-line interface for enhanced markdown conversion"""
import argparse
parser = argparse.ArgumentParser(description="Enhanced Markdown to PDF conversion")
parser.add_argument("input", nargs="?", help="Input markdown file")
parser.add_argument("--output", "-o", help="Output PDF file")
parser.add_argument(
"--method",
choices=["auto", "weasyprint", "pandoc", "pandoc_system"],
default="auto",
help="Conversion method",
)
parser.add_argument("--css", help="Custom CSS file")
parser.add_argument("--info", action="store_true", help="Show backend information")
args = parser.parse_args()
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
# Create converter
config = MarkdownConfig()
if args.css:
config.css_file = args.css
converter = EnhancedMarkdownConverter(config)
# Show backend info if requested
if args.info:
info = converter.get_backend_info()
print("Backend Information:")
for backend, available in info["available_backends"].items():
status = "" if available else ""
print(f" {status} {backend}")
print(f"Recommended backend: {info['recommended_backend']}")
return 0
# Check if input file is provided
if not args.input:
parser.error("Input file is required when not using --info")
# Convert file
try:
success = converter.convert_file_to_pdf(
input_path=args.input, output_path=args.output, method=args.method
)
if success:
print(f"✅ Successfully converted {args.input} to PDF")
return 0
else:
print("❌ Conversion failed")
return 1
except Exception as e:
print(f"❌ Error: {str(e)}")
return 1
if __name__ == "__main__":
exit(main())

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1831
raganything/parser.py Normal file

File diff suppressed because it is too large Load Diff

1824
raganything/processor.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -56,6 +56,38 @@ Additional context:
Focus on providing accurate, detailed visual analysis that would be useful for knowledge retrieval."""
# Image analysis prompt with context support
PROMPTS[
"vision_prompt_with_context"
] = """Please analyze this image in detail, considering the surrounding context. Provide a JSON response with the following structure:
{{
"detailed_description": "A comprehensive and detailed visual description of the image following these guidelines:
- Describe the overall composition and layout
- Identify all objects, people, text, and visual elements
- Explain relationships between elements and how they relate to the surrounding context
- Note colors, lighting, and visual style
- Describe any actions or activities shown
- Include technical details if relevant (charts, diagrams, etc.)
- Reference connections to the surrounding content when relevant
- Always use specific names instead of pronouns",
"entity_info": {{
"entity_name": "{entity_name}",
"entity_type": "image",
"summary": "concise summary of the image content, its significance, and relationship to surrounding content (max 100 words)"
}}
}}
Context from surrounding content:
{context}
Image details:
- Image Path: {image_path}
- Captions: {captions}
- Footnotes: {footnotes}
Focus on providing accurate, detailed visual analysis that incorporates the context and would be useful for knowledge retrieval."""
# Image analysis prompt with text fallback
PROMPTS["text_prompt"] = """Based on the following image information, provide analysis:
@@ -94,6 +126,39 @@ Footnotes: {table_footnote}
Focus on extracting meaningful insights and relationships from the tabular data."""
# Table analysis prompt with context support
PROMPTS[
"table_prompt_with_context"
] = """Please analyze this table content considering the surrounding context, and provide a JSON response with the following structure:
{{
"detailed_description": "A comprehensive analysis of the table including:
- Table structure and organization
- Column headers and their meanings
- Key data points and patterns
- Statistical insights and trends
- Relationships between data elements
- Significance of the data presented in relation to surrounding context
- How the table supports or illustrates concepts from the surrounding content
Always use specific names and values instead of general references.",
"entity_info": {{
"entity_name": "{entity_name}",
"entity_type": "table",
"summary": "concise summary of the table's purpose, key findings, and relationship to surrounding content (max 100 words)"
}}
}}
Context from surrounding content:
{context}
Table Information:
Image Path: {table_img_path}
Caption: {table_caption}
Body: {table_body}
Footnotes: {table_footnote}
Focus on extracting meaningful insights and relationships from the tabular data in the context of the surrounding content."""
# Equation analysis prompt template
PROMPTS[
"equation_prompt"
@@ -122,6 +187,38 @@ Format: {equation_format}
Focus on providing mathematical insights and explaining the equation's significance."""
# Equation analysis prompt with context support
PROMPTS[
"equation_prompt_with_context"
] = """Please analyze this mathematical equation considering the surrounding context, and provide a JSON response with the following structure:
{{
"detailed_description": "A comprehensive analysis of the equation including:
- Mathematical meaning and interpretation
- Variables and their definitions in the context of surrounding content
- Mathematical operations and functions used
- Application domain and context based on surrounding material
- Physical or theoretical significance
- Relationship to other mathematical concepts mentioned in the context
- Practical applications or use cases
- How the equation relates to the broader discussion or framework
Always use specific mathematical terminology.",
"entity_info": {{
"entity_name": "{entity_name}",
"entity_type": "equation",
"summary": "concise summary of the equation's purpose, significance, and role in the surrounding context (max 100 words)"
}}
}}
Context from surrounding content:
{context}
Equation Information:
Equation: {equation_text}
Format: {equation_format}
Focus on providing mathematical insights and explaining the equation's significance within the broader context."""
# Generic content analysis prompt template
PROMPTS[
"generic_prompt"
@@ -146,6 +243,34 @@ Content: {content}
Focus on extracting meaningful information that would be useful for knowledge retrieval."""
# Generic content analysis prompt with context support
PROMPTS[
"generic_prompt_with_context"
] = """Please analyze this {content_type} content considering the surrounding context, and provide a JSON response with the following structure:
{{
"detailed_description": "A comprehensive analysis of the content including:
- Content structure and organization
- Key information and elements
- Relationships between components
- Context and significance in relation to surrounding content
- How this content connects to or supports the broader discussion
- Relevant details for knowledge retrieval
Always use specific terminology appropriate for {content_type} content.",
"entity_info": {{
"entity_name": "{entity_name}",
"entity_type": "{content_type}",
"summary": "concise summary of the content's purpose, key points, and relationship to surrounding context (max 100 words)"
}}
}}
Context from surrounding content:
{context}
Content: {content}
Focus on extracting meaningful information that would be useful for knowledge retrieval and understanding the content's role in the broader context."""
# Modal chunk templates
PROMPTS["image_chunk"] = """
Image Content Analysis:
@@ -173,3 +298,56 @@ PROMPTS["generic_chunk"] = """{content_type} Content Analysis:
Content: {content}
Analysis: {enhanced_caption}"""
# Query-related prompts
PROMPTS["QUERY_IMAGE_DESCRIPTION"] = (
"Please briefly describe the main content, key elements, and important information in this image."
)
PROMPTS["QUERY_IMAGE_ANALYST_SYSTEM"] = (
"You are a professional image analyst who can accurately describe image content."
)
PROMPTS[
"QUERY_TABLE_ANALYSIS"
] = """Please analyze the main content, structure, and key information of the following table data:
Table data:
{table_data}
Table caption: {table_caption}
Please briefly summarize the main content, data characteristics, and important findings of the table."""
PROMPTS["QUERY_TABLE_ANALYST_SYSTEM"] = (
"You are a professional data analyst who can accurately analyze table data."
)
PROMPTS[
"QUERY_EQUATION_ANALYSIS"
] = """Please explain the meaning and purpose of the following mathematical formula:
LaTeX formula: {latex}
Formula caption: {equation_caption}
Please briefly explain the mathematical meaning, application scenarios, and importance of this formula."""
PROMPTS["QUERY_EQUATION_ANALYST_SYSTEM"] = (
"You are a mathematics expert who can clearly explain mathematical formulas."
)
PROMPTS[
"QUERY_GENERIC_ANALYSIS"
] = """Please analyze the following {content_type} type content and extract its main information and key features:
Content: {content_str}
Please briefly summarize the main characteristics and important information of this content."""
PROMPTS["QUERY_GENERIC_ANALYST_SYSTEM"] = (
"You are a professional content analyst who can accurately analyze {content_type} type content."
)
PROMPTS["QUERY_ENHANCEMENT_SUFFIX"] = (
"\n\nPlease provide a comprehensive answer based on the user query and the provided multimodal content information."
)

746
raganything/query.py Normal file
View File

@@ -0,0 +1,746 @@
"""
Query functionality for RAGAnything
Contains all query-related methods for both text and multimodal queries
"""
import json
import hashlib
import re
from typing import Dict, List, Any
from pathlib import Path
from lightrag import QueryParam
from lightrag.utils import always_get_an_event_loop
from raganything.prompt import PROMPTS
from raganything.utils import (
get_processor_for_type,
encode_image_to_base64,
validate_image_file,
)
class QueryMixin:
"""QueryMixin class containing query functionality for RAGAnything"""
def _generate_multimodal_cache_key(
self, query: str, multimodal_content: List[Dict[str, Any]], mode: str, **kwargs
) -> str:
"""
Generate cache key for multimodal query
Args:
query: Base query text
multimodal_content: List of multimodal content
mode: Query mode
**kwargs: Additional parameters
Returns:
str: Cache key hash
"""
# Create a normalized representation of the query parameters
cache_data = {
"query": query.strip(),
"mode": mode,
}
# Normalize multimodal content for stable caching
normalized_content = []
if multimodal_content:
for item in multimodal_content:
if isinstance(item, dict):
normalized_item = {}
for key, value in item.items():
# For file paths, use basename to make cache more portable
if key in [
"img_path",
"image_path",
"file_path",
] and isinstance(value, str):
normalized_item[key] = Path(value).name
# For large content, create a hash instead of storing directly
elif (
key in ["table_data", "table_body"]
and isinstance(value, str)
and len(value) > 200
):
normalized_item[f"{key}_hash"] = hashlib.md5(
value.encode()
).hexdigest()
else:
normalized_item[key] = value
normalized_content.append(normalized_item)
else:
normalized_content.append(item)
cache_data["multimodal_content"] = normalized_content
# Add relevant kwargs to cache data
relevant_kwargs = {
k: v
for k, v in kwargs.items()
if k
in [
"stream",
"response_type",
"top_k",
"max_tokens",
"temperature",
# "only_need_context",
# "only_need_prompt",
]
}
cache_data.update(relevant_kwargs)
# Generate hash from the cache data
cache_str = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
cache_hash = hashlib.md5(cache_str.encode()).hexdigest()
return f"multimodal_query:{cache_hash}"
async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
"""
Pure text query - directly calls LightRAG's query functionality
Args:
query: Query text
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
**kwargs: Other query parameters, will be passed to QueryParam
- vlm_enhanced: bool, default True when vision_model_func is available.
If True, will parse image paths in retrieved context and replace them
with base64 encoded images for VLM processing.
Returns:
str: Query result
"""
if self.lightrag is None:
raise ValueError(
"No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
)
# Check if VLM enhanced query should be used
vlm_enhanced = kwargs.pop("vlm_enhanced", None)
# Auto-determine VLM enhanced based on availability
if vlm_enhanced is None:
vlm_enhanced = (
hasattr(self, "vision_model_func")
and self.vision_model_func is not None
)
# Use VLM enhanced query if enabled and available
if (
vlm_enhanced
and hasattr(self, "vision_model_func")
and self.vision_model_func
):
return await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
elif vlm_enhanced and (
not hasattr(self, "vision_model_func") or not self.vision_model_func
):
self.logger.warning(
"VLM enhanced query requested but vision_model_func is not available, falling back to normal query"
)
# Create query parameters
query_param = QueryParam(mode=mode, **kwargs)
self.logger.info(f"Executing text query: {query[:100]}...")
self.logger.info(f"Query mode: {mode}")
# Call LightRAG's query method
result = await self.lightrag.aquery(query, param=query_param)
self.logger.info("Text query completed")
return result
async def aquery_with_multimodal(
self,
query: str,
multimodal_content: List[Dict[str, Any]] = None,
mode: str = "mix",
**kwargs,
) -> str:
"""
Multimodal query - combines text and multimodal content for querying
Args:
query: Base query text
multimodal_content: List of multimodal content, each element contains:
- type: Content type ("image", "table", "equation", etc.)
- Other fields depend on type (e.g., img_path, table_data, latex, etc.)
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
**kwargs: Other query parameters, will be passed to QueryParam
Returns:
str: Query result
Examples:
# Pure text query
result = await rag.query_with_multimodal("What is machine learning?")
# Image query
result = await rag.query_with_multimodal(
"Analyze the content in this image",
multimodal_content=[{
"type": "image",
"img_path": "./image.jpg"
}]
)
# Table query
result = await rag.query_with_multimodal(
"Analyze the data trends in this table",
multimodal_content=[{
"type": "table",
"table_data": "Name,Age\nAlice,25\nBob,30"
}]
)
"""
# Ensure LightRAG is initialized
await self._ensure_lightrag_initialized()
self.logger.info(f"Executing multimodal query: {query[:100]}...")
self.logger.info(f"Query mode: {mode}")
# If no multimodal content, fallback to pure text query
if not multimodal_content:
self.logger.info("No multimodal content provided, executing text query")
return await self.aquery(query, mode=mode, **kwargs)
# Generate cache key for multimodal query
cache_key = self._generate_multimodal_cache_key(
query, multimodal_content, mode, **kwargs
)
# Check cache if available and enabled
cached_result = None
if (
hasattr(self, "lightrag")
and self.lightrag
and hasattr(self.lightrag, "llm_response_cache")
and self.lightrag.llm_response_cache
):
if self.lightrag.llm_response_cache.global_config.get(
"enable_llm_cache", True
):
try:
cached_result = await self.lightrag.llm_response_cache.get_by_id(
cache_key
)
if cached_result and isinstance(cached_result, dict):
result_content = cached_result.get("return")
if result_content:
self.logger.info(
f"Multimodal query cache hit: {cache_key[:16]}..."
)
return result_content
except Exception as e:
self.logger.debug(f"Error accessing multimodal query cache: {e}")
# Process multimodal content to generate enhanced query text
enhanced_query = await self._process_multimodal_query_content(
query, multimodal_content
)
self.logger.info(
f"Generated enhanced query length: {len(enhanced_query)} characters"
)
# Execute enhanced query
result = await self.aquery(enhanced_query, mode=mode, **kwargs)
# Save to cache if available and enabled
if (
hasattr(self, "lightrag")
and self.lightrag
and hasattr(self.lightrag, "llm_response_cache")
and self.lightrag.llm_response_cache
):
if self.lightrag.llm_response_cache.global_config.get(
"enable_llm_cache", True
):
try:
# Create cache entry for multimodal query
cache_entry = {
"return": result,
"cache_type": "multimodal_query",
"original_query": query,
"multimodal_content_count": len(multimodal_content),
"mode": mode,
}
await self.lightrag.llm_response_cache.upsert(
{cache_key: cache_entry}
)
self.logger.info(
f"Saved multimodal query result to cache: {cache_key[:16]}..."
)
except Exception as e:
self.logger.debug(f"Error saving multimodal query to cache: {e}")
# Ensure cache is persisted to disk
if (
hasattr(self, "lightrag")
and self.lightrag
and hasattr(self.lightrag, "llm_response_cache")
and self.lightrag.llm_response_cache
):
try:
await self.lightrag.llm_response_cache.index_done_callback()
except Exception as e:
self.logger.debug(f"Error persisting multimodal query cache: {e}")
self.logger.info("Multimodal query completed")
return result
async def aquery_vlm_enhanced(self, query: str, mode: str = "mix", **kwargs) -> str:
"""
VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing
Args:
query: User query
mode: Underlying LightRAG query mode
**kwargs: Other query parameters
Returns:
str: VLM query result
"""
# Ensure VLM is available
if not hasattr(self, "vision_model_func") or not self.vision_model_func:
raise ValueError(
"VLM enhanced query requires vision_model_func. "
"Please provide a vision model function when initializing RAGAnything."
)
# Ensure LightRAG is initialized
await self._ensure_lightrag_initialized()
self.logger.info(f"Executing VLM enhanced query: {query[:100]}...")
# Clear previous image cache
if hasattr(self, "_current_images_base64"):
delattr(self, "_current_images_base64")
# 1. Get original retrieval prompt (without generating final answer)
query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)
raw_prompt = await self.lightrag.aquery(query, param=query_param)
self.logger.debug("Retrieved raw prompt from LightRAG")
# 2. Extract and process image paths
enhanced_prompt, images_found = await self._process_image_paths_for_vlm(
raw_prompt
)
if not images_found:
self.logger.info("No valid images found, falling back to normal query")
# Fallback to normal query
query_param = QueryParam(mode=mode, **kwargs)
return await self.lightrag.aquery(query, param=query_param)
self.logger.info(f"Processed {images_found} images for VLM")
# 3. Build VLM message format
messages = self._build_vlm_messages_with_images(enhanced_prompt, query)
# 4. Call VLM for question answering
result = await self._call_vlm_with_multimodal_content(messages)
self.logger.info("VLM enhanced query completed")
return result
async def _process_multimodal_query_content(
self, base_query: str, multimodal_content: List[Dict[str, Any]]
) -> str:
"""
Process multimodal query content to generate enhanced query text
Args:
base_query: Base query text
multimodal_content: List of multimodal content
Returns:
str: Enhanced query text
"""
self.logger.info("Starting multimodal query content processing...")
enhanced_parts = [f"User query: {base_query}"]
for i, content in enumerate(multimodal_content):
content_type = content.get("type", "unknown")
self.logger.info(
f"Processing {i+1}/{len(multimodal_content)} multimodal content: {content_type}"
)
try:
# Get appropriate processor
processor = get_processor_for_type(self.modal_processors, content_type)
if processor:
# Generate content description
description = await self._generate_query_content_description(
processor, content, content_type
)
enhanced_parts.append(
f"\nRelated {content_type} content: {description}"
)
else:
# If no appropriate processor, use basic description
basic_desc = str(content)[:200]
enhanced_parts.append(
f"\nRelated {content_type} content: {basic_desc}"
)
except Exception as e:
self.logger.error(f"Error processing multimodal content: {str(e)}")
# Continue processing other content
continue
enhanced_query = "\n".join(enhanced_parts)
enhanced_query += PROMPTS["QUERY_ENHANCEMENT_SUFFIX"]
self.logger.info("Multimodal query content processing completed")
return enhanced_query
async def _generate_query_content_description(
self, processor, content: Dict[str, Any], content_type: str
) -> str:
"""
Generate content description for query
Args:
processor: Multimodal processor
content: Content data
content_type: Content type
Returns:
str: Content description
"""
try:
if content_type == "image":
return await self._describe_image_for_query(processor, content)
elif content_type == "table":
return await self._describe_table_for_query(processor, content)
elif content_type == "equation":
return await self._describe_equation_for_query(processor, content)
else:
return await self._describe_generic_for_query(
processor, content, content_type
)
except Exception as e:
self.logger.error(f"Error generating {content_type} description: {str(e)}")
return f"{content_type} content: {str(content)[:100]}"
async def _describe_image_for_query(
self, processor, content: Dict[str, Any]
) -> str:
"""Generate image description for query"""
image_path = content.get("img_path")
captions = content.get("image_caption", content.get("img_caption", []))
footnotes = content.get("image_footnote", content.get("img_footnote", []))
if image_path and Path(image_path).exists():
# If image exists, use vision model to generate description
image_base64 = processor._encode_image_to_base64(image_path)
if image_base64:
prompt = PROMPTS["QUERY_IMAGE_DESCRIPTION"]
description = await processor.modal_caption_func(
prompt,
image_data=image_base64,
system_prompt=PROMPTS["QUERY_IMAGE_ANALYST_SYSTEM"],
)
return description
# If image doesn't exist or processing failed, use existing information
parts = []
if image_path:
parts.append(f"Image path: {image_path}")
if captions:
parts.append(f"Image captions: {', '.join(captions)}")
if footnotes:
parts.append(f"Image footnotes: {', '.join(footnotes)}")
return "; ".join(parts) if parts else "Image content information incomplete"
async def _describe_table_for_query(
self, processor, content: Dict[str, Any]
) -> str:
"""Generate table description for query"""
table_data = content.get("table_data", "")
table_caption = content.get("table_caption", "")
prompt = PROMPTS["QUERY_TABLE_ANALYSIS"].format(
table_data=table_data, table_caption=table_caption
)
description = await processor.modal_caption_func(
prompt, system_prompt=PROMPTS["QUERY_TABLE_ANALYST_SYSTEM"]
)
return description
async def _describe_equation_for_query(
self, processor, content: Dict[str, Any]
) -> str:
"""Generate equation description for query"""
latex = content.get("latex", "")
equation_caption = content.get("equation_caption", "")
prompt = PROMPTS["QUERY_EQUATION_ANALYSIS"].format(
latex=latex, equation_caption=equation_caption
)
description = await processor.modal_caption_func(
prompt, system_prompt=PROMPTS["QUERY_EQUATION_ANALYST_SYSTEM"]
)
return description
async def _describe_generic_for_query(
self, processor, content: Dict[str, Any], content_type: str
) -> str:
"""Generate generic content description for query"""
content_str = str(content)
prompt = PROMPTS["QUERY_GENERIC_ANALYSIS"].format(
content_type=content_type, content_str=content_str
)
description = await processor.modal_caption_func(
prompt,
system_prompt=PROMPTS["QUERY_GENERIC_ANALYST_SYSTEM"].format(
content_type=content_type
),
)
return description
async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]:
"""
Process image paths in prompt, keeping original paths and adding VLM markers
Args:
prompt: Original prompt
Returns:
tuple: (processed prompt, image count)
"""
enhanced_prompt = prompt
images_processed = 0
# Initialize image cache
self._current_images_base64 = []
# Enhanced regex pattern for matching image paths
# Matches only the path ending with image file extensions
image_path_pattern = (
r"Image Path:\s*([^\r\n]*?\.(?:jpg|jpeg|png|gif|bmp|webp|tiff|tif))"
)
# First, let's see what matches we find
matches = re.findall(image_path_pattern, prompt)
self.logger.info(f"Found {len(matches)} image path matches in prompt")
def replace_image_path(match):
nonlocal images_processed
image_path = match.group(1).strip()
self.logger.debug(f"Processing image path: '{image_path}'")
# Validate path format (basic check)
if not image_path or len(image_path) < 3:
self.logger.warning(f"Invalid image path format: {image_path}")
return match.group(0) # Keep original
# Use utility function to validate image file
self.logger.debug(f"Calling validate_image_file for: {image_path}")
is_valid = validate_image_file(image_path)
self.logger.debug(f"Validation result for {image_path}: {is_valid}")
if not is_valid:
self.logger.warning(f"Image validation failed for: {image_path}")
return match.group(0) # Keep original if validation fails
try:
# Encode image to base64 using utility function
self.logger.debug(f"Attempting to encode image: {image_path}")
image_base64 = encode_image_to_base64(image_path)
if image_base64:
images_processed += 1
# Save base64 to instance variable for later use
self._current_images_base64.append(image_base64)
# Keep original path info and add VLM marker
result = f"Image Path: {image_path}\n[VLM_IMAGE_{images_processed}]"
self.logger.debug(
f"Successfully processed image {images_processed}: {image_path}"
)
return result
else:
self.logger.error(f"Failed to encode image: {image_path}")
return match.group(0) # Keep original if encoding failed
except Exception as e:
self.logger.error(f"Failed to process image {image_path}: {e}")
return match.group(0) # Keep original
# Execute replacement
enhanced_prompt = re.sub(
image_path_pattern, replace_image_path, enhanced_prompt
)
return enhanced_prompt, images_processed
def _build_vlm_messages_with_images(
self, enhanced_prompt: str, user_query: str
) -> List[Dict]:
"""
Build VLM message format, using markers to correspond images with text positions
Args:
enhanced_prompt: Enhanced prompt with image markers
user_query: User query
Returns:
List[Dict]: VLM message format
"""
images_base64 = getattr(self, "_current_images_base64", [])
if not images_base64:
# Pure text mode
return [
{
"role": "user",
"content": f"Context:\n{enhanced_prompt}\n\nUser Question: {user_query}",
}
]
# Build multimodal content
content_parts = []
# Split text at image markers and insert images
text_parts = enhanced_prompt.split("[VLM_IMAGE_")
for i, text_part in enumerate(text_parts):
if i == 0:
# First text part
if text_part.strip():
content_parts.append({"type": "text", "text": text_part})
else:
# Find marker number and insert corresponding image
marker_match = re.match(r"(\d+)\](.*)", text_part, re.DOTALL)
if marker_match:
image_num = (
int(marker_match.group(1)) - 1
) # Convert to 0-based index
remaining_text = marker_match.group(2)
# Insert corresponding image
if 0 <= image_num < len(images_base64):
content_parts.append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{images_base64[image_num]}"
},
}
)
# Insert remaining text
if remaining_text.strip():
content_parts.append({"type": "text", "text": remaining_text})
# Add user question
content_parts.append(
{
"type": "text",
"text": f"\n\nUser Question: {user_query}\n\nPlease answer based on the context and images provided.",
}
)
return [
{
"role": "system",
"content": "You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.",
},
{"role": "user", "content": content_parts},
]
async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:
"""
Call VLM to process multimodal content
Args:
messages: VLM message format
Returns:
str: VLM response result
"""
try:
user_message = messages[1]
content = user_message["content"]
system_prompt = messages[0]["content"]
if isinstance(content, str):
# Pure text mode
result = await self.vision_model_func(
content, system_prompt=system_prompt
)
else:
# Multimodal mode - pass complete messages directly to VLM
result = await self.vision_model_func(
"", # Empty prompt since we're using messages format
messages=messages,
)
return result
except Exception as e:
self.logger.error(f"VLM call failed: {e}")
raise
# Synchronous versions of query methods
def query(self, query: str, mode: str = "mix", **kwargs) -> str:
"""
Synchronous version of pure text query
Args:
query: Query text
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
**kwargs: Other query parameters, will be passed to QueryParam
- vlm_enhanced: bool, default True when vision_model_func is available.
If True, will parse image paths in retrieved context and replace them
with base64 encoded images for VLM processing.
Returns:
str: Query result
"""
loop = always_get_an_event_loop()
return loop.run_until_complete(self.aquery(query, mode=mode, **kwargs))
def query_with_multimodal(
self,
query: str,
multimodal_content: List[Dict[str, Any]] = None,
mode: str = "mix",
**kwargs,
) -> str:
"""
Synchronous version of multimodal query
Args:
query: Base query text
multimodal_content: List of multimodal content, each element contains:
- type: Content type ("image", "table", "equation", etc.)
- Other fields depend on type (e.g., img_path, table_data, latex, etc.)
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
**kwargs: Other query parameters, will be passed to QueryParam
Returns:
str: Query result
"""
loop = always_get_an_event_loop()
return loop.run_until_complete(
self.aquery_with_multimodal(query, multimodal_content, mode=mode, **kwargs)
)

File diff suppressed because it is too large Load Diff

274
raganything/utils.py Normal file
View File

@@ -0,0 +1,274 @@
"""
Utility functions for RAGAnything
Contains helper functions for content separation, text insertion, and other utilities
"""
import base64
from typing import Dict, List, Any, Tuple
from pathlib import Path
from lightrag.utils import logger
def separate_content(
content_list: List[Dict[str, Any]],
) -> Tuple[str, List[Dict[str, Any]]]:
"""
Separate text content and multimodal content
Args:
content_list: Content list from MinerU parsing
Returns:
(text_content, multimodal_items): Pure text content and multimodal items list
"""
text_parts = []
multimodal_items = []
for item in content_list:
content_type = item.get("type", "text")
if content_type == "text":
# Text content
text = item.get("text", "")
if text.strip():
text_parts.append(text)
else:
# Multimodal content (image, table, equation, etc.)
multimodal_items.append(item)
# Merge all text content
text_content = "\n\n".join(text_parts)
logger.info("Content separation complete:")
logger.info(f" - Text content length: {len(text_content)} characters")
logger.info(f" - Multimodal items count: {len(multimodal_items)}")
# Count multimodal types
modal_types = {}
for item in multimodal_items:
modal_type = item.get("type", "unknown")
modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
if modal_types:
logger.info(f" - Multimodal type distribution: {modal_types}")
return text_content, multimodal_items
def encode_image_to_base64(image_path: str) -> str:
"""
Encode image file to base64 string
Args:
image_path: Path to the image file
Returns:
str: Base64 encoded string, empty string if encoding fails
"""
try:
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
return encoded_string
except Exception as e:
logger.error(f"Failed to encode image {image_path}: {e}")
return ""
def validate_image_file(image_path: str, max_size_mb: int = 50) -> bool:
"""
Validate if a file is a valid image file
Args:
image_path: Path to the image file
max_size_mb: Maximum file size in MB
Returns:
bool: True if valid, False otherwise
"""
try:
path = Path(image_path)
logger.debug(f"Validating image path: {image_path}")
logger.debug(f"Resolved path object: {path}")
logger.debug(f"Path exists check: {path.exists()}")
# Check if file exists
if not path.exists():
logger.warning(f"Image file not found: {image_path}")
return False
# Check file extension
image_extensions = [
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".webp",
".tiff",
".tif",
]
path_lower = str(path).lower()
has_valid_extension = any(path_lower.endswith(ext) for ext in image_extensions)
logger.debug(
f"File extension check - path: {path_lower}, valid: {has_valid_extension}"
)
if not has_valid_extension:
logger.warning(f"File does not appear to be an image: {image_path}")
return False
# Check file size
file_size = path.stat().st_size
max_size = max_size_mb * 1024 * 1024
logger.debug(
f"File size check - size: {file_size} bytes, max: {max_size} bytes"
)
if file_size > max_size:
logger.warning(f"Image file too large ({file_size} bytes): {image_path}")
return False
logger.debug(f"Image validation successful: {image_path}")
return True
except Exception as e:
logger.error(f"Error validating image file {image_path}: {e}")
return False
async def insert_text_content(
lightrag,
input: str | list[str],
split_by_character: str | None = None,
split_by_character_only: bool = False,
ids: str | list[str] | None = None,
file_paths: str | list[str] | None = None,
):
"""
Insert pure text content into LightRAG
Args:
lightrag: LightRAG instance
input: Single document string or list of document strings
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
chunk_token_size, it will be split again by token size.
split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored.
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
file_paths: single string of the file path or list of file paths, used for citation
"""
logger.info("Starting text content insertion into LightRAG...")
# Use LightRAG's insert method with all parameters
await lightrag.ainsert(
input=input,
file_paths=file_paths,
split_by_character=split_by_character,
split_by_character_only=split_by_character_only,
ids=ids,
)
logger.info("Text content insertion complete")
async def insert_text_content_with_multimodal_content(
lightrag,
input: str | list[str],
multimodal_content: list[dict[str, any]] | None = None,
split_by_character: str | None = None,
split_by_character_only: bool = False,
ids: str | list[str] | None = None,
file_paths: str | list[str] | None = None,
scheme_name: str | None = None,
):
"""
Insert pure text content into LightRAG
Args:
lightrag: LightRAG instance
input: Single document string or list of document strings
multimodal_content: Multimodal content list (optional)
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
chunk_token_size, it will be split again by token size.
split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored.
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
file_paths: single string of the file path or list of file paths, used for citation
scheme_name: scheme name (optional)
"""
logger.info("Starting text content insertion into LightRAG...")
# Use LightRAG's insert method with all parameters
try:
await lightrag.ainsert(
input=input,
multimodal_content=multimodal_content,
file_paths=file_paths,
split_by_character=split_by_character,
split_by_character_only=split_by_character_only,
ids=ids,
scheme_name=scheme_name,
)
except Exception as e:
logger.info(f"Error: {e}")
logger.info(
"If the error is caused by the ainsert function not having a multimodal content parameter, please update the raganything branch of lightrag"
)
logger.info("Text content insertion complete")
def get_processor_for_type(modal_processors: Dict[str, Any], content_type: str):
"""
Get appropriate processor based on content type
Args:
modal_processors: Dictionary of available processors
content_type: Content type
Returns:
Corresponding processor instance
"""
# Direct mapping to corresponding processor
if content_type == "image":
return modal_processors.get("image")
elif content_type == "table":
return modal_processors.get("table")
elif content_type == "equation":
return modal_processors.get("equation")
else:
# For other types, use generic processor
return modal_processors.get("generic")
def get_processor_supports(proc_type: str) -> List[str]:
"""Get processor supported features"""
supports_map = {
"image": [
"Image content analysis",
"Visual understanding",
"Image description generation",
"Image entity extraction",
],
"table": [
"Table structure analysis",
"Data statistics",
"Trend identification",
"Table entity extraction",
],
"equation": [
"Mathematical formula parsing",
"Variable identification",
"Formula meaning explanation",
"Formula entity extraction",
],
"generic": [
"General content analysis",
"Structured processing",
"Entity extraction",
],
}
return supports_map.get(proc_type, ["Basic processing"])

View File

@@ -1,10 +1,10 @@
huggingface_hub
# LightRAG packages
lightrag-hku
# MinerU 2.0 packages (replaces magic-pdf)
mineru[core]
# Progress bars for batch processing
tqdm
# Note: Optional dependencies are now defined in setup.py extras_require:
# - [image]: Pillow>=10.0.0 (for BMP, TIFF, GIF, WebP format conversion)
# - [text]: reportlab>=4.0.0 (for TXT, MD to PDF conversion)

View File

@@ -0,0 +1,17 @@
import tiktoken
import os
# Define the directory where you want to store the cache
cache_dir = "./tiktoken_cache"
if "TIKTOKEN_CACHE_DIR" not in os.environ:
os.environ["TIKTOKEN_CACHE_DIR"] = cache_dir
# Create the directory if it doesn't exist
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
print("Downloading and caching tiktoken models...")
tiktoken.get_encoding("cl100k_base")
# tiktoken.get_encoding("p50k_base")
print(f"tiktoken models have been cached in '{cache_dir}'")

View File

@@ -64,6 +64,11 @@ extras_require = {
"text": ["reportlab>=4.0.0"], # For text file to PDF conversion (TXT, MD)
"office": [], # Office document processing requires LibreOffice (external program)
"all": ["Pillow>=10.0.0", "reportlab>=4.0.0"], # All optional features
"markdown": [
"markdown>=3.4.0",
"weasyprint>=60.0",
"pygments>=2.10.0",
], # Enhanced markdown conversion
}
setuptools.setup(