Compare commits
130 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1c7247c5ab | ||
|
|
e70cf8d38a | ||
|
|
8079053506 | ||
|
|
e6af42be94 | ||
|
|
e7273346e1 | ||
|
|
3c5c3fa3d5 | ||
|
|
9207cbed46 | ||
|
|
ba464f459a | ||
|
|
806ac9ba3e | ||
|
|
ab552321ae | ||
|
|
0f6d3ea83a | ||
|
|
10ee99952a | ||
|
|
0ac3dc6bf7 | ||
|
|
1e0568543f | ||
|
|
8757265825 | ||
|
|
06011b0e6a | ||
|
|
d7bec4e472 | ||
|
|
d30e71502e | ||
|
|
214eb0f94d | ||
|
|
1d48f24b4a | ||
|
|
8e0e05d497 | ||
|
|
fcd21ea31d | ||
|
|
df99bfb82f | ||
|
|
7e5e691650 | ||
|
|
8d170e52c8 | ||
|
|
ba3f737c87 | ||
|
|
48087eac78 | ||
|
|
d63798f05c | ||
|
|
991794361f | ||
|
|
30b6e2358b | ||
|
|
c16208de29 | ||
|
|
ba3c0154dd | ||
|
|
260044f449 | ||
|
|
8d8805c66f | ||
|
|
2a3d222140 | ||
|
|
bcd6cc16c0 | ||
|
|
de2824f816 | ||
|
|
9872b86d13 | ||
|
|
b9f5e9d7d3 | ||
|
|
0b0c20aa7a | ||
|
|
0d73279aab | ||
|
|
12855ec5bb | ||
|
|
c3ca5dd756 | ||
|
|
6877983a71 | ||
|
|
4f879408e3 | ||
|
|
92cf72fe8a | ||
|
|
c6d805005e | ||
|
|
ec858868ee | ||
|
|
6781662a3e | ||
|
|
dc3a46f247 | ||
|
|
a87166814c | ||
|
|
2addb7b799 | ||
|
|
e5a3d0cfae | ||
|
|
362302276c | ||
|
|
79078b2f6c | ||
|
|
d031468437 | ||
|
|
801f276d82 | ||
|
|
dfd9ec855e | ||
|
|
cf2aa70cfd | ||
|
|
0f6f41aafb | ||
|
|
9649e31d1a | ||
|
|
d7eaa8642b | ||
|
|
380e3cdf3c | ||
|
|
d5ff598a9b | ||
|
|
bc7d5ec0b3 | ||
|
|
c31610200c | ||
|
|
d858eabaf9 | ||
|
|
5e56140300 | ||
|
|
4f900db761 | ||
|
|
180082cbf5 | ||
|
|
935b70a65f | ||
|
|
a7c46d5e55 | ||
|
|
336ae09177 | ||
|
|
60f05e04cf | ||
|
|
099b502860 | ||
|
|
1764e1ee8d | ||
|
|
5b44298214 | ||
|
|
356f26a390 | ||
|
|
0653b0c7f0 | ||
|
|
7775bb35ea | ||
|
|
8d4bb554a1 | ||
|
|
fd418b69f6 | ||
|
|
1d40425c81 | ||
|
|
2056c358ac | ||
|
|
6dc0effafb | ||
|
|
84276c38ca | ||
|
|
710ed38d88 | ||
|
|
a1a783b481 | ||
|
|
905466436d | ||
|
|
a88831e540 | ||
|
|
4b08d62f74 | ||
|
|
9f9fb68010 | ||
|
|
f40fe6fbf3 | ||
|
|
84bdd21073 | ||
|
|
2f52f9b4e0 | ||
|
|
33a9dd6bb9 | ||
|
|
d8302d0cf8 | ||
|
|
7aafb58e41 | ||
|
|
de7b401b23 | ||
|
|
13306ef249 | ||
|
|
9867ac38ab | ||
|
|
131d41a60e | ||
|
|
656c4cdae6 | ||
|
|
9781605b94 | ||
|
|
0f3cce4ad0 | ||
|
|
e8f9a877e2 | ||
|
|
b39efde039 | ||
|
|
c0da599225 | ||
|
|
9ccb55cde8 | ||
|
|
e6fd04bbd9 | ||
|
|
f1d4867a0c | ||
|
|
2ba6d1cf94 | ||
|
|
f4ff60b88c | ||
|
|
2baa214cd4 | ||
|
|
92aecaa2b3 | ||
|
|
2b030435d1 | ||
|
|
788540f01e | ||
|
|
983d994c4c | ||
|
|
5f1c650bd8 | ||
|
|
ef094868de | ||
|
|
c818cd145f | ||
|
|
322358df42 | ||
|
|
5288be9d0c | ||
|
|
4e9a7354a7 | ||
|
|
217978e0e9 | ||
|
|
1e4bdbd5b3 | ||
|
|
5ba5beceb3 | ||
|
|
d9e6e80735 | ||
|
|
711ac23ade | ||
|
|
b33f3bebcd |
6
.github/workflows/linting.yaml
vendored
6
.github/workflows/linting.yaml
vendored
@@ -28,3 +28,9 @@ jobs:
|
||||
|
||||
- name: Run pre-commit
|
||||
run: pre-commit run --all-files --show-diff-on-failure
|
||||
|
||||
- name: Commit lint changes
|
||||
uses: stefanzweifel/git-auto-commit-action@v5
|
||||
with:
|
||||
commit_message: "chore: apply linting and formatting"
|
||||
branch: ${{ github.head_ref }}
|
||||
|
||||
10
.gitignore
vendored
10
.gitignore
vendored
@@ -11,6 +11,7 @@ __pycache__/
|
||||
.venv/
|
||||
env/
|
||||
venv/
|
||||
|
||||
*.env*
|
||||
.env_example
|
||||
|
||||
@@ -46,7 +47,7 @@ neo4jWorkDir/
|
||||
|
||||
# Data & Storage
|
||||
inputs/
|
||||
rag_storage/
|
||||
rag_storage*/
|
||||
examples/input/
|
||||
examples/output/
|
||||
output*/
|
||||
@@ -61,12 +62,19 @@ ignore_this.txt
|
||||
dickens*/
|
||||
book.txt
|
||||
LightRAG.pdf
|
||||
LightRAG_2-4.pdf
|
||||
download_models_hf.py
|
||||
lightrag-dev/
|
||||
gui/
|
||||
tiktoken_cache/
|
||||
|
||||
# unit-test files
|
||||
test_*
|
||||
|
||||
# Cline files
|
||||
memory-bank/
|
||||
|
||||
# AI
|
||||
.claude/
|
||||
.cursor/
|
||||
CLAUDE.md
|
||||
|
||||
723
README.md
723
README.md
@@ -4,7 +4,9 @@
|
||||
<img src="./assets/logo.png" width="120" height="120" alt="RAG-Anything Logo" style="border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);">
|
||||
</div>
|
||||
|
||||
# 🚀 RAG-Anything: All-in-One RAG System
|
||||
# 🚀 RAG-Anything: All-in-One RAG Framework
|
||||
|
||||
<a href="https://trendshift.io/repositories/14959" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14959" alt="HKUDS%2FRAG-Anything | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
|
||||
<div align="center">
|
||||
<img src="https://readme-typing-svg.herokuapp.com?font=Orbitron&size=24&duration=3000&pause=1000&color=00D9FF¢er=true&vCenter=true&width=600&lines=Welcome+to+RAG-Anything;Next-Gen+Multimodal+RAG+System;Powered+by+Advanced+AI+Technology" alt="Typing Animation" />
|
||||
@@ -14,13 +16,14 @@
|
||||
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
|
||||
<p>
|
||||
<a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥Project-Page-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
|
||||
<a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
|
||||
<a href='https://arxiv.org/abs/2510.12323'><img src='https://img.shields.io/badge/📄arXiv-2510.12323-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
|
||||
<a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡Based%20on-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
|
||||
</p>
|
||||
<p>
|
||||
<a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
|
||||
<img src="https://img.shields.io/badge/🐍Python-3.9+-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
|
||||
<img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
|
||||
<a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
|
||||
<a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/⚡uv-Ready-ff6b6b?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e"></a>
|
||||
</p>
|
||||
<p>
|
||||
<a href="https://discord.gg/yF2MmDJyGJ"><img src="https://img.shields.io/badge/💬Discord-Community-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e"></a>
|
||||
@@ -47,6 +50,15 @@
|
||||
|
||||
---
|
||||
|
||||
## 🎉 News
|
||||
- [X] [2025.10]🎯📢 🚀 We have released the technical report of [RAG-Anything](http://arxiv.org/abs/2510.12323). Access it now to explore our latest research findings.
|
||||
- [X] [2025.08]🎯📢 🔍 RAG-Anything now features **VLM-Enhanced Query** mode! When documents include images, the system seamlessly integrates them into VLM for advanced multimodal analysis, combining visual and textual context for deeper insights.
|
||||
- [X] [2025.07]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.
|
||||
- [X] [2025.07]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.
|
||||
- [X] [2025.07]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.
|
||||
|
||||
---
|
||||
|
||||
## 🌟 System Overview
|
||||
|
||||
*Next-Generation Multimodal Intelligence*
|
||||
@@ -72,6 +84,7 @@ Users can query documents containing **interleaved text**, **visual diagrams**,
|
||||
- **🧠 Specialized Content Analysis** - Dedicated processors for images, tables, mathematical equations, and heterogeneous content types
|
||||
- **🔗 Multimodal Knowledge Graph** - Automatic entity extraction and cross-modal relationship discovery for enhanced understanding
|
||||
- **⚡ Adaptive Processing Modes** - Flexible MinerU-based parsing or direct multimodal content injection workflows
|
||||
- **📋 Direct Content List Insertion** - Bypass document parsing by directly inserting pre-parsed content lists from external sources
|
||||
- **🎯 Hybrid Intelligent Retrieval** - Advanced search capabilities spanning textual and multimodal content with contextual understanding
|
||||
|
||||
</div>
|
||||
@@ -176,7 +189,7 @@ The system deploys modality-aware processing units for heterogeneous data modali
|
||||
|
||||
</div>
|
||||
|
||||
### 4. Multi-Modal Knowledge Graph Index
|
||||
### 4. Multimodal Knowledge Graph Index
|
||||
|
||||
<div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
|
||||
|
||||
@@ -236,14 +249,26 @@ pip install 'raganything[image,text]' # Multiple features
|
||||
```
|
||||
|
||||
#### Option 2: Install from Source
|
||||
|
||||
```bash
|
||||
# Install uv (if not already installed)
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Clone and setup the project with uv
|
||||
git clone https://github.com/HKUDS/RAG-Anything.git
|
||||
cd RAG-Anything
|
||||
pip install -e .
|
||||
|
||||
# With optional dependencies
|
||||
pip install -e '.[all]'
|
||||
# Install the package and dependencies in a virtual environment
|
||||
uv sync
|
||||
|
||||
# If you encounter network timeouts (especially for opencv packages):
|
||||
# UV_HTTP_TIMEOUT=120 uv sync
|
||||
|
||||
# Run commands directly with uv (recommended approach)
|
||||
uv run python examples/raganything_example.py --help
|
||||
|
||||
# Install with optional dependencies
|
||||
uv sync --extra image --extra text # Specific extras
|
||||
uv sync --all-extras # All optional features
|
||||
```
|
||||
|
||||
#### Optional Dependencies
|
||||
@@ -267,7 +292,7 @@ pip install -e '.[all]'
|
||||
mineru --version
|
||||
|
||||
# Check if properly configured
|
||||
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU installed properly' if rag.check_mineru_installation() else '❌ MinerU installation issue')"
|
||||
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU installed properly' if rag.check_parser_installation() else '❌ MinerU installation issue')"
|
||||
```
|
||||
|
||||
Models are downloaded automatically on first use. For manual download, refer to [MinerU Model Source Configuration](https://github.com/opendatalab/MinerU/blob/master/README.md#22-model-source-configuration).
|
||||
@@ -278,42 +303,162 @@ Models are downloaded automatically on first use. For manual download, refer to
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from raganything import RAGAnything
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
|
||||
async def main():
|
||||
# Set up API configuration
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # Optional
|
||||
|
||||
# Create RAGAnything configuration
|
||||
config = RAGAnythingConfig(
|
||||
working_dir="./rag_storage",
|
||||
parser="mineru", # Parser selection: mineru or docling
|
||||
parse_method="auto", # Parse method: auto, ocr, or txt
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
)
|
||||
|
||||
# Define LLM model function
|
||||
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt}
|
||||
if system_prompt
|
||||
else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_data}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
if image_data
|
||||
else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
# Define embedding function
|
||||
embedding_func = EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="text-embedding-3-large",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialize RAGAnything
|
||||
rag = RAGAnything(
|
||||
config=config,
|
||||
llm_model_func=llm_model_func,
|
||||
vision_model_func=vision_model_func,
|
||||
embedding_func=embedding_func,
|
||||
)
|
||||
|
||||
# Process a document
|
||||
await rag.process_document_complete(
|
||||
file_path="path/to/your/document.pdf",
|
||||
output_dir="./output",
|
||||
parse_method="auto"
|
||||
)
|
||||
|
||||
# Query the processed content
|
||||
# Pure text query - for basic knowledge base search
|
||||
text_result = await rag.aquery(
|
||||
"What are the main findings shown in the figures and tables?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print("Text query result:", text_result)
|
||||
|
||||
# Multimodal query with specific multimodal content
|
||||
multimodal_result = await rag.aquery_with_multimodal(
|
||||
"Explain this formula and its relevance to the document content",
|
||||
multimodal_content=[{
|
||||
"type": "equation",
|
||||
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
||||
"equation_caption": "Document relevance probability"
|
||||
}],
|
||||
mode="hybrid"
|
||||
)
|
||||
print("Multimodal query result:", multimodal_result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
#### 2. Direct Multimodal Content Processing
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from lightrag import LightRAG
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
|
||||
|
||||
async def process_multimodal_content():
|
||||
# Set up API configuration
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # Optional
|
||||
|
||||
# Initialize LightRAG
|
||||
rag = LightRAG(
|
||||
working_dir="./rag_storage",
|
||||
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
),
|
||||
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
]} if image_data else {"role": "user", "content": prompt}
|
||||
],
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
) if image_data else openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
),
|
||||
embedding_func=EmbeddingFunc(
|
||||
@@ -325,52 +470,43 @@ async def main():
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
# Process a document
|
||||
await rag.process_document_complete(
|
||||
file_path="path/to/your/document.pdf",
|
||||
output_dir="./output",
|
||||
parse_method="auto"
|
||||
)
|
||||
|
||||
# Query the processed content
|
||||
result = await rag.query_with_multimodal(
|
||||
"What are the main findings shown in the figures and tables?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
#### 2. Direct Multimodal Content Processing
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from lightrag import LightRAG
|
||||
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
|
||||
|
||||
async def process_multimodal_content():
|
||||
# Initialize LightRAG
|
||||
rag = LightRAG(
|
||||
working_dir="./rag_storage",
|
||||
# ... your LLM and embedding configurations
|
||||
)
|
||||
)
|
||||
await rag.initialize_storages()
|
||||
|
||||
# Process an image
|
||||
image_processor = ImageModalProcessor(
|
||||
lightrag=rag,
|
||||
modal_caption_func=your_vision_model_func
|
||||
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
]} if image_data else {"role": "user", "content": prompt}
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
) if image_data else openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
image_content = {
|
||||
"img_path": "path/to/image.jpg",
|
||||
"img_caption": ["Figure 1: Experimental results"],
|
||||
"img_footnote": ["Data collected in 2024"]
|
||||
"image_caption": ["Figure 1: Experimental results"],
|
||||
"image_footnote": ["Data collected in 2024"]
|
||||
}
|
||||
|
||||
description, entity_info = await image_processor.process_multimodal_content(
|
||||
@@ -383,7 +519,15 @@ async def process_multimodal_content():
|
||||
# Process a table
|
||||
table_processor = TableModalProcessor(
|
||||
lightrag=rag,
|
||||
modal_caption_func=your_llm_model_func
|
||||
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
table_content = {
|
||||
@@ -436,25 +580,93 @@ class CustomModalProcessor(GenericModalProcessor):
|
||||
|
||||
#### 5. Query Options
|
||||
|
||||
RAG-Anything provides three types of query methods:
|
||||
|
||||
**Pure Text Queries** - Direct knowledge base search using LightRAG:
|
||||
```python
|
||||
# Different query modes
|
||||
result_hybrid = await rag.query_with_multimodal("Your question", mode="hybrid")
|
||||
result_local = await rag.query_with_multimodal("Your question", mode="local")
|
||||
result_global = await rag.query_with_multimodal("Your question", mode="global")
|
||||
# Different query modes for text queries
|
||||
text_result_hybrid = await rag.aquery("Your question", mode="hybrid")
|
||||
text_result_local = await rag.aquery("Your question", mode="local")
|
||||
text_result_global = await rag.aquery("Your question", mode="global")
|
||||
text_result_naive = await rag.aquery("Your question", mode="naive")
|
||||
|
||||
# Synchronous version
|
||||
sync_text_result = rag.query("Your question", mode="hybrid")
|
||||
```
|
||||
|
||||
**VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:
|
||||
```python
|
||||
# VLM enhanced query (automatically enabled when vision_model_func is provided)
|
||||
vlm_result = await rag.aquery(
|
||||
"Analyze the charts and figures in the document",
|
||||
mode="hybrid"
|
||||
# vlm_enhanced=True is automatically set when vision_model_func is available
|
||||
)
|
||||
|
||||
# Manually control VLM enhancement
|
||||
vlm_enabled = await rag.aquery(
|
||||
"What do the images show in this document?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=True # Force enable VLM enhancement
|
||||
)
|
||||
|
||||
vlm_disabled = await rag.aquery(
|
||||
"What do the images show in this document?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=False # Force disable VLM enhancement
|
||||
)
|
||||
|
||||
# When documents contain images, VLM can see and analyze them directly
|
||||
# The system will automatically:
|
||||
# 1. Retrieve relevant context containing image paths
|
||||
# 2. Load and encode images as base64
|
||||
# 3. Send both text context and images to VLM for comprehensive analysis
|
||||
```
|
||||
|
||||
**Multimodal Queries** - Enhanced queries with specific multimodal content analysis:
|
||||
```python
|
||||
# Query with table data
|
||||
table_result = await rag.aquery_with_multimodal(
|
||||
"Compare these performance metrics with the document content",
|
||||
multimodal_content=[{
|
||||
"type": "table",
|
||||
"table_data": """Method,Accuracy,Speed
|
||||
RAGAnything,95.2%,120ms
|
||||
Traditional,87.3%,180ms""",
|
||||
"table_caption": "Performance comparison"
|
||||
}],
|
||||
mode="hybrid"
|
||||
)
|
||||
|
||||
# Query with equation content
|
||||
equation_result = await rag.aquery_with_multimodal(
|
||||
"Explain this formula and its relevance to the document content",
|
||||
multimodal_content=[{
|
||||
"type": "equation",
|
||||
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
||||
"equation_caption": "Document relevance probability"
|
||||
}],
|
||||
mode="hybrid"
|
||||
)
|
||||
```
|
||||
|
||||
#### 6. Loading Existing LightRAG Instance
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from raganything import RAGAnything
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
from lightrag import LightRAG
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
import os
|
||||
|
||||
async def load_existing_lightrag():
|
||||
# First, create or load an existing LightRAG instance
|
||||
# Set up API configuration
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # Optional
|
||||
|
||||
# First, create or load existing LightRAG instance
|
||||
lightrag_working_dir = "./existing_lightrag_storage"
|
||||
|
||||
# Check if previous LightRAG instance exists
|
||||
@@ -463,7 +675,7 @@ async def load_existing_lightrag():
|
||||
else:
|
||||
print("❌ No existing LightRAG instance found, will create new one")
|
||||
|
||||
# Create/Load LightRAG instance with your configurations
|
||||
# Create/load LightRAG instance with your configuration
|
||||
lightrag_instance = LightRAG(
|
||||
working_dir=lightrag_working_dir,
|
||||
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
||||
@@ -471,7 +683,8 @@ async def load_existing_lightrag():
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
),
|
||||
embedding_func=EmbeddingFunc(
|
||||
@@ -488,44 +701,73 @@ async def load_existing_lightrag():
|
||||
|
||||
# Initialize storage (this will load existing data if available)
|
||||
await lightrag_instance.initialize_storages()
|
||||
await initialize_pipeline_status()
|
||||
|
||||
# Now initialize RAGAnything with the existing LightRAG instance
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt}
|
||||
if system_prompt
|
||||
else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_data}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
if image_data
|
||||
else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
# Now use existing LightRAG instance to initialize RAGAnything
|
||||
rag = RAGAnything(
|
||||
lightrag=lightrag_instance, # Pass the existing LightRAG instance
|
||||
# Only need vision model for multimodal processing
|
||||
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
]} if image_data else {"role": "user", "content": prompt}
|
||||
],
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
) if image_data else openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
)
|
||||
lightrag=lightrag_instance, # Pass existing LightRAG instance
|
||||
vision_model_func=vision_model_func,
|
||||
# Note: working_dir, llm_model_func, embedding_func, etc. are inherited from lightrag_instance
|
||||
)
|
||||
|
||||
# Query the existing knowledge base
|
||||
result = await rag.query_with_multimodal(
|
||||
# Query existing knowledge base
|
||||
result = await rag.aquery(
|
||||
"What data has been processed in this LightRAG instance?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print("Query result:", result)
|
||||
|
||||
# Add new multimodal documents to the existing LightRAG instance
|
||||
# Add new multimodal document to existing LightRAG instance
|
||||
await rag.process_document_complete(
|
||||
file_path="path/to/new/multimodal_document.pdf",
|
||||
output_dir="./output"
|
||||
@@ -535,6 +777,195 @@ if __name__ == "__main__":
|
||||
asyncio.run(load_existing_lightrag())
|
||||
```
|
||||
|
||||
#### 7. Direct Content List Insertion
|
||||
|
||||
For scenarios where you already have a pre-parsed content list (e.g., from external parsers or previous processing), you can directly insert it into RAGAnything without document parsing:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
|
||||
async def insert_content_list_example():
|
||||
# Set up API configuration
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # Optional
|
||||
|
||||
# Create RAGAnything configuration
|
||||
config = RAGAnythingConfig(
|
||||
working_dir="./rag_storage",
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
)
|
||||
|
||||
# Define model functions
|
||||
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
],
|
||||
} if image_data else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
embedding_func = EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="text-embedding-3-large",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialize RAGAnything
|
||||
rag = RAGAnything(
|
||||
config=config,
|
||||
llm_model_func=llm_model_func,
|
||||
vision_model_func=vision_model_func,
|
||||
embedding_func=embedding_func,
|
||||
)
|
||||
|
||||
# Example: Pre-parsed content list from external source
|
||||
content_list = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is the introduction section of our research paper.",
|
||||
"page_idx": 0 # Page number where this content appears
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"img_path": "/absolute/path/to/figure1.jpg", # IMPORTANT: Use absolute path
|
||||
"image_caption": ["Figure 1: System Architecture"],
|
||||
"image_footnote": ["Source: Authors' original design"],
|
||||
"page_idx": 1 # Page number where this image appears
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"table_body": "| Method | Accuracy | F1-Score |\n|--------|----------|----------|\n| Ours | 95.2% | 0.94 |\n| Baseline | 87.3% | 0.85 |",
|
||||
"table_caption": ["Table 1: Performance Comparison"],
|
||||
"table_footnote": ["Results on test dataset"],
|
||||
"page_idx": 2 # Page number where this table appears
|
||||
},
|
||||
{
|
||||
"type": "equation",
|
||||
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
||||
"text": "Document relevance probability formula",
|
||||
"page_idx": 3 # Page number where this equation appears
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "In conclusion, our method demonstrates superior performance across all metrics.",
|
||||
"page_idx": 4 # Page number where this content appears
|
||||
}
|
||||
]
|
||||
|
||||
# Insert the content list directly
|
||||
await rag.insert_content_list(
|
||||
content_list=content_list,
|
||||
file_path="research_paper.pdf", # Reference file name for citation
|
||||
split_by_character=None, # Optional text splitting
|
||||
split_by_character_only=False, # Optional text splitting mode
|
||||
doc_id=None, # Optional custom document ID (will be auto-generated if not provided)
|
||||
display_stats=True # Show content statistics
|
||||
)
|
||||
|
||||
# Query the inserted content
|
||||
result = await rag.aquery(
|
||||
"What are the key findings and performance metrics mentioned in the research?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print("Query result:", result)
|
||||
|
||||
# You can also insert multiple content lists with different document IDs
|
||||
another_content_list = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is content from another document.",
|
||||
"page_idx": 0 # Page number where this content appears
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"table_body": "| Feature | Value |\n|---------|-------|\n| Speed | Fast |\n| Accuracy | High |",
|
||||
"table_caption": ["Feature Comparison"],
|
||||
"page_idx": 1 # Page number where this table appears
|
||||
}
|
||||
]
|
||||
|
||||
await rag.insert_content_list(
|
||||
content_list=another_content_list,
|
||||
file_path="another_document.pdf",
|
||||
doc_id="custom-doc-id-123" # Custom document ID
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(insert_content_list_example())
|
||||
```
|
||||
|
||||
**Content List Format:**
|
||||
|
||||
The `content_list` should follow the standard format with each item being a dictionary containing:
|
||||
|
||||
- **Text content**: `{"type": "text", "text": "content text", "page_idx": 0}`
|
||||
- **Image content**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["caption"], "image_footnote": ["note"], "page_idx": 1}`
|
||||
- **Table content**: `{"type": "table", "table_body": "markdown table", "table_caption": ["caption"], "table_footnote": ["note"], "page_idx": 2}`
|
||||
- **Equation content**: `{"type": "equation", "latex": "LaTeX formula", "text": "description", "page_idx": 3}`
|
||||
- **Generic content**: `{"type": "custom_type", "content": "any content", "page_idx": 4}`
|
||||
|
||||
**Important Notes:**
|
||||
- **`img_path`**: Must be an absolute path to the image file (e.g., `/home/user/images/chart.jpg` or `C:\Users\user\images\chart.jpg`)
|
||||
- **`page_idx`**: Represents the page number where the content appears in the original document (0-based indexing)
|
||||
- **Content ordering**: Items are processed in the order they appear in the list
|
||||
|
||||
This method is particularly useful when:
|
||||
- You have content from external parsers (non-MinerU/Docling)
|
||||
- You want to process programmatically generated content
|
||||
- You need to insert content from multiple sources into a single knowledge base
|
||||
- You have cached parsing results that you want to reuse
|
||||
|
||||
---
|
||||
|
||||
## 🛠️ Examples
|
||||
@@ -556,8 +987,8 @@ The `examples/` directory contains comprehensive usage examples:
|
||||
**Run examples:**
|
||||
|
||||
```bash
|
||||
# End-to-end processing
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
|
||||
# End-to-end processing with parser selection
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
|
||||
|
||||
# Direct modal processing
|
||||
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
|
||||
@@ -594,14 +1025,32 @@ Create a `.env` file (refer to `.env.example`):
|
||||
```bash
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
OPENAI_BASE_URL=your_base_url # Optional
|
||||
OUTPUT_DIR=./output # Default output directory for parsed documents
|
||||
PARSER=mineru # Parser selection: mineru or docling
|
||||
PARSE_METHOD=auto # Parse method: auto, ocr, or txt
|
||||
```
|
||||
|
||||
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys.
|
||||
**Note:** For backward compatibility, legacy environment variable names are still supported:
|
||||
- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
|
||||
|
||||
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
|
||||
|
||||
### Parser Configuration
|
||||
|
||||
RAGAnything now supports multiple parsers, each with specific advantages:
|
||||
|
||||
#### MinerU Parser
|
||||
- Supports PDF, images, Office documents, and more formats
|
||||
- Powerful OCR and table extraction capabilities
|
||||
- GPU acceleration support
|
||||
|
||||
#### Docling Parser
|
||||
- Optimized for Office documents and HTML files
|
||||
- Better document structure preservation
|
||||
- Native support for multiple Office formats
|
||||
|
||||
### MinerU Configuration
|
||||
|
||||
MinerU 2.0 uses a simplified configuration approach:
|
||||
|
||||
```bash
|
||||
# MinerU 2.0 uses command-line parameters instead of config files
|
||||
# Check available options:
|
||||
@@ -613,20 +1062,43 @@ mineru -p input.pdf -o output_dir -m ocr # OCR-focused parsing
|
||||
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU acceleration
|
||||
```
|
||||
|
||||
You can also configure MinerU through RAGAnything parameters:
|
||||
You can also configure parsing through RAGAnything parameters:
|
||||
|
||||
```python
|
||||
# Configure parsing behavior
|
||||
# Basic parsing configuration with parser selection
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
parse_method="auto", # or "ocr", "txt"
|
||||
device="cuda", # GPU acceleration
|
||||
backend="pipeline", # parsing backend
|
||||
lang="en" # language optimization
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # or "ocr", "txt"
|
||||
parser="mineru" # Optional: "mineru" or "docling"
|
||||
)
|
||||
|
||||
# Advanced parsing configuration with special parameters
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # Parsing method: "auto", "ocr", "txt"
|
||||
parser="mineru", # Parser selection: "mineru" or "docling"
|
||||
|
||||
# MinerU special parameters - all supported kwargs:
|
||||
lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja")
|
||||
device="cuda:0", # Inference device: "cpu", "cuda", "cuda:0", "npu", "mps"
|
||||
start_page=0, # Starting page number (0-based, for PDF)
|
||||
end_page=10, # Ending page number (0-based, for PDF)
|
||||
formula=True, # Enable formula parsing
|
||||
table=True, # Enable table parsing
|
||||
backend="pipeline", # Parsing backend: pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client.
|
||||
source="huggingface", # Model source: "huggingface", "modelscope", "local"
|
||||
# vlm_url="http://127.0.0.1:3000" # Service address when using backend=vlm-sglang-client
|
||||
|
||||
# Standard RAGAnything parameters
|
||||
display_stats=True, # Display content statistics
|
||||
split_by_character=None, # Optional character to split text by
|
||||
doc_id=None # Optional document ID
|
||||
)
|
||||
```
|
||||
|
||||
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments.
|
||||
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs.
|
||||
|
||||
### Processing Requirements
|
||||
|
||||
@@ -676,13 +1148,14 @@ Different content types require specific optional dependencies:
|
||||
If you find RAG-Anything useful in your research, please cite our paper:
|
||||
|
||||
```bibtex
|
||||
@article{guo2024lightrag,
|
||||
title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
|
||||
author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
|
||||
year={2024},
|
||||
eprint={2410.05779},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.IR}
|
||||
@misc{guo2025raganythingallinoneragframework,
|
||||
title={RAG-Anything: All-in-One RAG Framework},
|
||||
author={Zirui Guo and Xubin Ren and Lingrui Xu and Jiahao Zhang and Chao Huang},
|
||||
year={2025},
|
||||
eprint={2510.12323},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.AI},
|
||||
url={https://arxiv.org/abs/2510.12323},
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
702
README_zh.md
702
README_zh.md
@@ -14,12 +14,12 @@
|
||||
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
|
||||
<p>
|
||||
<a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥项目-主页-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
|
||||
<a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
|
||||
<a href='https://arxiv.org/abs/2510.12323'><img src='https://img.shields.io/badge/📄arXiv-2510.12323-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
|
||||
<a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡基于-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
|
||||
</p>
|
||||
<p>
|
||||
<a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
|
||||
<img src="https://img.shields.io/badge/🐍Python-3.9+-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
|
||||
<img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
|
||||
<a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
|
||||
</p>
|
||||
<p>
|
||||
@@ -47,6 +47,14 @@
|
||||
|
||||
---
|
||||
|
||||
## 🎉 新闻
|
||||
- [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式!当文档包含图片时,系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。
|
||||
- [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md),支持为多模态内容处理添加相关上下文信息。
|
||||
- [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询,实现了集成文本、图像、表格和公式处理的增强检索生成功能。
|
||||
- [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟!感谢您的支持和贡献。
|
||||
|
||||
---
|
||||
|
||||
## 🌟 系统概述
|
||||
|
||||
*下一代多模态智能*
|
||||
@@ -68,6 +76,7 @@
|
||||
- **🧠 多模态内容分析引擎** - 针对图像、表格、公式和通用文本内容部署专门的处理器,确保各类内容的精准解析
|
||||
- **🔗 基于知识图谱索引** - 实现自动化实体提取和关系构建,建立跨模态的语义连接网络
|
||||
- **⚡ 灵活的处理架构** - 支持基于MinerU的智能解析模式和直接多模态内容插入模式,满足不同应用场景需求
|
||||
- **📋 直接内容列表插入** - 跳过文档解析,直接插入来自外部源的预解析内容列表,支持多种数据来源整合
|
||||
- **🎯 跨模态检索机制** - 实现跨文本和多模态内容的智能检索,提供精准的信息定位和匹配能力
|
||||
|
||||
</div>
|
||||
@@ -263,7 +272,7 @@ pip install -e '.[all]'
|
||||
mineru --version
|
||||
|
||||
# 检查是否正确配置
|
||||
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU安装正常' if rag.check_mineru_installation() else '❌ MinerU安装有问题')"
|
||||
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU安装正常' if rag.check_parser_installation() else '❌ MinerU安装有问题')"
|
||||
```
|
||||
|
||||
模型在首次使用时自动下载。手动下载参考[MinerU模型源配置](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#22-%E6%A8%A1%E5%9E%8B%E6%BA%90%E9%85%8D%E7%BD%AE):
|
||||
@@ -274,42 +283,164 @@ python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from raganything import RAGAnything
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
|
||||
async def main():
|
||||
# 初始化RAGAnything
|
||||
# 设置 API 配置
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # 可选
|
||||
|
||||
# 创建 RAGAnything 配置
|
||||
config = RAGAnythingConfig(
|
||||
working_dir="./rag_storage",
|
||||
parser="mineru", # 选择解析器:mineru 或 docling
|
||||
parse_method="auto", # 解析方法:auto, ocr 或 txt
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
)
|
||||
|
||||
# 定义 LLM 模型函数
|
||||
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# 定义视觉模型函数用于图像处理
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 传统单图片格式
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt}
|
||||
if system_prompt
|
||||
else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_data}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
if image_data
|
||||
else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 纯文本格式
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
# 定义嵌入函数
|
||||
embedding_func = EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="text-embedding-3-large",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
)
|
||||
|
||||
# 初始化 RAGAnything
|
||||
rag = RAGAnything(
|
||||
config=config,
|
||||
llm_model_func=llm_model_func,
|
||||
vision_model_func=vision_model_func,
|
||||
embedding_func=embedding_func,
|
||||
)
|
||||
|
||||
# 处理文档
|
||||
await rag.process_document_complete(
|
||||
file_path="path/to/your/document.pdf",
|
||||
output_dir="./output",
|
||||
parse_method="auto"
|
||||
)
|
||||
|
||||
# 查询处理后的内容
|
||||
# 纯文本查询 - 基本知识库搜索
|
||||
text_result = await rag.aquery(
|
||||
"文档的主要内容是什么?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print("文本查询结果:", text_result)
|
||||
|
||||
# 多模态查询 - 包含具体多模态内容的查询
|
||||
multimodal_result = await rag.aquery_with_multimodal(
|
||||
"分析这个性能数据并解释与现有文档内容的关系",
|
||||
multimodal_content=[{
|
||||
"type": "table",
|
||||
"table_data": """系统,准确率,F1分数
|
||||
RAGAnything,95.2%,0.94
|
||||
基准方法,87.3%,0.85""",
|
||||
"table_caption": "性能对比结果"
|
||||
}],
|
||||
mode="hybrid"
|
||||
)
|
||||
print("多模态查询结果:", multimodal_result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
#### 2. 直接多模态内容处理
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from lightrag import LightRAG
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
|
||||
|
||||
async def process_multimodal_content():
|
||||
# 设置 API 配置
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # 可选
|
||||
|
||||
# 初始化 LightRAG
|
||||
rag = LightRAG(
|
||||
working_dir="./rag_storage",
|
||||
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
),
|
||||
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
]} if image_data else {"role": "user", "content": prompt}
|
||||
],
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
) if image_data else openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
),
|
||||
embedding_func=EmbeddingFunc(
|
||||
@@ -321,52 +452,43 @@ async def main():
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
# 处理文档
|
||||
await rag.process_document_complete(
|
||||
file_path="path/to/your/document.pdf",
|
||||
output_dir="./output",
|
||||
parse_method="auto"
|
||||
)
|
||||
|
||||
# 查询处理后的内容
|
||||
result = await rag.query_with_multimodal(
|
||||
"图表中显示的主要发现是什么?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
#### 2. 直接多模态内容处理
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from lightrag import LightRAG
|
||||
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
|
||||
|
||||
async def process_multimodal_content():
|
||||
# 初始化LightRAG
|
||||
rag = LightRAG(
|
||||
working_dir="./rag_storage",
|
||||
# ... 你的LLM和嵌入配置
|
||||
)
|
||||
)
|
||||
await rag.initialize_storages()
|
||||
|
||||
# 处理图像
|
||||
image_processor = ImageModalProcessor(
|
||||
lightrag=rag,
|
||||
modal_caption_func=your_vision_model_func
|
||||
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
]} if image_data else {"role": "user", "content": prompt}
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
) if image_data else openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
image_content = {
|
||||
"img_path": "path/to/image.jpg",
|
||||
"img_caption": ["图1:实验结果"],
|
||||
"img_footnote": ["数据收集于2024年"]
|
||||
"image_caption": ["图1:实验结果"],
|
||||
"image_footnote": ["数据收集于2024年"]
|
||||
}
|
||||
|
||||
description, entity_info = await image_processor.process_multimodal_content(
|
||||
@@ -379,7 +501,15 @@ async def process_multimodal_content():
|
||||
# 处理表格
|
||||
table_processor = TableModalProcessor(
|
||||
lightrag=rag,
|
||||
modal_caption_func=your_llm_model_func
|
||||
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
table_content = {
|
||||
@@ -432,11 +562,74 @@ class CustomModalProcessor(GenericModalProcessor):
|
||||
|
||||
#### 5. 查询选项
|
||||
|
||||
RAG-Anything 提供三种类型的查询方法:
|
||||
|
||||
**纯文本查询** - 使用LightRAG直接进行知识库搜索:
|
||||
```python
|
||||
# 不同的查询模式
|
||||
result_hybrid = await rag.query_with_multimodal("你的问题", mode="hybrid")
|
||||
result_local = await rag.query_with_multimodal("你的问题", mode="local")
|
||||
result_global = await rag.query_with_multimodal("你的问题", mode="global")
|
||||
# 文本查询的不同模式
|
||||
text_result_hybrid = await rag.aquery("你的问题", mode="hybrid")
|
||||
text_result_local = await rag.aquery("你的问题", mode="local")
|
||||
text_result_global = await rag.aquery("你的问题", mode="global")
|
||||
text_result_naive = await rag.aquery("你的问题", mode="naive")
|
||||
|
||||
# 同步版本
|
||||
sync_text_result = rag.query("你的问题", mode="hybrid")
|
||||
```
|
||||
|
||||
**VLM增强查询** - 使用VLM自动分析检索上下文中的图像:
|
||||
```python
|
||||
# VLM增强查询(当提供vision_model_func时自动启用)
|
||||
vlm_result = await rag.aquery(
|
||||
"分析文档中的图表和数据",
|
||||
mode="hybrid"
|
||||
# vlm_enhanced=True 当vision_model_func可用时自动设置
|
||||
)
|
||||
|
||||
# 手动控制VLM增强
|
||||
vlm_enabled = await rag.aquery(
|
||||
"这个文档中的图片显示了什么内容?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=True # 强制启用VLM增强
|
||||
)
|
||||
|
||||
vlm_disabled = await rag.aquery(
|
||||
"这个文档中的图片显示了什么内容?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=False # 强制禁用VLM增强
|
||||
)
|
||||
|
||||
# 当文档包含图片时,VLM可以直接查看和分析图片
|
||||
# 系统将自动:
|
||||
# 1. 检索包含图片路径的相关上下文
|
||||
# 2. 加载图片并编码为base64格式
|
||||
# 3. 将文本上下文和图片一起发送给VLM进行综合分析
|
||||
```
|
||||
|
||||
**多模态查询** - 包含特定多模态内容分析的增强查询:
|
||||
```python
|
||||
# 包含表格数据的查询
|
||||
table_result = await rag.aquery_with_multimodal(
|
||||
"比较这些性能指标与文档内容",
|
||||
multimodal_content=[{
|
||||
"type": "table",
|
||||
"table_data": """方法,准确率,速度
|
||||
LightRAG,95.2%,120ms
|
||||
传统方法,87.3%,180ms""",
|
||||
"table_caption": "性能对比"
|
||||
}],
|
||||
mode="hybrid"
|
||||
)
|
||||
|
||||
# 包含公式内容的查询
|
||||
equation_result = await rag.aquery_with_multimodal(
|
||||
"解释这个公式及其与文档内容的相关性",
|
||||
multimodal_content=[{
|
||||
"type": "equation",
|
||||
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
||||
"equation_caption": "文档相关性概率"
|
||||
}],
|
||||
mode="hybrid"
|
||||
)
|
||||
```
|
||||
|
||||
#### 6. 加载已存在的LightRAG实例
|
||||
@@ -450,16 +643,20 @@ from lightrag.utils import EmbeddingFunc
|
||||
import os
|
||||
|
||||
async def load_existing_lightrag():
|
||||
# 首先,创建或加载已存在的LightRAG实例
|
||||
# 设置 API 配置
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # 可选
|
||||
|
||||
# 首先,创建或加载已存在的 LightRAG 实例
|
||||
lightrag_working_dir = "./existing_lightrag_storage"
|
||||
|
||||
# 检查是否存在之前的LightRAG实例
|
||||
# 检查是否存在之前的 LightRAG 实例
|
||||
if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):
|
||||
print("✅ 发现已存在的LightRAG实例,正在加载...")
|
||||
print("✅ 发现已存在的 LightRAG 实例,正在加载...")
|
||||
else:
|
||||
print("❌ 未找到已存在的LightRAG实例,将创建新实例")
|
||||
print("❌ 未找到已存在的 LightRAG 实例,将创建新实例")
|
||||
|
||||
# 使用您的配置创建/加载LightRAG实例
|
||||
# 使用您的配置创建/加载 LightRAG 实例
|
||||
lightrag_instance = LightRAG(
|
||||
working_dir=lightrag_working_dir,
|
||||
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
||||
@@ -467,7 +664,8 @@ async def load_existing_lightrag():
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
),
|
||||
embedding_func=EmbeddingFunc(
|
||||
@@ -484,44 +682,73 @@ async def load_existing_lightrag():
|
||||
|
||||
# 初始化存储(如果有现有数据,这将加载它们)
|
||||
await lightrag_instance.initialize_storages()
|
||||
await initialize_pipeline_status()
|
||||
|
||||
# 现在使用已存在的LightRAG实例初始化RAGAnything
|
||||
# 定义视觉模型函数用于图像处理
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 传统单图片格式
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt}
|
||||
if system_prompt
|
||||
else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_data}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
if image_data
|
||||
else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 纯文本格式
|
||||
else:
|
||||
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
# 现在使用已存在的 LightRAG 实例初始化 RAGAnything
|
||||
rag = RAGAnything(
|
||||
lightrag=lightrag_instance, # 传入已存在的LightRAG实例
|
||||
# 只需要为多模态处理配置vision model
|
||||
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
]} if image_data else {"role": "user", "content": prompt}
|
||||
],
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
) if image_data else openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="your-api-key",
|
||||
**kwargs,
|
||||
)
|
||||
# 注意:working_dir、llm_model_func、embedding_func等都从lightrag_instance继承
|
||||
lightrag=lightrag_instance, # 传入已存在的 LightRAG 实例
|
||||
vision_model_func=vision_model_func,
|
||||
# 注意:working_dir、llm_model_func、embedding_func 等都从 lightrag_instance 继承
|
||||
)
|
||||
|
||||
# 查询已存在的知识库
|
||||
result = await rag.query_with_multimodal(
|
||||
"这个LightRAG实例中处理了哪些数据?",
|
||||
result = await rag.aquery(
|
||||
"这个 LightRAG 实例中处理了哪些数据?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print("查询结果:", result)
|
||||
|
||||
# 向已存在的LightRAG实例添加新的多模态文档
|
||||
# 向已存在的 LightRAG 实例添加新的多模态文档
|
||||
await rag.process_document_complete(
|
||||
file_path="path/to/new/multimodal_document.pdf",
|
||||
output_dir="./output"
|
||||
@@ -531,6 +758,195 @@ if __name__ == "__main__":
|
||||
asyncio.run(load_existing_lightrag())
|
||||
```
|
||||
|
||||
#### 7. 直接插入内容列表
|
||||
|
||||
当您已经有预解析的内容列表(例如,来自外部解析器或之前的处理结果)时,可以直接插入到 RAGAnything 中而无需文档解析:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
|
||||
async def insert_content_list_example():
|
||||
# 设置 API 配置
|
||||
api_key = "your-api-key"
|
||||
base_url = "your-base-url" # 可选
|
||||
|
||||
# 创建 RAGAnything 配置
|
||||
config = RAGAnythingConfig(
|
||||
working_dir="./rag_storage",
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
)
|
||||
|
||||
# 定义模型函数
|
||||
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
|
||||
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 传统单图片格式
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
],
|
||||
} if image_data else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 纯文本格式
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
embedding_func = EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="text-embedding-3-large",
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
)
|
||||
|
||||
# 初始化 RAGAnything
|
||||
rag = RAGAnything(
|
||||
config=config,
|
||||
llm_model_func=llm_model_func,
|
||||
vision_model_func=vision_model_func,
|
||||
embedding_func=embedding_func,
|
||||
)
|
||||
|
||||
# 示例:来自外部源的预解析内容列表
|
||||
content_list = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "这是我们研究论文的引言部分。",
|
||||
"page_idx": 0 # 此内容出现的页码
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"img_path": "/absolute/path/to/figure1.jpg", # 重要:使用绝对路径
|
||||
"image_caption": ["图1:系统架构"],
|
||||
"image_footnote": ["来源:作者原创设计"],
|
||||
"page_idx": 1 # 此图像出现的页码
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"table_body": "| 方法 | 准确率 | F1分数 |\n|------|--------|--------|\n| 我们的方法 | 95.2% | 0.94 |\n| 基准方法 | 87.3% | 0.85 |",
|
||||
"table_caption": ["表1:性能对比"],
|
||||
"table_footnote": ["测试数据集结果"],
|
||||
"page_idx": 2 # 此表格出现的页码
|
||||
},
|
||||
{
|
||||
"type": "equation",
|
||||
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
||||
"text": "文档相关性概率公式",
|
||||
"page_idx": 3 # 此公式出现的页码
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "总之,我们的方法在所有指标上都表现出优越的性能。",
|
||||
"page_idx": 4 # 此内容出现的页码
|
||||
}
|
||||
]
|
||||
|
||||
# 直接插入内容列表
|
||||
await rag.insert_content_list(
|
||||
content_list=content_list,
|
||||
file_path="research_paper.pdf", # 用于引用的参考文件名
|
||||
split_by_character=None, # 可选的文本分割
|
||||
split_by_character_only=False, # 可选的文本分割模式
|
||||
doc_id=None, # 可选的自定义文档ID(如果未提供将自动生成)
|
||||
display_stats=True # 显示内容统计信息
|
||||
)
|
||||
|
||||
# 查询插入的内容
|
||||
result = await rag.aquery(
|
||||
"研究中提到的主要发现和性能指标是什么?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print("查询结果:", result)
|
||||
|
||||
# 您也可以使用不同的文档ID插入多个内容列表
|
||||
another_content_list = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "这是来自另一个文档的内容。",
|
||||
"page_idx": 0 # 此内容出现的页码
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"table_body": "| 特性 | 值 |\n|------|----|\n| 速度 | 快速 |\n| 准确性 | 高 |",
|
||||
"table_caption": ["特性对比"],
|
||||
"page_idx": 1 # 此表格出现的页码
|
||||
}
|
||||
]
|
||||
|
||||
await rag.insert_content_list(
|
||||
content_list=another_content_list,
|
||||
file_path="another_document.pdf",
|
||||
doc_id="custom-doc-id-123" # 自定义文档ID
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(insert_content_list_example())
|
||||
```
|
||||
|
||||
**内容列表格式:**
|
||||
|
||||
`content_list` 应遵循标准格式,每个项目都是包含以下内容的字典:
|
||||
|
||||
- **文本内容**: `{"type": "text", "text": "内容文本", "page_idx": 0}`
|
||||
- **图像内容**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["标题"], "image_footnote": ["注释"], "page_idx": 1}`
|
||||
- **表格内容**: `{"type": "table", "table_body": "markdown表格", "table_caption": ["标题"], "table_footnote": ["注释"], "page_idx": 2}`
|
||||
- **公式内容**: `{"type": "equation", "latex": "LaTeX公式", "text": "描述", "page_idx": 3}`
|
||||
- **通用内容**: `{"type": "custom_type", "content": "任何内容", "page_idx": 4}`
|
||||
|
||||
**重要说明:**
|
||||
- **`img_path`**: 必须是图像文件的绝对路径(例如:`/home/user/images/chart.jpg` 或 `C:\Users\user\images\chart.jpg`)
|
||||
- **`page_idx`**: 表示内容在原始文档中出现的页码(从0开始的索引)
|
||||
- **内容顺序**: 项目按照在列表中出现的顺序进行处理
|
||||
|
||||
此方法在以下情况下特别有用:
|
||||
- 您有来自外部解析器的内容(非MinerU/Docling)
|
||||
- 您想要处理程序化生成的内容
|
||||
- 您需要将来自多个源的内容插入到单个知识库中
|
||||
- 您有想要重用的缓存解析结果
|
||||
|
||||
---
|
||||
|
||||
## 🛠️ 示例
|
||||
@@ -552,8 +968,8 @@ if __name__ == "__main__":
|
||||
**运行示例:**
|
||||
|
||||
```bash
|
||||
# 端到端处理
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
|
||||
# 端到端处理(包含解析器选择)
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
|
||||
|
||||
# 直接模态处理
|
||||
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
|
||||
@@ -592,11 +1008,29 @@ python examples/text_format_test.py --check-reportlab --file dummy
|
||||
```bash
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
OPENAI_BASE_URL=your_base_url # 可选
|
||||
OUTPUT_DIR=./output # 解析文档的默认输出目录
|
||||
PARSER=mineru # 解析器选择:mineru 或 docling
|
||||
PARSE_METHOD=auto # 解析方法:auto, ocr 或 txt
|
||||
```
|
||||
|
||||
### MinerU配置
|
||||
**注意:** 为了向后兼容,旧的环境变量名称仍然有效:
|
||||
- `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD`
|
||||
|
||||
MinerU 2.0使用简化的配置方式:
|
||||
### 解析器配置
|
||||
|
||||
RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势:
|
||||
|
||||
#### MinerU 解析器
|
||||
- 支持PDF、图像、Office文档等多种格式
|
||||
- 强大的OCR和表格提取能力
|
||||
- 支持GPU加速
|
||||
|
||||
#### Docling 解析器
|
||||
- 专门优化Office文档和HTML文件的解析
|
||||
- 更好的文档结构保持
|
||||
- 原生支持多种Office格式
|
||||
|
||||
### MinerU配置
|
||||
|
||||
```bash
|
||||
# MinerU 2.0使用命令行参数而不是配置文件
|
||||
@@ -609,20 +1043,43 @@ mineru -p input.pdf -o output_dir -m ocr # OCR重点解析
|
||||
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU加速
|
||||
```
|
||||
|
||||
你也可以通过RAGAnything参数配置MinerU:
|
||||
你也可以通过RAGAnything参数配置解析:
|
||||
|
||||
```python
|
||||
# 配置解析行为
|
||||
# 基础解析配置和解析器选择
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
parse_method="auto", # 或 "ocr", "txt"
|
||||
device="cuda", # GPU加速
|
||||
backend="pipeline", # 解析后端
|
||||
lang="ch" # 语言优化
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # 或 "ocr", "txt"
|
||||
parser="mineru" # 可选:"mineru" 或 "docling"
|
||||
)
|
||||
|
||||
# 高级解析配置(包含特殊参数)
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # 解析方法:"auto", "ocr", "txt"
|
||||
parser="mineru", # 解析器选择:"mineru" 或 "docling"
|
||||
|
||||
# MinerU特殊参数 - 支持的所有kwargs:
|
||||
lang="ch", # 文档语言优化(如:"ch", "en", "ja")
|
||||
device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
|
||||
start_page=0, # 起始页码(0为基准,适用于PDF)
|
||||
end_page=10, # 结束页码(0为基准,适用于PDF)
|
||||
formula=True, # 启用公式解析
|
||||
table=True, # 启用表格解析
|
||||
backend="pipeline", # 解析后端:pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client
|
||||
source="huggingface", # 模型源:"huggingface", "modelscope", "local"
|
||||
# vlm_url="http://127.0.0.1:3000" # 当backend=vlm-sglang-client时,需指定服务地址
|
||||
|
||||
# RAGAnything标准参数
|
||||
display_stats=True, # 显示内容统计信息
|
||||
split_by_character=None, # 可选的文本分割字符
|
||||
doc_id=None # 可选的文档ID
|
||||
)
|
||||
```
|
||||
|
||||
> **注意**:MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。
|
||||
> **注意**:MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。RAG-Anything现在支持多种文档解析器 - 你可以根据需要在MinerU和Docling之间选择。
|
||||
|
||||
### 处理要求
|
||||
|
||||
@@ -670,13 +1127,14 @@ await rag.process_document_complete(
|
||||
</div>
|
||||
|
||||
```bibtex
|
||||
@article{guo2024lightrag,
|
||||
title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
|
||||
author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
|
||||
year={2024},
|
||||
eprint={2410.05779},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.IR}
|
||||
@misc{guo2025raganythingallinoneragframework,
|
||||
title={RAG-Anything: All-in-One RAG Framework},
|
||||
author={Zirui Guo and Xubin Ren and Lingrui Xu and Jiahao Zhang and Chao Huang},
|
||||
year={2025},
|
||||
eprint={2510.12323},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.AI},
|
||||
url={https://arxiv.org/abs/2510.12323},
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
341
docs/batch_processing.md
Normal file
341
docs/batch_processing.md
Normal file
@@ -0,0 +1,341 @@
|
||||
# Batch Processing
|
||||
|
||||
This document describes the batch processing feature for RAG-Anything, which allows you to process multiple documents in parallel for improved throughput.
|
||||
|
||||
## Overview
|
||||
|
||||
The batch processing feature allows you to process multiple documents concurrently, significantly improving throughput for large document collections. It provides parallel processing, progress tracking, error handling, and flexible configuration options.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Parallel Processing**: Process multiple files concurrently using thread pools
|
||||
- **Progress Tracking**: Real-time progress bars with `tqdm`
|
||||
- **Error Handling**: Comprehensive error reporting and recovery
|
||||
- **Flexible Input**: Support for files, directories, and recursive search
|
||||
- **Configurable Workers**: Adjustable number of parallel workers
|
||||
- **Installation Check Bypass**: Optional skip for environments with package conflicts
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Basic installation
|
||||
pip install raganything[all]
|
||||
|
||||
# Required for batch processing
|
||||
pip install tqdm
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Batch Processing
|
||||
|
||||
```python
|
||||
from raganything.batch_parser import BatchParser
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru", # or "docling"
|
||||
max_workers=4,
|
||||
show_progress=True,
|
||||
timeout_per_file=300,
|
||||
skip_installation_check=False # Set to True if having parser installation issues
|
||||
)
|
||||
|
||||
# Process multiple files
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=["doc1.pdf", "doc2.docx", "folder/"],
|
||||
output_dir="./batch_output",
|
||||
parse_method="auto",
|
||||
recursive=True
|
||||
)
|
||||
|
||||
# Check results
|
||||
print(result.summary())
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
print(f"Processing time: {result.processing_time:.2f} seconds")
|
||||
```
|
||||
|
||||
### Asynchronous Batch Processing
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from raganything.batch_parser import BatchParser
|
||||
|
||||
async def async_batch_processing():
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=4,
|
||||
show_progress=True
|
||||
)
|
||||
|
||||
# Process files asynchronously
|
||||
result = await batch_parser.process_batch_async(
|
||||
file_paths=["doc1.pdf", "doc2.docx"],
|
||||
output_dir="./output",
|
||||
parse_method="auto"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
# Run async processing
|
||||
result = asyncio.run(async_batch_processing())
|
||||
```
|
||||
|
||||
### Integration with RAG-Anything
|
||||
|
||||
```python
|
||||
from raganything import RAGAnything
|
||||
|
||||
rag = RAGAnything()
|
||||
|
||||
# Process documents with batch functionality
|
||||
result = rag.process_documents_batch(
|
||||
file_paths=["doc1.pdf", "doc2.docx"],
|
||||
output_dir="./output",
|
||||
max_workers=4,
|
||||
show_progress=True
|
||||
)
|
||||
|
||||
print(f"Processed {len(result.successful_files)} files successfully")
|
||||
```
|
||||
|
||||
### Process Documents with RAG Integration
|
||||
|
||||
```python
|
||||
# Process documents in batch and then add them to RAG
|
||||
result = await rag.process_documents_with_rag_batch(
|
||||
file_paths=["doc1.pdf", "doc2.docx"],
|
||||
output_dir="./output",
|
||||
max_workers=4,
|
||||
show_progress=True
|
||||
)
|
||||
|
||||
print(f"Processed {result['successful_rag_files']} files with RAG")
|
||||
print(f"Total processing time: {result['total_processing_time']:.2f} seconds")
|
||||
```
|
||||
|
||||
### Command Line Interface
|
||||
|
||||
```bash
|
||||
# Basic batch processing
|
||||
python -m raganything.batch_parser path/to/docs/ --output ./output --workers 4
|
||||
|
||||
# With specific parser
|
||||
python -m raganything.batch_parser path/to/docs/ --parser mineru --method auto
|
||||
|
||||
# Without progress bar
|
||||
python -m raganything.batch_parser path/to/docs/ --output ./output --no-progress
|
||||
|
||||
# Help
|
||||
python -m raganything.batch_parser --help
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```env
|
||||
# Batch processing configuration
|
||||
MAX_CONCURRENT_FILES=4
|
||||
SUPPORTED_FILE_EXTENSIONS=.pdf,.docx,.doc,.pptx,.ppt,.xlsx,.xls,.txt,.md
|
||||
RECURSIVE_FOLDER_PROCESSING=true
|
||||
PARSER_OUTPUT_DIR=./parsed_output
|
||||
```
|
||||
|
||||
### BatchParser Parameters
|
||||
|
||||
- **parser_type**: `"mineru"` or `"docling"` (default: `"mineru"`)
|
||||
- **max_workers**: Number of parallel workers (default: `4`)
|
||||
- **show_progress**: Show progress bar (default: `True`)
|
||||
- **timeout_per_file**: Timeout per file in seconds (default: `300`)
|
||||
- **skip_installation_check**: Skip parser installation check (default: `False`)
|
||||
|
||||
## Supported File Types
|
||||
|
||||
- **PDF files**: `.pdf`
|
||||
- **Office documents**: `.doc`, `.docx`, `.ppt`, `.pptx`, `.xls`, `.xlsx`
|
||||
- **Images**: `.png`, `.jpg`, `.jpeg`, `.bmp`, `.tiff`, `.tif`, `.gif`, `.webp`
|
||||
- **Text files**: `.txt`, `.md`
|
||||
|
||||
## API Reference
|
||||
|
||||
### BatchProcessingResult
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class BatchProcessingResult:
|
||||
successful_files: List[str] # Successfully processed files
|
||||
failed_files: List[str] # Failed files
|
||||
total_files: int # Total number of files
|
||||
processing_time: float # Total processing time in seconds
|
||||
errors: Dict[str, str] # Error messages for failed files
|
||||
output_dir: str # Output directory used
|
||||
|
||||
def summary(self) -> str: # Human-readable summary
|
||||
def success_rate(self) -> float: # Success rate as percentage
|
||||
```
|
||||
|
||||
### BatchParser Methods
|
||||
|
||||
```python
|
||||
class BatchParser:
|
||||
def __init__(self, parser_type: str = "mineru", max_workers: int = 4, ...):
|
||||
"""Initialize batch parser"""
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""Get list of supported file extensions"""
|
||||
|
||||
def filter_supported_files(self, file_paths: List[str], recursive: bool = True) -> List[str]:
|
||||
"""Filter files to only supported types"""
|
||||
|
||||
def process_batch(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
|
||||
"""Process files in batch"""
|
||||
|
||||
async def process_batch_async(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
|
||||
"""Process files in batch asynchronously"""
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Memory Usage
|
||||
- Each worker uses additional memory
|
||||
- Recommended: 2-4 workers for most systems
|
||||
- Monitor memory usage with large files
|
||||
|
||||
### CPU Usage
|
||||
- Parallel processing utilizes multiple cores
|
||||
- Optimal worker count depends on CPU cores and file sizes
|
||||
- I/O may become bottleneck with many small files
|
||||
|
||||
### Recommended Settings
|
||||
- **Small files** (< 1MB): Higher worker count (6-8)
|
||||
- **Large files** (> 100MB): Lower worker count (2-3)
|
||||
- **Mixed sizes**: Start with 4 workers and adjust
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### Memory Errors
|
||||
```python
|
||||
# Solution: Reduce max_workers
|
||||
batch_parser = BatchParser(max_workers=2)
|
||||
```
|
||||
|
||||
#### Timeout Errors
|
||||
```python
|
||||
# Solution: Increase timeout_per_file
|
||||
batch_parser = BatchParser(timeout_per_file=600) # 10 minutes
|
||||
```
|
||||
|
||||
#### Parser Installation Issues
|
||||
```python
|
||||
# Solution: Skip installation check
|
||||
batch_parser = BatchParser(skip_installation_check=True)
|
||||
```
|
||||
|
||||
#### File Not Found Errors
|
||||
- Check file paths and permissions
|
||||
- Ensure input files exist
|
||||
- Verify directory access rights
|
||||
|
||||
### Debug Mode
|
||||
|
||||
Enable debug logging for detailed information:
|
||||
|
||||
```python
|
||||
import logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
# Create batch parser with debug logging
|
||||
batch_parser = BatchParser(parser_type="mineru", max_workers=2)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
The batch processor provides comprehensive error handling:
|
||||
|
||||
```python
|
||||
result = batch_parser.process_batch(file_paths=["doc1.pdf", "doc2.docx"])
|
||||
|
||||
# Check for errors
|
||||
if result.failed_files:
|
||||
print("Failed files:")
|
||||
for file_path in result.failed_files:
|
||||
error_message = result.errors.get(file_path, "Unknown error")
|
||||
print(f" - {file_path}: {error_message}")
|
||||
|
||||
# Process only successful files
|
||||
for file_path in result.successful_files:
|
||||
print(f"Successfully processed: {file_path}")
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Process Entire Directory
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
|
||||
# Process all supported files in a directory
|
||||
batch_parser = BatchParser(max_workers=4)
|
||||
directory_path = Path("./documents")
|
||||
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=[str(directory_path)],
|
||||
output_dir="./processed",
|
||||
recursive=True # Include subdirectories
|
||||
)
|
||||
|
||||
print(f"Processed {len(result.successful_files)} out of {result.total_files} files")
|
||||
```
|
||||
|
||||
### Filter Files Before Processing
|
||||
|
||||
```python
|
||||
# Get all files in directory
|
||||
all_files = ["doc1.pdf", "image.png", "spreadsheet.xlsx", "unsupported.xyz"]
|
||||
|
||||
# Filter to supported files only
|
||||
supported_files = batch_parser.filter_supported_files(all_files)
|
||||
print(f"Will process {len(supported_files)} out of {len(all_files)} files")
|
||||
|
||||
# Process only supported files
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=supported_files,
|
||||
output_dir="./output"
|
||||
)
|
||||
```
|
||||
|
||||
### Custom Error Handling
|
||||
|
||||
```python
|
||||
def process_with_retry(file_paths, max_retries=3):
|
||||
"""Process files with retry logic"""
|
||||
|
||||
for attempt in range(max_retries):
|
||||
result = batch_parser.process_batch(file_paths, "./output")
|
||||
|
||||
if not result.failed_files:
|
||||
break # All files processed successfully
|
||||
|
||||
print(f"Attempt {attempt + 1}: {len(result.failed_files)} files failed")
|
||||
file_paths = result.failed_files # Retry failed files
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start with default settings** and adjust based on performance
|
||||
2. **Monitor system resources** during batch processing
|
||||
3. **Use appropriate worker counts** for your hardware
|
||||
4. **Handle errors gracefully** with retry logic
|
||||
5. **Test with small batches** before processing large collections
|
||||
6. **Use skip_installation_check** if facing parser installation issues
|
||||
7. **Enable progress tracking** for long-running operations
|
||||
8. **Set appropriate timeouts** based on expected file processing times
|
||||
|
||||
## Conclusion
|
||||
|
||||
The batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline.
|
||||
375
docs/context_aware_processing.md
Normal file
375
docs/context_aware_processing.md
Normal file
@@ -0,0 +1,375 @@
|
||||
# Context-Aware Multimodal Processing in RAGAnything
|
||||
|
||||
This document describes the context-aware multimodal processing feature in RAGAnything, which provides surrounding content information to LLMs when analyzing images, tables, equations, and other multimodal content for enhanced accuracy and relevance.
|
||||
|
||||
## Overview
|
||||
|
||||
The context-aware feature enables RAGAnything to automatically extract and provide surrounding text content as context when processing multimodal content. This leads to more accurate and contextually relevant analysis by giving AI models additional information about where the content appears in the document structure.
|
||||
|
||||
### Key Benefits
|
||||
|
||||
- **Enhanced Accuracy**: Context helps AI understand the purpose and meaning of multimodal content
|
||||
- **Semantic Coherence**: Generated descriptions align with document context and terminology
|
||||
- **Automated Integration**: Context extraction is automatically enabled during document processing
|
||||
- **Flexible Configuration**: Multiple extraction modes and filtering options
|
||||
|
||||
## Key Features
|
||||
|
||||
### 1. Configuration Support
|
||||
- **Integrated Configuration**: Complete context options in `RAGAnythingConfig`
|
||||
- **Environment Variables**: Configure all context parameters via environment variables
|
||||
- **Dynamic Updates**: Runtime configuration updates supported
|
||||
- **Content Format Control**: Configurable content source format detection
|
||||
|
||||
### 2. Automated Integration
|
||||
- **Auto-Initialization**: Modal processors automatically receive tokenizer and context configuration
|
||||
- **Content Source Setup**: Document processing automatically sets content sources for context extraction
|
||||
- **Position Information**: Automatic position info (page_idx, index) passed to processors
|
||||
- **Batch Processing**: Context-aware batch processing for efficient document handling
|
||||
|
||||
### 3. Advanced Token Management
|
||||
- **Accurate Token Counting**: Uses LightRAG's tokenizer for precise token calculation
|
||||
- **Smart Boundary Preservation**: Truncates at sentence/paragraph boundaries
|
||||
- **Backward Compatibility**: Fallback to character truncation when tokenizer unavailable
|
||||
|
||||
### 4. Universal Context Extraction
|
||||
- **Multiple Formats**: Support for MinerU, plain text, custom formats
|
||||
- **Flexible Modes**: Page-based and chunk-based context extraction
|
||||
- **Content Filtering**: Configurable content type filtering
|
||||
- **Header Support**: Optional inclusion of document headers and structure
|
||||
|
||||
## Configuration
|
||||
|
||||
### RAGAnythingConfig Parameters
|
||||
|
||||
```python
|
||||
# Context Extraction Configuration
|
||||
context_window: int = 1 # Context window size (pages/chunks)
|
||||
context_mode: str = "page" # Context mode ("page" or "chunk")
|
||||
max_context_tokens: int = 2000 # Maximum context tokens
|
||||
include_headers: bool = True # Include document headers
|
||||
include_captions: bool = True # Include image/table captions
|
||||
context_filter_content_types: List[str] = ["text"] # Content types to include
|
||||
content_format: str = "minerU" # Default content format for context extraction
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Context extraction settings
|
||||
CONTEXT_WINDOW=2
|
||||
CONTEXT_MODE=page
|
||||
MAX_CONTEXT_TOKENS=3000
|
||||
INCLUDE_HEADERS=true
|
||||
INCLUDE_CAPTIONS=true
|
||||
CONTEXT_FILTER_CONTENT_TYPES=text,image
|
||||
CONTENT_FORMAT=minerU
|
||||
```
|
||||
|
||||
## Usage Guide
|
||||
|
||||
### 1. Basic Configuration
|
||||
|
||||
```python
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
|
||||
# Create configuration with context settings
|
||||
config = RAGAnythingConfig(
|
||||
context_window=2,
|
||||
context_mode="page",
|
||||
max_context_tokens=3000,
|
||||
include_headers=True,
|
||||
include_captions=True,
|
||||
context_filter_content_types=["text", "image"],
|
||||
content_format="minerU"
|
||||
)
|
||||
|
||||
# Create RAGAnything instance
|
||||
rag_anything = RAGAnything(
|
||||
config=config,
|
||||
llm_model_func=your_llm_function,
|
||||
embedding_func=your_embedding_function
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Automatic Document Processing
|
||||
|
||||
```python
|
||||
# Context is automatically enabled during document processing
|
||||
await rag_anything.process_document_complete("document.pdf")
|
||||
```
|
||||
|
||||
### 3. Manual Content Source Configuration
|
||||
|
||||
```python
|
||||
# Set content source for specific content lists
|
||||
rag_anything.set_content_source_for_context(content_list, "minerU")
|
||||
|
||||
# Update context configuration at runtime
|
||||
rag_anything.update_context_config(
|
||||
context_window=1,
|
||||
max_context_tokens=1500,
|
||||
include_captions=False
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Direct Modal Processor Usage
|
||||
|
||||
```python
|
||||
from raganything.modalprocessors import (
|
||||
ContextExtractor,
|
||||
ContextConfig,
|
||||
ImageModalProcessor
|
||||
)
|
||||
|
||||
# Configure context extraction
|
||||
config = ContextConfig(
|
||||
context_window=1,
|
||||
context_mode="page",
|
||||
max_context_tokens=2000,
|
||||
include_headers=True,
|
||||
include_captions=True,
|
||||
filter_content_types=["text"]
|
||||
)
|
||||
|
||||
# Initialize context extractor
|
||||
context_extractor = ContextExtractor(config)
|
||||
|
||||
# Initialize modal processor with context support
|
||||
processor = ImageModalProcessor(lightrag, caption_func, context_extractor)
|
||||
|
||||
# Set content source
|
||||
processor.set_content_source(content_list, "minerU")
|
||||
|
||||
# Process with context
|
||||
item_info = {
|
||||
"page_idx": 2,
|
||||
"index": 5,
|
||||
"type": "image"
|
||||
}
|
||||
|
||||
result = await processor.process_multimodal_content(
|
||||
modal_content=image_data,
|
||||
content_type="image",
|
||||
file_path="document.pdf",
|
||||
entity_name="Architecture Diagram",
|
||||
item_info=item_info
|
||||
)
|
||||
```
|
||||
|
||||
## Context Modes
|
||||
|
||||
### Page-Based Context (`context_mode="page"`)
|
||||
- Extracts context based on page boundaries
|
||||
- Uses `page_idx` field from content items
|
||||
- Suitable for document-structured content
|
||||
- Example: Include text from 2 pages before and after current image
|
||||
|
||||
### Chunk-Based Context (`context_mode="chunk"`)
|
||||
- Extracts context based on content item positions
|
||||
- Uses sequential position in content list
|
||||
- Suitable for fine-grained control
|
||||
- Example: Include 5 content items before and after current table
|
||||
|
||||
## Processing Workflow
|
||||
|
||||
### 1. Document Parsing
|
||||
```
|
||||
Document Input → MinerU Parsing → content_list Generation
|
||||
```
|
||||
|
||||
### 2. Context Setup
|
||||
```
|
||||
content_list → Set as Context Source → All Modal Processors Gain Context Capability
|
||||
```
|
||||
|
||||
### 3. Multimodal Processing
|
||||
```
|
||||
Multimodal Content → Extract Surrounding Context → Enhanced LLM Analysis → More Accurate Results
|
||||
```
|
||||
|
||||
## Content Source Formats
|
||||
|
||||
### MinerU Format
|
||||
```json
|
||||
[
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Document content here...",
|
||||
"text_level": 1,
|
||||
"page_idx": 0
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"img_path": "images/figure1.jpg",
|
||||
"image_caption": ["Figure 1: Architecture"],
|
||||
"image_footnote": [],
|
||||
"page_idx": 1
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Custom Text Chunks
|
||||
```python
|
||||
text_chunks = [
|
||||
"First chunk of text content...",
|
||||
"Second chunk of text content...",
|
||||
"Third chunk of text content..."
|
||||
]
|
||||
```
|
||||
|
||||
### Plain Text
|
||||
```python
|
||||
full_document = "Complete document text with all content..."
|
||||
```
|
||||
|
||||
## Configuration Examples
|
||||
|
||||
### High-Precision Context
|
||||
For focused analysis with minimal context:
|
||||
```python
|
||||
config = RAGAnythingConfig(
|
||||
context_window=1,
|
||||
context_mode="page",
|
||||
max_context_tokens=1000,
|
||||
include_headers=True,
|
||||
include_captions=False,
|
||||
context_filter_content_types=["text"]
|
||||
)
|
||||
```
|
||||
|
||||
### Comprehensive Context
|
||||
For broad analysis with rich context:
|
||||
```python
|
||||
config = RAGAnythingConfig(
|
||||
context_window=2,
|
||||
context_mode="page",
|
||||
max_context_tokens=3000,
|
||||
include_headers=True,
|
||||
include_captions=True,
|
||||
context_filter_content_types=["text", "image", "table"]
|
||||
)
|
||||
```
|
||||
|
||||
### Chunk-Based Analysis
|
||||
For fine-grained sequential context:
|
||||
```python
|
||||
config = RAGAnythingConfig(
|
||||
context_window=5,
|
||||
context_mode="chunk",
|
||||
max_context_tokens=2000,
|
||||
include_headers=False,
|
||||
include_captions=False,
|
||||
context_filter_content_types=["text"]
|
||||
)
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### 1. Accurate Token Control
|
||||
- Uses real tokenizer for precise token counting
|
||||
- Avoids exceeding LLM token limits
|
||||
- Provides consistent performance
|
||||
|
||||
### 2. Smart Truncation
|
||||
- Truncates at sentence boundaries
|
||||
- Maintains semantic integrity
|
||||
- Adds truncation indicators
|
||||
|
||||
### 3. Caching Optimization
|
||||
- Context extraction results can be reused
|
||||
- Reduces redundant computation overhead
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Context Truncation
|
||||
The system automatically truncates context to fit within token limits:
|
||||
- Uses actual tokenizer for accurate token counting
|
||||
- Attempts to end at sentence boundaries (periods)
|
||||
- Falls back to line boundaries if needed
|
||||
- Adds "..." indicator for truncated content
|
||||
|
||||
### Header Formatting
|
||||
When `include_headers=True`, headers are formatted with markdown-style prefixes:
|
||||
```
|
||||
# Level 1 Header
|
||||
## Level 2 Header
|
||||
### Level 3 Header
|
||||
```
|
||||
|
||||
### Caption Integration
|
||||
When `include_captions=True`, image and table captions are included as:
|
||||
```
|
||||
[Image: Figure 1 caption text]
|
||||
[Table: Table 1 caption text]
|
||||
```
|
||||
|
||||
## Integration with RAGAnything
|
||||
|
||||
The context-aware feature is seamlessly integrated into RAGAnything's workflow:
|
||||
|
||||
1. **Automatic Setup**: Context extractors are automatically created and configured
|
||||
2. **Content Source Management**: Document processing automatically sets content sources
|
||||
3. **Processor Integration**: All modal processors receive context capabilities
|
||||
4. **Configuration Consistency**: Single configuration system for all context settings
|
||||
|
||||
## Error Handling
|
||||
|
||||
The system includes robust error handling:
|
||||
- Gracefully handles missing or invalid content sources
|
||||
- Returns empty context for unsupported formats
|
||||
- Logs warnings for configuration issues
|
||||
- Continues processing even if context extraction fails
|
||||
|
||||
## Compatibility
|
||||
|
||||
- **Backward Compatible**: Existing code works without modification
|
||||
- **Optional Feature**: Context can be selectively enabled/disabled
|
||||
- **Flexible Configuration**: Supports multiple configuration combinations
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Token Limits**: Ensure `max_context_tokens` doesn't exceed LLM context limits
|
||||
2. **Performance Impact**: Larger context windows increase processing time
|
||||
3. **Content Quality**: Context quality directly affects analysis accuracy
|
||||
4. **Window Size**: Match window size to content structure (documents vs articles)
|
||||
5. **Content Filtering**: Use `context_filter_content_types` to reduce noise
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Context Not Extracted**
|
||||
- Check if `set_content_source_for_context()` was called
|
||||
- Verify `item_info` contains required fields (`page_idx`, `index`)
|
||||
- Confirm content source format is correct
|
||||
|
||||
**Context Too Long/Short**
|
||||
- Adjust `max_context_tokens` setting
|
||||
- Modify `context_window` size
|
||||
- Check `context_filter_content_types` configuration
|
||||
|
||||
**Irrelevant Context**
|
||||
- Refine `context_filter_content_types` to exclude noise
|
||||
- Reduce `context_window` size
|
||||
- Set `include_captions=False` if captions are not helpful
|
||||
|
||||
**Configuration Issues**
|
||||
- Verify environment variables are set correctly
|
||||
- Check RAGAnythingConfig parameter names
|
||||
- Ensure content_format matches your data source
|
||||
|
||||
## Examples
|
||||
|
||||
Check out these example files for complete usage demonstrations:
|
||||
|
||||
- **Configuration Examples**: See how to set up different context configurations
|
||||
- **Integration Examples**: Learn how to integrate context-aware processing into your workflow
|
||||
- **Custom Processors**: Examples of creating custom modal processors with context support
|
||||
|
||||
## API Reference
|
||||
|
||||
For detailed API documentation, see the docstrings in:
|
||||
- `raganything/modalprocessors.py` - Context extraction and modal processors
|
||||
- `raganything/config.py` - Configuration options
|
||||
- `raganything/raganything.py` - Main RAGAnything class integration
|
||||
552
docs/enhanced_markdown.md
Normal file
552
docs/enhanced_markdown.md
Normal file
@@ -0,0 +1,552 @@
|
||||
# Enhanced Markdown Conversion
|
||||
|
||||
This document describes the enhanced markdown conversion feature for RAG-Anything, which provides high-quality PDF generation from markdown files with multiple backend options and advanced styling.
|
||||
|
||||
## Overview
|
||||
|
||||
The enhanced markdown conversion feature provides professional-quality PDF generation from markdown files. It supports multiple conversion backends, advanced styling options, syntax highlighting, and seamless integration with RAG-Anything's document processing pipeline.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Multiple Backends**: WeasyPrint, Pandoc, and automatic backend selection
|
||||
- **Advanced Styling**: Custom CSS, syntax highlighting, and professional layouts
|
||||
- **Image Support**: Embedded images with proper scaling and positioning
|
||||
- **Table Support**: Formatted tables with borders and professional styling
|
||||
- **Code Highlighting**: Syntax highlighting for code blocks using Pygments
|
||||
- **Custom Templates**: Support for custom CSS and document templates
|
||||
- **Table of Contents**: Automatic TOC generation with navigation links
|
||||
- **Professional Typography**: High-quality fonts and spacing
|
||||
|
||||
## Installation
|
||||
|
||||
### Required Dependencies
|
||||
|
||||
```bash
|
||||
# Basic installation
|
||||
pip install raganything[all]
|
||||
|
||||
# Required for enhanced markdown conversion
|
||||
pip install markdown weasyprint pygments
|
||||
```
|
||||
|
||||
### Optional Dependencies
|
||||
|
||||
```bash
|
||||
# For Pandoc backend (system installation required)
|
||||
# Ubuntu/Debian:
|
||||
sudo apt-get install pandoc wkhtmltopdf
|
||||
|
||||
# macOS:
|
||||
brew install pandoc wkhtmltopdf
|
||||
|
||||
# Or using conda:
|
||||
conda install -c conda-forge pandoc wkhtmltopdf
|
||||
```
|
||||
|
||||
### Backend-Specific Installation
|
||||
|
||||
#### WeasyPrint (Recommended)
|
||||
```bash
|
||||
# Install WeasyPrint with system dependencies
|
||||
pip install weasyprint
|
||||
|
||||
# Ubuntu/Debian system dependencies:
|
||||
sudo apt-get install -y build-essential python3-dev python3-pip \
|
||||
python3-setuptools python3-wheel python3-cffi libcairo2 \
|
||||
libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
|
||||
libffi-dev shared-mime-info
|
||||
```
|
||||
|
||||
#### Pandoc
|
||||
- Download from: https://pandoc.org/installing.html
|
||||
- Requires system-wide installation
|
||||
- Used for complex document structures and LaTeX-quality output
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Conversion
|
||||
|
||||
```python
|
||||
from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
|
||||
|
||||
# Create converter with default settings
|
||||
converter = EnhancedMarkdownConverter()
|
||||
|
||||
# Convert markdown file to PDF
|
||||
success = converter.convert_file_to_pdf(
|
||||
input_path="document.md",
|
||||
output_path="document.pdf",
|
||||
method="auto" # Automatically select best available backend
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Conversion successful!")
|
||||
else:
|
||||
print("❌ Conversion failed")
|
||||
```
|
||||
|
||||
### Advanced Configuration
|
||||
|
||||
```python
|
||||
# Create custom configuration
|
||||
config = MarkdownConfig(
|
||||
page_size="A4", # A4, Letter, Legal, etc.
|
||||
margin="1in", # CSS-style margins
|
||||
font_size="12pt", # Base font size
|
||||
line_height="1.5", # Line spacing
|
||||
include_toc=True, # Generate table of contents
|
||||
syntax_highlighting=True, # Enable code syntax highlighting
|
||||
|
||||
# Custom CSS styling
|
||||
custom_css="""
|
||||
body {
|
||||
font-family: 'Georgia', serif;
|
||||
color: #333;
|
||||
}
|
||||
h1 {
|
||||
color: #2c3e50;
|
||||
border-bottom: 2px solid #3498db;
|
||||
padding-bottom: 0.3em;
|
||||
}
|
||||
code {
|
||||
background-color: #f8f9fa;
|
||||
padding: 2px 4px;
|
||||
border-radius: 3px;
|
||||
}
|
||||
pre {
|
||||
background-color: #f8f9fa;
|
||||
border-left: 4px solid #3498db;
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
}
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin: 1em 0;
|
||||
}
|
||||
th, td {
|
||||
border: 1px solid #ddd;
|
||||
padding: 8px 12px;
|
||||
text-align: left;
|
||||
}
|
||||
th {
|
||||
background-color: #f2f2f2;
|
||||
font-weight: bold;
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
converter = EnhancedMarkdownConverter(config)
|
||||
```
|
||||
|
||||
### Backend Selection
|
||||
|
||||
```python
|
||||
# Check available backends
|
||||
converter = EnhancedMarkdownConverter()
|
||||
backend_info = converter.get_backend_info()
|
||||
|
||||
print("Available backends:")
|
||||
for backend, available in backend_info["available_backends"].items():
|
||||
status = "✅" if available else "❌"
|
||||
print(f" {status} {backend}")
|
||||
|
||||
print(f"Recommended backend: {backend_info['recommended_backend']}")
|
||||
|
||||
# Use specific backend
|
||||
converter.convert_file_to_pdf(
|
||||
input_path="document.md",
|
||||
output_path="document.pdf",
|
||||
method="weasyprint" # or "pandoc", "pandoc_system", "auto"
|
||||
)
|
||||
```
|
||||
|
||||
### Content Conversion
|
||||
|
||||
```python
|
||||
# Convert markdown content directly (not from file)
|
||||
markdown_content = """
|
||||
# Sample Document
|
||||
|
||||
## Introduction
|
||||
This is a **bold** statement with *italic* text.
|
||||
|
||||
## Code Example
|
||||
```python
|
||||
def hello_world():
|
||||
print("Hello, World!")
|
||||
return "Success"
|
||||
```
|
||||
|
||||
## Table
|
||||
| Feature | Status | Notes |
|
||||
|---------|--------|-------|
|
||||
| PDF Generation | ✅ | Working |
|
||||
| Syntax Highlighting | ✅ | Pygments |
|
||||
| Custom CSS | ✅ | Full support |
|
||||
"""
|
||||
|
||||
success = converter.convert_markdown_to_pdf(
|
||||
markdown_content=markdown_content,
|
||||
output_path="sample.pdf",
|
||||
method="auto"
|
||||
)
|
||||
```
|
||||
|
||||
### Command Line Interface
|
||||
|
||||
```bash
|
||||
# Basic conversion
|
||||
python -m raganything.enhanced_markdown document.md --output document.pdf
|
||||
|
||||
# With specific backend
|
||||
python -m raganything.enhanced_markdown document.md --method weasyprint
|
||||
|
||||
# With custom CSS file
|
||||
python -m raganything.enhanced_markdown document.md --css custom_style.css
|
||||
|
||||
# Show backend information
|
||||
python -m raganything.enhanced_markdown --info
|
||||
|
||||
# Help
|
||||
python -m raganything.enhanced_markdown --help
|
||||
```
|
||||
|
||||
## Backend Comparison
|
||||
|
||||
| Backend | Pros | Cons | Best For | Quality |
|
||||
|---------|------|------|----------|---------|
|
||||
| **WeasyPrint** | • Excellent CSS support<br>• Fast rendering<br>• Great web-style layouts<br>• Python-based | • Limited LaTeX features<br>• Requires system deps | • Web-style documents<br>• Custom styling<br>• Fast conversion | ⭐⭐⭐⭐ |
|
||||
| **Pandoc** | • Extensive features<br>• LaTeX-quality output<br>• Academic formatting<br>• Many input/output formats | • Slower conversion<br>• System installation<br>• Complex setup | • Academic papers<br>• Complex documents<br>• Publication quality | ⭐⭐⭐⭐⭐ |
|
||||
| **Auto** | • Automatic selection<br>• Fallback support<br>• User-friendly | • May not use optimal backend | • General use<br>• Quick setup<br>• Development | ⭐⭐⭐⭐ |
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### MarkdownConfig Parameters
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class MarkdownConfig:
|
||||
# Page layout
|
||||
page_size: str = "A4" # A4, Letter, Legal, A3, etc.
|
||||
margin: str = "1in" # CSS margin format
|
||||
font_size: str = "12pt" # Base font size
|
||||
line_height: str = "1.5" # Line spacing multiplier
|
||||
|
||||
# Content options
|
||||
include_toc: bool = True # Generate table of contents
|
||||
syntax_highlighting: bool = True # Enable code highlighting
|
||||
image_max_width: str = "100%" # Maximum image width
|
||||
table_style: str = "..." # Default table CSS
|
||||
|
||||
# Styling
|
||||
css_file: Optional[str] = None # External CSS file path
|
||||
custom_css: Optional[str] = None # Inline CSS content
|
||||
template_file: Optional[str] = None # Custom HTML template
|
||||
|
||||
# Output options
|
||||
output_format: str = "pdf" # Currently only PDF supported
|
||||
output_dir: Optional[str] = None # Output directory
|
||||
|
||||
# Metadata
|
||||
metadata: Optional[Dict[str, str]] = None # Document metadata
|
||||
```
|
||||
|
||||
### Supported Markdown Features
|
||||
|
||||
#### Basic Formatting
|
||||
- **Headers**: `# ## ### #### ##### ######`
|
||||
- **Emphasis**: `*italic*`, `**bold**`, `***bold italic***`
|
||||
- **Links**: `[text](url)`, `[text][ref]`
|
||||
- **Images**: ``, `![alt][ref]`
|
||||
- **Lists**: Ordered and unordered, nested
|
||||
- **Blockquotes**: `> quote`
|
||||
- **Line breaks**: Double space or `\n\n`
|
||||
|
||||
#### Advanced Features
|
||||
- **Tables**: GitHub-style tables with alignment
|
||||
- **Code blocks**: Fenced code blocks with language specification
|
||||
- **Inline code**: `backtick code`
|
||||
- **Horizontal rules**: `---` or `***`
|
||||
- **Footnotes**: `[^1]` references
|
||||
- **Definition lists**: Term and definition pairs
|
||||
- **Attributes**: `{#id .class key=value}`
|
||||
|
||||
#### Code Highlighting
|
||||
|
||||
```markdown
|
||||
```python
|
||||
def example_function():
|
||||
"""This will be syntax highlighted"""
|
||||
return "Hello, World!"
|
||||
```
|
||||
|
||||
```javascript
|
||||
function exampleFunction() {
|
||||
// This will also be highlighted
|
||||
return "Hello, World!";
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
## Integration with RAG-Anything
|
||||
|
||||
The enhanced markdown conversion integrates seamlessly with RAG-Anything:
|
||||
|
||||
```python
|
||||
from raganything import RAGAnything
|
||||
|
||||
# Initialize RAG-Anything
|
||||
rag = RAGAnything()
|
||||
|
||||
# Process markdown files - enhanced conversion is used automatically
|
||||
await rag.process_document_complete("document.md")
|
||||
|
||||
# Batch processing with enhanced markdown conversion
|
||||
result = rag.process_documents_batch(
|
||||
file_paths=["doc1.md", "doc2.md", "doc3.md"],
|
||||
output_dir="./output"
|
||||
)
|
||||
|
||||
# The .md files will be converted to PDF using enhanced conversion
|
||||
# before being processed by the RAG system
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Conversion Speed
|
||||
- **WeasyPrint**: ~1-3 seconds for typical documents
|
||||
- **Pandoc**: ~3-10 seconds for typical documents
|
||||
- **Large documents**: Time scales roughly linearly with content
|
||||
|
||||
### Memory Usage
|
||||
- **WeasyPrint**: ~50-100MB per conversion
|
||||
- **Pandoc**: ~100-200MB per conversion
|
||||
- **Images**: Large images increase memory usage significantly
|
||||
|
||||
### Optimization Tips
|
||||
1. **Resize large images** before embedding
|
||||
2. **Use compressed images** (JPEG for photos, PNG for graphics)
|
||||
3. **Limit concurrent conversions** to avoid memory issues
|
||||
4. **Cache converted content** when processing multiple times
|
||||
|
||||
## Examples
|
||||
|
||||
### Sample Markdown Document
|
||||
|
||||
```markdown
|
||||
# Technical Documentation
|
||||
|
||||
## Table of Contents
|
||||
[TOC]
|
||||
|
||||
## Overview
|
||||
This document provides comprehensive technical specifications.
|
||||
|
||||
## Architecture
|
||||
|
||||
### System Components
|
||||
1. **Parser Engine**: Handles document processing
|
||||
2. **Storage Layer**: Manages data persistence
|
||||
3. **Query Interface**: Provides search capabilities
|
||||
|
||||
### Code Implementation
|
||||
```python
|
||||
from raganything import RAGAnything
|
||||
|
||||
# Initialize system
|
||||
rag = RAGAnything(config={
|
||||
"working_dir": "./storage",
|
||||
"enable_image_processing": True
|
||||
})
|
||||
|
||||
# Process document
|
||||
await rag.process_document_complete("document.pdf")
|
||||
```
|
||||
|
||||
### Performance Metrics
|
||||
|
||||
| Component | Throughput | Latency | Memory |
|
||||
|-----------|------------|---------|--------|
|
||||
| Parser | 100 docs/hour | 36s avg | 2.5 GB |
|
||||
| Storage | 1000 ops/sec | 1ms avg | 512 MB |
|
||||
| Query | 50 queries/sec | 20ms avg | 1 GB |
|
||||
|
||||
## Integration Notes
|
||||
|
||||
> **Important**: Always validate input before processing.
|
||||
|
||||
## Conclusion
|
||||
The enhanced system provides excellent performance for document processing workflows.
|
||||
```
|
||||
|
||||
### Generated PDF Features
|
||||
|
||||
The enhanced markdown converter produces PDFs with:
|
||||
|
||||
- **Professional typography** with proper font selection and spacing
|
||||
- **Syntax-highlighted code blocks** using Pygments
|
||||
- **Formatted tables** with borders and alternating row colors
|
||||
- **Clickable table of contents** with navigation links
|
||||
- **Responsive images** that scale appropriately
|
||||
- **Custom styling** through CSS
|
||||
- **Proper page breaks** and margins
|
||||
- **Document metadata** and properties
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### WeasyPrint Installation Problems
|
||||
```bash
|
||||
# Ubuntu/Debian: Install system dependencies
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential python3-dev libcairo2 \
|
||||
libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
|
||||
libffi-dev shared-mime-info
|
||||
|
||||
# Then reinstall WeasyPrint
|
||||
pip install --force-reinstall weasyprint
|
||||
```
|
||||
|
||||
#### Pandoc Not Found
|
||||
```bash
|
||||
# Check if Pandoc is installed
|
||||
pandoc --version
|
||||
|
||||
# Install Pandoc (Ubuntu/Debian)
|
||||
sudo apt-get install pandoc wkhtmltopdf
|
||||
|
||||
# Or download from: https://pandoc.org/installing.html
|
||||
```
|
||||
|
||||
#### CSS Issues
|
||||
- Check CSS syntax in custom_css
|
||||
- Verify CSS file paths exist
|
||||
- Test CSS with simple HTML first
|
||||
- Use browser developer tools to debug styling
|
||||
|
||||
#### Image Problems
|
||||
- Ensure images are accessible (correct paths)
|
||||
- Check image file formats (PNG, JPEG, GIF supported)
|
||||
- Verify image file permissions
|
||||
- Consider image size and format optimization
|
||||
|
||||
#### Font Issues
|
||||
```python
|
||||
# Use web-safe fonts
|
||||
config = MarkdownConfig(
|
||||
custom_css="""
|
||||
body {
|
||||
font-family: 'Arial', 'Helvetica', sans-serif;
|
||||
}
|
||||
"""
|
||||
)
|
||||
```
|
||||
|
||||
### Debug Mode
|
||||
|
||||
Enable detailed logging for troubleshooting:
|
||||
|
||||
```python
|
||||
import logging
|
||||
|
||||
# Enable debug logging
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# Create converter with debug logging
|
||||
converter = EnhancedMarkdownConverter()
|
||||
result = converter.convert_file_to_pdf("test.md", "test.pdf")
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
def robust_conversion(input_path, output_path):
|
||||
"""Convert with fallback backends"""
|
||||
converter = EnhancedMarkdownConverter()
|
||||
|
||||
# Try backends in order of preference
|
||||
backends = ["weasyprint", "pandoc", "auto"]
|
||||
|
||||
for backend in backends:
|
||||
try:
|
||||
success = converter.convert_file_to_pdf(
|
||||
input_path=input_path,
|
||||
output_path=output_path,
|
||||
method=backend
|
||||
)
|
||||
if success:
|
||||
print(f"✅ Conversion successful with {backend}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ {backend} failed: {str(e)}")
|
||||
continue
|
||||
|
||||
print("❌ All backends failed")
|
||||
return False
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### EnhancedMarkdownConverter
|
||||
|
||||
```python
|
||||
class EnhancedMarkdownConverter:
|
||||
def __init__(self, config: Optional[MarkdownConfig] = None):
|
||||
"""Initialize converter with optional configuration"""
|
||||
|
||||
def convert_file_to_pdf(self, input_path: str, output_path: str, method: str = "auto") -> bool:
|
||||
"""Convert markdown file to PDF"""
|
||||
|
||||
def convert_markdown_to_pdf(self, markdown_content: str, output_path: str, method: str = "auto") -> bool:
|
||||
"""Convert markdown content to PDF"""
|
||||
|
||||
def get_backend_info(self) -> Dict[str, Any]:
|
||||
"""Get information about available backends"""
|
||||
|
||||
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
|
||||
"""Convert using WeasyPrint backend"""
|
||||
|
||||
def convert_with_pandoc(self, markdown_content: str, output_path: str) -> bool:
|
||||
"""Convert using Pandoc backend"""
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Choose the right backend** for your use case:
|
||||
- **WeasyPrint** for web-style documents and custom CSS
|
||||
- **Pandoc** for academic papers and complex formatting
|
||||
- **Auto** for general use and development
|
||||
|
||||
2. **Optimize images** before embedding:
|
||||
- Use appropriate formats (JPEG for photos, PNG for graphics)
|
||||
- Compress images to reduce file size
|
||||
- Set reasonable maximum widths
|
||||
|
||||
3. **Design responsive layouts**:
|
||||
- Use relative units (%, em) instead of absolute (px)
|
||||
- Test with different page sizes
|
||||
- Consider print-specific CSS
|
||||
|
||||
4. **Test your styling**:
|
||||
- Start with default styling and incrementally customize
|
||||
- Test with sample content before production use
|
||||
- Validate CSS syntax
|
||||
|
||||
5. **Handle errors gracefully**:
|
||||
- Implement fallback backends
|
||||
- Provide meaningful error messages
|
||||
- Log conversion attempts for debugging
|
||||
|
||||
6. **Performance optimization**:
|
||||
- Cache converted content when possible
|
||||
- Process large batches with appropriate worker counts
|
||||
- Monitor memory usage with large documents
|
||||
|
||||
## Conclusion
|
||||
|
||||
The enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs.
|
||||
78
docs/offline_setup.md
Normal file
78
docs/offline_setup.md
Normal file
@@ -0,0 +1,78 @@
|
||||
# Running RAG-Anything in an Offline Environment
|
||||
|
||||
This document explains a critical consideration for running the RAG-Anything project in an environment with no internet access.
|
||||
|
||||
## The Network Dependency: `LightRAG` and `tiktoken`
|
||||
|
||||
The `RAGAnything` core engine relies on the `LightRAG` library for its primary functionality. `LightRAG`, in turn, uses OpenAI's `tiktoken` library for text tokenization.
|
||||
|
||||
By default, the `tiktoken` library has a network dependency. On its first use, it attempts to download tokenizer models from OpenAI's public servers (`openaipublic.blob.core.windows.net`). If the application is running in an offline or network-restricted environment, this download will fail, causing the `LightRAG` instance to fail to initialize.
|
||||
|
||||
This results in an error similar to the following:
|
||||
|
||||
```
|
||||
Failed to initialize LightRAG instance: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/o200k_ba
|
||||
```
|
||||
|
||||
This dependency is indirect. The `RAG-Anything` codebase itself does not directly import or call `tiktoken`. The call is made from within the `lightrag` library.
|
||||
|
||||
## The Solution: Using a Local `tiktoken` Cache
|
||||
|
||||
To resolve this issue and enable fully offline operation, you must provide a local cache for the `tiktoken` models. This is achieved by setting the `TIKTOKEN_CACHE_DIR` environment variable **before** the application starts.
|
||||
|
||||
When this environment variable is set, `tiktoken` will look for its model files in the specified local directory instead of attempting to download them from the internet.
|
||||
|
||||
### Steps to Implement the Solution:
|
||||
|
||||
1. **Create a Model Cache:** In an environment *with* internet access, run the provided script to download and cache the necessary `tiktoken` models.
|
||||
|
||||
```bash
|
||||
# Run the cache creation script
|
||||
uv run scripts/create_tiktoken_cache.py
|
||||
```
|
||||
|
||||
This will create a `tiktoken_cache` directory in your project root containing the required model files.
|
||||
|
||||
2. **Configure the Environment Variable:** Add the following line to your `.env` file:
|
||||
|
||||
```bash
|
||||
TIKTOKEN_CACHE_DIR=./tiktoken_cache
|
||||
```
|
||||
|
||||
**Important:** You should ensure that the `.env` file is loaded **before** `LightRAG` imports `tiktoken`, making this configuration effective.
|
||||
|
||||
```python
|
||||
import os
|
||||
from typing import Dict, Any, Optional, Callable
|
||||
import sys
|
||||
import asyncio
|
||||
import atexit
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Add project root directory to Python path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
# Load environment variables FIRST - before any imports that use tiktoken
|
||||
load_dotenv(dotenv_path=".env", override=False)
|
||||
|
||||
# Now import LightRAG (which will import tiktoken with the correct env var set)
|
||||
from lightrag import LightRAG
|
||||
from lightrag.utils import logger
|
||||
|
||||
# Rest of the code...
|
||||
```
|
||||
|
||||
### Testing the Offline Setup
|
||||
|
||||
1. **Create a `tiktoken_cache` directory:** If you don't have one already, create a directory named `tiktoken_cache` in the project root.
|
||||
2. **Populate the cache:** Run the `scripts/create_tiktoken_cache.py` script to download the necessary tiktoken models into the `tiktoken_cache` directory.
|
||||
3. **Set the `TIKTOKEN_CACHE_DIR` environment variable:** Add the line `TIKTOKEN_CACHE_DIR=./tiktoken_cache` to your `.env` file.
|
||||
4. **Disconnect from the internet:** Disable your internet connection or put your machine in airplane mode.
|
||||
5. **Run the application:** Start the `RAG-Anything` application. For example:
|
||||
```
|
||||
uv run examples/raganything_example.py requirements.txt
|
||||
```
|
||||
|
||||
By following these steps, you can eliminate the network dependency and run the `RAG-Anything` project successfully in a fully offline environment.
|
||||
26
env.example
26
env.example
@@ -10,6 +10,12 @@ OLLAMA_EMULATING_MODEL_TAG=latest
|
||||
# WORKERS=2
|
||||
# CORS_ORIGINS=http://localhost:3000,http://localhost:8080
|
||||
|
||||
### Tiktoken Cache Configuration (for offline deployment)
|
||||
### Set this to a local directory containing cached tiktoken models
|
||||
### This prevents tiktoken from downloading models from the internet on initialization
|
||||
### See docs/offline_setup.md for setup instructions
|
||||
# TIKTOKEN_CACHE_DIR=./tiktoken_cache
|
||||
|
||||
### Login Configuration
|
||||
# AUTH_ACCOUNTS='admin:admin123,user1:pass456'
|
||||
# TOKEN_SECRET=Your-Key-For-LightRAG-API-Server
|
||||
@@ -33,9 +39,10 @@ OLLAMA_EMULATING_MODEL_TAG=latest
|
||||
|
||||
### RAGAnything Configuration (Multimodal Document Processing)
|
||||
### ---
|
||||
### MinerU Parser Configuration
|
||||
# MINERU_PARSE_METHOD=auto
|
||||
# MINERU_OUTPUT_DIR=./output
|
||||
### Parser Configuration
|
||||
# PARSE_METHOD=auto
|
||||
# OUTPUT_DIR=./output
|
||||
# PARSER=mineru
|
||||
# DISPLAY_CONTENT_STATS=true
|
||||
|
||||
### Multimodal Processing Configuration
|
||||
@@ -48,6 +55,15 @@ OLLAMA_EMULATING_MODEL_TAG=latest
|
||||
# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md
|
||||
# RECURSIVE_FOLDER_PROCESSING=true
|
||||
|
||||
### Context Extraction Configuration
|
||||
# CONTEXT_WINDOW=1
|
||||
# CONTEXT_MODE=page
|
||||
# MAX_CONTEXT_TOKENS=2000
|
||||
# INCLUDE_HEADERS=true
|
||||
# INCLUDE_CAPTIONS=true
|
||||
# CONTEXT_FILTER_CONTENT_TYPES=text
|
||||
# CONTENT_FORMAT=minerU
|
||||
|
||||
### Max nodes return from grap retrieval
|
||||
# MAX_GRAPH_NODES=1000
|
||||
|
||||
@@ -93,7 +109,7 @@ MAX_ASYNC=4
|
||||
### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)
|
||||
### MAX_TOKENS: set as num_ctx option for Ollama by API Server
|
||||
MAX_TOKENS=32768
|
||||
### LLM Binding type: openai, ollama, lollms, azure_openai
|
||||
### LLM Binding type: openai, ollama, lollms, azure_openai, lmstudio
|
||||
LLM_BINDING=openai
|
||||
LLM_MODEL=gpt-4o
|
||||
LLM_BINDING_HOST=https://api.openai.com/v1
|
||||
@@ -103,7 +119,7 @@ LLM_BINDING_API_KEY=your_api_key
|
||||
# AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
||||
|
||||
### Embedding Configuration
|
||||
### Embedding Binding type: openai, ollama, lollms, azure_openai
|
||||
### Embedding Binding type: openai, ollama, lollms, azure_openai, lmstudio
|
||||
EMBEDDING_BINDING=ollama
|
||||
EMBEDDING_MODEL=bge-m3:latest
|
||||
EMBEDDING_DIM=1024
|
||||
|
||||
561
examples/batch_processing_example.py
Normal file
561
examples/batch_processing_example.py
Normal file
@@ -0,0 +1,561 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Batch Processing Example for RAG-Anything
|
||||
|
||||
This example demonstrates how to use the batch processing capabilities
|
||||
to process multiple documents in parallel for improved throughput.
|
||||
|
||||
Features demonstrated:
|
||||
- Basic batch processing with BatchParser
|
||||
- Asynchronous batch processing
|
||||
- Integration with RAG-Anything
|
||||
- Error handling and progress tracking
|
||||
- File filtering and directory processing
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
# Add project root directory to Python path
|
||||
import sys
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
from raganything.batch_parser import BatchParser
|
||||
|
||||
|
||||
def create_sample_documents():
|
||||
"""Create sample documents for batch processing testing"""
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
sample_files = []
|
||||
|
||||
# Create various document types
|
||||
documents = {
|
||||
"document1.txt": "This is a simple text document for testing batch processing.",
|
||||
"document2.txt": "Another text document with different content.",
|
||||
"document3.md": """# Markdown Document
|
||||
|
||||
## Introduction
|
||||
This is a markdown document for testing.
|
||||
|
||||
### Features
|
||||
- Markdown formatting
|
||||
- Code blocks
|
||||
- Lists
|
||||
|
||||
```python
|
||||
def example():
|
||||
return "Hello from markdown"
|
||||
```
|
||||
""",
|
||||
"report.txt": """Business Report
|
||||
|
||||
Executive Summary:
|
||||
This report demonstrates batch processing capabilities.
|
||||
|
||||
Key Findings:
|
||||
1. Parallel processing improves throughput
|
||||
2. Progress tracking enhances user experience
|
||||
3. Error handling ensures reliability
|
||||
|
||||
Conclusion:
|
||||
Batch processing is essential for large-scale document processing.
|
||||
""",
|
||||
"notes.md": """# Meeting Notes
|
||||
|
||||
## Date: 2024-01-15
|
||||
|
||||
### Attendees
|
||||
- Alice Johnson
|
||||
- Bob Smith
|
||||
- Carol Williams
|
||||
|
||||
### Discussion Topics
|
||||
1. **Batch Processing Implementation**
|
||||
- Parallel document processing
|
||||
- Progress tracking
|
||||
- Error handling strategies
|
||||
|
||||
2. **Performance Metrics**
|
||||
- Target: 100 documents/hour
|
||||
- Memory usage: < 4GB
|
||||
- Success rate: > 95%
|
||||
|
||||
### Action Items
|
||||
- [ ] Implement batch processing
|
||||
- [ ] Add progress bars
|
||||
- [ ] Test with large document sets
|
||||
- [ ] Optimize memory usage
|
||||
|
||||
### Next Steps
|
||||
Continue development and testing of batch processing features.
|
||||
""",
|
||||
}
|
||||
|
||||
# Create files
|
||||
for filename, content in documents.items():
|
||||
file_path = temp_dir / filename
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
sample_files.append(str(file_path))
|
||||
|
||||
return sample_files, temp_dir
|
||||
|
||||
|
||||
def demonstrate_basic_batch_processing():
|
||||
"""Demonstrate basic batch processing functionality"""
|
||||
print("\n" + "=" * 60)
|
||||
print("BASIC BATCH PROCESSING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Create sample documents
|
||||
sample_files, temp_dir = create_sample_documents()
|
||||
|
||||
try:
|
||||
print(f"Created {len(sample_files)} sample documents in: {temp_dir}")
|
||||
for file_path in sample_files:
|
||||
print(f" - {Path(file_path).name}")
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=3,
|
||||
show_progress=True,
|
||||
timeout_per_file=60,
|
||||
skip_installation_check=True, # Skip installation check for demo
|
||||
)
|
||||
|
||||
print("\nBatch parser configured:")
|
||||
print(" - Parser type: mineru")
|
||||
print(" - Max workers: 3")
|
||||
print(" - Progress tracking: enabled")
|
||||
print(" - Timeout per file: 60 seconds")
|
||||
|
||||
# Check supported extensions
|
||||
supported_extensions = batch_parser.get_supported_extensions()
|
||||
print(f" - Supported extensions: {supported_extensions}")
|
||||
|
||||
# Filter files to supported types
|
||||
supported_files = batch_parser.filter_supported_files(sample_files)
|
||||
print("\nFile filtering results:")
|
||||
print(f" - Total files: {len(sample_files)}")
|
||||
print(f" - Supported files: {len(supported_files)}")
|
||||
|
||||
# Process batch
|
||||
output_dir = temp_dir / "batch_output"
|
||||
print("\nStarting batch processing...")
|
||||
print(f"Output directory: {output_dir}")
|
||||
|
||||
start_time = time.time()
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=supported_files,
|
||||
output_dir=str(output_dir),
|
||||
parse_method="auto",
|
||||
recursive=False,
|
||||
)
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
# Display results
|
||||
print("\n" + "-" * 40)
|
||||
print("BATCH PROCESSING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
print(f"Total processing time: {processing_time:.2f} seconds")
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
if result.successful_files:
|
||||
print("\nSuccessfully processed files:")
|
||||
for file_path in result.successful_files:
|
||||
print(f" ✅ {Path(file_path).name}")
|
||||
|
||||
if result.failed_files:
|
||||
print("\nFailed files:")
|
||||
for file_path in result.failed_files:
|
||||
error = result.errors.get(file_path, "Unknown error")
|
||||
print(f" ❌ {Path(file_path).name}: {error}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Batch processing demonstration failed: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
async def demonstrate_async_batch_processing():
|
||||
"""Demonstrate asynchronous batch processing"""
|
||||
print("\n" + "=" * 60)
|
||||
print("ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Create sample documents
|
||||
sample_files, temp_dir = create_sample_documents()
|
||||
|
||||
try:
|
||||
print(f"Processing {len(sample_files)} documents asynchronously...")
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=2,
|
||||
show_progress=True,
|
||||
skip_installation_check=True,
|
||||
)
|
||||
|
||||
# Process batch asynchronously
|
||||
output_dir = temp_dir / "async_output"
|
||||
|
||||
start_time = time.time()
|
||||
result = await batch_parser.process_batch_async(
|
||||
file_paths=sample_files,
|
||||
output_dir=str(output_dir),
|
||||
parse_method="auto",
|
||||
recursive=False,
|
||||
)
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
# Display results
|
||||
print("\n" + "-" * 40)
|
||||
print("ASYNC BATCH PROCESSING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
print(f"Async processing time: {processing_time:.2f} seconds")
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Async batch processing demonstration failed: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
async def demonstrate_rag_integration():
|
||||
"""Demonstrate batch processing integration with RAG-Anything"""
|
||||
print("\n" + "=" * 60)
|
||||
print("RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Create sample documents
|
||||
sample_files, temp_dir = create_sample_documents()
|
||||
|
||||
try:
|
||||
# Initialize RAG-Anything with temporary storage
|
||||
config = RAGAnythingConfig(
|
||||
working_dir=str(temp_dir / "rag_storage"),
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
max_concurrent_files=2,
|
||||
)
|
||||
|
||||
rag = RAGAnything(config=config)
|
||||
|
||||
print("RAG-Anything initialized with batch processing capabilities")
|
||||
|
||||
# Show available batch methods
|
||||
batch_methods = [method for method in dir(rag) if "batch" in method.lower()]
|
||||
print(f"Available batch methods: {batch_methods}")
|
||||
|
||||
# Demonstrate batch processing with RAG integration
|
||||
print(f"\nProcessing {len(sample_files)} documents with RAG integration...")
|
||||
|
||||
# Use the RAG-integrated batch processing
|
||||
try:
|
||||
# Process documents in batch
|
||||
result = rag.process_documents_batch(
|
||||
file_paths=sample_files,
|
||||
output_dir=str(temp_dir / "rag_batch_output"),
|
||||
max_workers=2,
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("RAG BATCH PROCESSING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
# Demonstrate batch processing with full RAG integration
|
||||
print("\nProcessing documents with full RAG integration...")
|
||||
|
||||
rag_result = await rag.process_documents_with_rag_batch(
|
||||
file_paths=sample_files[:2], # Process subset for demo
|
||||
output_dir=str(temp_dir / "rag_full_output"),
|
||||
max_workers=1,
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("FULL RAG INTEGRATION RESULTS")
|
||||
print("-" * 40)
|
||||
print(f"Parse result: {rag_result['parse_result'].summary()}")
|
||||
print(
|
||||
f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds"
|
||||
)
|
||||
print(
|
||||
f"Successfully processed with RAG: {rag_result['successful_rag_files']}"
|
||||
)
|
||||
print(f"Failed RAG processing: {rag_result['failed_rag_files']}")
|
||||
|
||||
return rag_result
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
|
||||
print(
|
||||
"Note: This is expected in environments without full API configuration"
|
||||
)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ RAG integration demonstration failed: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def demonstrate_directory_processing():
|
||||
"""Demonstrate processing entire directories"""
|
||||
print("\n" + "=" * 60)
|
||||
print("DIRECTORY PROCESSING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Create a directory structure with nested files
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
# Create main directory files
|
||||
main_files = {
|
||||
"overview.txt": "Main directory overview document",
|
||||
"readme.md": "# Project README\n\nThis is the main project documentation.",
|
||||
}
|
||||
|
||||
# Create subdirectory
|
||||
sub_dir = temp_dir / "subdirectory"
|
||||
sub_dir.mkdir()
|
||||
|
||||
sub_files = {
|
||||
"details.txt": "Detailed information in subdirectory",
|
||||
"notes.md": "# Notes\n\nAdditional notes and information.",
|
||||
}
|
||||
|
||||
# Write all files
|
||||
all_files = []
|
||||
for filename, content in main_files.items():
|
||||
file_path = temp_dir / filename
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
all_files.append(str(file_path))
|
||||
|
||||
for filename, content in sub_files.items():
|
||||
file_path = sub_dir / filename
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
all_files.append(str(file_path))
|
||||
|
||||
try:
|
||||
print("Created directory structure:")
|
||||
print(f" Main directory: {temp_dir}")
|
||||
print(f" Files in main: {list(main_files.keys())}")
|
||||
print(f" Subdirectory: {sub_dir}")
|
||||
print(f" Files in sub: {list(sub_files.keys())}")
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=2,
|
||||
show_progress=True,
|
||||
skip_installation_check=True,
|
||||
)
|
||||
|
||||
# Process entire directory recursively
|
||||
print("\nProcessing entire directory recursively...")
|
||||
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=[str(temp_dir)], # Pass directory path
|
||||
output_dir=str(temp_dir / "directory_output"),
|
||||
parse_method="auto",
|
||||
recursive=True, # Include subdirectories
|
||||
)
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("DIRECTORY PROCESSING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
print(f"Total files found and processed: {result.total_files}")
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
if result.successful_files:
|
||||
print("\nSuccessfully processed:")
|
||||
for file_path in result.successful_files:
|
||||
relative_path = Path(file_path).relative_to(temp_dir)
|
||||
print(f" ✅ {relative_path}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Directory processing demonstration failed: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def demonstrate_error_handling():
|
||||
"""Demonstrate error handling and recovery"""
|
||||
print("\n" + "=" * 60)
|
||||
print("ERROR HANDLING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
# Create files with various issues
|
||||
files_with_issues = {
|
||||
"valid_file.txt": "This is a valid file that should process successfully.",
|
||||
"empty_file.txt": "", # Empty file
|
||||
"large_file.txt": "x" * 1000000, # Large file (1MB of 'x')
|
||||
}
|
||||
|
||||
created_files = []
|
||||
for filename, content in files_with_issues.items():
|
||||
file_path = temp_dir / filename
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
created_files.append(str(file_path))
|
||||
|
||||
# Add a non-existent file to the list
|
||||
created_files.append(str(temp_dir / "non_existent_file.txt"))
|
||||
|
||||
try:
|
||||
print(f"Testing error handling with {len(created_files)} files:")
|
||||
for file_path in created_files:
|
||||
name = Path(file_path).name
|
||||
exists = Path(file_path).exists()
|
||||
size = Path(file_path).stat().st_size if exists else 0
|
||||
print(f" - {name}: {'exists' if exists else 'missing'}, {size} bytes")
|
||||
|
||||
# Create batch parser with short timeout for demonstration
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=2,
|
||||
show_progress=True,
|
||||
timeout_per_file=30, # Short timeout for demo
|
||||
skip_installation_check=True,
|
||||
)
|
||||
|
||||
# Process files and handle errors
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=created_files,
|
||||
output_dir=str(temp_dir / "error_test_output"),
|
||||
parse_method="auto",
|
||||
)
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("ERROR HANDLING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
|
||||
if result.successful_files:
|
||||
print("\nSuccessful files:")
|
||||
for file_path in result.successful_files:
|
||||
print(f" ✅ {Path(file_path).name}")
|
||||
|
||||
if result.failed_files:
|
||||
print("\nFailed files with error details:")
|
||||
for file_path in result.failed_files:
|
||||
error = result.errors.get(file_path, "Unknown error")
|
||||
print(f" ❌ {Path(file_path).name}: {error}")
|
||||
|
||||
# Demonstrate retry logic
|
||||
if result.failed_files:
|
||||
print(
|
||||
f"\nDemonstrating retry logic for {len(result.failed_files)} failed files..."
|
||||
)
|
||||
|
||||
# Retry only the failed files
|
||||
retry_result = batch_parser.process_batch(
|
||||
file_paths=result.failed_files,
|
||||
output_dir=str(temp_dir / "retry_output"),
|
||||
parse_method="auto",
|
||||
)
|
||||
|
||||
print(f"Retry results: {retry_result.summary()}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error handling demonstration failed: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main demonstration function"""
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
print("RAG-Anything Batch Processing Demonstration")
|
||||
print("=" * 70)
|
||||
print("This example demonstrates various batch processing capabilities:")
|
||||
print(" - Basic batch processing with progress tracking")
|
||||
print(" - Asynchronous processing for improved performance")
|
||||
print(" - Integration with RAG-Anything pipeline")
|
||||
print(" - Directory processing with recursive file discovery")
|
||||
print(" - Comprehensive error handling and recovery")
|
||||
|
||||
results = {}
|
||||
|
||||
# Run demonstrations
|
||||
print("\n🚀 Starting demonstrations...")
|
||||
|
||||
# Basic batch processing
|
||||
results["basic"] = demonstrate_basic_batch_processing()
|
||||
|
||||
# Asynchronous processing
|
||||
results["async"] = await demonstrate_async_batch_processing()
|
||||
|
||||
# RAG integration
|
||||
results["rag"] = await demonstrate_rag_integration()
|
||||
|
||||
# Directory processing
|
||||
results["directory"] = demonstrate_directory_processing()
|
||||
|
||||
# Error handling
|
||||
results["error_handling"] = demonstrate_error_handling()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("DEMONSTRATION SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
for demo_name, result in results.items():
|
||||
if result:
|
||||
if hasattr(result, "success_rate"):
|
||||
print(
|
||||
f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate"
|
||||
)
|
||||
else:
|
||||
print(f"✅ {demo_name.upper()}: Completed successfully")
|
||||
else:
|
||||
print(f"❌ {demo_name.upper()}: Failed or had limitations")
|
||||
|
||||
print("\n📊 Key Features Demonstrated:")
|
||||
print(" - Parallel document processing with configurable worker counts")
|
||||
print(" - Real-time progress tracking with tqdm progress bars")
|
||||
print(" - Comprehensive error handling and reporting")
|
||||
print(" - File filtering based on supported document types")
|
||||
print(" - Directory processing with recursive file discovery")
|
||||
print(" - Asynchronous processing for improved performance")
|
||||
print(" - Integration with RAG-Anything document pipeline")
|
||||
print(" - Retry logic for failed documents")
|
||||
print(" - Detailed processing statistics and timing")
|
||||
|
||||
print("\n💡 Best Practices Highlighted:")
|
||||
print(" - Use appropriate worker counts for your system")
|
||||
print(" - Enable progress tracking for long-running operations")
|
||||
print(" - Handle errors gracefully with retry mechanisms")
|
||||
print(" - Filter files to supported types before processing")
|
||||
print(" - Set reasonable timeouts for document processing")
|
||||
print(" - Use skip_installation_check for environments with conflicts")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
1055
examples/enhanced_markdown_example.py
Normal file
1055
examples/enhanced_markdown_example.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -14,6 +14,7 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from raganything import RAGAnything
|
||||
@@ -51,7 +52,7 @@ def get_image_info(image_path: Path):
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
def test_image_format_parsing(file_path: str):
|
||||
async def test_image_format_parsing(file_path: str):
|
||||
"""Test image format parsing with MinerU"""
|
||||
|
||||
print(f"🧪 Testing image format parsing: {file_path}")
|
||||
@@ -101,12 +102,12 @@ def test_image_format_parsing(file_path: str):
|
||||
print(f"✅ Format {file_path.suffix.upper()} is natively supported by MinerU")
|
||||
|
||||
# Initialize RAGAnything (only for parsing functionality)
|
||||
rag = RAGAnything(working_dir="./temp_parsing_test")
|
||||
rag = RAGAnything()
|
||||
|
||||
try:
|
||||
# Test image parsing with MinerU
|
||||
print("\n🔄 Testing image parsing with MinerU...")
|
||||
content_list, md_content = rag.parse_document(
|
||||
content_list, md_content = await rag.parse_document(
|
||||
file_path=str(file_path),
|
||||
output_dir="./test_output",
|
||||
parse_method="ocr", # Images use OCR method
|
||||
@@ -147,10 +148,9 @@ def test_image_format_parsing(file_path: str):
|
||||
print(f"\n🖼️ Found {len(image_items)} processed image(s):")
|
||||
for i, item in enumerate(image_items, 1):
|
||||
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
|
||||
if item.get("img_caption"):
|
||||
print(
|
||||
f" Caption: {item.get('img_caption', [])[0] if item.get('img_caption') else 'N/A'}"
|
||||
)
|
||||
caption = item.get("image_caption", item.get("img_caption", []))
|
||||
if caption:
|
||||
print(f" Caption: {caption[0] if caption else 'N/A'}")
|
||||
|
||||
# Display text blocks (OCR results)
|
||||
text_items = [
|
||||
@@ -196,7 +196,7 @@ def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Test image format parsing with MinerU"
|
||||
)
|
||||
parser.add_argument("--file", required=True, help="Path to the image file to test")
|
||||
parser.add_argument("--file", help="Path to the image file to test")
|
||||
parser.add_argument(
|
||||
"--check-pillow", action="store_true", help="Only check PIL/Pillow installation"
|
||||
)
|
||||
@@ -212,9 +212,15 @@ def main():
|
||||
print("✅ PIL/Pillow installation check passed!")
|
||||
return 0
|
||||
|
||||
# If not just checking dependencies, file argument is required
|
||||
if not args.file:
|
||||
print("❌ Error: --file argument is required when not using --check-pillow")
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
# Run the parsing test
|
||||
try:
|
||||
success = test_image_format_parsing(args.file)
|
||||
success = asyncio.run(test_image_format_parsing(args.file))
|
||||
return 0 if success else 1
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹️ Test interrupted by user")
|
||||
|
||||
422
examples/insert_content_list_example.py
Normal file
422
examples/insert_content_list_example.py
Normal file
@@ -0,0 +1,422 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Example script demonstrating direct content list insertion with RAGAnything
|
||||
|
||||
This example shows how to:
|
||||
1. Create a simple content list with different content types
|
||||
2. Insert content list directly without document parsing using insert_content_list() method
|
||||
3. Perform pure text queries using aquery() method
|
||||
4. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method
|
||||
5. Handle different types of multimodal content in the inserted knowledge base
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import logging.config
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root directory to Python path
|
||||
import sys
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path=".env", override=False)
|
||||
|
||||
|
||||
def configure_logging():
|
||||
"""Configure logging for the application"""
|
||||
# Get log directory path from environment variable or use current directory
|
||||
log_dir = os.getenv("LOG_DIR", os.getcwd())
|
||||
log_file_path = os.path.abspath(
|
||||
os.path.join(log_dir, "insert_content_list_example.log")
|
||||
)
|
||||
|
||||
print(f"\nInsert Content List example log file: {log_file_path}\n")
|
||||
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
|
||||
|
||||
# Get log file max size and backup count from environment variables
|
||||
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
|
||||
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
|
||||
|
||||
logging.config.dictConfig(
|
||||
{
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"default": {
|
||||
"format": "%(levelname)s: %(message)s",
|
||||
},
|
||||
"detailed": {
|
||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"formatter": "default",
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": "ext://sys.stderr",
|
||||
},
|
||||
"file": {
|
||||
"formatter": "detailed",
|
||||
"class": "logging.handlers.RotatingFileHandler",
|
||||
"filename": log_file_path,
|
||||
"maxBytes": log_max_bytes,
|
||||
"backupCount": log_backup_count,
|
||||
"encoding": "utf-8",
|
||||
},
|
||||
},
|
||||
"loggers": {
|
||||
"lightrag": {
|
||||
"handlers": ["console", "file"],
|
||||
"level": "INFO",
|
||||
"propagate": False,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Set the logger level to INFO
|
||||
logger.setLevel(logging.INFO)
|
||||
# Enable verbose debug if needed
|
||||
set_verbose_debug(os.getenv("VERBOSE", "false").lower() == "true")
|
||||
|
||||
|
||||
def create_sample_content_list():
|
||||
"""
|
||||
Create a simple content list for testing insert_content_list functionality
|
||||
|
||||
Returns:
|
||||
List[Dict]: Sample content list with various content types
|
||||
|
||||
Note:
|
||||
- img_path should be absolute path to the image file
|
||||
- page_idx represents the page number where the content appears (0-based)
|
||||
"""
|
||||
content_list = [
|
||||
# Introduction text
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Welcome to the RAGAnything System Documentation. This guide covers the advanced multimodal document processing capabilities and features of our comprehensive RAG system.",
|
||||
"page_idx": 0, # Page number where this content appears
|
||||
},
|
||||
# System architecture image
|
||||
{
|
||||
"type": "image",
|
||||
"img_path": "/absolute/path/to/system_architecture.jpg", # IMPORTANT: Use absolute path to image file
|
||||
"image_caption": ["Figure 1: RAGAnything System Architecture"],
|
||||
"image_footnote": [
|
||||
"The architecture shows the complete pipeline from document parsing to multimodal query processing"
|
||||
],
|
||||
"page_idx": 1, # Page number where this image appears
|
||||
},
|
||||
# Performance comparison table
|
||||
{
|
||||
"type": "table",
|
||||
"table_body": """| System | Accuracy | Processing Speed | Memory Usage |
|
||||
|--------|----------|------------------|--------------|
|
||||
| RAGAnything | 95.2% | 120ms | 2.1GB |
|
||||
| Traditional RAG | 87.3% | 180ms | 3.2GB |
|
||||
| Baseline System | 82.1% | 220ms | 4.1GB |
|
||||
| Simple Retrieval | 76.5% | 95ms | 1.8GB |""",
|
||||
"table_caption": [
|
||||
"Table 1: Performance Comparison of Different RAG Systems"
|
||||
],
|
||||
"table_footnote": [
|
||||
"All tests conducted on the same hardware with identical test datasets"
|
||||
],
|
||||
"page_idx": 2, # Page number where this table appears
|
||||
},
|
||||
# Mathematical formula
|
||||
{
|
||||
"type": "equation",
|
||||
"latex": "Relevance(d, q) = \\sum_{i=1}^{n} w_i \\cdot sim(t_i^d, t_i^q) \\cdot \\alpha_i",
|
||||
"text": "Document relevance scoring formula where w_i are term weights, sim() is similarity function, and α_i are modality importance factors",
|
||||
"page_idx": 3, # Page number where this equation appears
|
||||
},
|
||||
# Feature description
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The system supports multiple content modalities including text, images, tables, and mathematical equations. Each modality is processed using specialized processors optimized for that content type.",
|
||||
"page_idx": 4, # Page number where this content appears
|
||||
},
|
||||
# Technical specifications table
|
||||
{
|
||||
"type": "table",
|
||||
"table_body": """| Feature | Specification |
|
||||
|---------|---------------|
|
||||
| Supported Formats | PDF, DOCX, PPTX, XLSX, Images |
|
||||
| Max Document Size | 100MB |
|
||||
| Concurrent Processing | Up to 8 documents |
|
||||
| Query Response Time | <200ms average |
|
||||
| Knowledge Graph Nodes | Up to 1M entities |""",
|
||||
"table_caption": ["Table 2: Technical Specifications"],
|
||||
"table_footnote": [
|
||||
"Specifications may vary based on hardware configuration"
|
||||
],
|
||||
"page_idx": 5, # Page number where this table appears
|
||||
},
|
||||
# Conclusion
|
||||
{
|
||||
"type": "text",
|
||||
"text": "RAGAnything represents a significant advancement in multimodal document processing, providing comprehensive solutions for complex knowledge extraction and retrieval tasks.",
|
||||
"page_idx": 6, # Page number where this content appears
|
||||
},
|
||||
]
|
||||
|
||||
return content_list
|
||||
|
||||
|
||||
async def demo_insert_content_list(
|
||||
api_key: str,
|
||||
base_url: str = None,
|
||||
working_dir: str = None,
|
||||
):
|
||||
"""
|
||||
Demonstrate content list insertion and querying with RAGAnything
|
||||
|
||||
Args:
|
||||
api_key: OpenAI API key
|
||||
base_url: Optional base URL for API
|
||||
working_dir: Working directory for RAG storage
|
||||
"""
|
||||
try:
|
||||
# Create RAGAnything configuration
|
||||
config = RAGAnythingConfig(
|
||||
working_dir=working_dir or "./rag_storage",
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
display_content_stats=True, # Show content statistics
|
||||
)
|
||||
|
||||
# Define LLM model function
|
||||
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o-mini",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
):
|
||||
if image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt}
|
||||
if system_prompt
|
||||
else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_data}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
if image_data
|
||||
else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
# Define embedding function - using environment variables for configuration
|
||||
embedding_dim = int(os.getenv("EMBEDDING_DIM", "3072"))
|
||||
embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
|
||||
|
||||
embedding_func = EmbeddingFunc(
|
||||
embedding_dim=embedding_dim,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model=embedding_model,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialize RAGAnything
|
||||
rag = RAGAnything(
|
||||
config=config,
|
||||
llm_model_func=llm_model_func,
|
||||
vision_model_func=vision_model_func,
|
||||
embedding_func=embedding_func,
|
||||
)
|
||||
|
||||
# Create sample content list
|
||||
logger.info("Creating sample content list...")
|
||||
content_list = create_sample_content_list()
|
||||
logger.info(f"Created content list with {len(content_list)} items")
|
||||
|
||||
# Insert content list directly
|
||||
logger.info("\nInserting content list into RAGAnything...")
|
||||
await rag.insert_content_list(
|
||||
content_list=content_list,
|
||||
file_path="raganything_documentation.pdf", # Reference file name for citation
|
||||
split_by_character=None, # Optional text splitting
|
||||
split_by_character_only=False, # Optional text splitting mode
|
||||
doc_id="demo-doc-001", # Custom document ID
|
||||
display_stats=True, # Show content statistics
|
||||
)
|
||||
logger.info("Content list insertion completed!")
|
||||
|
||||
# Example queries - demonstrating different query approaches
|
||||
logger.info("\nQuerying inserted content:")
|
||||
|
||||
# 1. Pure text queries using aquery()
|
||||
text_queries = [
|
||||
"What is RAGAnything and what are its main features?",
|
||||
"How does RAGAnything compare to traditional RAG systems?",
|
||||
"What are the technical specifications of the system?",
|
||||
]
|
||||
|
||||
for query in text_queries:
|
||||
logger.info(f"\n[Text Query]: {query}")
|
||||
result = await rag.aquery(query, mode="hybrid")
|
||||
logger.info(f"Answer: {result}")
|
||||
|
||||
# 2. Multimodal query with specific multimodal content using aquery_with_multimodal()
|
||||
logger.info(
|
||||
"\n[Multimodal Query]: Analyzing new performance data against existing benchmarks"
|
||||
)
|
||||
multimodal_result = await rag.aquery_with_multimodal(
|
||||
"Compare this new performance data with the existing benchmark results in the documentation",
|
||||
multimodal_content=[
|
||||
{
|
||||
"type": "table",
|
||||
"table_data": """Method,Accuracy,Speed,Memory
|
||||
New_Approach,97.1%,110ms,1.9GB
|
||||
Enhanced_RAG,91.4%,140ms,2.5GB""",
|
||||
"table_caption": "Latest experimental results",
|
||||
}
|
||||
],
|
||||
mode="hybrid",
|
||||
)
|
||||
logger.info(f"Answer: {multimodal_result}")
|
||||
|
||||
# 3. Another multimodal query with equation content
|
||||
logger.info("\n[Multimodal Query]: Mathematical formula analysis")
|
||||
equation_result = await rag.aquery_with_multimodal(
|
||||
"How does this similarity formula relate to the relevance scoring mentioned in the documentation?",
|
||||
multimodal_content=[
|
||||
{
|
||||
"type": "equation",
|
||||
"latex": "sim(a, b) = \\frac{a \\cdot b}{||a|| \\times ||b||} + \\beta \\cdot context\\_weight",
|
||||
"equation_caption": "Enhanced cosine similarity with context weighting",
|
||||
}
|
||||
],
|
||||
mode="hybrid",
|
||||
)
|
||||
logger.info(f"Answer: {equation_result}")
|
||||
|
||||
# 4. Insert another content list with different document ID
|
||||
logger.info("\nInserting additional content list...")
|
||||
additional_content = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is additional documentation about advanced features and configuration options.",
|
||||
"page_idx": 0, # Page number where this content appears
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"table_body": """| Configuration | Default Value | Range |
|
||||
|---------------|---------------|-------|
|
||||
| Chunk Size | 512 tokens | 128-2048 |
|
||||
| Context Window | 4096 tokens | 1024-8192 |
|
||||
| Batch Size | 32 | 1-128 |""",
|
||||
"table_caption": ["Advanced Configuration Parameters"],
|
||||
"page_idx": 1, # Page number where this table appears
|
||||
},
|
||||
]
|
||||
|
||||
await rag.insert_content_list(
|
||||
content_list=additional_content,
|
||||
file_path="advanced_configuration.pdf",
|
||||
doc_id="demo-doc-002", # Different document ID
|
||||
)
|
||||
|
||||
# Query combined knowledge base
|
||||
logger.info("\n[Combined Query]: What configuration options are available?")
|
||||
combined_result = await rag.aquery(
|
||||
"What configuration options are available and what are their default values?",
|
||||
mode="hybrid",
|
||||
)
|
||||
logger.info(f"Answer: {combined_result}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in content list insertion demo: {str(e)}")
|
||||
import traceback
|
||||
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to run the example"""
|
||||
parser = argparse.ArgumentParser(description="Insert Content List Example")
|
||||
parser.add_argument(
|
||||
"--working_dir", "-w", default="./rag_storage", help="Working directory path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
default=os.getenv("LLM_BINDING_API_KEY"),
|
||||
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default=os.getenv("LLM_BINDING_HOST"),
|
||||
help="Optional base URL for API",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if API key is provided
|
||||
if not args.api_key:
|
||||
logger.error("Error: OpenAI API key is required")
|
||||
logger.error("Set api key environment variable or use --api-key option")
|
||||
return
|
||||
|
||||
# Run the demo
|
||||
asyncio.run(
|
||||
demo_insert_content_list(
|
||||
args.api_key,
|
||||
args.base_url,
|
||||
args.working_dir,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Configure logging first
|
||||
configure_logging()
|
||||
|
||||
print("RAGAnything Insert Content List Example")
|
||||
print("=" * 45)
|
||||
print("Demonstrating direct content list insertion without document parsing")
|
||||
print("=" * 45)
|
||||
|
||||
main()
|
||||
334
examples/lmstudio_integration_example.py
Normal file
334
examples/lmstudio_integration_example.py
Normal file
@@ -0,0 +1,334 @@
|
||||
"""
|
||||
LM Studio Integration Example with RAG-Anything
|
||||
|
||||
This example demonstrates how to integrate LM Studio with RAG-Anything for local
|
||||
text document processing and querying.
|
||||
|
||||
Requirements:
|
||||
- LM Studio running locally with server enabled
|
||||
- OpenAI Python package: pip install openai
|
||||
- RAG-Anything installed: pip install raganything
|
||||
|
||||
Environment Setup:
|
||||
Create a .env file with:
|
||||
LLM_BINDING=lmstudio
|
||||
LLM_MODEL=openai/gpt-oss-20b
|
||||
LLM_BINDING_HOST=http://localhost:1234/v1
|
||||
LLM_BINDING_API_KEY=lm-studio
|
||||
EMBEDDING_BINDING=lmstudio
|
||||
EMBEDDING_MODEL=text-embedding-nomic-embed-text-v1.5
|
||||
EMBEDDING_BINDING_HOST=http://localhost:1234/v1
|
||||
EMBEDDING_BINDING_API_KEY=lm-studio
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import asyncio
|
||||
from typing import List, Dict, Optional
|
||||
from dotenv import load_dotenv
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# RAG-Anything imports
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
from lightrag.llm.openai import openai_complete_if_cache
|
||||
|
||||
LM_BASE_URL = os.getenv("LLM_BINDING_HOST", "http://localhost:1234/v1")
|
||||
LM_API_KEY = os.getenv("LLM_BINDING_API_KEY", "lm-studio")
|
||||
LM_MODEL_NAME = os.getenv("LLM_MODEL", "openai/gpt-oss-20b")
|
||||
LM_EMBED_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-nomic-embed-text-v1.5")
|
||||
|
||||
|
||||
async def lmstudio_llm_model_func(
|
||||
prompt: str,
|
||||
system_prompt: Optional[str] = None,
|
||||
history_messages: List[Dict] = None,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""Top-level LLM function for LightRAG (pickle-safe)."""
|
||||
return await openai_complete_if_cache(
|
||||
model=LM_MODEL_NAME,
|
||||
prompt=prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages or [],
|
||||
base_url=LM_BASE_URL,
|
||||
api_key=LM_API_KEY,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
async def lmstudio_embedding_async(texts: List[str]) -> List[List[float]]:
|
||||
"""Top-level embedding function for LightRAG (pickle-safe)."""
|
||||
from lightrag.llm.openai import openai_embed
|
||||
|
||||
embeddings = await openai_embed(
|
||||
texts=texts,
|
||||
model=LM_EMBED_MODEL,
|
||||
base_url=LM_BASE_URL,
|
||||
api_key=LM_API_KEY,
|
||||
)
|
||||
return embeddings.tolist()
|
||||
|
||||
|
||||
class LMStudioRAGIntegration:
|
||||
"""Integration class for LM Studio with RAG-Anything."""
|
||||
|
||||
def __init__(self):
|
||||
# LM Studio configuration using standard LLM_BINDING variables
|
||||
self.base_url = os.getenv("LLM_BINDING_HOST", "http://localhost:1234/v1")
|
||||
self.api_key = os.getenv("LLM_BINDING_API_KEY", "lm-studio")
|
||||
self.model_name = os.getenv("LLM_MODEL", "openai/gpt-oss-20b")
|
||||
self.embedding_model = os.getenv(
|
||||
"EMBEDDING_MODEL", "text-embedding-nomic-embed-text-v1.5"
|
||||
)
|
||||
|
||||
# RAG-Anything configuration
|
||||
# Use a fresh working directory each run to avoid legacy doc_status schema conflicts
|
||||
self.config = RAGAnythingConfig(
|
||||
working_dir=f"./rag_storage_lmstudio/{uuid.uuid4()}",
|
||||
parser="mineru",
|
||||
parse_method="auto",
|
||||
enable_image_processing=False,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
)
|
||||
print(f"📁 Using working_dir: {self.config.working_dir}")
|
||||
|
||||
self.rag = None
|
||||
|
||||
async def test_connection(self) -> bool:
|
||||
"""Test LM Studio connection."""
|
||||
try:
|
||||
print(f"🔌 Testing LM Studio connection at: {self.base_url}")
|
||||
client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)
|
||||
models = await client.models.list()
|
||||
print(f"✅ Connected successfully! Found {len(models.data)} models")
|
||||
|
||||
# Show available models
|
||||
print("📊 Available models:")
|
||||
for i, model in enumerate(models.data[:5]):
|
||||
marker = "🎯" if model.id == self.model_name else " "
|
||||
print(f"{marker} {i+1}. {model.id}")
|
||||
|
||||
if len(models.data) > 5:
|
||||
print(f" ... and {len(models.data) - 5} more models")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Connection failed: {str(e)}")
|
||||
print("\n💡 Troubleshooting tips:")
|
||||
print("1. Ensure LM Studio is running")
|
||||
print("2. Start the local server in LM Studio")
|
||||
print("3. Load a model or enable just-in-time loading")
|
||||
print(f"4. Verify server address: {self.base_url}")
|
||||
return False
|
||||
finally:
|
||||
try:
|
||||
await client.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def test_chat_completion(self) -> bool:
|
||||
"""Test basic chat functionality."""
|
||||
try:
|
||||
print(f"💬 Testing chat with model: {self.model_name}")
|
||||
client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)
|
||||
response = await client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful AI assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello! Please confirm you're working and tell me your capabilities.",
|
||||
},
|
||||
],
|
||||
max_tokens=100,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
result = response.choices[0].message.content.strip()
|
||||
print("✅ Chat test successful!")
|
||||
print(f"Response: {result}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Chat test failed: {str(e)}")
|
||||
return False
|
||||
finally:
|
||||
try:
|
||||
await client.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Deprecated factory helpers removed to reduce redundancy
|
||||
|
||||
def embedding_func_factory(self):
|
||||
"""Create a completely serializable embedding function."""
|
||||
return EmbeddingFunc(
|
||||
embedding_dim=768, # nomic-embed-text-v1.5 default dimension
|
||||
max_token_size=8192, # nomic-embed-text-v1.5 context length
|
||||
func=lmstudio_embedding_async,
|
||||
)
|
||||
|
||||
async def initialize_rag(self):
|
||||
"""Initialize RAG-Anything with LM Studio functions."""
|
||||
print("Initializing RAG-Anything with LM Studio...")
|
||||
|
||||
try:
|
||||
self.rag = RAGAnything(
|
||||
config=self.config,
|
||||
llm_model_func=lmstudio_llm_model_func,
|
||||
embedding_func=self.embedding_func_factory(),
|
||||
)
|
||||
|
||||
# Compatibility: avoid writing unknown field 'multimodal_processed' to LightRAG doc_status
|
||||
# Older LightRAG versions may not accept this extra field in DocProcessingStatus
|
||||
async def _noop_mark_multimodal(doc_id: str):
|
||||
return None
|
||||
|
||||
self.rag._mark_multimodal_processing_complete = _noop_mark_multimodal
|
||||
|
||||
print("✅ RAG-Anything initialized successfully!")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ RAG initialization failed: {str(e)}")
|
||||
return False
|
||||
|
||||
async def process_document_example(self, file_path: str):
|
||||
"""Example: Process a document with LM Studio backend."""
|
||||
if not self.rag:
|
||||
print("❌ RAG not initialized. Call initialize_rag() first.")
|
||||
return
|
||||
|
||||
try:
|
||||
print(f"📄 Processing document: {file_path}")
|
||||
await self.rag.process_document_complete(
|
||||
file_path=file_path,
|
||||
output_dir="./output_lmstudio",
|
||||
parse_method="auto",
|
||||
display_stats=True,
|
||||
)
|
||||
print("✅ Document processing completed!")
|
||||
except Exception as e:
|
||||
print(f"❌ Document processing failed: {str(e)}")
|
||||
|
||||
async def query_examples(self):
|
||||
"""Example queries with different modes."""
|
||||
if not self.rag:
|
||||
print("❌ RAG not initialized. Call initialize_rag() first.")
|
||||
return
|
||||
|
||||
# Example queries
|
||||
queries = [
|
||||
("What are the main topics in the processed documents?", "hybrid"),
|
||||
("Summarize any tables or data found in the documents", "local"),
|
||||
("What images or figures are mentioned?", "global"),
|
||||
]
|
||||
|
||||
print("\n🔍 Running example queries...")
|
||||
for query, mode in queries:
|
||||
try:
|
||||
print(f"\nQuery ({mode}): {query}")
|
||||
result = await self.rag.aquery(query, mode=mode)
|
||||
print(f"Answer: {result[:200]}...")
|
||||
except Exception as e:
|
||||
print(f"❌ Query failed: {str(e)}")
|
||||
|
||||
async def simple_query_example(self):
|
||||
"""Example basic text query with sample content."""
|
||||
if not self.rag:
|
||||
print("❌ RAG not initialized")
|
||||
return
|
||||
|
||||
try:
|
||||
print("\nAdding sample content for testing...")
|
||||
|
||||
# Create content list in the format expected by RAGAnything
|
||||
content_list = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": """LM Studio Integration with RAG-Anything
|
||||
|
||||
This integration demonstrates how to connect LM Studio's local AI models with RAG-Anything's document processing capabilities. The system uses:
|
||||
|
||||
- LM Studio for local LLM inference
|
||||
- nomic-embed-text-v1.5 for embeddings (768 dimensions)
|
||||
- RAG-Anything for document processing and retrieval
|
||||
|
||||
Key benefits include:
|
||||
- Privacy: All processing happens locally
|
||||
- Performance: Direct API access to local models
|
||||
- Flexibility: Support for various document formats
|
||||
- Cost-effective: No external API usage""",
|
||||
"page_idx": 0,
|
||||
}
|
||||
]
|
||||
|
||||
# Insert the content list using the correct method
|
||||
await self.rag.insert_content_list(
|
||||
content_list=content_list,
|
||||
file_path="lmstudio_integration_demo.txt",
|
||||
# Use a unique doc_id to avoid collisions and doc_status reuse across runs
|
||||
doc_id=f"demo-content-{uuid.uuid4()}",
|
||||
display_stats=True,
|
||||
)
|
||||
print("✅ Sample content added to knowledge base")
|
||||
|
||||
print("\nTesting basic text query...")
|
||||
|
||||
# Simple text query example
|
||||
result = await self.rag.aquery(
|
||||
"What are the key benefits of this LM Studio integration?",
|
||||
mode="hybrid",
|
||||
)
|
||||
print(f"✅ Query result: {result[:300]}...")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Query failed: {str(e)}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main example function."""
|
||||
print("=" * 70)
|
||||
print("LM Studio + RAG-Anything Integration Example")
|
||||
print("=" * 70)
|
||||
|
||||
# Initialize integration
|
||||
integration = LMStudioRAGIntegration()
|
||||
|
||||
# Test connection
|
||||
if not await integration.test_connection():
|
||||
return False
|
||||
|
||||
print()
|
||||
if not await integration.test_chat_completion():
|
||||
return False
|
||||
|
||||
# Initialize RAG
|
||||
print("\n" + "─" * 50)
|
||||
if not await integration.initialize_rag():
|
||||
return False
|
||||
|
||||
# Example document processing (uncomment and provide a real file path)
|
||||
# await integration.process_document_example("path/to/your/document.pdf")
|
||||
|
||||
# Example queries (uncomment after processing documents)
|
||||
# await integration.query_examples()
|
||||
|
||||
# Example basic query
|
||||
await integration.simple_query_example()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Integration example completed successfully!")
|
||||
print("=" * 70)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🚀 Starting LM Studio integration example...")
|
||||
success = asyncio.run(main())
|
||||
|
||||
exit(0 if success else 1)
|
||||
@@ -91,12 +91,12 @@ async def process_image_example(lightrag: LightRAG, vision_model_func):
|
||||
# Prepare image content
|
||||
image_content = {
|
||||
"img_path": "image.jpg",
|
||||
"img_caption": ["Example image caption"],
|
||||
"img_footnote": ["Example image footnote"],
|
||||
"image_caption": ["Example image caption"],
|
||||
"image_footnote": ["Example image footnote"],
|
||||
}
|
||||
|
||||
# Process image
|
||||
description, entity_info = await image_processor.process_multimodal_content(
|
||||
(description, entity_info, _) = await image_processor.process_multimodal_content(
|
||||
modal_content=image_content,
|
||||
content_type="image",
|
||||
file_path="image_example.jpg",
|
||||
@@ -128,7 +128,7 @@ async def process_table_example(lightrag: LightRAG, llm_model_func):
|
||||
}
|
||||
|
||||
# Process table
|
||||
description, entity_info = await table_processor.process_multimodal_content(
|
||||
(description, entity_info, _) = await table_processor.process_multimodal_content(
|
||||
modal_content=table_content,
|
||||
content_type="table",
|
||||
file_path="table_example.md",
|
||||
@@ -151,7 +151,7 @@ async def process_equation_example(lightrag: LightRAG, llm_model_func):
|
||||
equation_content = {"text": "E = mc^2", "text_format": "LaTeX"}
|
||||
|
||||
# Process equation
|
||||
description, entity_info = await equation_processor.process_multimodal_content(
|
||||
(description, entity_info, _) = await equation_processor.process_multimodal_content(
|
||||
modal_content=equation_content,
|
||||
content_type="equation",
|
||||
file_path="equation_example.txt",
|
||||
@@ -164,14 +164,20 @@ async def process_equation_example(lightrag: LightRAG, llm_model_func):
|
||||
|
||||
|
||||
async def initialize_rag(api_key: str, base_url: str = None):
|
||||
# Use environment variables for embedding configuration
|
||||
import os
|
||||
|
||||
embedding_dim = int(os.getenv("EMBEDDING_DIM", "3072"))
|
||||
embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
|
||||
|
||||
rag = LightRAG(
|
||||
working_dir=WORKING_DIR,
|
||||
embedding_func=EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
embedding_dim=embedding_dim,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="text-embedding-3-large",
|
||||
model=embedding_model,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
|
||||
@@ -14,6 +14,7 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from raganything import RAGAnything
|
||||
@@ -45,7 +46,7 @@ def check_libreoffice_installation():
|
||||
return False
|
||||
|
||||
|
||||
def test_office_document_parsing(file_path: str):
|
||||
async def test_office_document_parsing(file_path: str):
|
||||
"""Test Office document parsing with MinerU"""
|
||||
|
||||
print(f"🧪 Testing Office document parsing: {file_path}")
|
||||
@@ -66,12 +67,12 @@ def test_office_document_parsing(file_path: str):
|
||||
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
|
||||
|
||||
# Initialize RAGAnything (only for parsing functionality)
|
||||
rag = RAGAnything(working_dir="./temp_parsing_test")
|
||||
rag = RAGAnything()
|
||||
|
||||
try:
|
||||
# Test document parsing with MinerU
|
||||
print("\n🔄 Testing document parsing with MinerU...")
|
||||
content_list, md_content = rag.parse_document(
|
||||
content_list, md_content = await rag.parse_document(
|
||||
file_path=str(file_path),
|
||||
output_dir="./test_output",
|
||||
parse_method="auto",
|
||||
@@ -157,9 +158,7 @@ def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Test Office document parsing with MinerU"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--file", required=True, help="Path to the Office document to test"
|
||||
)
|
||||
parser.add_argument("--file", help="Path to the Office document to test")
|
||||
parser.add_argument(
|
||||
"--check-libreoffice",
|
||||
action="store_true",
|
||||
@@ -177,9 +176,17 @@ def main():
|
||||
print("✅ LibreOffice installation check passed!")
|
||||
return 0
|
||||
|
||||
# If not just checking dependencies, file argument is required
|
||||
if not args.file:
|
||||
print(
|
||||
"❌ Error: --file argument is required when not using --check-libreoffice"
|
||||
)
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
# Run the parsing test
|
||||
try:
|
||||
success = test_office_document_parsing(args.file)
|
||||
success = asyncio.run(test_office_document_parsing(args.file))
|
||||
return 0 if success else 1
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹️ Test interrupted by user")
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
Example script demonstrating the integration of MinerU parser with RAGAnything
|
||||
|
||||
This example shows how to:
|
||||
1. Process parsed documents with RAGAnything
|
||||
2. Perform multimodal queries on the processed documents
|
||||
3. Handle different types of content (text, images, tables)
|
||||
1. Process documents with RAGAnything using MinerU parser
|
||||
2. Perform pure text queries using aquery() method
|
||||
3. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method
|
||||
4. Handle different types of multimodal content (tables, equations) in queries
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -24,6 +25,10 @@ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path=".env", override=False)
|
||||
|
||||
|
||||
def configure_logging():
|
||||
"""Configure logging for the application"""
|
||||
@@ -87,6 +92,7 @@ async def process_with_rag(
|
||||
api_key: str,
|
||||
base_url: str = None,
|
||||
working_dir: str = None,
|
||||
parser: str = None,
|
||||
):
|
||||
"""
|
||||
Process document with RAGAnything
|
||||
@@ -102,7 +108,8 @@ async def process_with_rag(
|
||||
# Create RAGAnything configuration
|
||||
config = RAGAnythingConfig(
|
||||
working_dir=working_dir or "./rag_storage",
|
||||
mineru_parse_method="auto",
|
||||
parser=parser, # Parser selection: mineru or docling
|
||||
parse_method="auto", # Parse method: auto, ocr, or txt
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
@@ -122,9 +129,27 @@ async def process_with_rag(
|
||||
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
prompt,
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
image_data=None,
|
||||
messages=None,
|
||||
**kwargs,
|
||||
):
|
||||
if image_data:
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -153,16 +178,20 @@ async def process_with_rag(
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
# Define embedding function
|
||||
# Define embedding function - using environment variables for configuration
|
||||
embedding_dim = int(os.getenv("EMBEDDING_DIM", "3072"))
|
||||
embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
|
||||
|
||||
embedding_func = EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
embedding_dim=embedding_dim,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="text-embedding-3-large",
|
||||
model=embedding_model,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
),
|
||||
@@ -181,19 +210,55 @@ async def process_with_rag(
|
||||
file_path=file_path, output_dir=output_dir, parse_method="auto"
|
||||
)
|
||||
|
||||
# Example queries
|
||||
queries = [
|
||||
# Example queries - demonstrating different query approaches
|
||||
logger.info("\nQuerying processed document:")
|
||||
|
||||
# 1. Pure text queries using aquery()
|
||||
text_queries = [
|
||||
"What is the main content of the document?",
|
||||
"Describe the images and figures in the document",
|
||||
"Tell me about the experimental results and data tables",
|
||||
"What are the key topics discussed?",
|
||||
]
|
||||
|
||||
logger.info("\nQuerying processed document:")
|
||||
for query in queries:
|
||||
logger.info(f"\nQuery: {query}")
|
||||
result = await rag.query_with_multimodal(query, mode="hybrid")
|
||||
for query in text_queries:
|
||||
logger.info(f"\n[Text Query]: {query}")
|
||||
result = await rag.aquery(query, mode="hybrid")
|
||||
logger.info(f"Answer: {result}")
|
||||
|
||||
# 2. Multimodal query with specific multimodal content using aquery_with_multimodal()
|
||||
logger.info(
|
||||
"\n[Multimodal Query]: Analyzing performance data in context of document"
|
||||
)
|
||||
multimodal_result = await rag.aquery_with_multimodal(
|
||||
"Compare this performance data with any similar results mentioned in the document",
|
||||
multimodal_content=[
|
||||
{
|
||||
"type": "table",
|
||||
"table_data": """Method,Accuracy,Processing_Time
|
||||
RAGAnything,95.2%,120ms
|
||||
Traditional_RAG,87.3%,180ms
|
||||
Baseline,82.1%,200ms""",
|
||||
"table_caption": "Performance comparison results",
|
||||
}
|
||||
],
|
||||
mode="hybrid",
|
||||
)
|
||||
logger.info(f"Answer: {multimodal_result}")
|
||||
|
||||
# 3. Another multimodal query with equation content
|
||||
logger.info("\n[Multimodal Query]: Mathematical formula analysis")
|
||||
equation_result = await rag.aquery_with_multimodal(
|
||||
"Explain this formula and relate it to any mathematical concepts in the document",
|
||||
multimodal_content=[
|
||||
{
|
||||
"type": "equation",
|
||||
"latex": "F1 = 2 \\cdot \\frac{precision \\cdot recall}{precision + recall}",
|
||||
"equation_caption": "F1-score calculation formula",
|
||||
}
|
||||
],
|
||||
mode="hybrid",
|
||||
)
|
||||
logger.info(f"Answer: {equation_result}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing with RAG: {str(e)}")
|
||||
import traceback
|
||||
@@ -213,17 +278,26 @@ def main():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
default=os.getenv("OPENAI_API_KEY"),
|
||||
help="OpenAI API key (defaults to OPENAI_API_KEY env var)",
|
||||
default=os.getenv("LLM_BINDING_API_KEY"),
|
||||
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default=os.getenv("LLM_BINDING_HOST"),
|
||||
help="Optional base URL for API",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parser",
|
||||
default=os.getenv("PARSER", "mineru"),
|
||||
help="Optional base URL for API",
|
||||
)
|
||||
parser.add_argument("--base-url", help="Optional base URL for API")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if API key is provided
|
||||
if not args.api_key:
|
||||
logger.error("Error: OpenAI API key is required")
|
||||
logger.error("Set OPENAI_API_KEY environment variable or use --api-key option")
|
||||
logger.error("Set api key environment variable or use --api-key option")
|
||||
return
|
||||
|
||||
# Create output directory if specified
|
||||
@@ -233,7 +307,12 @@ def main():
|
||||
# Process with RAG
|
||||
asyncio.run(
|
||||
process_with_rag(
|
||||
args.file_path, args.output, args.api_key, args.base_url, args.working_dir
|
||||
args.file_path,
|
||||
args.output,
|
||||
args.api_key,
|
||||
args.base_url,
|
||||
args.working_dir,
|
||||
args.parser,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from raganything import RAGAnything
|
||||
@@ -34,7 +35,7 @@ def check_reportlab_installation():
|
||||
return False
|
||||
|
||||
|
||||
def test_text_format_parsing(file_path: str):
|
||||
async def test_text_format_parsing(file_path: str):
|
||||
"""Test text format parsing with MinerU"""
|
||||
|
||||
print(f"🧪 Testing text format parsing: {file_path}")
|
||||
@@ -66,12 +67,12 @@ def test_text_format_parsing(file_path: str):
|
||||
)
|
||||
|
||||
# Initialize RAGAnything (only for parsing functionality)
|
||||
rag = RAGAnything(working_dir="./temp_parsing_test")
|
||||
rag = RAGAnything()
|
||||
|
||||
try:
|
||||
# Test text parsing with MinerU
|
||||
print("\n🔄 Testing text parsing with MinerU...")
|
||||
content_list, md_content = rag.parse_document(
|
||||
content_list, md_content = await rag.parse_document(
|
||||
file_path=str(file_path),
|
||||
output_dir="./test_output",
|
||||
parse_method="auto",
|
||||
@@ -157,7 +158,7 @@ def test_text_format_parsing(file_path: str):
|
||||
def main():
|
||||
"""Main function"""
|
||||
parser = argparse.ArgumentParser(description="Test text format parsing with MinerU")
|
||||
parser.add_argument("--file", required=True, help="Path to the text file to test")
|
||||
parser.add_argument("--file", help="Path to the text file to test")
|
||||
parser.add_argument(
|
||||
"--check-reportlab",
|
||||
action="store_true",
|
||||
@@ -175,9 +176,15 @@ def main():
|
||||
print("✅ ReportLab installation check passed!")
|
||||
return 0
|
||||
|
||||
# If not just checking dependencies, file argument is required
|
||||
if not args.file:
|
||||
print("❌ Error: --file argument is required when not using --check-reportlab")
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
# Run the parsing test
|
||||
try:
|
||||
success = test_text_format_parsing(args.file)
|
||||
success = asyncio.run(test_text_format_parsing(args.file))
|
||||
return 0 if success else 1
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹️ Test interrupted by user")
|
||||
|
||||
75
pyproject.toml
Normal file
75
pyproject.toml
Normal file
@@ -0,0 +1,75 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=64", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "raganything"
|
||||
dynamic = ["version"]
|
||||
authors = [
|
||||
{name = "Zirui Guo"}
|
||||
]
|
||||
description = "RAGAnything: All-in-One RAG System"
|
||||
readme = "README.md"
|
||||
license = { text = "MIT" }
|
||||
requires-python = ">=3.10"
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Intended Audience :: Developers",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
]
|
||||
dependencies = [
|
||||
"huggingface_hub",
|
||||
"lightrag-hku",
|
||||
"mineru[core]",
|
||||
"tqdm",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
image = ["Pillow>=10.0.0"]
|
||||
text = ["reportlab>=4.0.0"]
|
||||
office = [] # Requires LibreOffice (external program)
|
||||
markdown = [
|
||||
"markdown>=3.4.0",
|
||||
"weasyprint>=60.0",
|
||||
"pygments>=2.10.0",
|
||||
]
|
||||
all = [
|
||||
"Pillow>=10.0.0",
|
||||
"reportlab>=4.0.0",
|
||||
"markdown>=3.4.0",
|
||||
"weasyprint>=60.0",
|
||||
"pygments>=2.10.0"
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/HKUDS/RAG-Anything"
|
||||
Documentation = "https://github.com/HKUDS/RAG-Anything"
|
||||
Repository = "https://github.com/HKUDS/RAG-Anything"
|
||||
Issues = "https://github.com/HKUDS/RAG-Anything/issues"
|
||||
|
||||
[tool.uv]
|
||||
dev-dependencies = [
|
||||
"pytest>=6.0",
|
||||
"pytest-asyncio",
|
||||
"black",
|
||||
"isort",
|
||||
"flake8",
|
||||
"mypy",
|
||||
"openai",
|
||||
"python-dotenv",
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["raganything*"]
|
||||
|
||||
[tool.setuptools]
|
||||
include-package-data = true
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {attr = "raganything.__version__"}
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
@@ -1,7 +1,7 @@
|
||||
from .raganything import RAGAnything as RAGAnything
|
||||
from .raganything import RAGAnythingConfig as RAGAnythingConfig
|
||||
from .config import RAGAnythingConfig as RAGAnythingConfig
|
||||
|
||||
__version__ = "1.1.0"
|
||||
__version__ = "1.2.8"
|
||||
__author__ = "Zirui Guo"
|
||||
__url__ = "https://github.com/HKUDS/RAG-Anything"
|
||||
|
||||
|
||||
12
raganything/base.py
Normal file
12
raganything/base.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class DocStatus(str, Enum):
|
||||
"""Document processing status"""
|
||||
|
||||
READY = "ready"
|
||||
HANDLING = "handling"
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
PROCESSED = "processed"
|
||||
FAILED = "failed"
|
||||
386
raganything/batch.py
Normal file
386
raganything/batch.py
Normal file
@@ -0,0 +1,386 @@
|
||||
"""
|
||||
Batch processing functionality for RAGAnything
|
||||
|
||||
Contains methods for processing multiple documents in batch mode
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, TYPE_CHECKING
|
||||
import time
|
||||
|
||||
from .batch_parser import BatchParser, BatchProcessingResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .config import RAGAnythingConfig
|
||||
|
||||
|
||||
class BatchMixin:
|
||||
"""BatchMixin class containing batch processing functionality for RAGAnything"""
|
||||
|
||||
# Type hints for mixin attributes (will be available when mixed into RAGAnything)
|
||||
config: "RAGAnythingConfig"
|
||||
logger: logging.Logger
|
||||
|
||||
# Type hints for methods that will be available from other mixins
|
||||
async def _ensure_lightrag_initialized(self) -> None: ...
|
||||
async def process_document_complete(self, file_path: str, **kwargs) -> None: ...
|
||||
|
||||
# ==========================================
|
||||
# ORIGINAL BATCH PROCESSING METHOD (RESTORED)
|
||||
# ==========================================
|
||||
|
||||
async def process_folder_complete(
|
||||
self,
|
||||
folder_path: str,
|
||||
output_dir: str = None,
|
||||
parse_method: str = None,
|
||||
display_stats: bool = None,
|
||||
split_by_character: str | None = None,
|
||||
split_by_character_only: bool = False,
|
||||
file_extensions: Optional[List[str]] = None,
|
||||
recursive: bool = None,
|
||||
max_workers: int = None,
|
||||
):
|
||||
"""
|
||||
Process all supported files in a folder
|
||||
|
||||
Args:
|
||||
folder_path: Path to the folder containing files to process
|
||||
output_dir: Directory for parsed outputs (optional)
|
||||
parse_method: Parsing method to use (optional)
|
||||
display_stats: Whether to display statistics (optional)
|
||||
split_by_character: Character to split by (optional)
|
||||
split_by_character_only: Whether to split only by character (optional)
|
||||
file_extensions: List of file extensions to process (optional)
|
||||
recursive: Whether to process folders recursively (optional)
|
||||
max_workers: Maximum number of workers for concurrent processing (optional)
|
||||
"""
|
||||
if output_dir is None:
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.parse_method
|
||||
if display_stats is None:
|
||||
display_stats = True
|
||||
if file_extensions is None:
|
||||
file_extensions = self.config.supported_file_extensions
|
||||
if recursive is None:
|
||||
recursive = self.config.recursive_folder_processing
|
||||
if max_workers is None:
|
||||
max_workers = self.config.max_concurrent_files
|
||||
|
||||
await self._ensure_lightrag_initialized()
|
||||
|
||||
# Get all files in the folder
|
||||
folder_path_obj = Path(folder_path)
|
||||
if not folder_path_obj.exists():
|
||||
raise FileNotFoundError(f"Folder not found: {folder_path}")
|
||||
|
||||
# Collect files based on supported extensions
|
||||
files_to_process = []
|
||||
for file_ext in file_extensions:
|
||||
if recursive:
|
||||
pattern = f"**/*{file_ext}"
|
||||
else:
|
||||
pattern = f"*{file_ext}"
|
||||
files_to_process.extend(folder_path_obj.glob(pattern))
|
||||
|
||||
if not files_to_process:
|
||||
self.logger.warning(f"No supported files found in {folder_path}")
|
||||
return
|
||||
|
||||
self.logger.info(
|
||||
f"Found {len(files_to_process)} files to process in {folder_path}"
|
||||
)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process files with controlled concurrency
|
||||
semaphore = asyncio.Semaphore(max_workers)
|
||||
tasks = []
|
||||
|
||||
async def process_single_file(file_path: Path):
|
||||
async with semaphore:
|
||||
try:
|
||||
await self.process_document_complete(
|
||||
str(file_path),
|
||||
output_dir=output_dir,
|
||||
parse_method=parse_method,
|
||||
split_by_character=split_by_character,
|
||||
split_by_character_only=split_by_character_only,
|
||||
)
|
||||
return True, str(file_path), None
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to process {file_path}: {str(e)}")
|
||||
return False, str(file_path), str(e)
|
||||
|
||||
# Create tasks for all files
|
||||
for file_path in files_to_process:
|
||||
task = asyncio.create_task(process_single_file(file_path))
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Process results
|
||||
successful_files = []
|
||||
failed_files = []
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
failed_files.append(("unknown", str(result)))
|
||||
else:
|
||||
success, file_path, error = result
|
||||
if success:
|
||||
successful_files.append(file_path)
|
||||
else:
|
||||
failed_files.append((file_path, error))
|
||||
|
||||
# Display statistics if requested
|
||||
if display_stats:
|
||||
self.logger.info("Processing complete!")
|
||||
self.logger.info(f" Successful: {len(successful_files)} files")
|
||||
self.logger.info(f" Failed: {len(failed_files)} files")
|
||||
if failed_files:
|
||||
self.logger.warning("Failed files:")
|
||||
for file_path, error in failed_files:
|
||||
self.logger.warning(f" - {file_path}: {error}")
|
||||
|
||||
# ==========================================
|
||||
# NEW ENHANCED BATCH PROCESSING METHODS
|
||||
# ==========================================
|
||||
|
||||
def process_documents_batch(
|
||||
self,
|
||||
file_paths: List[str],
|
||||
output_dir: Optional[str] = None,
|
||||
parse_method: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
recursive: Optional[bool] = None,
|
||||
show_progress: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchProcessingResult:
|
||||
"""
|
||||
Process multiple documents in batch using the new BatchParser
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths or directories to process
|
||||
output_dir: Output directory for parsed files
|
||||
parse_method: Parsing method to use
|
||||
max_workers: Maximum number of workers for parallel processing
|
||||
recursive: Whether to process directories recursively
|
||||
show_progress: Whether to show progress bar
|
||||
**kwargs: Additional arguments passed to the parser
|
||||
|
||||
Returns:
|
||||
BatchProcessingResult: Results of the batch processing
|
||||
"""
|
||||
# Use config defaults if not specified
|
||||
if output_dir is None:
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.parse_method
|
||||
if max_workers is None:
|
||||
max_workers = self.config.max_concurrent_files
|
||||
if recursive is None:
|
||||
recursive = self.config.recursive_folder_processing
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type=self.config.parser,
|
||||
max_workers=max_workers,
|
||||
show_progress=show_progress,
|
||||
skip_installation_check=True, # Skip installation check for better UX
|
||||
)
|
||||
|
||||
# Process batch
|
||||
return batch_parser.process_batch(
|
||||
file_paths=file_paths,
|
||||
output_dir=output_dir,
|
||||
parse_method=parse_method,
|
||||
recursive=recursive,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def process_documents_batch_async(
|
||||
self,
|
||||
file_paths: List[str],
|
||||
output_dir: Optional[str] = None,
|
||||
parse_method: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
recursive: Optional[bool] = None,
|
||||
show_progress: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchProcessingResult:
|
||||
"""
|
||||
Asynchronously process multiple documents in batch
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths or directories to process
|
||||
output_dir: Output directory for parsed files
|
||||
parse_method: Parsing method to use
|
||||
max_workers: Maximum number of workers for parallel processing
|
||||
recursive: Whether to process directories recursively
|
||||
show_progress: Whether to show progress bar
|
||||
**kwargs: Additional arguments passed to the parser
|
||||
|
||||
Returns:
|
||||
BatchProcessingResult: Results of the batch processing
|
||||
"""
|
||||
# Use config defaults if not specified
|
||||
if output_dir is None:
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.parse_method
|
||||
if max_workers is None:
|
||||
max_workers = self.config.max_concurrent_files
|
||||
if recursive is None:
|
||||
recursive = self.config.recursive_folder_processing
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type=self.config.parser,
|
||||
max_workers=max_workers,
|
||||
show_progress=show_progress,
|
||||
skip_installation_check=True, # Skip installation check for better UX
|
||||
)
|
||||
|
||||
# Process batch asynchronously
|
||||
return await batch_parser.process_batch_async(
|
||||
file_paths=file_paths,
|
||||
output_dir=output_dir,
|
||||
parse_method=parse_method,
|
||||
recursive=recursive,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_supported_file_extensions(self) -> List[str]:
|
||||
"""Get list of supported file extensions for batch processing"""
|
||||
batch_parser = BatchParser(parser_type=self.config.parser)
|
||||
return batch_parser.get_supported_extensions()
|
||||
|
||||
def filter_supported_files(
|
||||
self, file_paths: List[str], recursive: Optional[bool] = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
Filter file paths to only include supported file types
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths to filter
|
||||
recursive: Whether to process directories recursively
|
||||
|
||||
Returns:
|
||||
List of supported file paths
|
||||
"""
|
||||
if recursive is None:
|
||||
recursive = self.config.recursive_folder_processing
|
||||
|
||||
batch_parser = BatchParser(parser_type=self.config.parser)
|
||||
return batch_parser.filter_supported_files(file_paths, recursive)
|
||||
|
||||
async def process_documents_with_rag_batch(
|
||||
self,
|
||||
file_paths: List[str],
|
||||
output_dir: Optional[str] = None,
|
||||
parse_method: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
recursive: Optional[bool] = None,
|
||||
show_progress: bool = True,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process documents in batch and then add them to RAG
|
||||
|
||||
This method combines document parsing and RAG insertion:
|
||||
1. First, parse all documents using batch processing
|
||||
2. Then, process each successfully parsed document with RAG
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths or directories to process
|
||||
output_dir: Output directory for parsed files
|
||||
parse_method: Parsing method to use
|
||||
max_workers: Maximum number of workers for parallel processing
|
||||
recursive: Whether to process directories recursively
|
||||
show_progress: Whether to show progress bar
|
||||
**kwargs: Additional arguments passed to the parser
|
||||
|
||||
Returns:
|
||||
Dict containing both parse results and RAG processing results
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Use config defaults if not specified
|
||||
if output_dir is None:
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.parse_method
|
||||
if max_workers is None:
|
||||
max_workers = self.config.max_concurrent_files
|
||||
if recursive is None:
|
||||
recursive = self.config.recursive_folder_processing
|
||||
|
||||
self.logger.info("Starting batch processing with RAG integration")
|
||||
|
||||
# Step 1: Parse documents in batch
|
||||
parse_result = self.process_documents_batch(
|
||||
file_paths=file_paths,
|
||||
output_dir=output_dir,
|
||||
parse_method=parse_method,
|
||||
max_workers=max_workers,
|
||||
recursive=recursive,
|
||||
show_progress=show_progress,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Step 2: Process with RAG
|
||||
# Initialize RAG system
|
||||
await self._ensure_lightrag_initialized()
|
||||
|
||||
# Then, process each successful file with RAG
|
||||
rag_results = {}
|
||||
|
||||
if parse_result.successful_files:
|
||||
self.logger.info(
|
||||
f"Processing {len(parse_result.successful_files)} files with RAG"
|
||||
)
|
||||
|
||||
# Process files with RAG (this could be parallelized in the future)
|
||||
for file_path in parse_result.successful_files:
|
||||
try:
|
||||
# Process the successfully parsed file with RAG
|
||||
await self.process_document_complete(
|
||||
file_path,
|
||||
output_dir=output_dir,
|
||||
parse_method=parse_method,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Get some statistics about the processed content
|
||||
# This would require additional tracking in the RAG system
|
||||
rag_results[file_path] = {"status": "success", "processed": True}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Failed to process {file_path} with RAG: {str(e)}"
|
||||
)
|
||||
rag_results[file_path] = {
|
||||
"status": "failed",
|
||||
"error": str(e),
|
||||
"processed": False,
|
||||
}
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
return {
|
||||
"parse_result": parse_result,
|
||||
"rag_results": rag_results,
|
||||
"total_processing_time": processing_time,
|
||||
"successful_rag_files": len(
|
||||
[r for r in rag_results.values() if r["processed"]]
|
||||
),
|
||||
"failed_rag_files": len(
|
||||
[r for r in rag_results.values() if not r["processed"]]
|
||||
),
|
||||
}
|
||||
430
raganything/batch_parser.py
Normal file
430
raganything/batch_parser.py
Normal file
@@ -0,0 +1,430 @@
|
||||
"""
|
||||
Batch and Parallel Document Parsing
|
||||
|
||||
This module provides functionality for processing multiple documents in parallel,
|
||||
with progress reporting and error handling.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
import time
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from .parser import MineruParser, DoclingParser
|
||||
|
||||
|
||||
@dataclass
|
||||
class BatchProcessingResult:
|
||||
"""Result of batch processing operation"""
|
||||
|
||||
successful_files: List[str]
|
||||
failed_files: List[str]
|
||||
total_files: int
|
||||
processing_time: float
|
||||
errors: Dict[str, str]
|
||||
output_dir: str
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
"""Calculate success rate as percentage"""
|
||||
if self.total_files == 0:
|
||||
return 0.0
|
||||
return (len(self.successful_files) / self.total_files) * 100
|
||||
|
||||
def summary(self) -> str:
|
||||
"""Generate a summary of the batch processing results"""
|
||||
return (
|
||||
f"Batch Processing Summary:\n"
|
||||
f" Total files: {self.total_files}\n"
|
||||
f" Successful: {len(self.successful_files)} ({self.success_rate:.1f}%)\n"
|
||||
f" Failed: {len(self.failed_files)}\n"
|
||||
f" Processing time: {self.processing_time:.2f} seconds\n"
|
||||
f" Output directory: {self.output_dir}"
|
||||
)
|
||||
|
||||
|
||||
class BatchParser:
|
||||
"""
|
||||
Batch document parser with parallel processing capabilities
|
||||
|
||||
Supports processing multiple documents concurrently with progress tracking
|
||||
and comprehensive error handling.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser_type: str = "mineru",
|
||||
max_workers: int = 4,
|
||||
show_progress: bool = True,
|
||||
timeout_per_file: int = 300,
|
||||
skip_installation_check: bool = False,
|
||||
):
|
||||
"""
|
||||
Initialize batch parser
|
||||
|
||||
Args:
|
||||
parser_type: Type of parser to use ("mineru" or "docling")
|
||||
max_workers: Maximum number of parallel workers
|
||||
show_progress: Whether to show progress bars
|
||||
timeout_per_file: Timeout in seconds for each file
|
||||
skip_installation_check: Skip parser installation check (useful for testing)
|
||||
"""
|
||||
self.parser_type = parser_type
|
||||
self.max_workers = max_workers
|
||||
self.show_progress = show_progress
|
||||
self.timeout_per_file = timeout_per_file
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Initialize parser
|
||||
if parser_type == "mineru":
|
||||
self.parser = MineruParser()
|
||||
elif parser_type == "docling":
|
||||
self.parser = DoclingParser()
|
||||
else:
|
||||
raise ValueError(f"Unsupported parser type: {parser_type}")
|
||||
|
||||
# Check parser installation (optional)
|
||||
if not skip_installation_check:
|
||||
if not self.parser.check_installation():
|
||||
self.logger.warning(
|
||||
f"{parser_type.title()} parser installation check failed. "
|
||||
f"This may be due to package conflicts. "
|
||||
f"Use skip_installation_check=True to bypass this check."
|
||||
)
|
||||
# Don't raise an error, just warn - the parser might still work
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""Get list of supported file extensions"""
|
||||
return list(
|
||||
self.parser.OFFICE_FORMATS
|
||||
| self.parser.IMAGE_FORMATS
|
||||
| self.parser.TEXT_FORMATS
|
||||
| {".pdf"}
|
||||
)
|
||||
|
||||
def filter_supported_files(
|
||||
self, file_paths: List[str], recursive: bool = True
|
||||
) -> List[str]:
|
||||
"""
|
||||
Filter file paths to only include supported file types
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths or directories
|
||||
recursive: Whether to search directories recursively
|
||||
|
||||
Returns:
|
||||
List of supported file paths
|
||||
"""
|
||||
supported_extensions = set(self.get_supported_extensions())
|
||||
supported_files = []
|
||||
|
||||
for path_str in file_paths:
|
||||
path = Path(path_str)
|
||||
|
||||
if path.is_file():
|
||||
if path.suffix.lower() in supported_extensions:
|
||||
supported_files.append(str(path))
|
||||
else:
|
||||
self.logger.warning(f"Unsupported file type: {path}")
|
||||
|
||||
elif path.is_dir():
|
||||
if recursive:
|
||||
# Recursively find all files
|
||||
for file_path in path.rglob("*"):
|
||||
if (
|
||||
file_path.is_file()
|
||||
and file_path.suffix.lower() in supported_extensions
|
||||
):
|
||||
supported_files.append(str(file_path))
|
||||
else:
|
||||
# Only files in the directory (not subdirectories)
|
||||
for file_path in path.glob("*"):
|
||||
if (
|
||||
file_path.is_file()
|
||||
and file_path.suffix.lower() in supported_extensions
|
||||
):
|
||||
supported_files.append(str(file_path))
|
||||
|
||||
else:
|
||||
self.logger.warning(f"Path does not exist: {path}")
|
||||
|
||||
return supported_files
|
||||
|
||||
def process_single_file(
|
||||
self, file_path: str, output_dir: str, parse_method: str = "auto", **kwargs
|
||||
) -> Tuple[bool, str, Optional[str]]:
|
||||
"""
|
||||
Process a single file
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to process
|
||||
output_dir: Output directory
|
||||
parse_method: Parsing method
|
||||
**kwargs: Additional parser arguments
|
||||
|
||||
Returns:
|
||||
Tuple of (success, file_path, error_message)
|
||||
"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Create file-specific output directory
|
||||
file_name = Path(file_path).stem
|
||||
file_output_dir = Path(output_dir) / file_name
|
||||
file_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Parse the document
|
||||
content_list = self.parser.parse_document(
|
||||
file_path=file_path,
|
||||
output_dir=str(file_output_dir),
|
||||
method=parse_method,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
self.logger.info(
|
||||
f"Successfully processed {file_path} "
|
||||
f"({len(content_list)} content blocks, {processing_time:.2f}s)"
|
||||
)
|
||||
|
||||
return True, file_path, None
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to process {file_path}: {str(e)}"
|
||||
self.logger.error(error_msg)
|
||||
return False, file_path, error_msg
|
||||
|
||||
def process_batch(
|
||||
self,
|
||||
file_paths: List[str],
|
||||
output_dir: str,
|
||||
parse_method: str = "auto",
|
||||
recursive: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchProcessingResult:
|
||||
"""
|
||||
Process multiple files in parallel
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths or directories to process
|
||||
output_dir: Base output directory
|
||||
parse_method: Parsing method for all files
|
||||
recursive: Whether to search directories recursively
|
||||
**kwargs: Additional parser arguments
|
||||
|
||||
Returns:
|
||||
BatchProcessingResult with processing statistics
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Filter to supported files
|
||||
supported_files = self.filter_supported_files(file_paths, recursive)
|
||||
|
||||
if not supported_files:
|
||||
self.logger.warning("No supported files found to process")
|
||||
return BatchProcessingResult(
|
||||
successful_files=[],
|
||||
failed_files=[],
|
||||
total_files=0,
|
||||
processing_time=0.0,
|
||||
errors={},
|
||||
output_dir=output_dir,
|
||||
)
|
||||
|
||||
self.logger.info(f"Found {len(supported_files)} files to process")
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process files in parallel
|
||||
successful_files = []
|
||||
failed_files = []
|
||||
errors = {}
|
||||
|
||||
# Create progress bar if requested
|
||||
pbar = None
|
||||
if self.show_progress:
|
||||
pbar = tqdm(
|
||||
total=len(supported_files),
|
||||
desc=f"Processing files ({self.parser_type})",
|
||||
unit="file",
|
||||
)
|
||||
|
||||
try:
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all tasks
|
||||
future_to_file = {
|
||||
executor.submit(
|
||||
self.process_single_file,
|
||||
file_path,
|
||||
output_dir,
|
||||
parse_method,
|
||||
**kwargs,
|
||||
): file_path
|
||||
for file_path in supported_files
|
||||
}
|
||||
|
||||
# Process completed tasks
|
||||
for future in as_completed(
|
||||
future_to_file, timeout=self.timeout_per_file
|
||||
):
|
||||
success, file_path, error_msg = future.result()
|
||||
|
||||
if success:
|
||||
successful_files.append(file_path)
|
||||
else:
|
||||
failed_files.append(file_path)
|
||||
errors[file_path] = error_msg
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Batch processing failed: {str(e)}")
|
||||
# Mark remaining files as failed
|
||||
for future in future_to_file:
|
||||
if not future.done():
|
||||
file_path = future_to_file[future]
|
||||
failed_files.append(file_path)
|
||||
errors[file_path] = f"Processing interrupted: {str(e)}"
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
|
||||
finally:
|
||||
if pbar:
|
||||
pbar.close()
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
# Create result
|
||||
result = BatchProcessingResult(
|
||||
successful_files=successful_files,
|
||||
failed_files=failed_files,
|
||||
total_files=len(supported_files),
|
||||
processing_time=processing_time,
|
||||
errors=errors,
|
||||
output_dir=output_dir,
|
||||
)
|
||||
|
||||
# Log summary
|
||||
self.logger.info(result.summary())
|
||||
|
||||
return result
|
||||
|
||||
async def process_batch_async(
|
||||
self,
|
||||
file_paths: List[str],
|
||||
output_dir: str,
|
||||
parse_method: str = "auto",
|
||||
recursive: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchProcessingResult:
|
||||
"""
|
||||
Async version of batch processing
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths or directories to process
|
||||
output_dir: Base output directory
|
||||
parse_method: Parsing method for all files
|
||||
recursive: Whether to search directories recursively
|
||||
**kwargs: Additional parser arguments
|
||||
|
||||
Returns:
|
||||
BatchProcessingResult with processing statistics
|
||||
"""
|
||||
# Run the sync version in a thread pool
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
self.process_batch,
|
||||
file_paths,
|
||||
output_dir,
|
||||
parse_method,
|
||||
recursive,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface for batch parsing"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Batch document parsing")
|
||||
parser.add_argument("paths", nargs="+", help="File paths or directories to process")
|
||||
parser.add_argument("--output", "-o", required=True, help="Output directory")
|
||||
parser.add_argument(
|
||||
"--parser",
|
||||
choices=["mineru", "docling"],
|
||||
default="mineru",
|
||||
help="Parser to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--method",
|
||||
choices=["auto", "txt", "ocr"],
|
||||
default="auto",
|
||||
help="Parsing method",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers", type=int, default=4, help="Number of parallel workers"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-progress", action="store_true", help="Disable progress bar"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--recursive",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Search directories recursively",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout", type=int, default=300, help="Timeout per file (seconds)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
try:
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type=args.parser,
|
||||
max_workers=args.workers,
|
||||
show_progress=not args.no_progress,
|
||||
timeout_per_file=args.timeout,
|
||||
)
|
||||
|
||||
# Process files
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=args.paths,
|
||||
output_dir=args.output,
|
||||
parse_method=args.method,
|
||||
recursive=args.recursive,
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print("\n" + result.summary())
|
||||
|
||||
# Exit with error code if any files failed
|
||||
if result.failed_files:
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
147
raganything/config.py
Normal file
147
raganything/config.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Configuration classes for RAGAnything
|
||||
|
||||
Contains configuration dataclasses with environment variable support
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
from lightrag.utils import get_env_value
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGAnythingConfig:
|
||||
"""Configuration class for RAGAnything with environment variable support"""
|
||||
|
||||
# Directory Configuration
|
||||
# ---
|
||||
working_dir: str = field(default=get_env_value("WORKING_DIR", "./rag_storage", str))
|
||||
"""Directory where RAG storage and cache files are stored."""
|
||||
|
||||
# Parser Configuration
|
||||
# ---
|
||||
parse_method: str = field(default=get_env_value("PARSE_METHOD", "auto", str))
|
||||
"""Default parsing method for document parsing: 'auto', 'ocr', or 'txt'."""
|
||||
|
||||
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
|
||||
"""Default output directory for parsed content."""
|
||||
|
||||
parser: str = field(default=get_env_value("PARSER", "mineru", str))
|
||||
"""Parser selection: 'mineru' or 'docling'."""
|
||||
|
||||
display_content_stats: bool = field(
|
||||
default=get_env_value("DISPLAY_CONTENT_STATS", True, bool)
|
||||
)
|
||||
"""Whether to display content statistics during parsing."""
|
||||
|
||||
# Multimodal Processing Configuration
|
||||
# ---
|
||||
enable_image_processing: bool = field(
|
||||
default=get_env_value("ENABLE_IMAGE_PROCESSING", True, bool)
|
||||
)
|
||||
"""Enable image content processing."""
|
||||
|
||||
enable_table_processing: bool = field(
|
||||
default=get_env_value("ENABLE_TABLE_PROCESSING", True, bool)
|
||||
)
|
||||
"""Enable table content processing."""
|
||||
|
||||
enable_equation_processing: bool = field(
|
||||
default=get_env_value("ENABLE_EQUATION_PROCESSING", True, bool)
|
||||
)
|
||||
"""Enable equation content processing."""
|
||||
|
||||
# Batch Processing Configuration
|
||||
# ---
|
||||
max_concurrent_files: int = field(
|
||||
default=get_env_value("MAX_CONCURRENT_FILES", 1, int)
|
||||
)
|
||||
"""Maximum number of files to process concurrently."""
|
||||
|
||||
supported_file_extensions: List[str] = field(
|
||||
default_factory=lambda: get_env_value(
|
||||
"SUPPORTED_FILE_EXTENSIONS",
|
||||
".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md",
|
||||
str,
|
||||
).split(",")
|
||||
)
|
||||
"""List of supported file extensions for batch processing."""
|
||||
|
||||
recursive_folder_processing: bool = field(
|
||||
default=get_env_value("RECURSIVE_FOLDER_PROCESSING", True, bool)
|
||||
)
|
||||
"""Whether to recursively process subfolders in batch mode."""
|
||||
|
||||
# Context Extraction Configuration
|
||||
# ---
|
||||
context_window: int = field(default=get_env_value("CONTEXT_WINDOW", 1, int))
|
||||
"""Number of pages/chunks to include before and after current item for context."""
|
||||
|
||||
context_mode: str = field(default=get_env_value("CONTEXT_MODE", "page", str))
|
||||
"""Context extraction mode: 'page' for page-based, 'chunk' for chunk-based."""
|
||||
|
||||
max_context_tokens: int = field(
|
||||
default=get_env_value("MAX_CONTEXT_TOKENS", 2000, int)
|
||||
)
|
||||
"""Maximum number of tokens in extracted context."""
|
||||
|
||||
include_headers: bool = field(default=get_env_value("INCLUDE_HEADERS", True, bool))
|
||||
"""Whether to include document headers and titles in context."""
|
||||
|
||||
include_captions: bool = field(
|
||||
default=get_env_value("INCLUDE_CAPTIONS", True, bool)
|
||||
)
|
||||
"""Whether to include image/table captions in context."""
|
||||
|
||||
context_filter_content_types: List[str] = field(
|
||||
default_factory=lambda: get_env_value(
|
||||
"CONTEXT_FILTER_CONTENT_TYPES", "text", str
|
||||
).split(",")
|
||||
)
|
||||
"""Content types to include in context extraction (e.g., 'text', 'image', 'table')."""
|
||||
|
||||
content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str))
|
||||
"""Default content format for context extraction when processing documents."""
|
||||
|
||||
def __post_init__(self):
|
||||
"""Post-initialization setup for backward compatibility"""
|
||||
# Support legacy environment variable names for backward compatibility
|
||||
legacy_parse_method = get_env_value("MINERU_PARSE_METHOD", None, str)
|
||||
if legacy_parse_method and not get_env_value("PARSE_METHOD", None, str):
|
||||
self.parse_method = legacy_parse_method
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@property
|
||||
def mineru_parse_method(self) -> str:
|
||||
"""
|
||||
Backward compatibility property for old code.
|
||||
|
||||
.. deprecated::
|
||||
Use `parse_method` instead. This property will be removed in a future version.
|
||||
"""
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"mineru_parse_method is deprecated. Use parse_method instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return self.parse_method
|
||||
|
||||
@mineru_parse_method.setter
|
||||
def mineru_parse_method(self, value: str):
|
||||
"""Setter for backward compatibility"""
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"mineru_parse_method is deprecated. Use parse_method instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
self.parse_method = value
|
||||
534
raganything/enhanced_markdown.py
Normal file
534
raganything/enhanced_markdown.py
Normal file
@@ -0,0 +1,534 @@
|
||||
"""
|
||||
Enhanced Markdown to PDF Conversion
|
||||
|
||||
This module provides improved Markdown to PDF conversion with:
|
||||
- Better formatting and styling
|
||||
- Image support
|
||||
- Table support
|
||||
- Code syntax highlighting
|
||||
- Custom templates
|
||||
- Multiple output formats
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
import tempfile
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
import markdown
|
||||
|
||||
MARKDOWN_AVAILABLE = True
|
||||
except ImportError:
|
||||
MARKDOWN_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from weasyprint import HTML
|
||||
|
||||
WEASYPRINT_AVAILABLE = True
|
||||
except ImportError:
|
||||
WEASYPRINT_AVAILABLE = False
|
||||
|
||||
try:
|
||||
# Check if pandoc module exists (not used directly, just for detection)
|
||||
import importlib.util
|
||||
|
||||
spec = importlib.util.find_spec("pandoc")
|
||||
PANDOC_AVAILABLE = spec is not None
|
||||
except ImportError:
|
||||
PANDOC_AVAILABLE = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarkdownConfig:
|
||||
"""Configuration for Markdown to PDF conversion"""
|
||||
|
||||
# Styling options
|
||||
css_file: Optional[str] = None
|
||||
template_file: Optional[str] = None
|
||||
page_size: str = "A4"
|
||||
margin: str = "1in"
|
||||
font_size: str = "12pt"
|
||||
line_height: str = "1.5"
|
||||
|
||||
# Content options
|
||||
include_toc: bool = True
|
||||
syntax_highlighting: bool = True
|
||||
image_max_width: str = "100%"
|
||||
table_style: str = "border-collapse: collapse; width: 100%;"
|
||||
|
||||
# Output options
|
||||
output_format: str = "pdf" # pdf, html, docx
|
||||
output_dir: Optional[str] = None
|
||||
|
||||
# Advanced options
|
||||
custom_css: Optional[str] = None
|
||||
metadata: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
class EnhancedMarkdownConverter:
|
||||
"""
|
||||
Enhanced Markdown to PDF converter with multiple backends
|
||||
|
||||
Supports multiple conversion methods:
|
||||
- WeasyPrint (recommended for HTML/CSS styling)
|
||||
- Pandoc (recommended for complex documents)
|
||||
- ReportLab (fallback, basic styling)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[MarkdownConfig] = None):
|
||||
"""
|
||||
Initialize the converter
|
||||
|
||||
Args:
|
||||
config: Configuration for conversion
|
||||
"""
|
||||
self.config = config or MarkdownConfig()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Check available backends
|
||||
self.available_backends = self._check_backends()
|
||||
self.logger.info(f"Available backends: {list(self.available_backends.keys())}")
|
||||
|
||||
def _check_backends(self) -> Dict[str, bool]:
|
||||
"""Check which conversion backends are available"""
|
||||
backends = {
|
||||
"weasyprint": WEASYPRINT_AVAILABLE,
|
||||
"pandoc": PANDOC_AVAILABLE,
|
||||
"markdown": MARKDOWN_AVAILABLE,
|
||||
}
|
||||
|
||||
# Check if pandoc is installed on system
|
||||
try:
|
||||
subprocess.run(["pandoc", "--version"], capture_output=True, check=True)
|
||||
backends["pandoc_system"] = True
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
backends["pandoc_system"] = False
|
||||
|
||||
return backends
|
||||
|
||||
def _get_default_css(self) -> str:
|
||||
"""Get default CSS styling"""
|
||||
return """
|
||||
body {
|
||||
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: #2c3e50;
|
||||
margin-top: 1.5em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
|
||||
h2 { font-size: 1.5em; border-bottom: 1px solid #bdc3c7; padding-bottom: 0.2em; }
|
||||
h3 { font-size: 1.3em; }
|
||||
h4 { font-size: 1.1em; }
|
||||
|
||||
p { margin-bottom: 1em; }
|
||||
|
||||
code {
|
||||
background-color: #f8f9fa;
|
||||
padding: 2px 4px;
|
||||
border-radius: 3px;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
pre {
|
||||
background-color: #f8f9fa;
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
border-left: 4px solid #3498db;
|
||||
}
|
||||
|
||||
pre code {
|
||||
background-color: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
border-left: 4px solid #3498db;
|
||||
margin: 0;
|
||||
padding-left: 20px;
|
||||
color: #7f8c8d;
|
||||
}
|
||||
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin: 1em 0;
|
||||
}
|
||||
|
||||
th, td {
|
||||
border: 1px solid #ddd;
|
||||
padding: 8px 12px;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
th {
|
||||
background-color: #f2f2f2;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
display: block;
|
||||
margin: 1em auto;
|
||||
}
|
||||
|
||||
ul, ol {
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
|
||||
li {
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #3498db;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.toc {
|
||||
background-color: #f8f9fa;
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
margin-bottom: 2em;
|
||||
}
|
||||
|
||||
.toc ul {
|
||||
list-style-type: none;
|
||||
padding-left: 0;
|
||||
}
|
||||
|
||||
.toc li {
|
||||
margin-bottom: 0.3em;
|
||||
}
|
||||
|
||||
.toc a {
|
||||
color: #2c3e50;
|
||||
}
|
||||
"""
|
||||
|
||||
def _process_markdown_content(self, content: str) -> str:
|
||||
"""Process Markdown content with extensions"""
|
||||
if not MARKDOWN_AVAILABLE:
|
||||
raise RuntimeError(
|
||||
"Markdown library not available. Install with: pip install markdown"
|
||||
)
|
||||
|
||||
# Configure Markdown extensions
|
||||
extensions = [
|
||||
"markdown.extensions.tables",
|
||||
"markdown.extensions.fenced_code",
|
||||
"markdown.extensions.codehilite",
|
||||
"markdown.extensions.toc",
|
||||
"markdown.extensions.attr_list",
|
||||
"markdown.extensions.def_list",
|
||||
"markdown.extensions.footnotes",
|
||||
]
|
||||
|
||||
extension_configs = {
|
||||
"codehilite": {
|
||||
"css_class": "highlight",
|
||||
"use_pygments": True,
|
||||
},
|
||||
"toc": {
|
||||
"title": "Table of Contents",
|
||||
"permalink": True,
|
||||
},
|
||||
}
|
||||
|
||||
# Convert Markdown to HTML
|
||||
md = markdown.Markdown(
|
||||
extensions=extensions, extension_configs=extension_configs
|
||||
)
|
||||
|
||||
html_content = md.convert(content)
|
||||
|
||||
# Add CSS styling
|
||||
css = self.config.custom_css or self._get_default_css()
|
||||
|
||||
# Create complete HTML document
|
||||
html_doc = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Converted Document</title>
|
||||
<style>
|
||||
{css}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
{html_content}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
return html_doc
|
||||
|
||||
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
|
||||
"""Convert using WeasyPrint (best for styling)"""
|
||||
if not WEASYPRINT_AVAILABLE:
|
||||
raise RuntimeError(
|
||||
"WeasyPrint not available. Install with: pip install weasyprint"
|
||||
)
|
||||
|
||||
try:
|
||||
# Process Markdown to HTML
|
||||
html_content = self._process_markdown_content(markdown_content)
|
||||
|
||||
# Convert HTML to PDF
|
||||
html = HTML(string=html_content)
|
||||
html.write_pdf(output_path)
|
||||
|
||||
self.logger.info(
|
||||
f"Successfully converted to PDF using WeasyPrint: {output_path}"
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"WeasyPrint conversion failed: {str(e)}")
|
||||
return False
|
||||
|
||||
def convert_with_pandoc(
|
||||
self, markdown_content: str, output_path: str, use_system_pandoc: bool = False
|
||||
) -> bool:
|
||||
"""Convert using Pandoc (best for complex documents)"""
|
||||
if (
|
||||
not self.available_backends.get("pandoc_system", False)
|
||||
and not use_system_pandoc
|
||||
):
|
||||
raise RuntimeError(
|
||||
"Pandoc not available. Install from: https://pandoc.org/installing.html"
|
||||
)
|
||||
|
||||
temp_md_path = None
|
||||
try:
|
||||
import subprocess
|
||||
|
||||
# Create temporary markdown file
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".md", delete=False
|
||||
) as temp_file:
|
||||
temp_file.write(markdown_content)
|
||||
temp_md_path = temp_file.name
|
||||
|
||||
# Build pandoc command with wkhtmltopdf engine
|
||||
cmd = [
|
||||
"pandoc",
|
||||
temp_md_path,
|
||||
"-o",
|
||||
output_path,
|
||||
"--pdf-engine=wkhtmltopdf",
|
||||
"--standalone",
|
||||
"--toc",
|
||||
"--number-sections",
|
||||
]
|
||||
|
||||
# Run pandoc
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||
|
||||
if result.returncode == 0:
|
||||
self.logger.info(
|
||||
f"Successfully converted to PDF using Pandoc: {output_path}"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"Pandoc conversion failed: {result.stderr}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Pandoc conversion failed: {str(e)}")
|
||||
return False
|
||||
|
||||
finally:
|
||||
if temp_md_path and os.path.exists(temp_md_path):
|
||||
try:
|
||||
os.unlink(temp_md_path)
|
||||
except OSError as e:
|
||||
self.logger.error(
|
||||
f"Failed to clean up temp file {temp_md_path}: {str(e)}"
|
||||
)
|
||||
|
||||
def convert_markdown_to_pdf(
|
||||
self, markdown_content: str, output_path: str, method: str = "auto"
|
||||
) -> bool:
|
||||
"""
|
||||
Convert markdown content to PDF
|
||||
|
||||
Args:
|
||||
markdown_content: Markdown content to convert
|
||||
output_path: Output PDF file path
|
||||
method: Conversion method ("auto", "weasyprint", "pandoc", "pandoc_system")
|
||||
|
||||
Returns:
|
||||
True if conversion successful, False otherwise
|
||||
"""
|
||||
if method == "auto":
|
||||
method = self._get_recommended_backend()
|
||||
|
||||
try:
|
||||
if method == "weasyprint":
|
||||
return self.convert_with_weasyprint(markdown_content, output_path)
|
||||
elif method == "pandoc":
|
||||
return self.convert_with_pandoc(markdown_content, output_path)
|
||||
elif method == "pandoc_system":
|
||||
return self.convert_with_pandoc(
|
||||
markdown_content, output_path, use_system_pandoc=True
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown conversion method: {method}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"{method.title()} conversion failed: {str(e)}")
|
||||
return False
|
||||
|
||||
def convert_file_to_pdf(
|
||||
self, input_path: str, output_path: Optional[str] = None, method: str = "auto"
|
||||
) -> bool:
|
||||
"""
|
||||
Convert Markdown file to PDF
|
||||
|
||||
Args:
|
||||
input_path: Input Markdown file path
|
||||
output_path: Output PDF file path (optional)
|
||||
method: Conversion method
|
||||
|
||||
Returns:
|
||||
bool: True if conversion successful
|
||||
"""
|
||||
input_path_obj = Path(input_path)
|
||||
|
||||
if not input_path_obj.exists():
|
||||
raise FileNotFoundError(f"Input file not found: {input_path}")
|
||||
|
||||
# Read markdown content
|
||||
try:
|
||||
with open(input_path_obj, "r", encoding="utf-8") as f:
|
||||
markdown_content = f.read()
|
||||
except UnicodeDecodeError:
|
||||
# Try with different encodings
|
||||
for encoding in ["gbk", "latin-1", "cp1252"]:
|
||||
try:
|
||||
with open(input_path_obj, "r", encoding=encoding) as f:
|
||||
markdown_content = f.read()
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Could not decode file {input_path} with any supported encoding"
|
||||
)
|
||||
|
||||
# Determine output path
|
||||
if output_path is None:
|
||||
output_path = str(input_path_obj.with_suffix(".pdf"))
|
||||
|
||||
return self.convert_markdown_to_pdf(markdown_content, output_path, method)
|
||||
|
||||
def get_backend_info(self) -> Dict[str, Any]:
|
||||
"""Get information about available backends"""
|
||||
return {
|
||||
"available_backends": self.available_backends,
|
||||
"recommended_backend": self._get_recommended_backend(),
|
||||
"config": {
|
||||
"page_size": self.config.page_size,
|
||||
"margin": self.config.margin,
|
||||
"font_size": self.config.font_size,
|
||||
"include_toc": self.config.include_toc,
|
||||
"syntax_highlighting": self.config.syntax_highlighting,
|
||||
},
|
||||
}
|
||||
|
||||
def _get_recommended_backend(self) -> str:
|
||||
"""Get recommended backend based on availability"""
|
||||
if self.available_backends.get("pandoc_system", False):
|
||||
return "pandoc"
|
||||
elif self.available_backends.get("weasyprint", False):
|
||||
return "weasyprint"
|
||||
else:
|
||||
return "none"
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface for enhanced markdown conversion"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Enhanced Markdown to PDF conversion")
|
||||
parser.add_argument("input", nargs="?", help="Input markdown file")
|
||||
parser.add_argument("--output", "-o", help="Output PDF file")
|
||||
parser.add_argument(
|
||||
"--method",
|
||||
choices=["auto", "weasyprint", "pandoc", "pandoc_system"],
|
||||
default="auto",
|
||||
help="Conversion method",
|
||||
)
|
||||
parser.add_argument("--css", help="Custom CSS file")
|
||||
parser.add_argument("--info", action="store_true", help="Show backend information")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
# Create converter
|
||||
config = MarkdownConfig()
|
||||
if args.css:
|
||||
config.css_file = args.css
|
||||
|
||||
converter = EnhancedMarkdownConverter(config)
|
||||
|
||||
# Show backend info if requested
|
||||
if args.info:
|
||||
info = converter.get_backend_info()
|
||||
print("Backend Information:")
|
||||
for backend, available in info["available_backends"].items():
|
||||
status = "✅" if available else "❌"
|
||||
print(f" {status} {backend}")
|
||||
print(f"Recommended backend: {info['recommended_backend']}")
|
||||
return 0
|
||||
|
||||
# Check if input file is provided
|
||||
if not args.input:
|
||||
parser.error("Input file is required when not using --info")
|
||||
|
||||
# Convert file
|
||||
try:
|
||||
success = converter.convert_file_to_pdf(
|
||||
input_path=args.input, output_path=args.output, method=args.method
|
||||
)
|
||||
|
||||
if success:
|
||||
print(f"✅ Successfully converted {args.input} to PDF")
|
||||
return 0
|
||||
else:
|
||||
print("❌ Conversion failed")
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1831
raganything/parser.py
Normal file
1831
raganything/parser.py
Normal file
File diff suppressed because it is too large
Load Diff
1824
raganything/processor.py
Normal file
1824
raganything/processor.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -56,6 +56,38 @@ Additional context:
|
||||
|
||||
Focus on providing accurate, detailed visual analysis that would be useful for knowledge retrieval."""
|
||||
|
||||
# Image analysis prompt with context support
|
||||
PROMPTS[
|
||||
"vision_prompt_with_context"
|
||||
] = """Please analyze this image in detail, considering the surrounding context. Provide a JSON response with the following structure:
|
||||
|
||||
{{
|
||||
"detailed_description": "A comprehensive and detailed visual description of the image following these guidelines:
|
||||
- Describe the overall composition and layout
|
||||
- Identify all objects, people, text, and visual elements
|
||||
- Explain relationships between elements and how they relate to the surrounding context
|
||||
- Note colors, lighting, and visual style
|
||||
- Describe any actions or activities shown
|
||||
- Include technical details if relevant (charts, diagrams, etc.)
|
||||
- Reference connections to the surrounding content when relevant
|
||||
- Always use specific names instead of pronouns",
|
||||
"entity_info": {{
|
||||
"entity_name": "{entity_name}",
|
||||
"entity_type": "image",
|
||||
"summary": "concise summary of the image content, its significance, and relationship to surrounding content (max 100 words)"
|
||||
}}
|
||||
}}
|
||||
|
||||
Context from surrounding content:
|
||||
{context}
|
||||
|
||||
Image details:
|
||||
- Image Path: {image_path}
|
||||
- Captions: {captions}
|
||||
- Footnotes: {footnotes}
|
||||
|
||||
Focus on providing accurate, detailed visual analysis that incorporates the context and would be useful for knowledge retrieval."""
|
||||
|
||||
# Image analysis prompt with text fallback
|
||||
PROMPTS["text_prompt"] = """Based on the following image information, provide analysis:
|
||||
|
||||
@@ -94,6 +126,39 @@ Footnotes: {table_footnote}
|
||||
|
||||
Focus on extracting meaningful insights and relationships from the tabular data."""
|
||||
|
||||
# Table analysis prompt with context support
|
||||
PROMPTS[
|
||||
"table_prompt_with_context"
|
||||
] = """Please analyze this table content considering the surrounding context, and provide a JSON response with the following structure:
|
||||
|
||||
{{
|
||||
"detailed_description": "A comprehensive analysis of the table including:
|
||||
- Table structure and organization
|
||||
- Column headers and their meanings
|
||||
- Key data points and patterns
|
||||
- Statistical insights and trends
|
||||
- Relationships between data elements
|
||||
- Significance of the data presented in relation to surrounding context
|
||||
- How the table supports or illustrates concepts from the surrounding content
|
||||
Always use specific names and values instead of general references.",
|
||||
"entity_info": {{
|
||||
"entity_name": "{entity_name}",
|
||||
"entity_type": "table",
|
||||
"summary": "concise summary of the table's purpose, key findings, and relationship to surrounding content (max 100 words)"
|
||||
}}
|
||||
}}
|
||||
|
||||
Context from surrounding content:
|
||||
{context}
|
||||
|
||||
Table Information:
|
||||
Image Path: {table_img_path}
|
||||
Caption: {table_caption}
|
||||
Body: {table_body}
|
||||
Footnotes: {table_footnote}
|
||||
|
||||
Focus on extracting meaningful insights and relationships from the tabular data in the context of the surrounding content."""
|
||||
|
||||
# Equation analysis prompt template
|
||||
PROMPTS[
|
||||
"equation_prompt"
|
||||
@@ -122,6 +187,38 @@ Format: {equation_format}
|
||||
|
||||
Focus on providing mathematical insights and explaining the equation's significance."""
|
||||
|
||||
# Equation analysis prompt with context support
|
||||
PROMPTS[
|
||||
"equation_prompt_with_context"
|
||||
] = """Please analyze this mathematical equation considering the surrounding context, and provide a JSON response with the following structure:
|
||||
|
||||
{{
|
||||
"detailed_description": "A comprehensive analysis of the equation including:
|
||||
- Mathematical meaning and interpretation
|
||||
- Variables and their definitions in the context of surrounding content
|
||||
- Mathematical operations and functions used
|
||||
- Application domain and context based on surrounding material
|
||||
- Physical or theoretical significance
|
||||
- Relationship to other mathematical concepts mentioned in the context
|
||||
- Practical applications or use cases
|
||||
- How the equation relates to the broader discussion or framework
|
||||
Always use specific mathematical terminology.",
|
||||
"entity_info": {{
|
||||
"entity_name": "{entity_name}",
|
||||
"entity_type": "equation",
|
||||
"summary": "concise summary of the equation's purpose, significance, and role in the surrounding context (max 100 words)"
|
||||
}}
|
||||
}}
|
||||
|
||||
Context from surrounding content:
|
||||
{context}
|
||||
|
||||
Equation Information:
|
||||
Equation: {equation_text}
|
||||
Format: {equation_format}
|
||||
|
||||
Focus on providing mathematical insights and explaining the equation's significance within the broader context."""
|
||||
|
||||
# Generic content analysis prompt template
|
||||
PROMPTS[
|
||||
"generic_prompt"
|
||||
@@ -146,6 +243,34 @@ Content: {content}
|
||||
|
||||
Focus on extracting meaningful information that would be useful for knowledge retrieval."""
|
||||
|
||||
# Generic content analysis prompt with context support
|
||||
PROMPTS[
|
||||
"generic_prompt_with_context"
|
||||
] = """Please analyze this {content_type} content considering the surrounding context, and provide a JSON response with the following structure:
|
||||
|
||||
{{
|
||||
"detailed_description": "A comprehensive analysis of the content including:
|
||||
- Content structure and organization
|
||||
- Key information and elements
|
||||
- Relationships between components
|
||||
- Context and significance in relation to surrounding content
|
||||
- How this content connects to or supports the broader discussion
|
||||
- Relevant details for knowledge retrieval
|
||||
Always use specific terminology appropriate for {content_type} content.",
|
||||
"entity_info": {{
|
||||
"entity_name": "{entity_name}",
|
||||
"entity_type": "{content_type}",
|
||||
"summary": "concise summary of the content's purpose, key points, and relationship to surrounding context (max 100 words)"
|
||||
}}
|
||||
}}
|
||||
|
||||
Context from surrounding content:
|
||||
{context}
|
||||
|
||||
Content: {content}
|
||||
|
||||
Focus on extracting meaningful information that would be useful for knowledge retrieval and understanding the content's role in the broader context."""
|
||||
|
||||
# Modal chunk templates
|
||||
PROMPTS["image_chunk"] = """
|
||||
Image Content Analysis:
|
||||
@@ -173,3 +298,56 @@ PROMPTS["generic_chunk"] = """{content_type} Content Analysis:
|
||||
Content: {content}
|
||||
|
||||
Analysis: {enhanced_caption}"""
|
||||
|
||||
# Query-related prompts
|
||||
PROMPTS["QUERY_IMAGE_DESCRIPTION"] = (
|
||||
"Please briefly describe the main content, key elements, and important information in this image."
|
||||
)
|
||||
|
||||
PROMPTS["QUERY_IMAGE_ANALYST_SYSTEM"] = (
|
||||
"You are a professional image analyst who can accurately describe image content."
|
||||
)
|
||||
|
||||
PROMPTS[
|
||||
"QUERY_TABLE_ANALYSIS"
|
||||
] = """Please analyze the main content, structure, and key information of the following table data:
|
||||
|
||||
Table data:
|
||||
{table_data}
|
||||
|
||||
Table caption: {table_caption}
|
||||
|
||||
Please briefly summarize the main content, data characteristics, and important findings of the table."""
|
||||
|
||||
PROMPTS["QUERY_TABLE_ANALYST_SYSTEM"] = (
|
||||
"You are a professional data analyst who can accurately analyze table data."
|
||||
)
|
||||
|
||||
PROMPTS[
|
||||
"QUERY_EQUATION_ANALYSIS"
|
||||
] = """Please explain the meaning and purpose of the following mathematical formula:
|
||||
|
||||
LaTeX formula: {latex}
|
||||
Formula caption: {equation_caption}
|
||||
|
||||
Please briefly explain the mathematical meaning, application scenarios, and importance of this formula."""
|
||||
|
||||
PROMPTS["QUERY_EQUATION_ANALYST_SYSTEM"] = (
|
||||
"You are a mathematics expert who can clearly explain mathematical formulas."
|
||||
)
|
||||
|
||||
PROMPTS[
|
||||
"QUERY_GENERIC_ANALYSIS"
|
||||
] = """Please analyze the following {content_type} type content and extract its main information and key features:
|
||||
|
||||
Content: {content_str}
|
||||
|
||||
Please briefly summarize the main characteristics and important information of this content."""
|
||||
|
||||
PROMPTS["QUERY_GENERIC_ANALYST_SYSTEM"] = (
|
||||
"You are a professional content analyst who can accurately analyze {content_type} type content."
|
||||
)
|
||||
|
||||
PROMPTS["QUERY_ENHANCEMENT_SUFFIX"] = (
|
||||
"\n\nPlease provide a comprehensive answer based on the user query and the provided multimodal content information."
|
||||
)
|
||||
|
||||
746
raganything/query.py
Normal file
746
raganything/query.py
Normal file
@@ -0,0 +1,746 @@
|
||||
"""
|
||||
Query functionality for RAGAnything
|
||||
|
||||
Contains all query-related methods for both text and multimodal queries
|
||||
"""
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import re
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
from lightrag import QueryParam
|
||||
from lightrag.utils import always_get_an_event_loop
|
||||
from raganything.prompt import PROMPTS
|
||||
from raganything.utils import (
|
||||
get_processor_for_type,
|
||||
encode_image_to_base64,
|
||||
validate_image_file,
|
||||
)
|
||||
|
||||
|
||||
class QueryMixin:
|
||||
"""QueryMixin class containing query functionality for RAGAnything"""
|
||||
|
||||
def _generate_multimodal_cache_key(
|
||||
self, query: str, multimodal_content: List[Dict[str, Any]], mode: str, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Generate cache key for multimodal query
|
||||
|
||||
Args:
|
||||
query: Base query text
|
||||
multimodal_content: List of multimodal content
|
||||
mode: Query mode
|
||||
**kwargs: Additional parameters
|
||||
|
||||
Returns:
|
||||
str: Cache key hash
|
||||
"""
|
||||
# Create a normalized representation of the query parameters
|
||||
cache_data = {
|
||||
"query": query.strip(),
|
||||
"mode": mode,
|
||||
}
|
||||
|
||||
# Normalize multimodal content for stable caching
|
||||
normalized_content = []
|
||||
if multimodal_content:
|
||||
for item in multimodal_content:
|
||||
if isinstance(item, dict):
|
||||
normalized_item = {}
|
||||
for key, value in item.items():
|
||||
# For file paths, use basename to make cache more portable
|
||||
if key in [
|
||||
"img_path",
|
||||
"image_path",
|
||||
"file_path",
|
||||
] and isinstance(value, str):
|
||||
normalized_item[key] = Path(value).name
|
||||
# For large content, create a hash instead of storing directly
|
||||
elif (
|
||||
key in ["table_data", "table_body"]
|
||||
and isinstance(value, str)
|
||||
and len(value) > 200
|
||||
):
|
||||
normalized_item[f"{key}_hash"] = hashlib.md5(
|
||||
value.encode()
|
||||
).hexdigest()
|
||||
else:
|
||||
normalized_item[key] = value
|
||||
normalized_content.append(normalized_item)
|
||||
else:
|
||||
normalized_content.append(item)
|
||||
|
||||
cache_data["multimodal_content"] = normalized_content
|
||||
|
||||
# Add relevant kwargs to cache data
|
||||
relevant_kwargs = {
|
||||
k: v
|
||||
for k, v in kwargs.items()
|
||||
if k
|
||||
in [
|
||||
"stream",
|
||||
"response_type",
|
||||
"top_k",
|
||||
"max_tokens",
|
||||
"temperature",
|
||||
# "only_need_context",
|
||||
# "only_need_prompt",
|
||||
]
|
||||
}
|
||||
cache_data.update(relevant_kwargs)
|
||||
|
||||
# Generate hash from the cache data
|
||||
cache_str = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
|
||||
cache_hash = hashlib.md5(cache_str.encode()).hexdigest()
|
||||
|
||||
return f"multimodal_query:{cache_hash}"
|
||||
|
||||
async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
|
||||
"""
|
||||
Pure text query - directly calls LightRAG's query functionality
|
||||
|
||||
Args:
|
||||
query: Query text
|
||||
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
|
||||
**kwargs: Other query parameters, will be passed to QueryParam
|
||||
- vlm_enhanced: bool, default True when vision_model_func is available.
|
||||
If True, will parse image paths in retrieved context and replace them
|
||||
with base64 encoded images for VLM processing.
|
||||
|
||||
Returns:
|
||||
str: Query result
|
||||
"""
|
||||
if self.lightrag is None:
|
||||
raise ValueError(
|
||||
"No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
|
||||
)
|
||||
|
||||
# Check if VLM enhanced query should be used
|
||||
vlm_enhanced = kwargs.pop("vlm_enhanced", None)
|
||||
|
||||
# Auto-determine VLM enhanced based on availability
|
||||
if vlm_enhanced is None:
|
||||
vlm_enhanced = (
|
||||
hasattr(self, "vision_model_func")
|
||||
and self.vision_model_func is not None
|
||||
)
|
||||
|
||||
# Use VLM enhanced query if enabled and available
|
||||
if (
|
||||
vlm_enhanced
|
||||
and hasattr(self, "vision_model_func")
|
||||
and self.vision_model_func
|
||||
):
|
||||
return await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
|
||||
elif vlm_enhanced and (
|
||||
not hasattr(self, "vision_model_func") or not self.vision_model_func
|
||||
):
|
||||
self.logger.warning(
|
||||
"VLM enhanced query requested but vision_model_func is not available, falling back to normal query"
|
||||
)
|
||||
|
||||
# Create query parameters
|
||||
query_param = QueryParam(mode=mode, **kwargs)
|
||||
|
||||
self.logger.info(f"Executing text query: {query[:100]}...")
|
||||
self.logger.info(f"Query mode: {mode}")
|
||||
|
||||
# Call LightRAG's query method
|
||||
result = await self.lightrag.aquery(query, param=query_param)
|
||||
|
||||
self.logger.info("Text query completed")
|
||||
return result
|
||||
|
||||
async def aquery_with_multimodal(
|
||||
self,
|
||||
query: str,
|
||||
multimodal_content: List[Dict[str, Any]] = None,
|
||||
mode: str = "mix",
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Multimodal query - combines text and multimodal content for querying
|
||||
|
||||
Args:
|
||||
query: Base query text
|
||||
multimodal_content: List of multimodal content, each element contains:
|
||||
- type: Content type ("image", "table", "equation", etc.)
|
||||
- Other fields depend on type (e.g., img_path, table_data, latex, etc.)
|
||||
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
|
||||
**kwargs: Other query parameters, will be passed to QueryParam
|
||||
|
||||
Returns:
|
||||
str: Query result
|
||||
|
||||
Examples:
|
||||
# Pure text query
|
||||
result = await rag.query_with_multimodal("What is machine learning?")
|
||||
|
||||
# Image query
|
||||
result = await rag.query_with_multimodal(
|
||||
"Analyze the content in this image",
|
||||
multimodal_content=[{
|
||||
"type": "image",
|
||||
"img_path": "./image.jpg"
|
||||
}]
|
||||
)
|
||||
|
||||
# Table query
|
||||
result = await rag.query_with_multimodal(
|
||||
"Analyze the data trends in this table",
|
||||
multimodal_content=[{
|
||||
"type": "table",
|
||||
"table_data": "Name,Age\nAlice,25\nBob,30"
|
||||
}]
|
||||
)
|
||||
"""
|
||||
# Ensure LightRAG is initialized
|
||||
await self._ensure_lightrag_initialized()
|
||||
|
||||
self.logger.info(f"Executing multimodal query: {query[:100]}...")
|
||||
self.logger.info(f"Query mode: {mode}")
|
||||
|
||||
# If no multimodal content, fallback to pure text query
|
||||
if not multimodal_content:
|
||||
self.logger.info("No multimodal content provided, executing text query")
|
||||
return await self.aquery(query, mode=mode, **kwargs)
|
||||
|
||||
# Generate cache key for multimodal query
|
||||
cache_key = self._generate_multimodal_cache_key(
|
||||
query, multimodal_content, mode, **kwargs
|
||||
)
|
||||
|
||||
# Check cache if available and enabled
|
||||
cached_result = None
|
||||
if (
|
||||
hasattr(self, "lightrag")
|
||||
and self.lightrag
|
||||
and hasattr(self.lightrag, "llm_response_cache")
|
||||
and self.lightrag.llm_response_cache
|
||||
):
|
||||
if self.lightrag.llm_response_cache.global_config.get(
|
||||
"enable_llm_cache", True
|
||||
):
|
||||
try:
|
||||
cached_result = await self.lightrag.llm_response_cache.get_by_id(
|
||||
cache_key
|
||||
)
|
||||
if cached_result and isinstance(cached_result, dict):
|
||||
result_content = cached_result.get("return")
|
||||
if result_content:
|
||||
self.logger.info(
|
||||
f"Multimodal query cache hit: {cache_key[:16]}..."
|
||||
)
|
||||
return result_content
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Error accessing multimodal query cache: {e}")
|
||||
|
||||
# Process multimodal content to generate enhanced query text
|
||||
enhanced_query = await self._process_multimodal_query_content(
|
||||
query, multimodal_content
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"Generated enhanced query length: {len(enhanced_query)} characters"
|
||||
)
|
||||
|
||||
# Execute enhanced query
|
||||
result = await self.aquery(enhanced_query, mode=mode, **kwargs)
|
||||
|
||||
# Save to cache if available and enabled
|
||||
if (
|
||||
hasattr(self, "lightrag")
|
||||
and self.lightrag
|
||||
and hasattr(self.lightrag, "llm_response_cache")
|
||||
and self.lightrag.llm_response_cache
|
||||
):
|
||||
if self.lightrag.llm_response_cache.global_config.get(
|
||||
"enable_llm_cache", True
|
||||
):
|
||||
try:
|
||||
# Create cache entry for multimodal query
|
||||
cache_entry = {
|
||||
"return": result,
|
||||
"cache_type": "multimodal_query",
|
||||
"original_query": query,
|
||||
"multimodal_content_count": len(multimodal_content),
|
||||
"mode": mode,
|
||||
}
|
||||
|
||||
await self.lightrag.llm_response_cache.upsert(
|
||||
{cache_key: cache_entry}
|
||||
)
|
||||
self.logger.info(
|
||||
f"Saved multimodal query result to cache: {cache_key[:16]}..."
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Error saving multimodal query to cache: {e}")
|
||||
|
||||
# Ensure cache is persisted to disk
|
||||
if (
|
||||
hasattr(self, "lightrag")
|
||||
and self.lightrag
|
||||
and hasattr(self.lightrag, "llm_response_cache")
|
||||
and self.lightrag.llm_response_cache
|
||||
):
|
||||
try:
|
||||
await self.lightrag.llm_response_cache.index_done_callback()
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Error persisting multimodal query cache: {e}")
|
||||
|
||||
self.logger.info("Multimodal query completed")
|
||||
return result
|
||||
|
||||
async def aquery_vlm_enhanced(self, query: str, mode: str = "mix", **kwargs) -> str:
|
||||
"""
|
||||
VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing
|
||||
|
||||
Args:
|
||||
query: User query
|
||||
mode: Underlying LightRAG query mode
|
||||
**kwargs: Other query parameters
|
||||
|
||||
Returns:
|
||||
str: VLM query result
|
||||
"""
|
||||
# Ensure VLM is available
|
||||
if not hasattr(self, "vision_model_func") or not self.vision_model_func:
|
||||
raise ValueError(
|
||||
"VLM enhanced query requires vision_model_func. "
|
||||
"Please provide a vision model function when initializing RAGAnything."
|
||||
)
|
||||
|
||||
# Ensure LightRAG is initialized
|
||||
await self._ensure_lightrag_initialized()
|
||||
|
||||
self.logger.info(f"Executing VLM enhanced query: {query[:100]}...")
|
||||
|
||||
# Clear previous image cache
|
||||
if hasattr(self, "_current_images_base64"):
|
||||
delattr(self, "_current_images_base64")
|
||||
|
||||
# 1. Get original retrieval prompt (without generating final answer)
|
||||
query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)
|
||||
raw_prompt = await self.lightrag.aquery(query, param=query_param)
|
||||
|
||||
self.logger.debug("Retrieved raw prompt from LightRAG")
|
||||
|
||||
# 2. Extract and process image paths
|
||||
enhanced_prompt, images_found = await self._process_image_paths_for_vlm(
|
||||
raw_prompt
|
||||
)
|
||||
|
||||
if not images_found:
|
||||
self.logger.info("No valid images found, falling back to normal query")
|
||||
# Fallback to normal query
|
||||
query_param = QueryParam(mode=mode, **kwargs)
|
||||
return await self.lightrag.aquery(query, param=query_param)
|
||||
|
||||
self.logger.info(f"Processed {images_found} images for VLM")
|
||||
|
||||
# 3. Build VLM message format
|
||||
messages = self._build_vlm_messages_with_images(enhanced_prompt, query)
|
||||
|
||||
# 4. Call VLM for question answering
|
||||
result = await self._call_vlm_with_multimodal_content(messages)
|
||||
|
||||
self.logger.info("VLM enhanced query completed")
|
||||
return result
|
||||
|
||||
async def _process_multimodal_query_content(
|
||||
self, base_query: str, multimodal_content: List[Dict[str, Any]]
|
||||
) -> str:
|
||||
"""
|
||||
Process multimodal query content to generate enhanced query text
|
||||
|
||||
Args:
|
||||
base_query: Base query text
|
||||
multimodal_content: List of multimodal content
|
||||
|
||||
Returns:
|
||||
str: Enhanced query text
|
||||
"""
|
||||
self.logger.info("Starting multimodal query content processing...")
|
||||
|
||||
enhanced_parts = [f"User query: {base_query}"]
|
||||
|
||||
for i, content in enumerate(multimodal_content):
|
||||
content_type = content.get("type", "unknown")
|
||||
self.logger.info(
|
||||
f"Processing {i+1}/{len(multimodal_content)} multimodal content: {content_type}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Get appropriate processor
|
||||
processor = get_processor_for_type(self.modal_processors, content_type)
|
||||
|
||||
if processor:
|
||||
# Generate content description
|
||||
description = await self._generate_query_content_description(
|
||||
processor, content, content_type
|
||||
)
|
||||
enhanced_parts.append(
|
||||
f"\nRelated {content_type} content: {description}"
|
||||
)
|
||||
else:
|
||||
# If no appropriate processor, use basic description
|
||||
basic_desc = str(content)[:200]
|
||||
enhanced_parts.append(
|
||||
f"\nRelated {content_type} content: {basic_desc}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing multimodal content: {str(e)}")
|
||||
# Continue processing other content
|
||||
continue
|
||||
|
||||
enhanced_query = "\n".join(enhanced_parts)
|
||||
enhanced_query += PROMPTS["QUERY_ENHANCEMENT_SUFFIX"]
|
||||
|
||||
self.logger.info("Multimodal query content processing completed")
|
||||
return enhanced_query
|
||||
|
||||
async def _generate_query_content_description(
|
||||
self, processor, content: Dict[str, Any], content_type: str
|
||||
) -> str:
|
||||
"""
|
||||
Generate content description for query
|
||||
|
||||
Args:
|
||||
processor: Multimodal processor
|
||||
content: Content data
|
||||
content_type: Content type
|
||||
|
||||
Returns:
|
||||
str: Content description
|
||||
"""
|
||||
try:
|
||||
if content_type == "image":
|
||||
return await self._describe_image_for_query(processor, content)
|
||||
elif content_type == "table":
|
||||
return await self._describe_table_for_query(processor, content)
|
||||
elif content_type == "equation":
|
||||
return await self._describe_equation_for_query(processor, content)
|
||||
else:
|
||||
return await self._describe_generic_for_query(
|
||||
processor, content, content_type
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating {content_type} description: {str(e)}")
|
||||
return f"{content_type} content: {str(content)[:100]}"
|
||||
|
||||
async def _describe_image_for_query(
|
||||
self, processor, content: Dict[str, Any]
|
||||
) -> str:
|
||||
"""Generate image description for query"""
|
||||
image_path = content.get("img_path")
|
||||
captions = content.get("image_caption", content.get("img_caption", []))
|
||||
footnotes = content.get("image_footnote", content.get("img_footnote", []))
|
||||
|
||||
if image_path and Path(image_path).exists():
|
||||
# If image exists, use vision model to generate description
|
||||
image_base64 = processor._encode_image_to_base64(image_path)
|
||||
if image_base64:
|
||||
prompt = PROMPTS["QUERY_IMAGE_DESCRIPTION"]
|
||||
description = await processor.modal_caption_func(
|
||||
prompt,
|
||||
image_data=image_base64,
|
||||
system_prompt=PROMPTS["QUERY_IMAGE_ANALYST_SYSTEM"],
|
||||
)
|
||||
return description
|
||||
|
||||
# If image doesn't exist or processing failed, use existing information
|
||||
parts = []
|
||||
if image_path:
|
||||
parts.append(f"Image path: {image_path}")
|
||||
if captions:
|
||||
parts.append(f"Image captions: {', '.join(captions)}")
|
||||
if footnotes:
|
||||
parts.append(f"Image footnotes: {', '.join(footnotes)}")
|
||||
|
||||
return "; ".join(parts) if parts else "Image content information incomplete"
|
||||
|
||||
async def _describe_table_for_query(
|
||||
self, processor, content: Dict[str, Any]
|
||||
) -> str:
|
||||
"""Generate table description for query"""
|
||||
table_data = content.get("table_data", "")
|
||||
table_caption = content.get("table_caption", "")
|
||||
|
||||
prompt = PROMPTS["QUERY_TABLE_ANALYSIS"].format(
|
||||
table_data=table_data, table_caption=table_caption
|
||||
)
|
||||
|
||||
description = await processor.modal_caption_func(
|
||||
prompt, system_prompt=PROMPTS["QUERY_TABLE_ANALYST_SYSTEM"]
|
||||
)
|
||||
|
||||
return description
|
||||
|
||||
async def _describe_equation_for_query(
|
||||
self, processor, content: Dict[str, Any]
|
||||
) -> str:
|
||||
"""Generate equation description for query"""
|
||||
latex = content.get("latex", "")
|
||||
equation_caption = content.get("equation_caption", "")
|
||||
|
||||
prompt = PROMPTS["QUERY_EQUATION_ANALYSIS"].format(
|
||||
latex=latex, equation_caption=equation_caption
|
||||
)
|
||||
|
||||
description = await processor.modal_caption_func(
|
||||
prompt, system_prompt=PROMPTS["QUERY_EQUATION_ANALYST_SYSTEM"]
|
||||
)
|
||||
|
||||
return description
|
||||
|
||||
async def _describe_generic_for_query(
|
||||
self, processor, content: Dict[str, Any], content_type: str
|
||||
) -> str:
|
||||
"""Generate generic content description for query"""
|
||||
content_str = str(content)
|
||||
|
||||
prompt = PROMPTS["QUERY_GENERIC_ANALYSIS"].format(
|
||||
content_type=content_type, content_str=content_str
|
||||
)
|
||||
|
||||
description = await processor.modal_caption_func(
|
||||
prompt,
|
||||
system_prompt=PROMPTS["QUERY_GENERIC_ANALYST_SYSTEM"].format(
|
||||
content_type=content_type
|
||||
),
|
||||
)
|
||||
|
||||
return description
|
||||
|
||||
async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]:
|
||||
"""
|
||||
Process image paths in prompt, keeping original paths and adding VLM markers
|
||||
|
||||
Args:
|
||||
prompt: Original prompt
|
||||
|
||||
Returns:
|
||||
tuple: (processed prompt, image count)
|
||||
"""
|
||||
enhanced_prompt = prompt
|
||||
images_processed = 0
|
||||
|
||||
# Initialize image cache
|
||||
self._current_images_base64 = []
|
||||
|
||||
# Enhanced regex pattern for matching image paths
|
||||
# Matches only the path ending with image file extensions
|
||||
image_path_pattern = (
|
||||
r"Image Path:\s*([^\r\n]*?\.(?:jpg|jpeg|png|gif|bmp|webp|tiff|tif))"
|
||||
)
|
||||
|
||||
# First, let's see what matches we find
|
||||
matches = re.findall(image_path_pattern, prompt)
|
||||
self.logger.info(f"Found {len(matches)} image path matches in prompt")
|
||||
|
||||
def replace_image_path(match):
|
||||
nonlocal images_processed
|
||||
|
||||
image_path = match.group(1).strip()
|
||||
self.logger.debug(f"Processing image path: '{image_path}'")
|
||||
|
||||
# Validate path format (basic check)
|
||||
if not image_path or len(image_path) < 3:
|
||||
self.logger.warning(f"Invalid image path format: {image_path}")
|
||||
return match.group(0) # Keep original
|
||||
|
||||
# Use utility function to validate image file
|
||||
self.logger.debug(f"Calling validate_image_file for: {image_path}")
|
||||
is_valid = validate_image_file(image_path)
|
||||
self.logger.debug(f"Validation result for {image_path}: {is_valid}")
|
||||
|
||||
if not is_valid:
|
||||
self.logger.warning(f"Image validation failed for: {image_path}")
|
||||
return match.group(0) # Keep original if validation fails
|
||||
|
||||
try:
|
||||
# Encode image to base64 using utility function
|
||||
self.logger.debug(f"Attempting to encode image: {image_path}")
|
||||
image_base64 = encode_image_to_base64(image_path)
|
||||
if image_base64:
|
||||
images_processed += 1
|
||||
# Save base64 to instance variable for later use
|
||||
self._current_images_base64.append(image_base64)
|
||||
|
||||
# Keep original path info and add VLM marker
|
||||
result = f"Image Path: {image_path}\n[VLM_IMAGE_{images_processed}]"
|
||||
self.logger.debug(
|
||||
f"Successfully processed image {images_processed}: {image_path}"
|
||||
)
|
||||
return result
|
||||
else:
|
||||
self.logger.error(f"Failed to encode image: {image_path}")
|
||||
return match.group(0) # Keep original if encoding failed
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to process image {image_path}: {e}")
|
||||
return match.group(0) # Keep original
|
||||
|
||||
# Execute replacement
|
||||
enhanced_prompt = re.sub(
|
||||
image_path_pattern, replace_image_path, enhanced_prompt
|
||||
)
|
||||
|
||||
return enhanced_prompt, images_processed
|
||||
|
||||
def _build_vlm_messages_with_images(
|
||||
self, enhanced_prompt: str, user_query: str
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Build VLM message format, using markers to correspond images with text positions
|
||||
|
||||
Args:
|
||||
enhanced_prompt: Enhanced prompt with image markers
|
||||
user_query: User query
|
||||
|
||||
Returns:
|
||||
List[Dict]: VLM message format
|
||||
"""
|
||||
images_base64 = getattr(self, "_current_images_base64", [])
|
||||
|
||||
if not images_base64:
|
||||
# Pure text mode
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Context:\n{enhanced_prompt}\n\nUser Question: {user_query}",
|
||||
}
|
||||
]
|
||||
|
||||
# Build multimodal content
|
||||
content_parts = []
|
||||
|
||||
# Split text at image markers and insert images
|
||||
text_parts = enhanced_prompt.split("[VLM_IMAGE_")
|
||||
|
||||
for i, text_part in enumerate(text_parts):
|
||||
if i == 0:
|
||||
# First text part
|
||||
if text_part.strip():
|
||||
content_parts.append({"type": "text", "text": text_part})
|
||||
else:
|
||||
# Find marker number and insert corresponding image
|
||||
marker_match = re.match(r"(\d+)\](.*)", text_part, re.DOTALL)
|
||||
if marker_match:
|
||||
image_num = (
|
||||
int(marker_match.group(1)) - 1
|
||||
) # Convert to 0-based index
|
||||
remaining_text = marker_match.group(2)
|
||||
|
||||
# Insert corresponding image
|
||||
if 0 <= image_num < len(images_base64):
|
||||
content_parts.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{images_base64[image_num]}"
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Insert remaining text
|
||||
if remaining_text.strip():
|
||||
content_parts.append({"type": "text", "text": remaining_text})
|
||||
|
||||
# Add user question
|
||||
content_parts.append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"\n\nUser Question: {user_query}\n\nPlease answer based on the context and images provided.",
|
||||
}
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.",
|
||||
},
|
||||
{"role": "user", "content": content_parts},
|
||||
]
|
||||
|
||||
async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:
|
||||
"""
|
||||
Call VLM to process multimodal content
|
||||
|
||||
Args:
|
||||
messages: VLM message format
|
||||
|
||||
Returns:
|
||||
str: VLM response result
|
||||
"""
|
||||
try:
|
||||
user_message = messages[1]
|
||||
content = user_message["content"]
|
||||
system_prompt = messages[0]["content"]
|
||||
|
||||
if isinstance(content, str):
|
||||
# Pure text mode
|
||||
result = await self.vision_model_func(
|
||||
content, system_prompt=system_prompt
|
||||
)
|
||||
else:
|
||||
# Multimodal mode - pass complete messages directly to VLM
|
||||
result = await self.vision_model_func(
|
||||
"", # Empty prompt since we're using messages format
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"VLM call failed: {e}")
|
||||
raise
|
||||
|
||||
# Synchronous versions of query methods
|
||||
def query(self, query: str, mode: str = "mix", **kwargs) -> str:
|
||||
"""
|
||||
Synchronous version of pure text query
|
||||
|
||||
Args:
|
||||
query: Query text
|
||||
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
|
||||
**kwargs: Other query parameters, will be passed to QueryParam
|
||||
- vlm_enhanced: bool, default True when vision_model_func is available.
|
||||
If True, will parse image paths in retrieved context and replace them
|
||||
with base64 encoded images for VLM processing.
|
||||
|
||||
Returns:
|
||||
str: Query result
|
||||
"""
|
||||
loop = always_get_an_event_loop()
|
||||
return loop.run_until_complete(self.aquery(query, mode=mode, **kwargs))
|
||||
|
||||
def query_with_multimodal(
|
||||
self,
|
||||
query: str,
|
||||
multimodal_content: List[Dict[str, Any]] = None,
|
||||
mode: str = "mix",
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Synchronous version of multimodal query
|
||||
|
||||
Args:
|
||||
query: Base query text
|
||||
multimodal_content: List of multimodal content, each element contains:
|
||||
- type: Content type ("image", "table", "equation", etc.)
|
||||
- Other fields depend on type (e.g., img_path, table_data, latex, etc.)
|
||||
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
|
||||
**kwargs: Other query parameters, will be passed to QueryParam
|
||||
|
||||
Returns:
|
||||
str: Query result
|
||||
"""
|
||||
loop = always_get_an_event_loop()
|
||||
return loop.run_until_complete(
|
||||
self.aquery_with_multimodal(query, multimodal_content, mode=mode, **kwargs)
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
274
raganything/utils.py
Normal file
274
raganything/utils.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Utility functions for RAGAnything
|
||||
|
||||
Contains helper functions for content separation, text insertion, and other utilities
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from pathlib import Path
|
||||
from lightrag.utils import logger
|
||||
|
||||
|
||||
def separate_content(
|
||||
content_list: List[Dict[str, Any]],
|
||||
) -> Tuple[str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Separate text content and multimodal content
|
||||
|
||||
Args:
|
||||
content_list: Content list from MinerU parsing
|
||||
|
||||
Returns:
|
||||
(text_content, multimodal_items): Pure text content and multimodal items list
|
||||
"""
|
||||
text_parts = []
|
||||
multimodal_items = []
|
||||
|
||||
for item in content_list:
|
||||
content_type = item.get("type", "text")
|
||||
|
||||
if content_type == "text":
|
||||
# Text content
|
||||
text = item.get("text", "")
|
||||
if text.strip():
|
||||
text_parts.append(text)
|
||||
else:
|
||||
# Multimodal content (image, table, equation, etc.)
|
||||
multimodal_items.append(item)
|
||||
|
||||
# Merge all text content
|
||||
text_content = "\n\n".join(text_parts)
|
||||
|
||||
logger.info("Content separation complete:")
|
||||
logger.info(f" - Text content length: {len(text_content)} characters")
|
||||
logger.info(f" - Multimodal items count: {len(multimodal_items)}")
|
||||
|
||||
# Count multimodal types
|
||||
modal_types = {}
|
||||
for item in multimodal_items:
|
||||
modal_type = item.get("type", "unknown")
|
||||
modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
|
||||
|
||||
if modal_types:
|
||||
logger.info(f" - Multimodal type distribution: {modal_types}")
|
||||
|
||||
return text_content, multimodal_items
|
||||
|
||||
|
||||
def encode_image_to_base64(image_path: str) -> str:
|
||||
"""
|
||||
Encode image file to base64 string
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
|
||||
Returns:
|
||||
str: Base64 encoded string, empty string if encoding fails
|
||||
"""
|
||||
try:
|
||||
with open(image_path, "rb") as image_file:
|
||||
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
||||
return encoded_string
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to encode image {image_path}: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def validate_image_file(image_path: str, max_size_mb: int = 50) -> bool:
|
||||
"""
|
||||
Validate if a file is a valid image file
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
max_size_mb: Maximum file size in MB
|
||||
|
||||
Returns:
|
||||
bool: True if valid, False otherwise
|
||||
"""
|
||||
try:
|
||||
path = Path(image_path)
|
||||
|
||||
logger.debug(f"Validating image path: {image_path}")
|
||||
logger.debug(f"Resolved path object: {path}")
|
||||
logger.debug(f"Path exists check: {path.exists()}")
|
||||
|
||||
# Check if file exists
|
||||
if not path.exists():
|
||||
logger.warning(f"Image file not found: {image_path}")
|
||||
return False
|
||||
|
||||
# Check file extension
|
||||
image_extensions = [
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
".bmp",
|
||||
".webp",
|
||||
".tiff",
|
||||
".tif",
|
||||
]
|
||||
|
||||
path_lower = str(path).lower()
|
||||
has_valid_extension = any(path_lower.endswith(ext) for ext in image_extensions)
|
||||
logger.debug(
|
||||
f"File extension check - path: {path_lower}, valid: {has_valid_extension}"
|
||||
)
|
||||
|
||||
if not has_valid_extension:
|
||||
logger.warning(f"File does not appear to be an image: {image_path}")
|
||||
return False
|
||||
|
||||
# Check file size
|
||||
file_size = path.stat().st_size
|
||||
max_size = max_size_mb * 1024 * 1024
|
||||
logger.debug(
|
||||
f"File size check - size: {file_size} bytes, max: {max_size} bytes"
|
||||
)
|
||||
|
||||
if file_size > max_size:
|
||||
logger.warning(f"Image file too large ({file_size} bytes): {image_path}")
|
||||
return False
|
||||
|
||||
logger.debug(f"Image validation successful: {image_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating image file {image_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def insert_text_content(
|
||||
lightrag,
|
||||
input: str | list[str],
|
||||
split_by_character: str | None = None,
|
||||
split_by_character_only: bool = False,
|
||||
ids: str | list[str] | None = None,
|
||||
file_paths: str | list[str] | None = None,
|
||||
):
|
||||
"""
|
||||
Insert pure text content into LightRAG
|
||||
|
||||
Args:
|
||||
lightrag: LightRAG instance
|
||||
input: Single document string or list of document strings
|
||||
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
|
||||
chunk_token_size, it will be split again by token size.
|
||||
split_by_character_only: if split_by_character_only is True, split the string by character only, when
|
||||
split_by_character is None, this parameter is ignored.
|
||||
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
|
||||
file_paths: single string of the file path or list of file paths, used for citation
|
||||
"""
|
||||
logger.info("Starting text content insertion into LightRAG...")
|
||||
|
||||
# Use LightRAG's insert method with all parameters
|
||||
await lightrag.ainsert(
|
||||
input=input,
|
||||
file_paths=file_paths,
|
||||
split_by_character=split_by_character,
|
||||
split_by_character_only=split_by_character_only,
|
||||
ids=ids,
|
||||
)
|
||||
|
||||
logger.info("Text content insertion complete")
|
||||
|
||||
|
||||
async def insert_text_content_with_multimodal_content(
|
||||
lightrag,
|
||||
input: str | list[str],
|
||||
multimodal_content: list[dict[str, any]] | None = None,
|
||||
split_by_character: str | None = None,
|
||||
split_by_character_only: bool = False,
|
||||
ids: str | list[str] | None = None,
|
||||
file_paths: str | list[str] | None = None,
|
||||
scheme_name: str | None = None,
|
||||
):
|
||||
"""
|
||||
Insert pure text content into LightRAG
|
||||
|
||||
Args:
|
||||
lightrag: LightRAG instance
|
||||
input: Single document string or list of document strings
|
||||
multimodal_content: Multimodal content list (optional)
|
||||
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
|
||||
chunk_token_size, it will be split again by token size.
|
||||
split_by_character_only: if split_by_character_only is True, split the string by character only, when
|
||||
split_by_character is None, this parameter is ignored.
|
||||
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
|
||||
file_paths: single string of the file path or list of file paths, used for citation
|
||||
scheme_name: scheme name (optional)
|
||||
"""
|
||||
logger.info("Starting text content insertion into LightRAG...")
|
||||
|
||||
# Use LightRAG's insert method with all parameters
|
||||
try:
|
||||
await lightrag.ainsert(
|
||||
input=input,
|
||||
multimodal_content=multimodal_content,
|
||||
file_paths=file_paths,
|
||||
split_by_character=split_by_character,
|
||||
split_by_character_only=split_by_character_only,
|
||||
ids=ids,
|
||||
scheme_name=scheme_name,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.info(f"Error: {e}")
|
||||
logger.info(
|
||||
"If the error is caused by the ainsert function not having a multimodal content parameter, please update the raganything branch of lightrag"
|
||||
)
|
||||
|
||||
logger.info("Text content insertion complete")
|
||||
|
||||
|
||||
def get_processor_for_type(modal_processors: Dict[str, Any], content_type: str):
|
||||
"""
|
||||
Get appropriate processor based on content type
|
||||
|
||||
Args:
|
||||
modal_processors: Dictionary of available processors
|
||||
content_type: Content type
|
||||
|
||||
Returns:
|
||||
Corresponding processor instance
|
||||
"""
|
||||
# Direct mapping to corresponding processor
|
||||
if content_type == "image":
|
||||
return modal_processors.get("image")
|
||||
elif content_type == "table":
|
||||
return modal_processors.get("table")
|
||||
elif content_type == "equation":
|
||||
return modal_processors.get("equation")
|
||||
else:
|
||||
# For other types, use generic processor
|
||||
return modal_processors.get("generic")
|
||||
|
||||
|
||||
def get_processor_supports(proc_type: str) -> List[str]:
|
||||
"""Get processor supported features"""
|
||||
supports_map = {
|
||||
"image": [
|
||||
"Image content analysis",
|
||||
"Visual understanding",
|
||||
"Image description generation",
|
||||
"Image entity extraction",
|
||||
],
|
||||
"table": [
|
||||
"Table structure analysis",
|
||||
"Data statistics",
|
||||
"Trend identification",
|
||||
"Table entity extraction",
|
||||
],
|
||||
"equation": [
|
||||
"Mathematical formula parsing",
|
||||
"Variable identification",
|
||||
"Formula meaning explanation",
|
||||
"Formula entity extraction",
|
||||
],
|
||||
"generic": [
|
||||
"General content analysis",
|
||||
"Structured processing",
|
||||
"Entity extraction",
|
||||
],
|
||||
}
|
||||
return supports_map.get(proc_type, ["Basic processing"])
|
||||
@@ -1,10 +1,10 @@
|
||||
huggingface_hub
|
||||
# LightRAG packages
|
||||
lightrag-hku
|
||||
|
||||
# MinerU 2.0 packages (replaces magic-pdf)
|
||||
mineru[core]
|
||||
|
||||
# Progress bars for batch processing
|
||||
tqdm
|
||||
# Note: Optional dependencies are now defined in setup.py extras_require:
|
||||
# - [image]: Pillow>=10.0.0 (for BMP, TIFF, GIF, WebP format conversion)
|
||||
# - [text]: reportlab>=4.0.0 (for TXT, MD to PDF conversion)
|
||||
|
||||
17
scripts/create_tiktoken_cache.py
Normal file
17
scripts/create_tiktoken_cache.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import tiktoken
|
||||
import os
|
||||
|
||||
# Define the directory where you want to store the cache
|
||||
cache_dir = "./tiktoken_cache"
|
||||
if "TIKTOKEN_CACHE_DIR" not in os.environ:
|
||||
os.environ["TIKTOKEN_CACHE_DIR"] = cache_dir
|
||||
|
||||
# Create the directory if it doesn't exist
|
||||
if not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
|
||||
print("Downloading and caching tiktoken models...")
|
||||
tiktoken.get_encoding("cl100k_base")
|
||||
# tiktoken.get_encoding("p50k_base")
|
||||
|
||||
print(f"tiktoken models have been cached in '{cache_dir}'")
|
||||
5
setup.py
5
setup.py
@@ -64,6 +64,11 @@ extras_require = {
|
||||
"text": ["reportlab>=4.0.0"], # For text file to PDF conversion (TXT, MD)
|
||||
"office": [], # Office document processing requires LibreOffice (external program)
|
||||
"all": ["Pillow>=10.0.0", "reportlab>=4.0.0"], # All optional features
|
||||
"markdown": [
|
||||
"markdown>=3.4.0",
|
||||
"weasyprint>=60.0",
|
||||
"pygments>=2.10.0",
|
||||
], # Enhanced markdown conversion
|
||||
}
|
||||
|
||||
setuptools.setup(
|
||||
|
||||
Reference in New Issue
Block a user