summary

2025-09-01 21:57:31 +08:00
parent 2addb7b799
commit a87166814c
4 changed files with 47 additions and 43 deletions
--- a/raganything/base.py
+++ b/raganything/base.py
@@ -9,4 +9,4 @@ class DocStatus(str, Enum):
    PENDING = "pending"
    PROCESSING = "processing"
    PROCESSED = "processed"
-    FAILED = "failed"
+    FAILED = "failed"
--- a/raganything/processor.py
+++ b/raganything/processor.py
@@ -10,7 +10,6 @@ import hashlib
 import json
 from typing import Dict, List, Any, Tuple, Optional
 from pathlib import Path
-from zipfile import error

 from raganything.base import DocStatus
 from raganything.parser import MineruParser, DoclingParser
@@ -352,7 +351,7 @@ class ProcessorMixin:
                        doc_parser.parse_image,
                        image_path=file_path,
                        output_dir=output_dir,
-                        **kwargs
+                        **kwargs,
                    )
                else:
                    # Fallback to MinerU for image parsing if current parser doesn't support it
@@ -380,7 +379,7 @@ class ProcessorMixin:
                    doc_parser.parse_office_doc,
                    doc_path=file_path,
                    output_dir=output_dir,
-                    **kwargs
+                    **kwargs,
                )
            else:
                # For other or unknown formats, use generic parser
@@ -440,7 +439,12 @@ class ProcessorMixin:
        return content_list, doc_id

    async def _process_multimodal_content(
-        self, multimodal_items: List[Dict[str, Any]], file_path: str, doc_id: str, pipeline_status: Optional[Any] = None, pipeline_status_lock: Optional[Any] = None
+        self,
+        multimodal_items: List[Dict[str, Any]],
+        file_path: str,
+        doc_id: str,
+        pipeline_status: Optional[Any] = None,
+        pipeline_status_lock: Optional[Any] = None,
    ):
        """
        Process multimodal content (using specialized processors)
@@ -496,9 +500,7 @@ class ProcessorMixin:
        if pipeline_status_lock and pipeline_status:
            async with pipeline_status_lock:
                pipeline_status["latest_message"] = log_message
-                pipeline_status["history_messages"].append(
-                    log_message
-                )
+                pipeline_status["history_messages"].append(log_message)

        try:
            # Ensure LightRAG is initialized
@@ -1431,39 +1433,40 @@ class ProcessorMixin:
            doc_pre_id = f"doc-pre-{file_name}"
            current_doc_status = await self.lightrag.doc_status.get_by_id(doc_pre_id)
            if not current_doc_status:
-                await self.lightrag.doc_status.upsert({
-                    doc_pre_id: {
-                        'status': DocStatus.READY,
-                        'content': '',
-                        'content_summary': '',
-                        'multimodal_content': [],
-                        'scheme_name': scheme_name,
-                        'content_length': 0,
-                        'created_at': '',
-                        'updated_at': '',
-                        'file_path': file_path
+                await self.lightrag.doc_status.upsert(
+                    {
+                        doc_pre_id: {
+                            "status": DocStatus.READY,
+                            "content": "",
+                            "content_summary": "",
+                            "multimodal_content": [],
+                            "scheme_name": scheme_name,
+                            "content_length": 0,
+                            "created_at": "",
+                            "updated_at": "",
+                            "file_path": file_path,
+                        }
                    }
-                })
-                current_doc_status = await self.lightrag.doc_status.get_by_id(doc_pre_id)
+                )
+                current_doc_status = await self.lightrag.doc_status.get_by_id(
+                    doc_pre_id
+                )

-            from lightrag.kg.shared_storage import get_namespace_data, get_pipeline_status_lock
-            from datetime import datetime
+            from lightrag.kg.shared_storage import (
+                get_namespace_data,
+                get_pipeline_status_lock,
+            )

            pipeline_status = await get_namespace_data("pipeline_status")
            pipeline_status_lock = get_pipeline_status_lock()

            async with pipeline_status_lock:
-                pipeline_status.update({
-                    "scan_disabled": True
-                })
-                pipeline_status["history_messages"].append(f"Now is not allowed to scan")
+                pipeline_status.update({"scan_disabled": True})
+                pipeline_status["history_messages"].append("Now is not allowed to scan")

-            await self.lightrag.doc_status.upsert({
-                doc_pre_id: {
-                    **current_doc_status,
-                    "status": DocStatus.HANDLING
-                }
-            })
+            await self.lightrag.doc_status.upsert(
+                {doc_pre_id: {**current_doc_status, "status": DocStatus.HANDLING}}
+            )

            # Step 1: Parse document
            content_list, content_based_doc_id = await self.parse_document(
@@ -1512,12 +1515,14 @@ class ProcessorMixin:
            #
            # self.logger.info(f"Document {file_path} processing complete!")
            async with pipeline_status_lock:
-                pipeline_status.update({
-                    "scan_disabled": False
-                })
-                pipeline_status["latest_message"] = f"RAGAnything processing completed for {file_name}"
-                pipeline_status["history_messages"].append(f"RAGAnything processing completed for {file_name}")
-                pipeline_status["history_messages"].append(f"Now is allowed to scan")
+                pipeline_status.update({"scan_disabled": False})
+                pipeline_status["latest_message"] = (
+                    f"RAGAnything processing completed for {file_name}"
+                )
+                pipeline_status["history_messages"].append(
+                    f"RAGAnything processing completed for {file_name}"
+                )
+                pipeline_status["history_messages"].append("Now is allowed to scan")

            return True

@@ -1531,7 +1536,6 @@ class ProcessorMixin:

            return False

-
    async def insert_content_list(
        self,
        content_list: List[Dict[str, Any]],
--- a/raganything/raganything.py
+++ b/raganything/raganything.py
@@ -385,7 +385,6 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
            self.logger.info(f"Parser '{self.config.parser}' installation verified")
        return True

-
    def get_config_info(self) -> Dict[str, Any]:
        """Get current configuration information"""
        config_info = {
--- a/raganything/utils.py
+++ b/raganything/utils.py
@@ -179,8 +179,9 @@ async def insert_text_content(
        )
    except Exception as e:
        logger.info(f"Error: {e}")
-        logger.info("If the error is caused by the ainsert function not having a multimodal content parameter, please update the raganything branch of lightrag")
-    
+        logger.info(
+            "If the error is caused by the ainsert function not having a multimodal content parameter, please update the raganything branch of lightrag"
+        )

    logger.info("Text content insertion complete")