Tencent · begoniezhao · Nov 7, 2025 · Nov 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -24,17 +24,14 @@ node_modules/
 tmp/
 temp/
 
-# Docker compose файл (локальные настройки)
-# docker-compose.yml
-
 WeKnora
 /models/
-**/__pycache__
 test/data/mswag.txt
 data/files/
 
-.python-version
 .venv/
+**/__pycache__
+.python-version
 
 ### macOS
 # General

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -150,6 +150,7 @@ services:
       - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
       - MINIO_USE_SSL=${MINIO_USE_SSL:-}
       - WEB_PROXY=${WEB_PROXY:-}
+      - MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
     healthcheck:
       test: ["CMD", "grpc_health_probe", "-addr=:50051"]
       interval: 30s

diff --git a/docker/Dockerfile.docreader b/docker/Dockerfile.docreader
@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
     python -m uv sync --locked --no-dev
 
 # 复制源代码和生成脚本
-COPY docreader .
+COPY docreader docreader
 
 # 生成 protobuf 代码
-RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
+RUN chmod +x docreader/scripts/generate_proto.sh && \
+    bash docreader/scripts/generate_proto.sh
 
 # 确保模型目录存在
 RUN ls -la /root/.paddleocr/whl/
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
 # COPY docreader/scripts/download_deps.py download_deps.py
 # RUN python -m download_deps
 
-COPY --from=builder /app/ ./
+COPY docreader/pyproject.toml docreader/uv.lock ./
+COPY --from=builder /app/docreader docreader
 
 # 暴露 gRPC 端口
 EXPOSE 50051
 
 # 直接运行 Python 服务（日志输出到 stdout/stderr）
-CMD ["uv", "run", "main.py"]
+CMD ["uv", "run", "-m", "docreader.main"]
diff --git a/docreader/.pylintrc b/docreader/.pylintrc
@@ -0,0 +1,5 @@
+[LOGGING]
+logging-format-style=fstr
+
+[MESSAGES CONTROL]
+; disable=W1203
diff --git a/docreader/main.py b/docreader/main.py
@@ -1,37 +1,25 @@
+import logging
 import os
+import re
 import sys
-import logging
-from concurrent import futures
 import traceback
-import grpc
 import uuid
-import atexit
+from concurrent import futures
+from typing import Optional
+
+import grpc
 from grpc_health.v1 import health_pb2_grpc
 from grpc_health.v1.health import HealthServicer
 
-# Add parent directory to Python path
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-if parent_dir not in sys.path:
-    sys.path.insert(0, parent_dir)
-
-from proto.docreader_pb2 import ReadResponse, Chunk, Image
-from proto import docreader_pb2_grpc
-from parser import Parser, OCREngine
-from parser.config import ChunkingConfig
-from utils.request import request_id_context, init_logging_request_id
+from docreader.models.read_config import ChunkingConfig
+from docreader.parser import Parser
+from docreader.parser.ocr_engine import OCREngine
+from docreader.proto import docreader_pb2_grpc
+from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
+from docreader.utils.request import init_logging_request_id, request_id_context
 
-# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
-import re
-from typing import Optional
-
-try:
-    # Optional dependency for charset detection; install via `pip install charset-normalizer`
-    from charset_normalizer import from_bytes as _cn_from_bytes  # type: ignore
-except Exception:  # pragma: no cover
-    _cn_from_bytes = None  # type: ignore
-
-# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
+# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
+# cannot be encoded to UTF-8
 _SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
 
 
@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
     return s.encode("utf-8", errors="replace").decode("utf-8")
 
 
-def read_text_with_fallback(file_path: str) -> str:
-    """Read text from file supporting multiple encodings with graceful fallback.
-
-    This server currently receives bytes over gRPC and delegates decoding to the parser.
-    This helper is provided for future local-file reads if needed.
-    """
-    with open(file_path, "rb") as f:
-        raw = f.read()
-    if _cn_from_bytes is not None:
-        try:
-            result = _cn_from_bytes(raw).best()
-            if result:
-                return str(result)
-        except Exception:
-            pass
-    for enc in ("utf-8", "gb18030", "latin-1"):
-        try:
-            return raw.decode(enc, errors="replace")
-        except UnicodeDecodeError:
-            continue
-    return raw.decode("utf-8", errors="replace")
-
-
 # Ensure no existing handlers
 for handler in logging.root.handlers[:]:
     logging.root.removeHandler(handler)
@@ -113,7 +78,7 @@ def ReadFromFile(self, request, context):
                     request.file_type or os.path.splitext(request.file_name)[1][1:]
                 )
                 logger.info(
-                    f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
+                    f"ReadFromFile for file: {request.file_name}, type: {file_type}"
                 )
                 logger.info(f"File content size: {len(request.file_content)} bytes")
 
@@ -124,8 +89,8 @@ def ReadFromFile(self, request, context):
                 enable_multimodal = request.read_config.enable_multimodal or False
 
                 logger.info(
-                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
-                    f"multimodal={enable_multimodal}"
+                    f"Using chunking config: size={chunk_size}, "
+                    f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
                 )
 
                 # Get Storage and VLM config from request
@@ -144,7 +109,8 @@ def ReadFromFile(self, request, context):
                     "path_prefix": sc.path_prefix,
                 }
                 logger.info(
-                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                    f"Using Storage config: provider={storage_config.get('provider')}, "
+                    f"bucket={storage_config['bucket_name']}"
                 )
 
                 vlm_config = {
@@ -170,7 +136,7 @@ def ReadFromFile(self, request, context):
                 )
 
                 # Parse file
-                logger.info(f"Starting file parsing process")
+                logger.info("Starting file parsing process")
                 result = self.parser.parse_file(
                     request.file_name, file_type, request.file_content, chunking_config
                 )
@@ -184,7 +150,7 @@ def ReadFromFile(self, request, context):
 
                 # Convert to protobuf message
                 logger.info(
-                    f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
+                    f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
                 )
 
                 # Build response, including image info
@@ -224,8 +190,8 @@ def ReadFromURL(self, request, context):
                 enable_multimodal = request.read_config.enable_multimodal or False
 
                 logger.info(
-                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
-                    f"multimodal={enable_multimodal}"
+                    f"Using chunking config: size={chunk_size}, "
+                    f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
                 )
 
                 # Get Storage and VLM config from request
@@ -243,7 +209,8 @@ def ReadFromURL(self, request, context):
                     "path_prefix": sc.path_prefix,
                 }
                 logger.info(
-                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                    f"Using Storage config: provider={storage_config.get('provider')}, "
+                    f"bucket={storage_config['bucket_name']}"
                 )
 
                 vlm_config = {
@@ -269,7 +236,7 @@ def ReadFromURL(self, request, context):
                 )
 
                 # Parse URL
-                logger.info(f"Starting URL parsing process")
+                logger.info("Starting URL parsing process")
                 result = self.parser.parse_url(
                     request.url, request.title, chunking_config
                 )
@@ -282,7 +249,7 @@ def ReadFromURL(self, request, context):
 
                 # Convert to protobuf message, including image info
                 logger.info(
-                    f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
+                    f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
                 )
 
                 response = ReadResponse(
@@ -335,29 +302,15 @@ def _convert_chunk_to_proto(self, chunk):
         return proto_chunk
 
 
-def init_ocr_engine(ocr_backend, ocr_config):
+def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
     """Initialize OCR engine"""
-    try:
-        logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
-        ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
-        if ocr_engine:
-            logger.info("OCR engine initialized successfully")
-            return True
-        else:
-            logger.error("OCR engine initialization failed")
-            return False
-    except Exception as e:
-        logger.error(f"Error initializing OCR engine: {str(e)}")
-        return False
+    backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
+    logger.info(f"Initializing OCR engine with backend: {backend_type}")
+    OCREngine.get_instance(backend_type=backend_type, **kwargs)
 
 
 def main():
-    init_ocr_engine(
-        os.getenv("OCR_BACKEND", "paddle"),
-        {
-            "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
-        },
-    )
+    init_ocr_engine()
 
     # Set max number of worker threads
     max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))

diff --git a/docreader/models/__init__.py b/docreader/models/__init__.py
diff --git a/docreader/models/document.py b/docreader/models/document.py
@@ -0,0 +1,87 @@
+"""Chunk document schema."""
+
+import json
+from typing import Any, Dict, List
+
+from pydantic import BaseModel, Field
+
+
+class Chunk(BaseModel):
+    """Document Chunk including chunk content, chunk metadata."""
+
+    content: str = Field(default="", description="chunk text content")
+    seq: int = Field(default=0, description="Chunk sequence number")
+    start: int = Field(default=0, description="Chunk start position")
+    end: int = Field(description="Chunk end position")
+    images: List[Dict[str, Any]] = Field(
+        default_factory=list, description="Images in the chunk"
+    )
+
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="metadata fields",
+    )
+
+    def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
+        """Convert Chunk to dict."""
+
+        data = self.model_dump()
+        data.update(kwargs)
+        data["class_name"] = self.__class__.__name__
+        return data
+
+    def to_json(self, **kwargs: Any) -> str:
+        """Convert Chunk to json."""
+        data = self.to_dict(**kwargs)
+        return json.dumps(data)
+
+    def __hash__(self):
+        """Hash function."""
+        return hash((self.content,))
+
+    def __eq__(self, other):
+        """Equal function."""
+        return self.content == other.content
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any], **kwargs: Any):  # type: ignore
+        """Create Chunk from dict."""
+        if isinstance(kwargs, dict):
+            data.update(kwargs)
+
+        data.pop("class_name", None)
+        return cls(**data)
+
+    @classmethod
+    def from_json(cls, data_str: str, **kwargs: Any):  # type: ignore
+        """Create Chunk from json."""
+        data = json.loads(data_str)
+        return cls.from_dict(data, **kwargs)
+
+
+class Document(BaseModel):
+    """Document including document content, document metadata."""
+
+    model_config = {"arbitrary_types_allowed": True}
+
+    content: str = Field(default="", description="document text content")
+    images: Dict[str, str] = Field(
+        default_factory=dict, description="Images in the document"
+    )
+
+    chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="metadata fields",
+    )
+
+    def set_content(self, content: str) -> None:
+        """Set document content."""
+        self.content = content
+
+    def get_content(self) -> str:
+        """Get document content."""
+        return self.content
+
+    def is_valid(self) -> bool:
+        return self.content != ""
diff --git a/docreader/models/read_config.py b/docreader/models/read_config.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ChunkingConfig:
+    """
+    Configuration for text chunking process.
+    Controls how documents are split into smaller pieces for processing.
+    """
+
+    # Maximum size of each chunk in tokens/chars
+    chunk_size: int = 512
+
+    # Number of tokens/chars to overlap between chunks
+    chunk_overlap: int = 50
+
+    # Text separators in order of priority
+    separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
+
+    # Whether to enable multimodal processing (text + images)
+    enable_multimodal: bool = False
+
+    # Preferred field name going forward
+    storage_config: dict[str, str] = field(default_factory=dict)
+
+    # VLM configuration for image captioning
+    vlm_config: dict[str, str] = field(default_factory=dict)