Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,14 @@ node_modules/
tmp/
temp/

# Docker compose файл (локальные настройки)
# docker-compose.yml

WeKnora
/models/
**/__pycache__
test/data/mswag.txt
data/files/

.python-version
.venv/
**/__pycache__
.python-version

### macOS
# General
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ services:
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
- MINIO_USE_SSL=${MINIO_USE_SSL:-}
- WEB_PROXY=${WEB_PROXY:-}
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
healthcheck:
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
interval: 30s
Expand Down
10 changes: 6 additions & 4 deletions docker/Dockerfile.docreader
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
python -m uv sync --locked --no-dev

# 复制源代码和生成脚本
COPY docreader .
COPY docreader docreader

# 生成 protobuf 代码
RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
RUN chmod +x docreader/scripts/generate_proto.sh && \
bash docreader/scripts/generate_proto.sh

# 确保模型目录存在
RUN ls -la /root/.paddleocr/whl/
Expand Down Expand Up @@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
# COPY docreader/scripts/download_deps.py download_deps.py
# RUN python -m download_deps

COPY --from=builder /app/ ./
COPY docreader/pyproject.toml docreader/uv.lock ./
COPY --from=builder /app/docreader docreader

# 暴露 gRPC 端口
EXPOSE 50051

# 直接运行 Python 服务(日志输出到 stdout/stderr)
CMD ["uv", "run", "main.py"]
CMD ["uv", "run", "-m", "docreader.main"]
5 changes: 5 additions & 0 deletions docreader/.pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[LOGGING]
logging-format-style=fstr

[MESSAGES CONTROL]
; disable=W1203
111 changes: 32 additions & 79 deletions docreader/main.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,25 @@
import logging
import os
import re
import sys
import logging
from concurrent import futures
import traceback
import grpc
import uuid
import atexit
from concurrent import futures
from typing import Optional

import grpc
from grpc_health.v1 import health_pb2_grpc
from grpc_health.v1.health import HealthServicer

# Add parent directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)

from proto.docreader_pb2 import ReadResponse, Chunk, Image
from proto import docreader_pb2_grpc
from parser import Parser, OCREngine
from parser.config import ChunkingConfig
from utils.request import request_id_context, init_logging_request_id
from docreader.models.read_config import ChunkingConfig
from docreader.parser import Parser
from docreader.parser.ocr_engine import OCREngine
from docreader.proto import docreader_pb2_grpc
from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
from docreader.utils.request import init_logging_request_id, request_id_context

# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
import re
from typing import Optional

try:
# Optional dependency for charset detection; install via `pip install charset-normalizer`
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
except Exception: # pragma: no cover
_cn_from_bytes = None # type: ignore

# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
# cannot be encoded to UTF-8
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")


Expand All @@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
return s.encode("utf-8", errors="replace").decode("utf-8")


def read_text_with_fallback(file_path: str) -> str:
"""Read text from file supporting multiple encodings with graceful fallback.

This server currently receives bytes over gRPC and delegates decoding to the parser.
This helper is provided for future local-file reads if needed.
"""
with open(file_path, "rb") as f:
raw = f.read()
if _cn_from_bytes is not None:
try:
result = _cn_from_bytes(raw).best()
if result:
return str(result)
except Exception:
pass
for enc in ("utf-8", "gb18030", "latin-1"):
try:
return raw.decode(enc, errors="replace")
except UnicodeDecodeError:
continue
return raw.decode("utf-8", errors="replace")


# Ensure no existing handlers
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
Expand Down Expand Up @@ -113,7 +78,7 @@ def ReadFromFile(self, request, context):
request.file_type or os.path.splitext(request.file_name)[1][1:]
)
logger.info(
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
f"ReadFromFile for file: {request.file_name}, type: {file_type}"
)
logger.info(f"File content size: {len(request.file_content)} bytes")

Expand All @@ -124,8 +89,8 @@ def ReadFromFile(self, request, context):
enable_multimodal = request.read_config.enable_multimodal or False

logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
f"multimodal={enable_multimodal}"
f"Using chunking config: size={chunk_size}, "
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
)

# Get Storage and VLM config from request
Expand All @@ -144,7 +109,8 @@ def ReadFromFile(self, request, context):
"path_prefix": sc.path_prefix,
}
logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
f"Using Storage config: provider={storage_config.get('provider')}, "
f"bucket={storage_config['bucket_name']}"
)

vlm_config = {
Expand All @@ -170,7 +136,7 @@ def ReadFromFile(self, request, context):
)

# Parse file
logger.info(f"Starting file parsing process")
logger.info("Starting file parsing process")
result = self.parser.parse_file(
request.file_name, file_type, request.file_content, chunking_config
)
Expand All @@ -184,7 +150,7 @@ def ReadFromFile(self, request, context):

# Convert to protobuf message
logger.info(
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
)

# Build response, including image info
Expand Down Expand Up @@ -224,8 +190,8 @@ def ReadFromURL(self, request, context):
enable_multimodal = request.read_config.enable_multimodal or False

logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
f"multimodal={enable_multimodal}"
f"Using chunking config: size={chunk_size}, "
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
)

# Get Storage and VLM config from request
Expand All @@ -243,7 +209,8 @@ def ReadFromURL(self, request, context):
"path_prefix": sc.path_prefix,
}
logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
f"Using Storage config: provider={storage_config.get('provider')}, "
f"bucket={storage_config['bucket_name']}"
)

vlm_config = {
Expand All @@ -269,7 +236,7 @@ def ReadFromURL(self, request, context):
)

# Parse URL
logger.info(f"Starting URL parsing process")
logger.info("Starting URL parsing process")
result = self.parser.parse_url(
request.url, request.title, chunking_config
)
Expand All @@ -282,7 +249,7 @@ def ReadFromURL(self, request, context):

# Convert to protobuf message, including image info
logger.info(
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
)

response = ReadResponse(
Expand Down Expand Up @@ -335,29 +302,15 @@ def _convert_chunk_to_proto(self, chunk):
return proto_chunk


def init_ocr_engine(ocr_backend, ocr_config):
def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
"""Initialize OCR engine"""
try:
logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
if ocr_engine:
logger.info("OCR engine initialized successfully")
return True
else:
logger.error("OCR engine initialization failed")
return False
except Exception as e:
logger.error(f"Error initializing OCR engine: {str(e)}")
return False
backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
logger.info(f"Initializing OCR engine with backend: {backend_type}")
OCREngine.get_instance(backend_type=backend_type, **kwargs)


def main():
init_ocr_engine(
os.getenv("OCR_BACKEND", "paddle"),
{
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
},
)
init_ocr_engine()

# Set max number of worker threads
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
Expand Down
Empty file added docreader/models/__init__.py
Empty file.
87 changes: 87 additions & 0 deletions docreader/models/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Chunk document schema."""

import json
from typing import Any, Dict, List

from pydantic import BaseModel, Field


class Chunk(BaseModel):
"""Document Chunk including chunk content, chunk metadata."""

content: str = Field(default="", description="chunk text content")
seq: int = Field(default=0, description="Chunk sequence number")
start: int = Field(default=0, description="Chunk start position")
end: int = Field(description="Chunk end position")
images: List[Dict[str, Any]] = Field(
default_factory=list, description="Images in the chunk"
)

metadata: Dict[str, Any] = Field(
default_factory=dict,
description="metadata fields",
)

def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
"""Convert Chunk to dict."""

data = self.model_dump()
data.update(kwargs)
data["class_name"] = self.__class__.__name__
return data

def to_json(self, **kwargs: Any) -> str:
"""Convert Chunk to json."""
data = self.to_dict(**kwargs)
return json.dumps(data)

def __hash__(self):
"""Hash function."""
return hash((self.content,))

def __eq__(self, other):
"""Equal function."""
return self.content == other.content

@classmethod
def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
"""Create Chunk from dict."""
if isinstance(kwargs, dict):
data.update(kwargs)

data.pop("class_name", None)
return cls(**data)

@classmethod
def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
"""Create Chunk from json."""
data = json.loads(data_str)
return cls.from_dict(data, **kwargs)


class Document(BaseModel):
"""Document including document content, document metadata."""

model_config = {"arbitrary_types_allowed": True}

content: str = Field(default="", description="document text content")
images: Dict[str, str] = Field(
default_factory=dict, description="Images in the document"
)

chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="metadata fields",
)

def set_content(self, content: str) -> None:
"""Set document content."""
self.content = content

def get_content(self) -> str:
"""Get document content."""
return self.content

def is_valid(self) -> bool:
return self.content != ""
27 changes: 27 additions & 0 deletions docreader/models/read_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from dataclasses import dataclass, field


@dataclass
class ChunkingConfig:
"""
Configuration for text chunking process.
Controls how documents are split into smaller pieces for processing.
"""

# Maximum size of each chunk in tokens/chars
chunk_size: int = 512

# Number of tokens/chars to overlap between chunks
chunk_overlap: int = 50

# Text separators in order of priority
separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])

# Whether to enable multimodal processing (text + images)
enable_multimodal: bool = False

# Preferred field name going forward
storage_config: dict[str, str] = field(default_factory=dict)

# VLM configuration for image captioning
vlm_config: dict[str, str] = field(default_factory=dict)
Loading