Skip to content

Commit 9f16ab4

Browse files
authored
Merge pull request #141 from wongzhenhao/main
Update VQAExtract
2 parents bf9879e + ffaf21b commit 9f16ab4

File tree

2 files changed

+86
-68
lines changed

2 files changed

+86
-68
lines changed

docs/en/notes/guide/pipelines/PDFVQAExtractPipeline.md

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,9 @@ Major stages:
2525
### Step 1: Install Dataflow (and MinerU)
2626
```shell
2727
pip install open-dataflow
28-
pip install mineru[pipeline]
28+
pip install "mineru[vllm]"
2929
mineru-models-download
3030
```
31-
The `vlm-vllm-engine` backend requires GPU support.
3231

3332
### Step 2: Create a workspace
3433
```shell
@@ -54,15 +53,26 @@ $env:DF_API_KEY = "sk-xxxxx"
5453
In the pipeline script, set your API endpoint:
5554
```python
5655
self.llm_serving = APILLMServing_request(
57-
api_url="https://api.openai.com/v1/chat/completions",
56+
api_url="https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
57+
key_name_of_api_key="DF_API_KEY",
5858
model_name="gemini-2.5-pro",
5959
max_workers=100,
6060
)
6161
```
62+
and set MinerU backend ('vlm-vllm-engine' or 'vlm-transformers') and LLM max token length (recommended not to exceed 128000 to avoid LLM forgetting details).
63+
**Caution: The pipeline was only tested with the `vlm` backend; compatibility with the `pipeline` backend is uncertain due to format differences. Using the `vlm` backend is recommended.**
64+
The `vlm-vllm-engine` backend requires GPU support.
65+
```python
66+
self.vqa_extractor = VQAExtractor(
67+
llm_serving=self.llm_serving,
68+
mineru_backend='vlm-vllm-engine',
69+
max_chunk_len=128000
70+
)
71+
```
6272

6373
### Step 5: One-click run
6474
```bash
65-
python pipelines/vqa_extract_optimized_pipeline.py
75+
python api_pipelines/pdf_vqa_extract_pipeline.py
6676
```
6777
You can also import the operators into other workflows; the remainder of this doc explains the data flow in detail.
6878

@@ -84,11 +94,11 @@ Each job is defined by a JSONL row. Two modes are supported:
8494
`FileStorage` handles batching/cache management:
8595
```python
8696
self.storage = FileStorage(
87-
first_entry_file_name="./examples/VQA/vqa_extract_interleaved_test.jsonl",
88-
cache_path="./vqa_extract_optimized_cache",
89-
file_name_prefix="vqa",
90-
cache_type="jsonl",
91-
)
97+
first_entry_file_name="../example_data/PDF2VQAPipeline/vqa_extract_test.jsonl",
98+
cache_path="./cache",
99+
file_name_prefix="vqa",
100+
cache_type="jsonl",
101+
)
92102
```
93103

94104
### 2. Document layout extraction (MinerU)
@@ -107,9 +117,10 @@ The backend can be:
107117

108118
`VQAExtractor` chunks the layout JSON to respect token limits, builds subject-aware prompts (`QAExtractPrompt`), and batches LLM calls via `APILLMServing_request`. Key behaviors:
109119

120+
- Grouping and pairing Q&A based, and inserting images to proper positions.
110121
- Supports `question_pdf_path` + `answer_pdf_path`, or a single `pdf_path` (auto-detect interleaved mode).
111122
- Copies rendered images into `output_dir/question_images` and/or `answer_images`.
112-
- Parses `<qa_pair>`, `<question>`, `<answer>`, `<solution>`, `<chapter>` tags from the LLM response, with figure references preserved as `<pic>tag:box</pic>`.
123+
- Parses `<qa_pair>`, `<question>`, `<answer>`, `<solution>`, `<chapter>`, `<label>` tags from the LLM response.
113124

114125
### 4. Post-processing and outputs
115126

@@ -128,8 +139,8 @@ Filtering keeps entries where the question exists and either `answer` or `soluti
128139

129140
Each filtered record includes:
130141

131-
- `question`: question text (with inline `<pic>` tags if figures are referenced)
132-
- `answer`: answer text (if extracted from answer PDF)
142+
- `question`: question text and images
143+
- `answer`: answer text and images(if extracted from answer PDF)
133144
- `solution`: optional worked solution (if present)
134145
- `label`: original numbering (e.g., “Example 3”, “习题2”)
135146
- `chapter_title`: chapter/section header detected on the same page
@@ -148,56 +159,54 @@ Example:
148159
## 5. Pipeline Example
149160

150161
```python
151-
import os
152-
import sys
153-
154-
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
155-
if parent_dir not in sys.path:
156-
sys.path.insert(0, parent_dir)
157-
158162
from dataflow.serving import APILLMServing_request
159163
from dataflow.utils.storage import FileStorage
160-
from operators.vqa_extractor import VQAExtractor
164+
from dataflow.operators.pdf2vqa import VQAExtractor
161165

162166
class VQA_extract_optimized_pipeline:
163167
def __init__(self):
164168
self.storage = FileStorage(
165-
first_entry_file_name="./examples/VQA/vqa_extract_interleaved_test.jsonl",
166-
cache_path="./vqa_extract_optimized_cache",
169+
first_entry_file_name="./example_data/PDF2VQAPipeline/vqa_extract_test.jsonl",
170+
cache_path="./cache",
167171
file_name_prefix="vqa",
168172
cache_type="jsonl",
169173
)
170-
174+
171175
self.llm_serving = APILLMServing_request(
172-
api_url="https://api.openai.com/v1/chat/completions",
176+
api_url="https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
173177
key_name_of_api_key="DF_API_KEY",
174-
model_name="gpt-4o",
178+
model_name="gemini-2.5-pro",
175179
max_workers=100,
176180
)
177-
181+
178182
self.vqa_extractor = VQAExtractor(
179-
llm_serving=self.llm_serving
183+
llm_serving=self.llm_serving,
184+
mineru_backend='vlm-vllm-engine',
185+
max_chunk_len=128000
180186
)
181-
187+
182188
def forward(self):
183189
self.vqa_extractor.run(
184190
storage=self.storage.step(),
185-
question_pdf_path_key="question_pdf_path",
186-
answer_pdf_path_key="answer_pdf_path",
187-
pdf_path_key="pdf_path",
188-
subject_key="subject",
191+
input_question_pdf_path_key="question_pdf_path",
192+
input_answer_pdf_path_key="answer_pdf_path",
193+
input_pdf_path_key="pdf_path", # for interleaved mode
194+
input_subject_key="subject",
189195
output_dir_key="output_dir",
190196
output_jsonl_key="output_jsonl_path",
191-
mineru_backend='vlm-vllm-engine',
192197
)
193198

199+
200+
194201
if __name__ == "__main__":
202+
# Each line in the JSONL contains `question_pdf_path`, `answer_pdf_path`, `subject` (math, physics, chemistry, ...), and `output_dir`
203+
# If the question and the answer are in the same PDF, set both `question_pdf_path` and `answer_pdf_path` to the same path; the pipeline will automatically switch to interleaved mode.
195204
pipeline = VQA_extract_optimized_pipeline()
196205
pipeline.forward()
197206
```
198207

199208
---
200209

201-
Pipeline source: `DataFlow/pipelines/vqa_extract_optimized_pipeline.py`
210+
Pipeline source: `DataFlow/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py`
202211

203212
Use this pipeline whenever you need structured QA data distilled directly from PDF textbooks with figure references intact.

docs/zh/notes/guide/pipelines/PDFVQAExtractPipeline.md

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,9 @@ icon: heroicons:document-text
2525
### 步骤 1:安装 Dataflow(以及 MinerU)
2626
```shell
2727
pip install open-dataflow
28-
pip install mineru[pipeline]
28+
pip install "mineru[vllm]"
2929
mineru-models-download
3030
```
31-
`vlm-vllm-engine` 模式需要 GPU。
3231

3332
### 步骤 2:创建工作区
3433
```shell
@@ -54,15 +53,25 @@ $env:DF_API_KEY = "sk-xxxxx"
5453
在脚本中设置接口:
5554
```python
5655
self.llm_serving = APILLMServing_request(
57-
api_url="https://api.openai.com/v1/chat/completions",
56+
api_url="https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
57+
key_name_of_api_key="DF_API_KEY",
5858
model_name="gemini-2.5-pro",
5959
max_workers=100,
6060
)
6161
```
62+
并设置MinerU后端('vlm-vllm-engine'或者'vlm-transformers')和LLM最大token数量(建议不要设置大于128000,否则LLM因为无法记住细节而效果不好)。`vlm-vllm-engine` 模式需要 GPU。
63+
**目前这个pipeline只在`vlm`后端下经过测试,不确定是否能支持`pipeline`后端,根据官方文档两个后端格式有区别,因此建议使用`vlm`后端。**
64+
```python
65+
self.vqa_extractor = VQAExtractor(
66+
llm_serving=self.llm_serving,
67+
mineru_backend='vlm-vllm-engine',
68+
max_chunk_len=128000
69+
)
70+
```
6271

6372
### 步骤 5:一键运行
6473
```bash
65-
python pipelines/vqa_extract_optimized_pipeline.py
74+
python api_pipelines/pdf_vqa_extract_pipeline.py
6675
```
6776
也可将各算子嵌入其他流程,下文详细介绍数据流。
6877

@@ -84,11 +93,11 @@ python pipelines/vqa_extract_optimized_pipeline.py
8493
`FileStorage` 负责读取与缓存:
8594
```python
8695
self.storage = FileStorage(
87-
first_entry_file_name="./examples/VQA/vqa_extract_interleaved_test.jsonl",
88-
cache_path="./vqa_extract_optimized_cache",
89-
file_name_prefix="vqa",
90-
cache_type="jsonl",
91-
)
96+
first_entry_file_name="./example_data/PDF2VQAPipeline/vqa_extract_test.jsonl",
97+
cache_path="./cache",
98+
file_name_prefix="vqa",
99+
cache_type="jsonl",
100+
)
92101
```
93102

94103
### 2. 文档布局解析(MinerU)
@@ -107,9 +116,10 @@ self.storage = FileStorage(
107116

108117
`VQAExtractor` 将布局 JSON 切块以控制 token,利用 `QAExtractPrompt` 生成学科提示,并通过 `APILLMServing_request` 批量调用 LLM。主要特性:
109118

119+
- 整合、匹配问答对,并将图片插入到正确位置。
110120
- 同时支持 `question_pdf_path`/`answer_pdf_path` 与单一 `pdf_path`(自动判定混排)。
111121
- 将 MinerU 切图复制到 `output_dir/question_images``answer_images`
112-
- 解析 `<qa_pair>``<question>``<answer>``<solution>``<chapter>` 标签,并保留 `<pic>tag:box</pic>` 图片引用
122+
- 解析 `<qa_pair>``<question>``<answer>``<solution>``<chapter>``<label>` 标签
113123

114124
### 4. 后处理与产物
115125

@@ -128,8 +138,8 @@ self.storage = FileStorage(
128138

129139
筛选后的记录包含:
130140

131-
- `question`题干文本(包含 `<pic>` 标签)
132-
- `answer`答案文本(若来自答案 PDF)
141+
- `question`题干文本与图片
142+
- `answer`答案文本与图片(若来自答案 PDF)
133143
- `solution`:可选的解题过程
134144
- `label`:原始编号,如“例 3”“习题 2”
135145
- `chapter_title`:所在章节/小节标题
@@ -148,56 +158,55 @@ self.storage = FileStorage(
148158
## 5. 流水线示例
149159

150160
```python
151-
import os
152-
import sys
153-
154-
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
155-
if parent_dir not in sys.path:
156-
sys.path.insert(0, parent_dir)
157-
158161
from dataflow.serving import APILLMServing_request
159162
from dataflow.utils.storage import FileStorage
160-
from operators.vqa_extractor import VQAExtractor
163+
from dataflow.operators.pdf2vqa import VQAExtractor
161164

162165
class VQA_extract_optimized_pipeline:
163166
def __init__(self):
164167
self.storage = FileStorage(
165-
first_entry_file_name="./examples/VQA/vqa_extract_interleaved_test.jsonl",
166-
cache_path="./vqa_extract_optimized_cache",
168+
first_entry_file_name="./example_data/PDF2VQAPipeline/vqa_extract_test.jsonl",
169+
cache_path="./cache",
167170
file_name_prefix="vqa",
168171
cache_type="jsonl",
169172
)
170-
173+
171174
self.llm_serving = APILLMServing_request(
172-
api_url="https://api.openai.com/v1/chat/completions",
175+
api_url="https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
173176
key_name_of_api_key="DF_API_KEY",
174-
model_name="gpt-4o",
177+
model_name="gemini-2.5-pro",
175178
max_workers=100,
176179
)
177-
180+
178181
self.vqa_extractor = VQAExtractor(
179-
llm_serving=self.llm_serving
182+
llm_serving=self.llm_serving,
183+
mineru_backend='vlm-vllm-engine',
184+
max_chunk_len=128000
180185
)
181-
186+
182187
def forward(self):
188+
# 单一算子:包含预处理、QA提取、后处理的所有功能
183189
self.vqa_extractor.run(
184190
storage=self.storage.step(),
185-
question_pdf_path_key="question_pdf_path",
186-
answer_pdf_path_key="answer_pdf_path",
187-
pdf_path_key="pdf_path",
188-
subject_key="subject",
191+
input_question_pdf_path_key="question_pdf_path",
192+
input_answer_pdf_path_key="answer_pdf_path",
193+
input_pdf_path_key="pdf_path", # 支持 interleaved 模式
194+
input_subject_key="subject",
189195
output_dir_key="output_dir",
190196
output_jsonl_key="output_jsonl_path",
191-
mineru_backend='vlm-vllm-engine',
192197
)
193198

199+
200+
194201
if __name__ == "__main__":
202+
# jsonl中每一行包含question_pdf_path, answer_pdf_path, subject (math, physics, chemistry, ...), output_dir
203+
# 如果question和answer在同一份pdf中,请将question_pdf_path和answer_pdf_path设置为相同的路径,会自动切换为interleaved模式
195204
pipeline = VQA_extract_optimized_pipeline()
196205
pipeline.forward()
197206
```
198207

199208
---
200209

201-
Pipeline 源码:`DataFlow/pipelines/vqa_extract_optimized_pipeline.py`
210+
Pipeline 源码:`DataFlow/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py`
202211

203212
利用该流水线可直接从 PDF 教材中沉淀带图引用的结构化问答数据。

0 commit comments

Comments
 (0)