PaddlePaddle
diff --git a/‎.github/workflows/model-unittest-gpu.yml‎
Lines changed: 5 additions & 2 deletions b/‎.github/workflows/model-unittest-gpu.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎.github/workflows/unittest-gpu.yml‎
Lines changed: 5 additions & 2 deletions b/‎.github/workflows/unittest-gpu.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 20 additions & 3 deletions b/‎Makefile‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 13 additions & 3 deletions b/‎README.md‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎docs/zh/datasets_format_zh.md‎
Lines changed: 127 additions & 10 deletions b/‎docs/zh/datasets_format_zh.md‎
Lines changed: 127 additions & 10 deletions
diff --git a/‎docs/zh/template.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/zh/template.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/config/dpo/full_function_call.yaml‎
Lines changed: 3 additions & 2 deletions b/‎examples/config/dpo/full_function_call.yaml‎
Lines changed: 3 additions & 2 deletions
@@ -51,6 +51,7 @@ jobs:
           work_dir: ${{ github.workspace }}
           FLAGS_dynamic_static_unified_comm: "True"
           python_version: "3.10"
+          PIP_CACHE_DIR: /home/.cache/pip
           paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
         run: |
           container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
@@ -76,6 +77,7 @@ jobs:
             -e HF_PROXY_PATH=${work_dir}/../../../proxy_huggingface \
             -e AISTUDIO_PROXY_PATH=${work_dir}/../../../proxy_aistudio \
             -e PF_HOME=/home/models/ \
+            -e PIP_CACHE_DIR \
             -w /workspace --privileged ${IMAGE_NAME}
             
       - name: Download Code
@@ -112,15 +114,16 @@ jobs:
         run: |
           docker exec -t $container_name /bin/bash -c '
           ldconfig
-          pip config set global.cache-dir "/home/.cache/pip"
+          mkdir -p /home/.cache/pip
+          pip cache dir
           set -e
           rm -rf /root/.cache/aistudio/
           cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
           echo "work_dir = ${work_dir}"
           cp -r ${work_dir}/../../../models ./models
           echo "Check whether the local model file exists:"
           ls -l ./models
-          timeout 60m bash scripts/regression/ci_model_unittest.sh ${paddle_whl} false ${AGILE_COMPILE_BRANCH}
+          timeout 60m bash -x scripts/regression/ci_model_unittest.sh ${paddle_whl} false ${AGILE_COMPILE_BRANCH}
           '
           
       - name: Upload Products
 
@@ -55,6 +55,7 @@ jobs:
           work_dir: ${{ github.workspace }}
           FLAGS_dynamic_static_unified_comm: "True"
           python_version: "3.10"
+          PIP_CACHE_DIR: /home/.cache/pip
           paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
         run: |
           container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
@@ -83,6 +84,7 @@ jobs:
             -e "AISTUDIO_PROXY_PATH=$work_dir/../../../proxy_aistudio" \
             -e "HF_DATASETS_CACHE=$work_dir/../../../paddlenlp/huggingface/datasets" \
             -e "TRANSFORMERS_CACHE=$work_dir/../../../paddlenlp/huggingface" \
+            -e PIP_CACHE_DIR \
             -w /github/workspace --privileged $IMAGE_NAME
 
       - name: Download Code
@@ -117,7 +119,8 @@ jobs:
         run: |
           docker exec -t $container_name /bin/bash -c '
           ldconfig
-          pip config set global.cache-dir "/home/.cache/pip"
+          mkdir -p /home/.cache/pip
+          pip cache dir
           set -e
           rm -rf /root/.cache/aistudio/
           cd /github/workspace/PaddleFormers && git config --global --add safe.directory $PWD
@@ -127,7 +130,7 @@ jobs:
           cp -r ${work_dir}/../../../models ./models
           echo "Check whether the local model file exists:"
           ls -l ./models
-          timeout 50m bash scripts/unit_test/ci_unittest.sh ${paddle_whl} false ${PYTEST_EXECUTE_FLAG_FILE} ${AGILE_COMPILE_BRANCH}
+          timeout 50m bash -x scripts/unit_test/ci_unittest.sh ${paddle_whl} false ${PYTEST_EXECUTE_FLAG_FILE} ${AGILE_COMPILE_BRANCH}
           '
       - name: Copy coverage.xml out of container
         run: |
 
@@ -46,9 +46,26 @@ unit-test:
 
 .PHONY: install
 install:
-	pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
-	pip install -r requirements-dev.txt
-	pip install -r requirements.txt
+	@echo "Checking CUDA version and selecting pip source..."
+	@if ! command -v nvcc >/dev/null 2>&1; then \
+	    echo "ERROR: nvcc (CUDA) not found. Please install CUDA before proceeding."; \
+	    exit 1; \
+	fi; \
+	cuda_version=$$(nvcc --version | grep release | awk '{print $$5}' | sed 's/,//'); \
+	echo "Detected CUDA version: $$cuda_version"; \
+	if [ "$$cuda_version" = "12.6" ]; then \
+	    PADDLE_SOURCE="https://www.paddlepaddle.org.cn/packages/nightly/cu126/"; \
+	elif [ "$$cuda_version" = "12.9" ]; then \
+	    PADDLE_SOURCE="https://www.paddlepaddle.org.cn/packages/nightly/cu129/"; \
+	elif [ "$$cuda_version" = "13.0" ]; then \
+	    PADDLE_SOURCE="https://www.paddlepaddle.org.cn/packages/nightly/cu130/"; \
+	else \
+	    PADDLE_SOURCE=""; \
+	    echo "Unknown CUDA version."; \
+	fi; \
+	echo "Using pip source: $$PADDLE_SOURCE"; \
+	pip install -r requirements-dev.txt \
+	pip install -r requirements.txt --extra-index-url "$$PADDLE_SOURCE"; \
 	pre-commit install
 
 
 
@@ -42,12 +42,22 @@ Requires Python 3.8+ and [PaddlePaddle](https://www.paddlepaddle.org.cn/install/
 
 ```bash
 # Install via pip
-pip install paddleformers
+# cuda12.6
+pip install paddleformers --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+# cuda12.9
+pip install paddleformers --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu129/
+# cuda13.0
+pip install paddleformers --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu130/
 
 # Install development version
 git clone https://github.com/PaddlePaddle/PaddleFormers.git
 cd PaddleFormers
-pip install -e .
+# cuda12.6
+pip install -e . --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+# cuda12.9
+pip install -e . --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu129/
+# cuda13.0
+pip install -e . --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu130/
 ```
 
 ## Quickstart
@@ -89,4 +99,4 @@ trainer.train()
 We welcome all contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
 
 ## License
-This repository's source code is available under the [Apache 2.0 License](LICENSE).
+This repository's source code is available under the [Apache 2.0 License](LICENSE).
@@ -2,19 +2,23 @@
 
 ## 数据流文件格式支持
 
-当前预训练、后训练数据流只支持`jsonl`格式的数据
+当前预训练、后训练数据流支持`jsonl`、`json`、`parquet`格式的数据
 
 ## 1. 预训练数据流
 
 ### 1.1. 在线数据流
 
-预训练数据流中，每条数据都是一个字典，包含以下字段：
+#### 1.1.1. erniekit格式
 
-- `text` : `str, List(str)`, 预训练文本。
+使用 `erniekit` 格式需要在 `train(/eval)_dataset_type` 处指定为 `erniekit`
+
+erniekit格式：每条数据都是一个字典，包含以下字段：
+
+- `text` : `str, List(str)`
 
 样例数据：
 
-```text
+```json
 {"text": ["一个需要连续输入值的分类问题的示例是房屋价格预测。房屋的价格通常基于诸如平方英尺、位置、卧室和浴室数量以及像后院或车库等功能这样的因素定价。为了准确预测房屋价格，这些标准必须作为连续输入值输入到分类模型中。"]}
 ...
 ```
@@ -26,6 +30,21 @@ wget https://paddleformers.bj.bcebos.com/datasets/pt_data.tar.gz
 mkdir -p data/pt && tar -xf pt_data.tar.gz -C data/pt/
 ```
 
+#### 1.1.2. messages格式
+
+使用 `messages` 格式需要在 `train(/eval)_dataset_type` 处指定为 `messages`
+
+messages格式：每条数据都是一个字典，包含以下字段：
+
+- `messages` : `List(Dict）`
+
+样例数据：
+
+```json
+{"messages": {"role": "assistant", "content": "一个需要连续输入值的分类问题的示例是房屋价格预测。房屋的价格通常基于诸如平方英尺、位置、卧室和浴室数量以及像后院或车库等功能这样的因素定价。为了准确预测房屋价格，这些标准必须作为连续输入值输入到分类模型中。"}}
+...
+```
+
 ### 1.2. 离线数据流
 
 我们也可以选择使用离线的比特预训练数据流，更节省内存。
@@ -117,9 +136,9 @@ mkdir -p data/sft && tar -xf alpaca_demo.gz -C data/sft/ --strip-components=1
 ```
 
 
-### chatml格式
+### messages格式
 
-使用 `chatml` 格式需要在 `train(/eval)_dataset_type` 处指定为 `chatml`
+使用 `messages` 格式需要在 `train(/eval)_dataset_type` 处指定为 `messages`
 
 SFT数据流中，每条数据都是一个字典，包含以下字段：
 
@@ -168,7 +187,7 @@ Notes:
 ]
 ```
 
-为了方便测试，我们也提供了 `chatml` function call SFT 数据集可以直接使用：
+为了方便测试，我们也提供了 `messages` function call SFT 数据集可以直接使用：
 ```bash
 wget https://paddleformers.bj.bcebos.com/datasets/sft_function_call_demo.tar.gz
 
@@ -226,9 +245,9 @@ wget https://bj.bcebos.com/paddlenlp/datasets/examples/ultrafeedback_binarized.t
 mkdir -p data/dpo && tar -zxf ultrafeedback_binarized.tar.gz -C data/dpo/ --strip-components=1
 ```
 
-### chatml 格式
+### messages 格式
 
-使用 `chatml` 格式需要在 `train(/eval)_dataset_type` 处指定为 `chatml`
+使用 `messages` 格式需要在 `train(/eval)_dataset_type` 处指定为 `messages`
 
 DPO数据流中，每条数据都是一个字典，包含以下字段：
 - `messages` : `List(dict)`, 对话历史列表。
@@ -303,9 +322,107 @@ DPO数据流中，每条数据都是一个字典，包含以下字段：
 }
 ```
 
-为了方便测试，我们也提供了 `chatml` function call DPO 数据集可以直接使用：
+为了方便测试，我们也提供了 `messages` function call DPO 数据集可以直接使用：
 ```bash
 wget https://paddleformers.bj.bcebos.com/datasets/dpo_function_call_1k.tar.gz
 
 mkdir -p data/dpo_fc && tar -zxf dpo_function_call_1k.tar.gz -C data/dpo_fc/
 ```
+
+## 4. 多模 SFT数据流
+
+### erniekit格式
+
+使用 `erniekit` 格式需要在 `train(/eval)_dataset_type` 处指定为 `erniekit`
+
+SFT数据流中，每条数据都是一个字典，包含以下字段：
+
+* `text_info`: 纯文本的列表，每个元素包含一个 `text` 和一个 `tag`
+  * `text`: 来自使用者的问题或系统回复的文字内容
+  * `tag`: 遮挡标签 (`no_mask`=包含在训练中, `mask`=排除)
+* `image_info`: 图像组成的列表，每个元素包含一个 `image_url` 和一个 `matched_text_index`
+  * `image_url`: 线上下载图像的网址或本地存取图像的路径
+  * `matched_text_index`: `text_info` 中匹配文字的索引
+    * 预设值: `matched_text_index=0` 表示图像与第一个文字匹配，并将其放置在第一个文字之前
+* `is_system(optional)`: 系统标志 (1=系统配置 0=无系统配置)
+  * 系统配置 = 如果 `is_system=1`，则为 `text_info[0]`
+
+注意：
+* 通过将 `image_info` 替换为 `video_info` 来支持视频数据
+* 请确保 `mask` 和 `no_mask` 在 `text_info` 中交替出现
+
+这是一个 SFT VL 数据集的多图像示例：
+
+```json
+{
+    "image_info": [
+        {"matched_text_index": 0, "image_url": "./DoclingMatix/218/0.png"},
+        {"matched_text_index": 0, "image_url": "./DoclingMatix/218/1.png"}
+    ],
+    "text_info": [
+        {"text": "What is the purpose of the resolution discussed in the text?", "tag": "mask"},
+        {"text": "The purpose of the resolution is to approve the redevelopment contract of the Philadelphia Redevelopment Authority for the redevelopment and urban renewal of a portion of the Haddington Urban Renewal Area, Unit Nos. 2 and 3, and to authorize the Redevelopment Authority to execute the redevelopment contract with Danielle M. Carson-Varns.", "tag": "no_mask"},
+        {"text": "Who introduced Resolution No. 160204 to the City Council?", "tag": "mask"},
+        {"text": "Councilmember Blackwell introduced Resolution No. 160204 to the City Council.", "tag": "no_mask"},
+        ...
+    ]
+}
+```
+
+这是一个 SFT VL 数据集的单视频示例：
+```json
+{
+    "video_info": [
+        {"matched_text_index": 0, "image_url": "./NExTVideo/1027/4789497818.mp4"}
+    ],
+    "text_info": [
+        {"text": "how does the man sit on the grass?\nA. kneel\nB. one leg in the air\nC. sitting on bicycle seat\nD. legs spread out\nE. squatting down\n Answer with the option's letter from the given choices directly.", "tag": "mask"},
+        {"text": "D", "tag": "no_mask"}
+    ]
+}
+```
+
+这是一个 SFT VL 数据集的系统配置示例:
+```json
+{
+    "is_system": 1,
+    "text_info": [
+        {"text": "Your role as ...", "tag": "mask"},
+        {"text": "好的", "tag": "no_mask"},
+        {"text": "What is written...", "tag": "mask"},
+        {"text": "<think>So I've got...", "tag": "no_mask"},
+        ...
+    ]
+    "image_info": [...]
+}
+```
+
+为了方便测试，我们也提供了用于快速训练的demo数据，请根据您的需要下载[数据](https://paddleformers.bj.bcebos.com/datasets/DoclingMatix.tar.gz)，并将其解压缩到`tests/fixtures/dummy/sft-vl/`：
+
+```shell
+wget https://paddleformers.bj.bcebos.com/datasets/DoclingMatix.tar.gz
+tar -xf DoclingMatix.tar.gz -C tests/fixtures/dummy/sft-vl/ --strip-components=1
+```
+
+### messages格式
+
+使用 `messages` 格式需要在 `train(/eval)_dataset_type` 处指定为 `messages`
+
+多模messages格式需要在纯文messages格式的基础上加上`images`、`videos`、`audios`几个key，用于传入多模态资源的`url`或者`path`，同时在`messages`中插入`<image>`、`<video>`、`<audio>`标签来表述插入多模态数据的位置：
+
+纯文：
+```json
+{"messages": [{"role": "assistant", "content": "预训练的文本在这里"}]}
+```
+加入图片：
+```json
+{"messages": [{"role": "assistant", "content": "<image>是一只小狗，<image>是一只小猫"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
+```
+加入音频：
+```json
+{"messages": [{"role": "assistant", "content": "<audio>描述了今天天气真不错"}], "audios": ["/xxx/x.wav"]}
+```
+加入图片与视频：
+```json
+{"messages": [{"role": "assistant", "content": "<image>是一个大象，<video>是一只狮子在跑步"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+```
@@ -0,0 +1,15 @@
+## 指定训练使用的template
+
+| 参数 | 类型 | 描述 |
+| --- | --- | --- |
+| `template_backend` | str | 指定为`custom`表示使用自定义的template，`jinja`表示使用apply_chat_template方法进行拼接，不适合多轮对话，不推荐使用 |
+| `template` | str | （只在 `template_backend` 为 `custom` 时生效）指定训练用的 template
+| `split_multi_turn` | bool | 只在 `template_backend` 为 `jinja` 时生效）`True`表示将多轮数据拆成多条数据进行训练，`False`表示每次只学习最后一轮的回复 |
+| `encode_one_turn` | str | 只在 `template_backend` 为 `jinja` 时生效）`True`表示将多轮对话进行拆分，分别对每一轮对话套用`apply_chat_template`，`False`表示直接对整段对话套用`apply_chat_template` |
+
+## 自定义template
+在`paddleformers/datasets/template/template.py`文件中，通过`register_template`实现自定义template
+
+## 多模plugin接入流程
+
+在 `paddleformers/datasets/template/mm_plugin.py` 文件中实现各种多模预处理的处理，基类是`BasePlugin`，已经实现了各种图片、视频、音频的预处理操作，如果需要自定义plugin，需要继承`BasePlugin`实现自定义的类，如`Qwen2VLPlugin`。在自定义的类中实现各种多模数据预处理的操作，并在`PLUGINS`里面注册template名字和类名的对应关系
@@ -1,13 +1,14 @@
 ### data
-train_dataset_type: chatml
-eval_dataset_type: chatml
+train_dataset_type: messages
+eval_dataset_type: messages
 train_dataset_path: ./tests/fixtures/dummy/dpo/function-call-train.jsonl
 train_dataset_prob: "1.0"
 eval_dataset_path: ./tests/fixtures/dummy/dpo/function-call-eval.jsonl
 eval_dataset_prob: "1.0"
 max_seq_len: 8192
 packing: false
 mix_strategy: concat
+split_multi_turn: True
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base