optimize op doc for global textual search; correct beta into stable (#…

…552) * optimize op doc for global textual search; correct beta into stable * - merge main --------- Co-authored-by: lielin.hyl <[email protected]>
modelscope · Jan 17, 2025 · 129c75a · 129c75a
1 parent 20772be
commit 129c75a
Show file tree

Hide file tree

Showing 5 changed files with 226 additions and 185 deletions.
diff --git a/.pre-commit-hooks/build_op_doc.py b/.pre-commit-hooks/build_op_doc.py
@@ -25,10 +25,16 @@
 parameters of each operator. Users can refer to and run the unit tests
 (`tests/ops/...`) for [examples of operator-wise usage](../tests/ops) as well
 as the effects of each operator when applied to built-in test data samples.
+Besides, you can try to use agent to automatically route suitable OPs and
+call them. E.g., refer to
+[Agentic Filters of DJ](../demos/api_service/react_data_filter_process.ipynb),
+ [Agentic Mappers of DJ](../demos/api_service/react_data_mapper_process.ipynb)
 
 这个页面提供了OP的基本描述，用户可以参考[API文档](https://modelscope.github.io/data-juicer/)更细致了解每个
-OP的具体参数，并且可以查看、运行单元测试 (`tests/ops/...`)，来体验[各OP的用法示例](../tests/ops)以及每个OP作用于内置
-测试数据样本时的效果。
+OP的具体参数，并且可以查看、运行单元测试 (`tests/ops/...`)，来体验
+[各OP的用法示例](../tests/ops)以及每个OP作用于内置测试数据样本时的效果。例如，参考
+[Agentic Filters of DJ](../demos/api_service/react_data_filter_process.ipynb),
+ [Agentic Mappers of DJ](../demos/api_service/react_data_mapper_process.ipynb)
 '''
 
 DOC_CONTRIBUTING = '''
@@ -87,6 +93,27 @@ def replace_tags_with_icons(tags, lang='en'):
     return icons
 
 
+def remove_emojis(text):
+    # This pattern includes a wide range of emoji characters
+    emoji_pattern = re.compile(
+        '['  # Start of character class
+        '\U0001F600-\U0001F64F'  # Emoticons
+        '\U0001F300-\U0001F5FF'  # Misc Symbols and Pictographs
+        '\U0001F680-\U0001F6FF'  # Transport and Map Symbols
+        '\U0001F700-\U0001F77F'  # Alchemical Symbols
+        '\U0001F780-\U0001F7FF'  # Geometric Shapes Extended
+        '\U0001F800-\U0001F8FF'  # Supplemental Arrows-C
+        '\U0001F900-\U0001F9FF'  # Supplemental Symbols and Pictographs
+        '\U0001FA00-\U0001FA6F'  # Chess Symbols
+        '\U0001F000-\U0001F02F'  # Mahjong Tiles
+        '\U0001F0A0-\U0001F0FF'  # Playing Cards
+        '\U00002700-\U000027BF'  # Dingbats
+        '\U0001F1E6-\U0001F1FF'  # Regional Indicator Symbols
+        ']+',  # One or more of the above
+        flags=re.UNICODE)
+    return emoji_pattern.sub(r'', text)  # Replace emojis with an empty string
+
+
 # OP tag analysis functions
 def analyze_modality_tag(code, op_prefix):
     """
@@ -191,6 +218,9 @@ def __eq__(self, other):
                and self.desc == other.desc and self.code == other.code \
                and self.test == other.test
 
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
 
 class ClassVisitor(ast.NodeVisitor):
     """
@@ -423,7 +453,6 @@ def parse_op_record_from_current_doc():
     """
     # patterns
     tab_pattern = r'\| +(.*?) +\| +(.*?) +\| +(.*?) +\| +(.*?) +\| +(.*?) +\|'
-    tag_pattern = r'\!\[(.*?)\]\(https:\/\/img\.shields\.io\/badge\/'
     link_pattern = r'\[.*?\]\((.*?)\)'
 
     if os.path.exists(DOC_PATH):
@@ -437,7 +466,7 @@ def parse_op_record_from_current_doc():
                     continue
                 # extract tags
                 type = name.split('_')[-1]
-                tags = re.findall(tag_pattern, tags)
+                tags = [remove_emojis(tag.lower()) for tag in tags.split(' ')]
                 # only need English description
                 desc = desc.split('. ')[0] + '.'
                 code = re.findall(link_pattern, code)[0]
@@ -497,6 +526,7 @@ def check_and_update_op_record(old_op_record_list, new_op_record_list):
                     break
             if old_usability_tag and \
                     old_usability_tag == 'stable' and usability_tag == 'beta':
+                print(f'{record.name} kept stable')
                 usability_tag = 'stable'
         curr_tags = [
             tag for tag in record.tags if tag not in usability_tag_set

diff --git a/.pre-commit-hooks/tag_mappings.json b/.pre-commit-hooks/tag_mappings.json
@@ -1,61 +1,61 @@
 {
   "Modality Tags": {
     "text": {
-      "icon": "![text](https://img.shields.io/badge/text-010326?style=plastic)",
+      "icon": "🔤Text",
       "desc": "process text data specifically. 专用于处理文本。"
     },
     "image": {
-      "icon": "![image](https://img.shields.io/badge/image-07B0F2?style=plastic)",
+      "icon": "🏞Image",
       "desc": "process image data specifically. 专用于处理图像。"
     },
     "audio": {
-      "icon": "![audio](https://img.shields.io/badge/audio-0DA64F?style=plastic)",
+      "icon": "📣Audio",
       "desc": "process audio data specifically. 专用于处理音频。"
     },
     "video": {
-      "icon": "![video](https://img.shields.io/badge/video-F2B138?style=plastic)",
+      "icon": "🎬Video",
       "desc": "process video data specifically. 专用于处理视频。"
     },
     "multimodal": {
-      "icon": "![multimodal](https://img.shields.io/badge/multimodal-F25922?style=plastic)",
+      "icon": "🔮Multimodal",
       "desc": "process multimodal data. 用于处理多模态数据。"
     }
   },
   "Resource Tags": {
     "cpu": {
-      "icon": "![cpu](https://img.shields.io/badge/cpu-F2AA6B?style=plastic)",
+      "icon": "💻CPU",
       "desc": "only requires CPU resource. 只需要 CPU 资源。"
     },
     "gpu": {
-      "icon": "![gpu](https://img.shields.io/badge/gpu-F27649?style=plastic)",
+      "icon": "🚀GPU",
       "desc": "requires GPU/CUDA resource as well. 额外需要 GPU/CUDA 资源。"
     }
   },
   "Usability Tags": {
     "alpha": {
-      "icon": "![alpha](https://img.shields.io/badge/alpha-red?style=plastic)",
+      "icon": "🔴Alpha",
       "desc": "alpha version OP. Only the basic OP implementations are finished. 表示 alpha 版本算子。只完成了基础的算子实现。"
     },
     "beta": {
-      "icon": "![beta](https://img.shields.io/badge/beta-yellow?style=plastic)",
+      "icon": "🟡Beta",
       "desc": "beta version OP. Based on the alpha version, unittests for this OP are added as well. 表示 beta 版本算子。基于 alpha 版本，添加了算子的单元测试。"
     },
     "stable": {
-      "icon": "![stable](https://img.shields.io/badge/stable-green?style=plastic)",
+      "icon": "🟢Stable",
       "desc": "stable version OP. Based on the beta version, OP optimizations related to DJ (e.g. model management, batched processing, OP fusion, ...) are added to this OP. 表示 stable 版本算子。基于 beta 版本，完善了DJ相关的算子优化项（如模型管理，批处理，算子融合等）。"
     }
   },
   "Modal Tags": {
     "api": {
-      "icon": "![api](https://img.shields.io/badge/api-A64C44?style=plastic)",
+      "icon": "🔗API",
       "desc": "equipped with API-based models. (e.g. ChatGPT, GPT-4o). 支持基于 API 调用模型（如 ChatGPT，GPT-4o）。"
     },
     "vllm": {
-      "icon": "![vllm](https://img.shields.io/badge/vllm-D99379?style=plastic)",
+      "icon": "🌊vLLM",
       "desc": "equipped with models supported by vLLM. 支持基于 vLLM 进行模型推理。"
     },
     "hf": {
-      "icon": "![hf](https://img.shields.io/badge/hf-590F08?style=plastic)",
+      "icon": "🧩HF",
       "desc": "equipped with models from HuggingFace Hub. 支持来自于 HuggingFace Hub 的模型。"
     }
   }

diff --git a/README.md b/README.md
@@ -148,6 +148,8 @@ Table of Contents
 - [Refined recipes for fine-tuning text data](configs/data_juicer_recipes/README.md#before-and-after-refining-for-alpaca-cot-dataset)
 - [Refined recipes for pre-training multi-modal data](configs/data_juicer_recipes/README.md#before-and-after-refining-for-multimodal-dataset)
 - [DJ-SORA](docs/DJ_SORA.md)
+- [Agentic Filters of DJ](./demos/api_service/react_data_filter_process.ipynb)
+- [Agentic Mappers of DJ](./demos/api_service/react_data_mapper_process.ipynb)
 
 
 ### Interactive Examples
@@ -470,7 +472,7 @@ Data-Juicer thanks many community [contributers](https://github.com/modelscope/d
 
 
 ## References
-If you find our work useful for your research or development, please kindly cite the following [paper](https://arxiv.org/abs/2309.02033).
+If you find Data-Juicer useful for your research or development, please kindly cite the following [paper](https://arxiv.org/abs/2309.02033).
 ```
 @inproceedings{chen2024datajuicer,
   title={Data-Juicer: A One-Stop Data Processing System for Large Language Models},

diff --git a/README_ZH.md b/README_ZH.md
@@ -138,6 +138,9 @@ Data-Juicer正在积极更新和维护中，我们将定期强化和新增更多
 * [Fine-tuning文本数据增强菜谱](configs/data_juicer_recipes/README_ZH.md#完善前后的alpaca-cot数据集)
 * [预训练多模态数据增强菜谱](configs/data_juicer_recipes/README_ZH.md#before-and-after-refining-for-multimodal-dataset)
 * [DJ-SORA](docs/DJ_SORA_ZH.md)
+* [智能体调用DJ Filters](./demos/api_service/react_data_filter_process.ipynb)
+* [智能体调用DJ Mappers](./demos/api_service/react_data_mapper_process.ipynb)
+
 
 ### 交互类示例
 * Data-Juicer 介绍 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/overview_scan)]
@@ -452,7 +455,7 @@ Data-Juicer被许多大模型相关产品和研究工作所使用，例子阿里
 Data-Juicer 感谢社区[贡献者](https://github.com/modelscope/data-juicer/graphs/contributors) 和相关的先驱开源项目，譬如[Huggingface-Datasets](https://github.com/huggingface/datasets), [Bloom](https://huggingface.co/bigscience/bloom), [RedPajama](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1), [Arrow](https://github.com/apache/arrow), [Ray](https://github.com/ray-project/ray), ....
 
 ## 参考文献
-如果您发现我们的工作对您的研发有帮助，请引用以下[论文](https://arxiv.org/abs/2309.02033) 。
+如果您发现Data-Juicer对您的研发有帮助，请引用以下[论文](https://arxiv.org/abs/2309.02033) 。
 
 ```
 @inproceedings{chen2024datajuicer,