forked from modelscope/data-juicer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config_all.yaml
243 lines (234 loc) · 30.3 KB
/
config_all.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# Process config example including:
# - all global arguments
# - all ops and their arguments
# global parameters
project_name: 'all' # project name for distinguish your configs
dataset_path: '/path/to/your/dataset' # path to your dataset directory or file with weights(0.0-1.0), 1.0 as default.
# Accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path'
export_path: '/path/to/result/dataset.jsonl' # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
export_shard_size: 0 # Shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
export_in_parallel: false # Whether to export the result dataset in parallel to a single file, which usually takes less time. It only works when export_shard_size is 0, and its default number of processes is the same as the argument np. **Notice**: If it's True, sometimes exporting in parallel might require much more time due to the IO blocking, especially for very large datasets. When this happens, False is a better choice, although it takes more time.
np: 4 # number of subprocess to process your dataset
text_keys: 'content' # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...
# Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
suffixes: [] # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
use_cache: true # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache
ds_cache_dir: null # cache dir for Hugging Face datasets. In default it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir
use_checkpoint: false # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning.
temp_dir: null # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory.
open_tracer: false # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
op_list_to_trace: [] # only ops in this list will be traced by tracer. If it's empty, all ops will be traced. Only available when tracer is opened.
trace_num: 10 # number of samples to show the differences between datasets before and after each op. Only available when tracer is opened.
op_fusion: false # whether to fuse operators that share the same intermediate variables automatically. Op fusion might reduce the memory requirements slightly but speed up the whole process.
cache_compress: null # The compression method of the cache file, which can be specified in ['gzip', 'zstd', 'lz4']. If this parameter is None, the cache file will not be compressed. We recommend you turn on this argument when your input dataset is larger than tens of GB and your disk space is not enough.
# for multimodal data processing
image_key: 'images' # Key name of field to store the list of sample image paths.
image_special_token: '<__dj__image>' # The special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
audio_key: 'audios' # Key name of field to store the list of sample audio paths.
audio_special_token: '<__dj__audio>' # The special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.
eoc_special_token: '<|__dj__eoc|>' # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.
# for distributed processing
executor_type: default # Type of executor, support "default" or "ray" for now.
ray_address: auto # The address of the Ray cluster.
# only for data analysis
save_stats_in_one_file: false # whether to store all stats result into one file
# process schedule: a list of several process operators with their arguments
process:
# Mapper ops. Most of these ops need no arguments.
- chinese_convert_mapper: # convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.
mode: 's2t' # Choose the mode to convert Chinese: ['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t']
- clean_email_mapper: # remove emails from text.
- clean_html_mapper: # remove html formats form text.
- clean_ip_mapper: # remove ip addresses from text.
- clean_links_mapper: # remove web links from text.
- clean_copyright_mapper: # remove copyright comments.
- expand_macro_mapper: # expand macro definitions in Latex text.
- fix_unicode_mapper: # fix unicode errors in text.
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
delete_random_word: false # whether to open the augmentation method of deleting random words from the original texts. e.g. "I love LLM" --> "I LLM"
swap_random_word: false # whether to open the augmentation method of swapping random contiguous words in the original texts. e.g. "I love LLM" --> "Love I LLM"
spelling_error_word: false # whether to open the augmentation method of simulating the spelling error for words in the original texts. e.g. "I love LLM" --> "Ai love LLM"
split_random_word: false # whether to open the augmentation method of splitting words randomly with whitespaces in the original texts. e.g. "I love LLM" --> "I love LL M"
keyboard_error_char: false # whether to open the augmentation method of simulating the keyboard error for characters in the original texts. e.g. "I love LLM" --> "I ;ov4 LLM"
ocr_error_char: false # whether to open the augmentation method of simulating the OCR error for characters in the original texts. e.g. "I love LLM" --> "I 10ve LLM"
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "I love LLM" --> "I oe LLM"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "I love LLM" --> "I ovle LLM"
insert_random_char: false # whether to open the augmentation method of inserting random characters into the original texts. e.g. "I love LLM" --> "I ^lKove LLM"
- nlpcda_zh_mapper: # simply augment texts in Chinese based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
replace_similar_word: false # whether to open the augmentation method of replacing random words with their similar words in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法"
replace_homophone_char: false # whether to open the augmentation method of replacing random characters with their homophones in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法"
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
- remove_bibliography_mapper: # remove bibliography from Latex text.
- remove_comments_mapper: # remove comments from Latex text, code, etc.
doc_type: tex # comment type you want to remove. Only support 'tex' for now.
inline: true # whether to remove inline comments
multiline: true # whether to remove multiline comments
- remove_header_mapper: # remove header texts from Latex text.
drop_no_head: true # whether to drop sample texts without headers
- remove_long_words_mapper: # remove much too long words from text.
min_len: 1 # the min word length to keep words.
max_len: 128 # the max word length to keep words.
- remove_non_chinese_character_mapper: # remove non Chinese character in text samples.
keep_alphabet: true # whether to keep alpabet
keep_number: true # whether to keep number
keep_punc: true # whether to keep punctuation
- remove_specific_chars_mapper: # remove characters specified by users
chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□' # a string or a list including those characters that need to be removed
- remove_table_text_mapper: # remove possible table texts from text.
min_col: 2 # the min num of columns in tables to remove
max_col: 20 # the max num of columns in tables to remove
- remove_words_with_incorrect_substrings_mapper: # remove words with incorrect substrings from text.
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
substrings: ['http', 'www', '.com', 'href', '//'] # incorrect substrings to remove
- sentence_split_mapper: # split text to multiple sentences and join them with '\n'
lang: 'en' # split text in what language
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.
# Filter ops
- alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
tokenization: false # Whether to count the ratio of alphanumeric to the total number of tokens.
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.9 # the max ratio of filter range
- average_line_length_filter: # filter text with the average length of lines out of specific range.
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- character_repetition_filter: # filter text with the character repetition ratio out of specific range
rep_len: 10 # repetition length for char-level n-gram
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.5 # the max ratio of filter range
- face_area_filter: # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
min_ratio: 0.0 # the min face area ratio of filter range
max_ratio: 0.4 # the max face area ratio of filter range
upsample_num_times: 0 # optional argument passing to the underlying dlib face detector
- flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
lang: en # consider flagged words in what language
tokenization: false # whether to use model to tokenize documents
max_ratio: 0.0045 # the max ratio to filter text
flagged_words_dir: ./assets # directory to store flagged words dictionaries
use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
words_aug_group_sizes: [2] # the group size of words to augment
words_aug_join_char: "" # the join char between words to augment
- image_aspect_ratio_filter: # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
min_ratio: 0.333 # the min aspect ratio of filter range
max_ratio: 3.0 # the max aspect ratio of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_shape_filter: # filter samples according to the widths and heights of images in them
min_width: 200 # the min width of width filter range
max_width: 5000 # the max width of width filter range
min_height: 200 # the min height of height filter range
max_height: 5000 # the max height of height filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_size_filter: # filter samples according to the size of images (in bytes) within them
min_size: "0" # the min size of filter range
max_size: "1TB" # the max size of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_text_matching_filter: # filter samples according to the matching score between image and text.
hf_blip: Salesforce/blip-itm-base-coco # name of used Hugging Face blip
min_score: 0.003 # the min matching score of filter range
max_score: 1.0 # the max matching score of filter range
horizontal_flip: false # Flip image horizontally (left to right).
vertical_flip: false # Flip image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_text_similarity_filter: # filter samples according to the similarity between image and text.
hf_clip: openai/clip-vit-base-patch32 # name of used Hugging Face clip
min_score: 0.1 # the min similarity of filter range
max_score: 1.0 # the max similarity of filter range
horizontal_flip: false # Flip image horizontally (left to right).
vertical_flip: false # Flip image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
lang: en # keep text in what language
min_score: 0.8 # the min language scores to filter text
- maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- perplexity_filter: # filter text with perplexity score out of specific range
lang: en # compute perplexity in what language
max_ppl: 1500 # the max perplexity score to filter text
- special_characters_filter: # filter text with special-char ratio out of specific range
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.25 # the max ratio of filter range
- stopwords_filter: # filter text with stopword ratio smaller than a specific min value
lang: en # consider stopwords in what language
tokenization: false # whether to use model to tokenize documents
min_ratio: 0.3 # the min ratio to filter text
stopwords_dir: ./assets # directory to store stopwords dictionaries
use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
words_aug_group_sizes: [2] # the group size of words to augment
words_aug_join_char: "" # the join char between words to augment
- text_action_filter: # filter text according the number of action verb
lang: en # consider the words in what language
min_action_num: 1 # text will be filtered whose verbs less the min action number
- text_entity_dependency_filter: # filter text without non independent entity nouns
lang: en # consider the words in what language
min_dependency_num: 1 # the min number of adjacent edges of a non independent noun in dependency tree
any_or_all: any # keep this sample when any/all entity nouns are non independent
- text_length_filter: # filter text with length out of specific range
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- token_num_filter: # filter text with total token number out of specific range
hf_tokenizer: EleutherAI/pythia-6.9b-deduped # name of used Hugging Face tokenizer
min_num: 10 # the min number of filter range
max_num: 10000 # the max number of filter range
- words_num_filter: # filter text with number of words out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
min_num: 10 # the min number of filter range
max_num: 10000 # the max number of filter range
- word_repetition_filter: # filter text with the word repetition ratio out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
rep_len: 10 # repetition length for word-level n-gram
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.5 # the max ratio of filter range
- suffix_filter: # filter to keep samples with specified suffix.
suffixes: [] # the suffix of text that will be keep. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
- specified_field_filter: # filter text with the specified field info out of specific range
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
target_value: [] # the range of specified field information corresponding to the samples that need to be retained
- specified_numeric_field_filter: # filter text with the specified numeric field info out of specific range
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
min_value: 0 # the min filter value in SpecifiedNumericField op
max_value: 10000 # the max filter value in SpecifiedNumericField op
# Deduplicator ops
- document_deduplicator: # deduplicate text samples using md5 hashing exact matching method
lowercase: false # whether to convert text to lower case
ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
- document_minhash_deduplicator: # deduplicate text samples using MinHash-LSH method
tokenization: space # tokenization method for text. One of [space, punctuation, character]
window_size: 5 # window size of shingling
num_permutations: 256 # number of permutations in minhash computing
jaccard_threshold: 0.7 # the min jaccard similarity threshold in near-duplicate detection. When the jaccard similarity of two sample texts is >= this threshold, they are regarded as similar samples and this op will only keep one of them after deduplication
num_bands: null # number of bands in LSH. Default it's None, and it will be determined by an optimal params computation algorithm by minimize the weighted sum of probs of False Positives and False Negatives
num_rows_per_band: null # number of rows in each band in LSH. Default it's None, and it will be determined by an optimal params computation algorithm
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
- document_simhash_deduplicator: # deduplicate text samples using SimHash-LSH method
tokenization: space # tokenization method for text. One of [space, punctuation, character]
window_size: 6 # window size of shingling
num_blocks: 6 # number of blocks in SimHash computing
hamming_distance: 4 # the max hamming distance to regard 2 samples as similar enough pair. Should be less than num_blocks always
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
- image_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of images between documents.
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
# Selector ops
- topk_specified_field_selector: # selector to select top samples based on the sorted specified field
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top samples
topk: # number of selected top sample
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order
- frequency_specified_field_selector: # selector to select samples based on the sorted frequency of specified field value
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top specified field value
topk: # number of selected top specified field value
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order