configs/config_all.yaml

# Process config example including:
#   - all global arguments
#   - all ops and their arguments

# global parameters
project_name: 'all'                                         # project name for distinguish your configs
dataset_path: '/path/to/your/dataset'                       # path to your dataset directory or file with weights(0.0-1.0), 1.0 as default.
                                                            # Accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path'
export_path: '/path/to/result/dataset.jsonl'                # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
export_shard_size: 0                                        # Shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
export_in_parallel: false                                   # Whether to export the result dataset in parallel to a single file, which usually takes less time. It only works when export_shard_size is 0, and its default number of processes is the same as the argument np. **Notice**: If it's True, sometimes exporting in parallel might require much more time due to the IO blocking, especially for very large datasets. When this happens, False is a better choice, although it takes more time.
np: 4                                                       # number of subprocess to process your dataset
text_keys: 'content'                                        # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...
                                                            # Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
suffixes: []                                                # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
use_cache: true                                             # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache
ds_cache_dir: null                                          # cache dir for Hugging Face datasets. In default it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir
use_checkpoint: false                                       # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning.
temp_dir: null                                              # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory.
open_tracer: false                                          # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
op_list_to_trace: []                                        # only ops in this list will be traced by tracer. If it's empty, all ops will be traced. Only available when tracer is opened.
trace_num: 10                                               # number of samples to show the differences between datasets before and after each op. Only available when tracer is opened.
op_fusion: false                                            # whether to fuse operators that share the same intermediate variables automatically. Op fusion might reduce the memory requirements slightly but speed up the whole process.
cache_compress: null                                        # The compression method of the cache file, which can be specified in ['gzip', 'zstd', 'lz4']. If this parameter is None, the cache file will not be compressed. We recommend you turn on this argument when your input dataset is larger than tens of GB and your disk space is not enough.

# for multimodal data processing
image_key: 'images'                                         # Key name of field to store the list of sample image paths.
image_special_token: '<__dj__image>'                        # The special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
audio_key: 'audios'                                         # Key name of field to store the list of sample audio paths.
audio_special_token: '<__dj__audio>'                        # The special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.

eoc_special_token: '<|__dj__eoc|>'                          # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.

# for distributed processing
executor_type: default                                      # Type of executor, support "default" or "ray" for now.
ray_address: auto                                           # The address of the Ray cluster.

# only for data analysis
save_stats_in_one_file: false                               # whether to store all stats result into one file

# process schedule: a list of several process operators with their arguments
process:
  # Mapper ops. Most of these ops need no arguments.
  - chinese_convert_mapper:                                 # convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.
      mode: 's2t'                                             # Choose the mode to convert Chinese: ['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t']
  - clean_email_mapper:                                     # remove emails from text.
  - clean_html_mapper:                                      # remove html formats form text.
  - clean_ip_mapper:                                        # remove ip addresses from text.
  - clean_links_mapper:                                     # remove web links from text.
  - clean_copyright_mapper:                                 # remove copyright comments.
  - expand_macro_mapper:                                    # expand macro definitions in Latex text.
  - fix_unicode_mapper:                                     # fix unicode errors in text.
  - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
      sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
      aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
      delete_random_word: false                               # whether to open the augmentation method of deleting random words from the original texts. e.g. "I love LLM" --> "I LLM"
      swap_random_word: false                                 # whether to open the augmentation method of swapping random contiguous words in the original texts. e.g. "I love LLM" --> "Love I LLM"
      spelling_error_word: false                              # whether to open the augmentation method of simulating the spelling error for words in the original texts. e.g. "I love LLM" --> "Ai love LLM"
      split_random_word: false                                # whether to open the augmentation method of splitting words randomly with whitespaces in the original texts. e.g. "I love LLM" --> "I love LL M"
      keyboard_error_char: false                              # whether to open the augmentation method of simulating the keyboard error for characters in the original texts. e.g. "I love LLM" --> "I ;ov4 LLM"
      ocr_error_char: false                                   # whether to open the augmentation method of simulating the OCR error for characters in the original texts. e.g. "I love LLM" --> "I 10ve LLM"
      delete_random_char: false                               # whether to open the augmentation method of deleting random characters from the original texts. e.g. "I love LLM" --> "I oe LLM"
      swap_random_char: false                                 # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "I love LLM" --> "I ovle LLM"
      insert_random_char: false                               # whether to open the augmentation method of inserting random characters into the original texts. e.g. "I love LLM" --> "I ^lKove LLM"
  - nlpcda_zh_mapper:                                       # simply augment texts in Chinese based on the nlpaug library
      sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
      aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
      replace_similar_word: false                             # whether to open the augmentation method of replacing random words with their similar words in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法"
      replace_homophone_char: false                           # whether to open the augmentation method of replacing random characters with their homophones in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法"
      delete_random_char: false                               # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
      swap_random_char: false                                 # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
      replace_equivalent_num: false                           # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
  - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
  - remove_bibliography_mapper:                             # remove bibliography from Latex text.
  - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.
      doc_type: tex                                           # comment type you want to remove. Only support 'tex' for now.
      inline: true                                            # whether to remove inline comments
      multiline: true                                         # whether to remove multiline comments
  - remove_header_mapper:                                   # remove header texts from Latex text.
      drop_no_head: true                                      # whether to drop sample texts without headers
  - remove_long_words_mapper:                               # remove much too long words from text.
      min_len: 1                                              # the min word length to keep words.
      max_len: 128                                            # the max word length to keep words.
  - remove_non_chinese_character_mapper:                    # remove non Chinese character in text samples.
      keep_alphabet: true                                     # whether to keep alpabet
      keep_number: true                                       # whether to keep number
      keep_punc: true                                         # whether to keep punctuation
  - remove_specific_chars_mapper:                           # remove characters specified by users
      chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□'                        # a string or a list including those characters that need to be removed
  - remove_table_text_mapper:                               # remove possible table texts from text.
      min_col: 2                                              # the min num of columns in tables to remove
      max_col: 20                                             # the max num of columns in tables to remove
  - remove_words_with_incorrect_substrings_mapper:          # remove words with incorrect substrings from text.
      lang: en                                                # sample in which language
      tokenization: false                                     # whether to use model to tokenize documents
      substrings: ['http', 'www', '.com', 'href', '//']       # incorrect substrings to remove
  - sentence_split_mapper:                                  # split text to multiple sentences and join them with '\n'
      lang: 'en'                                              # split text in what language
  - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.

  # Filter ops
  - alphanumeric_filter:                                    # filter text with alphabet/numeric ratio out of specific range.
      tokenization: false                                     # Whether to count the ratio of alphanumeric to the total number of tokens.
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.9                                          # the max ratio of filter range
  - average_line_length_filter:                             # filter text with the average length of lines out of specific range.
      min_len: 10                                             # the min length of filter range
      max_len: 10000                                          # the max length of filter range
  - character_repetition_filter:                            # filter text with the character repetition ratio out of specific range
      rep_len: 10                                             # repetition length for char-level n-gram
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.5                                          # the max ratio of filter range
  - face_area_filter:                                       # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
      min_ratio: 0.0                                          # the min face area ratio of filter range
      max_ratio: 0.4                                          # the max face area ratio of filter range
      upsample_num_times: 0                                   # optional argument passing to the underlying dlib face detector
  - flagged_words_filter:                                   # filter text with the flagged-word ratio larger than a specific max value
      lang: en                                                # consider flagged words in what language
      tokenization: false                                     # whether to use model to tokenize documents
      max_ratio: 0.0045                                       # the max ratio to filter text
      flagged_words_dir: ./assets                             # directory to store flagged words dictionaries
      use_words_aug: false                                    # whether to augment words, especially for Chinese and Vietnamese
      words_aug_group_sizes: [2]                              # the group size of words to augment
      words_aug_join_char: ""                                 # the join char between words to augment
  - image_aspect_ratio_filter:                              # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
      min_ratio: 0.333                                        # the min aspect ratio of filter range
      max_ratio: 3.0                                          # the max aspect ratio of filter range
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
  - image_shape_filter:                                     # filter samples according to the widths and heights of images in them
      min_width: 200                                          # the min width of width filter range
      max_width: 5000                                         # the max width of width filter range
      min_height: 200                                         # the min height of height filter range
      max_height: 5000                                        # the max height of height filter range
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
  - image_size_filter:                                      # filter samples according to the size of images (in bytes) within them
      min_size:  "0"                                            # the min size of filter range
      max_size: "1TB"                                          # the max size of filter range
      any_or_all: any                                           # keep this sample when any/all images meet the filter condition
  - image_text_matching_filter:                             # filter samples according to the matching score between image and text.
      hf_blip: Salesforce/blip-itm-base-coco                  # name of used Hugging Face blip
      min_score: 0.003                                        # the min matching score of filter range
      max_score: 1.0                                          # the max matching score of filter range
      horizontal_flip: false                                  # Flip image horizontally (left to right).
      vertical_flip: false                                    # Flip image vertically (top to bottom).
      reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
  - image_text_similarity_filter:                           # filter samples according to the similarity between image and text.
      hf_clip: openai/clip-vit-base-patch32                   # name of used Hugging Face clip
      min_score: 0.1                                          # the min similarity of filter range
      max_score: 1.0                                          # the max similarity of filter range
      horizontal_flip: false                                  # Flip image horizontally (left to right).
      vertical_flip: false                                    # Flip image vertically (top to bottom).
      reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
  - language_id_score_filter:                               # filter text in specific language with language scores larger than a specific max value
      lang: en                                                # keep text in what language
      min_score: 0.8                                          # the min language scores to filter text
  - maximum_line_length_filter:                             # filter text with the maximum length of lines out of specific range
      min_len: 10                                             # the min length of filter range
      max_len: 10000                                          # the max length of filter range
  - perplexity_filter:                                      # filter text with perplexity score out of specific range
      lang: en                                                # compute perplexity in what language
      max_ppl: 1500                                           # the max perplexity score to filter text
  - special_characters_filter:                              # filter text with special-char ratio out of specific range
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.25                                         # the max ratio of filter range
  - stopwords_filter:                                       # filter text with stopword ratio smaller than a specific min value
      lang: en                                                # consider stopwords in what language
      tokenization: false                                     # whether to use model to tokenize documents
      min_ratio: 0.3                                          # the min ratio to filter text
      stopwords_dir: ./assets                                 # directory to store stopwords dictionaries
      use_words_aug: false                                    # whether to augment words, especially for Chinese and Vietnamese
      words_aug_group_sizes: [2]                              # the group size of words to augment
      words_aug_join_char: ""                                 # the join char between words to augment
  - text_action_filter:                                     # filter text according the number of action verb
      lang: en                                                # consider the words in what language
      min_action_num: 1                                       # text will be filtered whose verbs less the min action number
  - text_entity_dependency_filter:                          # filter text without non independent entity nouns
      lang: en                                                # consider the words in what language
      min_dependency_num: 1                                   # the min number of adjacent edges of a non independent noun in dependency tree
      any_or_all: any                                         # keep this sample when any/all entity nouns are non independent
  - text_length_filter:                                     # filter text with length out of specific range
      min_len: 10                                             # the min length of filter range
      max_len: 10000                                          # the max length of filter range
  - token_num_filter:                                       # filter text with total token number out of specific range
      hf_tokenizer: EleutherAI/pythia-6.9b-deduped            # name of used Hugging Face tokenizer
      min_num: 10                                             # the min number of filter range
      max_num: 10000                                          # the max number of filter range
  - words_num_filter:                                       # filter text with number of words out of specific range
      lang: en                                                # sample in which language
      tokenization: false                                     # whether to use model to tokenize documents
      min_num: 10                                             # the min number of filter range
      max_num: 10000                                          # the max number of filter range
  - word_repetition_filter:                                 # filter text with the word repetition ratio out of specific range
      lang: en                                                # sample in which language
      tokenization: false                                     # whether to use model to tokenize documents
      rep_len: 10                                             # repetition length for word-level n-gram
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.5                                          # the max ratio of filter range
  - suffix_filter:                                          # filter to keep samples with specified suffix.
      suffixes: []                                            # the suffix of text that will be keep. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
  - specified_field_filter:                                 # filter text with the specified field info out of specific range
      field_key: ''                                           # the target key corresponding to multi-level field information need to be separated by '.'
      target_value: []                                        # the range of specified field information corresponding to the samples that need to be retained
  - specified_numeric_field_filter:                         # filter text with the specified numeric field info out of specific range
      field_key: ''                                           # the target key corresponding to multi-level field information need to be separated by '.'
      min_value: 0                                            # the min filter value in SpecifiedNumericField op
      max_value: 10000                                        # the max filter value in SpecifiedNumericField op

  # Deduplicator ops
  - document_deduplicator:                                  # deduplicate text samples using md5 hashing exact matching method
      lowercase: false                                        # whether to convert text to lower case
      ignore_non_character: false                             # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
  - document_minhash_deduplicator:                          # deduplicate text samples using MinHash-LSH method
      tokenization: space                                     # tokenization method for text. One of [space, punctuation, character]
      window_size: 5                                          # window size of shingling
      num_permutations: 256                                   # number of permutations in minhash computing
      jaccard_threshold: 0.7                                  # the min jaccard similarity threshold in near-duplicate detection. When the jaccard similarity of two sample texts is >= this threshold, they are regarded as similar samples and this op will only keep one of them after deduplication
      num_bands: null                                         # number of bands in LSH. Default it's None, and it will be determined by an optimal params computation algorithm by minimize the weighted sum of probs of False Positives and False Negatives
      num_rows_per_band: null                                 # number of rows in each band in LSH. Default it's None, and it will be determined by an optimal params computation algorithm
      lowercase: true                                         # whether to convert text to lower case
      ignore_pattern: null                                    # whether to ignore sub-strings with specific pattern when computing simhash.
  - document_simhash_deduplicator:                          # deduplicate text samples using SimHash-LSH method
      tokenization: space                                     # tokenization method for text. One of [space, punctuation, character]
      window_size: 6                                          # window size of shingling
      num_blocks: 6                                           # number of blocks in SimHash computing
      hamming_distance: 4                                     # the max hamming distance to regard 2 samples as similar enough pair. Should be less than num_blocks always
      lowercase: true                                         # whether to convert text to lower case
      ignore_pattern: null                                    # whether to ignore sub-strings with specific pattern when computing simhash.
  - image_deduplicator:                                     # deduplicator to deduplicate samples at document-level using exact matching of images between documents.
      method: phash                                           # hash method for image. One of [phash, dhash, whash, ahash]

  # Selector ops
  - topk_specified_field_selector:                          # selector to select top samples based on the sorted specified field
      field_key: ''                                           # the target keys corresponding to multi-level field information need to be separated by '.'
      top_ratio:                                              # ratio of selected top samples
      topk:                                                   # number of selected top sample
      reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order
  - frequency_specified_field_selector:                     # selector to select samples based on the sorted frequency of specified field value
      field_key: ''                                           # the target keys corresponding to multi-level field information need to be separated by '.'
      top_ratio:                                              # ratio of selected top specified field value
      topk:                                                   # number of selected top specified field value
      reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order