allenai · rodneykinney · Oct 18, 2023 · Nov 27, 2023 · Nov 27, 2023 · Dec 20, 2023
diff --git a/configs/c4-replication/mixer.yaml b/configs/c4-replication/mixer.yaml
@@ -34,15 +34,16 @@ streams:
         # exclude documents that contain one or more naughty words
         - $.attributes[?(@.c4_v2__c4_v2__has_naughty_word && @.c4_v2__c4_v2__has_naughty_word[0] && @.c4_v2__c4_v2__has_naughty_word[0][2] > 0.5)]
 
-    span_replacement:
-      # remove lines that do not end in punctuation
-      - span: $.attributes.c4_v2__c4_v2__lines_with_no_ending_punctuation
-        min_score: 0.5
-        replacement: ""
-
-      # remove lines that are too short (less than 3 words as defined by C4 rules)
-      - span: $.attributes.c4_v2__c4_v2__lines_with_too_few_words
-        min_score: 0.5
-        replacement: ""
+    text_modification:
+      span_replacement:
+        # remove lines that do not end in punctuation
+        - span: $.attributes.c4_v2__c4_v2__lines_with_no_ending_punctuation
+          min_score: 0.5
+          replacement: ""
+
+        # remove lines that are too short (less than 3 words as defined by C4 rules)
+        - span: $.attributes.c4_v2__c4_v2__lines_with_too_few_words
+          min_score: 0.5
+          replacement: ""
 
 processes: 8
diff --git a/docs/examples/wikipedia-mixer.yaml b/docs/examples/wikipedia-mixer.yaml
@@ -17,9 +17,10 @@ streams:
           - "$.attributes[?(@.exp__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] <= 0.5)]"
           - "[email protected][?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
 
-      span_replacement:
-        - span: "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en"
-          min_score: 0.1
-          replacement: ''
+      text_modification:
+        span_replacement:
+          - span: "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en"
+            min_score: 0.1
+            replacement: ''
 
 processes: 1
diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -157,15 +157,17 @@ Further, we override the number of processes to use to 96 using the `--processes
           "[email protected][?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
         ]
       },
-      # span replacement allows you to replace spans of text with a different string
-      "span_replacement": [
-        {
-          # remove paragraphs whose not-English cld2 socre is below 0.9 in a document
-          "span": "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en",
-          "min_score": 0.1,
-          "replacement": ""
-        }
-      ]
+      "text_modification": {
+        # span replacement allows you to replace spans of text with a different string
+        "span_replacement": [
+          {
+            # remove paragraphs whose not-English cld2 socre is below 0.9 in a document
+            "span": "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en",
+            "min_score": 0.1,
+            "replacement": ""
+          }
+        ]
+      }
     }
   ],
   # this process option is overridden by the command line flag

diff --git a/docs/mixer.md b/docs/mixer.md
@@ -22,10 +22,11 @@ The following parameters are supported either via CLI (e.g. `dolma mix --paramet
 |`streams[].output.discard_fields`|No| Top-level fields in the `discard_fields` list will be dropped from the output documents. |
 |`streams[].filter.include`|No| Optional content-based filtering. Default = keep everything. Documents are retained if they match any of the `include` patterns (or if no `include` patterns are specified) AND if they match none of the `exclude` patterns. Pattern syntax is [jsonpath](https://support.smartbear.com/alertsite/docs/monitors/api/endpoint/jsonpath.html#filters). |
 |`streams[].filter.exclude`|No| Optional content-based filtering. Default = keep everything. Documents are retained if they match any of the `include` patterns (or if no `include` patterns are specified) AND if they match none of the `exclude` patterns. Pattern syntax is [jsonpath](https://support.smartbear.com/alertsite/docs/monitors/api/endpoint/jsonpath.html#filters). |
-|`streams[].span_replacement`|No| A list of objects specifying spans of text to be replaced. |
-|`streams[].span_replacement[].span`|No| A json-path expression for an attribute that contains an array of spans. Each span should be list of length three:  `[start, end, score]`. |
-|`streams[].span_replacement[].min_score`|No| If the span score is less than this value, the span will not be replaced. |
-|`streams[].span_replacement[].replacement`|No| The text that should be inserted in place of the span. Use `{}` to represent the original text. |
+|`streams[].text_modification.trim_whitespace`|No| Remove leading and trailing whitespace from document text. |
+|`streams[].text_modification.minimum_text_length`|No| Skip writing the document if the final text is shorter than this size (in bytes). |
+|`streams[].text_modification.span_replacement[].span`|No| A json-path expression for an attribute that contains an array of spans. Each span should be list of length three:  `[start, end, score]`. |
+|`streams[].text_modification.span_replacement[].min_score`|No| If the span score is less than this value, the span will not be replaced. |
+|`streams[].text_modification.span_replacement[].replacement`|No| The text that should be inserted in place of the span. Use `{}` to represent the original text. |
 |`work_dir.input`|No| Path to a local scratch directory where temporary input files can be placed. If not provided, Dolma will make one for you and delete it upon completion. |
 |`work_dir.output`|No| Path to a local scratch directory where temporary output files can be placed. If not provided, Dolma will make one for you and delete it upon completion. |
 |`processes`|No| Number of processes to use for mixing. By default 1 process is used. |

diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py
@@ -31,6 +31,17 @@ class SpanReplacementConfig:
     replacement: str = field(default="", help="Replacement for the span")
 
 
+@dataclass
+class TextModificationConfig:
+    span_replacement: List[SpanReplacementConfig] = field(default=[], help="Configuration for replacing spans.")
+    trim_whitespace: bool = field(
+        default=False, help="If true, trim leading and trailing whitespace from text (after span replacement)"
+    )
+    minimum_text_length: int = field(
+        default=0, help="Skip writing the document if the final text is shorter than this size (in bytes)"
+    )
+
+
 @dataclass
 class StreamConfig:
     name: str = field(help="Name of the stream. Required.")
@@ -42,7 +53,9 @@ class StreamConfig:
     filter: Optional[FilterConfig] = field(  # pyright: ignore
         default=None, help="Configuration for filtering documents."
     )
-    span_replacement: List[SpanReplacementConfig] = field(default=[], help="Configuration for replacing spans.")
+    text_modification: Optional[TextModificationConfig] = field(
+        default=None, help="Configuration for modifying the document text"
+    )
 
 
 @dataclass
@@ -83,17 +96,23 @@ def run(cls, parsed_config: MixerConfig):
                         "exclude": [str(i) for i in stream_config.filter.exclude],
                     }
 
-                for span_replacement in stream_config.span_replacement:
-                    stream_config_dict.setdefault("span_replacement", []).append(
-                        {
-                            "span": str(span_replacement.span),
-                            "min_score": float(span_replacement.min_score),
-                            "replacement": str(span_replacement.replacement),
-                        }
-                    )
-
-                if "span_replacement" not in stream_config_dict and "filter" not in stream_config_dict:
-                    raise DolmaConfigError("Either `filter` or `span_replacement` must be specified")
+                if stream_config.text_modification is not None:
+                    text_modification_dict = {
+                        "trim_whitespace": stream_config.text_modification.trim_whitespace,
+                        "minimum_text_length": stream_config.text_modification.minimum_text_length,
+                    }
+                    stream_config_dict["text_modification"] = text_modification_dict
+                    for span_replacement in stream_config.text_modification.span_replacement:
+                        text_modification_dict.setdefault("span_replacement", []).append(
+                            {
+                                "span": str(span_replacement.span),
+                                "min_score": float(span_replacement.min_score),
+                                "replacement": str(span_replacement.replacement),
+                            }
+                        )
+
+                if "text_modification" not in stream_config_dict and "filter" not in stream_config_dict:
+                    raise DolmaConfigError("Either `filter` or `text_modification` must be specified")
 
                 # perform some path validation to make sure we don't call the mixer with invalid config
                 total_matching_documents = 0