NatLabRockies · ppinchuk · Mar 10, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
@@ -4,6 +4,7 @@
 
 # VSCode
 .vscode
+*.code-workspace
 
 # Environments
 .env

@@ -40,7 +40,7 @@ def __init__(self, documents=None, attrs=None):
     @property
     def text(self):
         """str: Concatenated text from all documents"""
-        return "\n\n".join(doc.text for doc in self.documents)
+        return self.multi_doc_context()
 
     @property
     def pages(self):
@@ -124,6 +124,38 @@ async def mark_doc_as_data_source(self, doc, out_fn_stem=None):
         if out_fn_stem is not None:
             await _move_file_to_out_dir(doc, out_fn_stem)
 
+    def multi_doc_context(self, attr_text_key=None):
+        """Get concatenated text representation of documents
+
+        This method creates a concatenated text representation of the
+        documents in this context, optionally pulling the text from the
+        documents' `attr_text_key`.
+
+        Parameters
+        ----------
+        attr_text_key : str, optional
+            The key used to look up the document's `.attrs` dictionary
+            for the text to concatenate. If ``None``, the full document
+            text is used for concatenation.
+
+        Returns
+        -------
+        str
+            Concatenated text representation of the documents in this
+            context.
+        """
+        if not self.documents:
+            return ""
+
+        serialized = "\n\n".join(
+            (
+                f"# SOURCE INDEX #: {ind}\n"
+                f"# CONTENT #:\n{_text_from_doc(doc, attr_text_key)}"
+            )
+            for ind, doc in enumerate(self.documents)
+        )
+        return f"## MULTI-DOCUMENT CONTEXT ##\n\n{serialized}"
+
 
 async def _move_file_to_out_dir(doc, out_fn):
     """Move PDF or HTML text file to output directory"""
@@ -178,3 +210,8 @@ def _attrs_repr(attrs):
 
     attrs = "\n".join(to_join)
     return f"Attrs:\n{attrs}"
+
+
+def _text_from_doc(doc, key):
+    """Get text from key or full doc"""
+    return doc.text if key is None else doc.attrs[key]
@@ -11,7 +11,7 @@
 from compass.extraction import extract_date
 from compass.plugin import BaseExtractionPlugin, register_plugin
 from compass.utilities.enums import LLMTasks
-from compass.utilities.parsing import extract_ord_year_from_doc_attrs
+from compass.utilities.parsing import extract_year_from_doc_attrs
 from compass.exceptions import COMPASSRuntimeError
 from compass.extraction.water.parse import StructuredWaterParser
 
@@ -271,18 +271,17 @@ def save_structured_data(cls, doc_infos, out_dir):
 
 def _set_data_year(data_df, extraction_context):
     """Set the ordinance year column in the data DataFrame"""
-    years = filter(
-        None,
-        [
-            extract_ord_year_from_doc_attrs(doc.attrs)
-            for doc in extraction_context
-        ],
+    years = list(
+        filter(
+            None,
+            [
+                extract_year_from_doc_attrs(doc.attrs)
+                for doc in extraction_context
+            ],
+        )
     )
-    if not years:
-        data_df["ord_year"] = None
-    else:
-        # TODO: is `max` the right one to use here?
-        data_df["ord_year"] = max(years)
+    # TODO: is `max` the right one to use here?
+    data_df["year"] = max(years) if years else None
     return data_df
 
 
@@ -291,10 +290,7 @@ def _set_data_sources(data_df, extraction_context):
     sources = filter(
         None, [doc.attrs.get("source") for doc in extraction_context]
     )
-    if not sources:
-        data_df["source"] = None
-    else:
-        data_df["source"] = " ;\n".join(sources)
+    data_df["source"] = " ;\n".join(sources) or None
     return data_df
 
 

@@ -68,7 +68,7 @@ def __init__(
 
     @cached_property
     def text_splitter(self):
-        """:class:`~langchain_text_splitters.character.RecursiveCharacterTextSplitter`: Text splitter for ordinance text"""  # noqa: W505, E501
+        """`TextSplitter <https://reference.langchain.com/python/langchain-text-splitters/base/TextSplitter>`_: Text splitter for ordinance text"""  # noqa: W505, E501
         return RecursiveCharacterTextSplitter(
             RTS_SEPARATORS,
             chunk_size=self.text_splitter_chunk_size,

@@ -123,6 +123,11 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):  # noqa: C901
               may provide a custom system prompt if you want to provide
               more specific instructions to the LLM for the structured
               data extraction step.
+            - `allow_multi_doc_extraction`: Boolean flag indicating
+              whether to allow multiple documents to be used for the
+              extraction context simultaneously. By default, ``False``,
+              which means the first document that returns some extracted
+              data will be marked as the source.
 
     tech : str
         Technology identifier to use for the plugin (e.g., "wind",
@@ -135,6 +140,9 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):  # noqa: C901
     if isinstance(config["schema"], str):
         config["schema"] = load_config(config["schema"])
 
+    config["qual_feats"] = {
+        f.casefold() for f in config["schema"].pop("$qualitative_features", [])
+    }
     text_collectors = _collectors_from_config(config)
     text_extractors = _extractors_from_config(
         config, in_label=text_collectors[-1].OUT_LABEL, tech=tech
@@ -147,6 +155,11 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
         SCHEMA = config["schema"]
         """dict: Schema for the output of the text extraction step"""
 
+        ALLOW_MULTI_DOC_EXTRACTION = config.get(
+            "allow_multi_doc_extraction", False
+        )
+        """bool: Whether to allow extraction over multiple documents"""
+
         IDENTIFIER = tech
         """str: Identifier for extraction task """
 
@@ -166,7 +179,7 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
         """Classes for parsing structured ordinance data from text"""
 
         QUERY_TEMPLATES = []  # set by user or LLM-generated
-        """List: List of search engine query templates"""
+        """list: List of search engine query templates"""
 
         WEBSITE_KEYWORDS = {}  # set by user or LLM-generated
         """dict: Keyword weight mapping for link crawl prioritization"""
@@ -459,6 +472,7 @@ class PluginParser(SchemaOrdinanceParser):
         IN_LABEL = in_label
         OUT_LABEL = "structured_data"
         SCHEMA = config["schema"]
+        QUALITATIVE_FEATURES = config["qual_feats"]
         DATA_TYPE_SHORT_DESC = config.get("data_type_short_desc")
         SYSTEM_PROMPT = new_sys_prompt
 

@@ -302,6 +302,12 @@ def SCHEMA(self):  # noqa: N802
         """dict: Extraction schema"""
         raise NotImplementedError
 
+    @property
+    @abstractmethod
+    def QUALITATIVE_FEATURES(self):  # noqa: N802
+        """set: **Lowercase** feature names of qualitative features"""
+        raise NotImplementedError
+
     async def parse(self, text):
         """Parse text and extract structured data
 
@@ -352,19 +358,24 @@ def _to_dataframe(self, data):
         output_items = self.SCHEMA["properties"]["outputs"]["items"]
         all_features = output_items["properties"]["feature"]["enum"]
 
-        known_qual_features = set(
-            self.SCHEMA.get("$definitions", {})
-            .get("qualitative_restrictions", {})
-            .get("properties", {})
-        )
-        quant = [feat not in known_qual_features for feat in all_features]
+        quant = [
+            feat.casefold() not in self.QUALITATIVE_FEATURES
+            for feat in all_features
+        ]
 
         df = pd.DataFrame(data)
         full_df = pd.DataFrame(
             {"feature": all_features, "quantitative": quant}
         )
         full_df = full_df.merge(df, on="feature", how="left")
 
-        return full_df[
-            ["feature", "value", "units", "section", "summary", "quantitative"]
+        possible_out_cols = [
+            "value",
+            "units",
+            "summary",
+            "year",
+            "section",
+            "source",
         ]
+        out_cols = [col for col in possible_out_cols if col in full_df.columns]
+        return full_df[["feature", *out_cols, "quantitative"]]
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     # VSCode
     .vscode
+    *.code-workspace
     # Environments
     .env
@@ Expand Down @@