Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
7c1fb5a
Minor simplification
ppinchuk Mar 9, 2026
d8f5292
Allow source and year in schema outputs
ppinchuk Mar 9, 2026
ebb244e
update column name
ppinchuk Mar 9, 2026
41b358b
Minor re-order for consistency
ppinchuk Mar 9, 2026
79ae125
Don't overwrite `source` and `ord_year` if they are already in the ou…
ppinchuk Mar 9, 2026
59efc80
Update gitignore
ppinchuk Mar 9, 2026
4ca8df9
`ord_year` -> `year`
ppinchuk Mar 9, 2026
b47742d
Update functionality
ppinchuk Mar 9, 2026
9ff53ea
Minor updates to context text
ppinchuk Mar 9, 2026
bad6980
Update context and tests
ppinchuk Mar 9, 2026
8810609
update
ppinchuk Mar 9, 2026
6b1bc11
Add test
ppinchuk Mar 9, 2026
9c45dd7
Adding source and year no longer responsibility of `_concat_scrape_re…
ppinchuk Mar 9, 2026
002401e
Add `parse_multi_doc_context_for_structured_data` method
ppinchuk Mar 9, 2026
8cc24f2
Minor update to function/arg names
ppinchuk Mar 9, 2026
d19d4b2
Allow plugins to specify whether they allow multi-doc contexts
ppinchuk Mar 9, 2026
bb9a53e
Update the info in the context
ppinchuk Mar 9, 2026
77f0d72
Bump ruff version
ppinchuk Mar 9, 2026
d38e84d
Move files
ppinchuk Mar 10, 2026
bd26a14
Add one-shot config
ppinchuk Mar 10, 2026
fc6e716
Add plugin config
ppinchuk Mar 10, 2026
2fc92fe
Bug fix
ppinchuk Mar 10, 2026
01b1d8c
Make `$qualitative_features` explicit
ppinchuk Mar 10, 2026
8869d7c
Update permissions
ppinchuk Mar 10, 2026
cd36b4f
Add example schema
ppinchuk Mar 10, 2026
bfb5b4d
Update permissions
ppinchuk Mar 10, 2026
271e518
Minor update
ppinchuk Mar 10, 2026
e40d087
Update message
ppinchuk Mar 10, 2026
aa3db96
Fixed handling of source inds
ppinchuk Mar 10, 2026
d5c6658
Add files for example
ppinchuk Mar 10, 2026
15231b4
Minor config updates
ppinchuk Mar 10, 2026
333b80c
Add top-level README
ppinchuk Mar 10, 2026
995a64f
Update text to explain new key
ppinchuk Mar 10, 2026
0c16673
Add README
ppinchuk Mar 10, 2026
983d077
Add local docs config
ppinchuk Mar 10, 2026
eca244f
Update tech key
ppinchuk Mar 10, 2026
ec582aa
Fix docs
ppinchuk Mar 10, 2026
e2e7528
PR review
ppinchuk Mar 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

# VSCode
.vscode
*.code-workspace

# Environments
.env
Expand Down
39 changes: 38 additions & 1 deletion compass/extraction/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, documents=None, attrs=None):
@property
def text(self):
"""str: Concatenated text from all documents"""
return "\n\n".join(doc.text for doc in self.documents)
return self.multi_doc_context()

@property
def pages(self):
Expand Down Expand Up @@ -124,6 +124,38 @@ async def mark_doc_as_data_source(self, doc, out_fn_stem=None):
if out_fn_stem is not None:
await _move_file_to_out_dir(doc, out_fn_stem)

def multi_doc_context(self, attr_text_key=None):
"""Get concatenated text representation of documents

This method creates a concatenated text representation of the
documents in this context, optionally pulling the text from the
documents' `attr_text_key`.

Parameters
----------
attr_text_key : str, optional
The key used to look up the document's `.attrs` dictionary
for the text to concatenate. If ``None``, the full document
text is used for concatenation.

Returns
-------
str
Concatenated text representation of the documents in this
context.
"""
if not self.documents:
return ""

serialized = "\n\n".join(
(
f"# SOURCE INDEX #: {ind}\n"
f"# CONTENT #:\n{_text_from_doc(doc, attr_text_key)}"
)
for ind, doc in enumerate(self.documents)
)
return f"## MULTI-DOCUMENT CONTEXT ##\n\n{serialized}"


async def _move_file_to_out_dir(doc, out_fn):
"""Move PDF or HTML text file to output directory"""
Expand Down Expand Up @@ -178,3 +210,8 @@ def _attrs_repr(attrs):

attrs = "\n".join(to_join)
return f"Attrs:\n{attrs}"


def _text_from_doc(doc, key):
"""Get text from key or full doc"""
return doc.text if key is None else doc.attrs[key]
28 changes: 12 additions & 16 deletions compass/extraction/water/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from compass.extraction import extract_date
from compass.plugin import BaseExtractionPlugin, register_plugin
from compass.utilities.enums import LLMTasks
from compass.utilities.parsing import extract_ord_year_from_doc_attrs
from compass.utilities.parsing import extract_year_from_doc_attrs
from compass.exceptions import COMPASSRuntimeError
from compass.extraction.water.parse import StructuredWaterParser

Expand Down Expand Up @@ -271,18 +271,17 @@ def save_structured_data(cls, doc_infos, out_dir):

def _set_data_year(data_df, extraction_context):
"""Set the ordinance year column in the data DataFrame"""
years = filter(
None,
[
extract_ord_year_from_doc_attrs(doc.attrs)
for doc in extraction_context
],
years = list(
filter(
None,
[
extract_year_from_doc_attrs(doc.attrs)
for doc in extraction_context
],
)
)
if not years:
data_df["ord_year"] = None
else:
# TODO: is `max` the right one to use here?
data_df["ord_year"] = max(years)
# TODO: is `max` the right one to use here?
data_df["year"] = max(years) if years else None
return data_df


Expand All @@ -291,10 +290,7 @@ def _set_data_sources(data_df, extraction_context):
sources = filter(
None, [doc.attrs.get("source") for doc in extraction_context]
)
if not sources:
data_df["source"] = None
else:
data_df["source"] = " ;\n".join(sources)
data_df["source"] = " ;\n".join(sources) or None
return data_df


Expand Down
2 changes: 1 addition & 1 deletion compass/llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(

@cached_property
def text_splitter(self):
""":class:`~langchain_text_splitters.character.RecursiveCharacterTextSplitter`: Text splitter for ordinance text""" # noqa: W505, E501
"""`TextSplitter <https://reference.langchain.com/python/langchain-text-splitters/base/TextSplitter>`_: Text splitter for ordinance text""" # noqa: W505, E501
return RecursiveCharacterTextSplitter(
RTS_SEPARATORS,
chunk_size=self.text_splitter_chunk_size,
Expand Down
16 changes: 15 additions & 1 deletion compass/plugin/one_shot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901
may provide a custom system prompt if you want to provide
more specific instructions to the LLM for the structured
data extraction step.
- `allow_multi_doc_extraction`: Boolean flag indicating
whether to allow multiple documents to be used for the
extraction context simultaneously. By default, ``False``,
which means the first document that returns some extracted
data will be marked as the source.

tech : str
Technology identifier to use for the plugin (e.g., "wind",
Expand All @@ -135,6 +140,9 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901
if isinstance(config["schema"], str):
config["schema"] = load_config(config["schema"])

config["qual_feats"] = {
f.casefold() for f in config["schema"].pop("$qualitative_features", [])
}
text_collectors = _collectors_from_config(config)
text_extractors = _extractors_from_config(
config, in_label=text_collectors[-1].OUT_LABEL, tech=tech
Expand All @@ -147,6 +155,11 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
SCHEMA = config["schema"]
"""dict: Schema for the output of the text extraction step"""

ALLOW_MULTI_DOC_EXTRACTION = config.get(
"allow_multi_doc_extraction", False
)
"""bool: Whether to allow extraction over multiple documents"""

IDENTIFIER = tech
"""str: Identifier for extraction task """

Expand All @@ -166,7 +179,7 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
"""Classes for parsing structured ordinance data from text"""

QUERY_TEMPLATES = [] # set by user or LLM-generated
"""List: List of search engine query templates"""
"""list: List of search engine query templates"""

WEBSITE_KEYWORDS = {} # set by user or LLM-generated
"""dict: Keyword weight mapping for link crawl prioritization"""
Expand Down Expand Up @@ -459,6 +472,7 @@ class PluginParser(SchemaOrdinanceParser):
IN_LABEL = in_label
OUT_LABEL = "structured_data"
SCHEMA = config["schema"]
QUALITATIVE_FEATURES = config["qual_feats"]
DATA_TYPE_SHORT_DESC = config.get("data_type_short_desc")
SYSTEM_PROMPT = new_sys_prompt

Expand Down
27 changes: 19 additions & 8 deletions compass/plugin/one_shot/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,12 @@ def SCHEMA(self): # noqa: N802
"""dict: Extraction schema"""
raise NotImplementedError

@property
@abstractmethod
def QUALITATIVE_FEATURES(self): # noqa: N802
"""set: **Lowercase** feature names of qualitative features"""
raise NotImplementedError

async def parse(self, text):
"""Parse text and extract structured data

Expand Down Expand Up @@ -352,19 +358,24 @@ def _to_dataframe(self, data):
output_items = self.SCHEMA["properties"]["outputs"]["items"]
all_features = output_items["properties"]["feature"]["enum"]

known_qual_features = set(
self.SCHEMA.get("$definitions", {})
.get("qualitative_restrictions", {})
.get("properties", {})
)
quant = [feat not in known_qual_features for feat in all_features]
quant = [
feat.casefold() not in self.QUALITATIVE_FEATURES
for feat in all_features
]

df = pd.DataFrame(data)
full_df = pd.DataFrame(
{"feature": all_features, "quantitative": quant}
)
full_df = full_df.merge(df, on="feature", how="left")

return full_df[
["feature", "value", "units", "section", "summary", "quantitative"]
possible_out_cols = [
"value",
"units",
"summary",
"year",
"section",
"source",
]
out_cols = [col for col in possible_out_cols if col in full_df.columns]
return full_df[["feature", *out_cols, "quantitative"]]
Loading
Loading