diff --git a/.gitignore b/.gitignore index 25e3f8c0..9c9a4ac8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # VSCode .vscode +*.code-workspace # Environments .env diff --git a/compass/extraction/context.py b/compass/extraction/context.py index 7b6b9c83..f70aaa28 100644 --- a/compass/extraction/context.py +++ b/compass/extraction/context.py @@ -40,7 +40,7 @@ def __init__(self, documents=None, attrs=None): @property def text(self): """str: Concatenated text from all documents""" - return "\n\n".join(doc.text for doc in self.documents) + return self.multi_doc_context() @property def pages(self): @@ -124,6 +124,38 @@ async def mark_doc_as_data_source(self, doc, out_fn_stem=None): if out_fn_stem is not None: await _move_file_to_out_dir(doc, out_fn_stem) + def multi_doc_context(self, attr_text_key=None): + """Get concatenated text representation of documents + + This method creates a concatenated text representation of the + documents in this context, optionally pulling the text from the + documents' `attr_text_key`. + + Parameters + ---------- + attr_text_key : str, optional + The key used to look up the document's `.attrs` dictionary + for the text to concatenate. If ``None``, the full document + text is used for concatenation. + + Returns + ------- + str + Concatenated text representation of the documents in this + context. + """ + if not self.documents: + return "" + + serialized = "\n\n".join( + ( + f"# SOURCE INDEX #: {ind}\n" + f"# CONTENT #:\n{_text_from_doc(doc, attr_text_key)}" + ) + for ind, doc in enumerate(self.documents) + ) + return f"## MULTI-DOCUMENT CONTEXT ##\n\n{serialized}" + async def _move_file_to_out_dir(doc, out_fn): """Move PDF or HTML text file to output directory""" @@ -178,3 +210,8 @@ def _attrs_repr(attrs): attrs = "\n".join(to_join) return f"Attrs:\n{attrs}" + + +def _text_from_doc(doc, key): + """Get text from key or full doc""" + return doc.text if key is None else doc.attrs[key] diff --git a/compass/extraction/water/plugin.py b/compass/extraction/water/plugin.py index c1826e6d..c1bb1b9e 100644 --- a/compass/extraction/water/plugin.py +++ b/compass/extraction/water/plugin.py @@ -11,7 +11,7 @@ from compass.extraction import extract_date from compass.plugin import BaseExtractionPlugin, register_plugin from compass.utilities.enums import LLMTasks -from compass.utilities.parsing import extract_ord_year_from_doc_attrs +from compass.utilities.parsing import extract_year_from_doc_attrs from compass.exceptions import COMPASSRuntimeError from compass.extraction.water.parse import StructuredWaterParser @@ -271,18 +271,17 @@ def save_structured_data(cls, doc_infos, out_dir): def _set_data_year(data_df, extraction_context): """Set the ordinance year column in the data DataFrame""" - years = filter( - None, - [ - extract_ord_year_from_doc_attrs(doc.attrs) - for doc in extraction_context - ], + years = list( + filter( + None, + [ + extract_year_from_doc_attrs(doc.attrs) + for doc in extraction_context + ], + ) ) - if not years: - data_df["ord_year"] = None - else: - # TODO: is `max` the right one to use here? - data_df["ord_year"] = max(years) + # TODO: is `max` the right one to use here? + data_df["year"] = max(years) if years else None return data_df @@ -291,10 +290,7 @@ def _set_data_sources(data_df, extraction_context): sources = filter( None, [doc.attrs.get("source") for doc in extraction_context] ) - if not sources: - data_df["source"] = None - else: - data_df["source"] = " ;\n".join(sources) + data_df["source"] = " ;\n".join(sources) or None return data_df diff --git a/compass/llm/config.py b/compass/llm/config.py index 15a7a902..487f14a5 100644 --- a/compass/llm/config.py +++ b/compass/llm/config.py @@ -68,7 +68,7 @@ def __init__( @cached_property def text_splitter(self): - """:class:`~langchain_text_splitters.character.RecursiveCharacterTextSplitter`: Text splitter for ordinance text""" # noqa: W505, E501 + """`TextSplitter `_: Text splitter for ordinance text""" # noqa: W505, E501 return RecursiveCharacterTextSplitter( RTS_SEPARATORS, chunk_size=self.text_splitter_chunk_size, diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py index 4ab68576..5570f182 100644 --- a/compass/plugin/one_shot/base.py +++ b/compass/plugin/one_shot/base.py @@ -123,6 +123,11 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901 may provide a custom system prompt if you want to provide more specific instructions to the LLM for the structured data extraction step. + - `allow_multi_doc_extraction`: Boolean flag indicating + whether to allow multiple documents to be used for the + extraction context simultaneously. By default, ``False``, + which means the first document that returns some extracted + data will be marked as the source. tech : str Technology identifier to use for the plugin (e.g., "wind", @@ -135,6 +140,9 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901 if isinstance(config["schema"], str): config["schema"] = load_config(config["schema"]) + config["qual_feats"] = { + f.casefold() for f in config["schema"].pop("$qualitative_features", []) + } text_collectors = _collectors_from_config(config) text_extractors = _extractors_from_config( config, in_label=text_collectors[-1].OUT_LABEL, tech=tech @@ -147,6 +155,11 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin): SCHEMA = config["schema"] """dict: Schema for the output of the text extraction step""" + ALLOW_MULTI_DOC_EXTRACTION = config.get( + "allow_multi_doc_extraction", False + ) + """bool: Whether to allow extraction over multiple documents""" + IDENTIFIER = tech """str: Identifier for extraction task """ @@ -166,7 +179,7 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin): """Classes for parsing structured ordinance data from text""" QUERY_TEMPLATES = [] # set by user or LLM-generated - """List: List of search engine query templates""" + """list: List of search engine query templates""" WEBSITE_KEYWORDS = {} # set by user or LLM-generated """dict: Keyword weight mapping for link crawl prioritization""" @@ -459,6 +472,7 @@ class PluginParser(SchemaOrdinanceParser): IN_LABEL = in_label OUT_LABEL = "structured_data" SCHEMA = config["schema"] + QUALITATIVE_FEATURES = config["qual_feats"] DATA_TYPE_SHORT_DESC = config.get("data_type_short_desc") SYSTEM_PROMPT = new_sys_prompt diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py index 6274725a..3edefaa8 100644 --- a/compass/plugin/one_shot/components.py +++ b/compass/plugin/one_shot/components.py @@ -302,6 +302,12 @@ def SCHEMA(self): # noqa: N802 """dict: Extraction schema""" raise NotImplementedError + @property + @abstractmethod + def QUALITATIVE_FEATURES(self): # noqa: N802 + """set: **Lowercase** feature names of qualitative features""" + raise NotImplementedError + async def parse(self, text): """Parse text and extract structured data @@ -352,12 +358,10 @@ def _to_dataframe(self, data): output_items = self.SCHEMA["properties"]["outputs"]["items"] all_features = output_items["properties"]["feature"]["enum"] - known_qual_features = set( - self.SCHEMA.get("$definitions", {}) - .get("qualitative_restrictions", {}) - .get("properties", {}) - ) - quant = [feat not in known_qual_features for feat in all_features] + quant = [ + feat.casefold() not in self.QUALITATIVE_FEATURES + for feat in all_features + ] df = pd.DataFrame(data) full_df = pd.DataFrame( @@ -365,6 +369,13 @@ def _to_dataframe(self, data): ) full_df = full_df.merge(df, on="feature", how="left") - return full_df[ - ["feature", "value", "units", "section", "summary", "quantitative"] + possible_out_cols = [ + "value", + "units", + "summary", + "year", + "section", + "source", ] + out_cols = [col for col in possible_out_cols if col in full_df.columns] + return full_df[["feature", *out_cols, "quantitative"]] diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 92d233e9..3980e5ab 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -29,12 +29,15 @@ from compass.utilities.ngrams import convert_text_to_sentence_ngrams from compass.utilities.parsing import ( clean_backticks_from_llm_response, - extract_ord_year_from_doc_attrs, + extract_year_from_doc_attrs, merge_overlapping_texts, ) from compass.utilities import num_ordinances_dataframe from compass.warn import COMPASSWarning -from compass.exceptions import COMPASSPluginConfigurationError +from compass.exceptions import ( + COMPASSPluginConfigurationError, + COMPASSRuntimeError, +) from compass.pb import COMPASS_PB @@ -593,6 +596,9 @@ class OrdinanceExtractionPlugin(FilteredExtractionPlugin): methods as needed. """ + ALLOW_MULTI_DOC_EXTRACTION = False + """bool: Whether to allow extraction over multiple documents""" + @property @abstractmethod def TEXT_EXTRACTORS(self): # noqa: N802 @@ -695,12 +701,85 @@ async def parse_docs_for_structured_data(self, extraction_context): Context with extracted data/information stored in the ``.attrs`` dictionary, or ``None`` if no data was extracted. """ - for doc_for_extraction in extraction_context: - data_df = await self.parse_single_doc_for_structured_data( - doc_for_extraction + if self.ALLOW_MULTI_DOC_EXTRACTION: + return await self.parse_multi_doc_context_for_structured_data( + extraction_context + ) + return await self.parse_single_doc_for_structured_data( + extraction_context + ) + + async def parse_multi_doc_context_for_structured_data( + self, extraction_context + ): + """Parse all documents to extract structured data/information + + Parameters + ---------- + extraction_context : ExtractionContext + Context containing candidate documents to parse. The text + from all documents will be concatenated to create the + context for the extraction. + + Returns + ------- + ExtractionContext or None + Context with extracted data/information stored in the + ``.attrs`` dictionary, or ``None`` if no data was extracted. + """ + key = self.TEXT_COLLECTORS[-1].OUT_LABEL + extraction_context.attrs[key] = extraction_context.multi_doc_context( + attr_text_key=key + ) + data_df = await self.parse_for_structured_data(extraction_context) + row_count = self.get_structured_data_row_count(data_df) + if row_count == 0: + logger.debug( + "No extracted data; searched %d docs", + extraction_context.num_documents, ) + return None + + data_df = await _fill_out_multi_file_sources( + data_df, + extraction_context, + out_fn_stem=self.jurisdiction.full_name, + ) + + extraction_context.attrs["structured_data"] = data_df + logger.info( + "%d ordinance value(s) found in %d docs for %s. ", + num_ordinances_dataframe(data_df), + extraction_context.num_documents, + self.jurisdiction.full_name, + ) + return extraction_context + + async def parse_single_doc_for_structured_data(self, extraction_context): + """Parse documents one at a time to extract structured data + + The first document to return some extracted data will be marked + as the source and will be returned from this method. + + Parameters + ---------- + extraction_context : ExtractionContext + Context containing candidate documents to parse. + + Returns + ------- + ExtractionContext or None + Context with extracted data/information stored in the + ``.attrs`` dictionary, or ``None`` if no data was extracted. + """ + for doc_for_extraction in extraction_context: + data_df = await self.parse_for_structured_data(doc_for_extraction) row_count = self.get_structured_data_row_count(data_df) if row_count > 0: + data_df["source"] = doc_for_extraction.attrs.get("source") + data_df["year"] = extract_year_from_doc_attrs( + doc_for_extraction.attrs + ) await extraction_context.mark_doc_as_data_source( doc_for_extraction, out_fn_stem=self.jurisdiction.full_name ) @@ -719,38 +798,39 @@ async def parse_docs_for_structured_data(self, extraction_context): ) return None - async def parse_single_doc_for_structured_data(self, doc_for_extraction): + async def parse_for_structured_data(self, source): """Extract all possible structured data from a document This method is called from the default implementation of - `parse_docs_for_structured_data()` for each document that passed - filtering. If you overwrite`parse_docs_for_structured_data()``, - you can ignore this method. + `parse_single_doc_for_structured_data()` for each document that + passed filtering. If you overwrite + ``parse_single_doc_for_structured_data()``, you can ignore this + method. Parameters ---------- - doc_for_extraction : BaseDocument - Document to extract structured data from. + source : BaseDocument or ExtractionContext + Source to extract structured data from. Must have an + `.attrs` attribute that contains text from which data should + be extracted. Returns ------- - BaseDocument - Document with extracted structured data stored in the - ``.attrs`` dictionary. + pandas.DataFrame or None + DataFrame containing extracted structured data, or None if + no structured data were extracted. """ with self._tracked_progress(): tasks = [ asyncio.create_task( - self._try_extract_ordinances( - doc_for_extraction, parser_class - ), + self._try_extract_ordinances(source, parser_class), name=self.jurisdiction.full_name, ) for parser_class in filter(None, self.PARSERS) ] await asyncio.gather(*tasks) - return self._concat_scrape_results(doc_for_extraction) + return self._concat_scrape_results(source) async def _try_extract_ordinances(self, doc_for_extraction, parser_class): """Apply a single extractor and parser to legal text""" @@ -810,17 +890,14 @@ def _tracked_progress(self): self._jsp = None - def _concat_scrape_results(self, doc): + def _concat_scrape_results(self, source): """Concatenate structured data from all parsers""" - data = [doc.attrs.get(p.OUT_LABEL, None) for p in self.PARSERS] + data = [source.attrs.get(p.OUT_LABEL, None) for p in self.PARSERS] data = [df for df in data if df is not None and not df.empty] if len(data) == 0: return None - data = data[0] if len(data) == 1 else pd.concat(data) - data["source"] = doc.attrs.get("source") - data["ord_year"] = extract_ord_year_from_doc_attrs(doc.attrs) - return data + return data[0] if len(data) == 1 else pd.concat(data) def _get_model_config(self, primary_key, secondary_key): """Get model config: primary_key -> secondary_key -> default""" @@ -1011,3 +1088,104 @@ def _validate_in_out_keys(consumers, producers): f"processing class: {formatted}" ) raise COMPASSPluginConfigurationError(msg) + + +async def _fill_out_multi_file_sources( + data_df, extraction_context, out_fn_stem +): + """Fill out source column for multi-doc extraction + + This method implements a "report all document" fallback for the + following scenarios: + + - source inds not given in output + - source inds not integers + - source inds are invalid indices for the actual documents + + If the source inds are all valid, each row in the dataframe gets its + own unique source and year combo. + """ + try: + source_inds = _get_source_inds( + data_df, extraction_context.num_documents + ) + except COMPASSRuntimeError: + return await _fill_in_all_sources( + data_df, extraction_context, out_fn_stem + ) + + year_map = {} + source_map = {} + for source_ind in source_inds: + doc = extraction_context[source_ind] + year_map[source_ind] = extract_year_from_doc_attrs(doc.attrs) + source_map[source_ind] = doc.attrs.get("source") + await extraction_context.mark_doc_as_data_source( + doc, out_fn_stem=f"{out_fn_stem}_{source_ind + 1}" + ) + + data_df["year"] = data_df["source"].map( + lambda source_ind: ( + year_map.get(int(source_ind)) if pd.notna(source_ind) else None + ) + ) + + data_df["source"] = data_df["source"].map( + lambda source_ind: ( + source_map.get(int(source_ind)) if pd.notna(source_ind) else None + ) + ) + return data_df + + +def _get_source_inds(data_df, num_docs): + """Try to extract source document indices""" + if "source" not in data_df.columns: + msg = "'source' column not found in extracted outputs" + raise COMPASSRuntimeError(msg) + + try: + source_inds = data_df["source"].dropna().unique().astype(int) + except (TypeError, ValueError): + msg = "'source' column contains non-integer values" + raise COMPASSRuntimeError(msg) from None + + if any( + source_ind < 0 or source_ind >= num_docs for source_ind in source_inds + ): + msg = "'source' column contains out-of-bounds indices" + raise COMPASSRuntimeError(msg) + + return source_inds + + +async def _fill_in_all_sources(data_df, extraction_context, out_fn_stem): + """Fill in source and year columns using all sources""" + logger.debug( + "Filling in sources using all %d documents in context due to " + "invalid or missing source indices", + extraction_context.num_documents, + ) + all_sources = filter( + None, [doc.attrs.get("source") for doc in extraction_context] + ) + concat_sources = " ;\n".join(all_sources) or None + data_df["source"] = concat_sources + + years = list( + filter( + None, + [ + extract_year_from_doc_attrs(doc.attrs) + for doc in extraction_context + ], + ) + ) + data_df["year"] = max(years) if years else None + + for ind, doc in enumerate(extraction_context, start=1): + await extraction_context.mark_doc_as_data_source( + doc, out_fn_stem=f"{out_fn_stem}_{ind}" + ) + + return data_df diff --git a/compass/utilities/__init__.py b/compass/utilities/__init__.py index 30c6cda8..d2cb3209 100644 --- a/compass/utilities/__init__.py +++ b/compass/utilities/__init__.py @@ -18,7 +18,7 @@ load_jurisdictions_from_fp, ) from .parsing import ( - extract_ord_year_from_doc_attrs, + extract_year_from_doc_attrs, llm_response_as_json, merge_overlapping_texts, num_ordinances_dataframe, diff --git a/compass/utilities/finalize.py b/compass/utilities/finalize.py index b4b9696e..c9e2b7bc 100644 --- a/compass/utilities/finalize.py +++ b/compass/utilities/finalize.py @@ -30,7 +30,7 @@ "min_dist", "max_dist", "summary", - "ord_year", + "year", "section", "source", "quantitative", @@ -319,7 +319,7 @@ def compile_run_summary_message( return ( f"✅ Scraping complete!\nOutput Directory: {out_dir}\n" f"Total runtime: {runtime} {total_cost}\n" - f"Number of documents found: {document_count}" + f"Number of jurisdictions with extracted data: {document_count}" ) diff --git a/compass/utilities/parsing.py b/compass/utilities/parsing.py index 11261c31..ce7e79f4 100644 --- a/compass/utilities/parsing.py +++ b/compass/utilities/parsing.py @@ -111,7 +111,7 @@ def merge_overlapping_texts(text_chunks, n=300): return out_text -def extract_ord_year_from_doc_attrs(doc_attrs): +def extract_year_from_doc_attrs(doc_attrs): """Extract the ordinance year stored in document attributes Parameters @@ -130,9 +130,9 @@ def extract_ord_year_from_doc_attrs(doc_attrs): Examples -------- - >>> extract_ord_year_from_doc_attrs({"date": (2024, 5, 17)}) + >>> extract_year_from_doc_attrs({"date": (2024, 5, 17)}) 2024 - >>> extract_ord_year_from_doc_attrs({"date": (None, None, None)}) + >>> extract_year_from_doc_attrs({"date": (None, None, None)}) None """ year, *__ = doc_attrs.get("date") or (None, None, None) diff --git a/crates/compass/src/lib.rs b/crates/compass/src/lib.rs index 2a373f72..8211bf06 100644 --- a/crates/compass/src/lib.rs +++ b/crates/compass/src/lib.rs @@ -129,7 +129,7 @@ pub fn load_ordinance + std::fmt::Debug>( /* let mut rdr = csv::Reader::from_path(raw_filename).unwrap(); - let mut stmt = conn.prepare_cached("INSERT INTO property (county, state, FIPS, feature, fixed_value, mult_value, mult_type, adder, min_dist, max_dist, value, units, ord_year, last_updated, section, source, comments) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)").unwrap(); + let mut stmt = conn.prepare_cached("INSERT INTO property (county, state, FIPS, feature, fixed_value, mult_value, mult_type, adder, min_dist, max_dist, value, units, year, last_updated, section, source, comments) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)").unwrap(); for result in rdr.records() { let record = result.unwrap(); // println!("{:?}", record); diff --git a/crates/compass/src/scraper/ordinance/qualitative.rs b/crates/compass/src/scraper/ordinance/qualitative.rs index 8804baa6..006b8694 100644 --- a/crates/compass/src/scraper/ordinance/qualitative.rs +++ b/crates/compass/src/scraper/ordinance/qualitative.rs @@ -17,7 +17,7 @@ pub(super) struct QualitativeRecord { FIPS: u64, feature: String, summary: String, - ord_year: Option, + year: Option, section: Option, source: Option, } @@ -40,7 +40,7 @@ impl Qualitative { FIPS UBIGINT, feature TEXT, summary TEXT, - ord_year INTEGER, + year INTEGER, section TEXT, source TEXT );", @@ -103,7 +103,7 @@ impl Qualitative { .prepare( r"INSERT INTO qualitative (bookkeeper_lnk, county, state, subdivison, - jurisdiction_type, FIPS, feature, summary, ord_year, + jurisdiction_type, FIPS, feature, summary, year, section, source) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? ) ", @@ -121,7 +121,7 @@ impl Qualitative { record.FIPS, record.feature, record.summary, - record.ord_year, + record.year, record.section, record.source, ])?; @@ -140,7 +140,7 @@ pub(crate) mod sample { pub(crate) fn basic() -> String { let mut output = String::new(); - output.push_str("county,state,subdivison,jurisdiction_type,FIPS,feature,summary,ord_year,section,source\n"); + output.push_str("county,state,subdivison,jurisdiction_type,FIPS,feature,summary,year,section,source\n"); output.push_str( "county-1,state-1,,jurisdiction_type-1,11111,feature-1,summary-1,2001,section-1,source-1\n", ); diff --git a/crates/compass/src/scraper/ordinance/quantitative.rs b/crates/compass/src/scraper/ordinance/quantitative.rs index 91fb0512..2a9a5895 100644 --- a/crates/compass/src/scraper/ordinance/quantitative.rs +++ b/crates/compass/src/scraper/ordinance/quantitative.rs @@ -22,7 +22,7 @@ pub(super) struct QuantitativeRecord { min_dist: Option, max_dist: Option, summary: Option, - ord_year: Option, + year: Option, section: Option, source: Option, } @@ -50,7 +50,7 @@ impl Quantitative { min_dist REAL, max_dist REAL, summary TEXT, - ord_year INTEGER, + year INTEGER, section TEXT, source TEXT );", @@ -114,7 +114,7 @@ impl Quantitative { r"INSERT INTO quantitative (bookkeeper_lnk, county, state, subdivison, jurisdiction_type, FIPS, feature, value, units, adder, - min_dist, max_dist, summary, ord_year, section, source) + min_dist, max_dist, summary, year, section, source) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ", ) @@ -136,7 +136,7 @@ impl Quantitative { record.min_dist, record.max_dist, record.summary, - record.ord_year, + record.year, record.section, record.source, ])?; @@ -155,7 +155,7 @@ pub(crate) mod sample { pub(crate) fn basic() -> String { let mut output = String::new(); - output.push_str("county,state,subdivison,jurisdiction_type,FIPS,feature,value,units,offset,min_dist,max_dist,summary,ord_year,section,source\n"); + output.push_str("county,state,subdivison,jurisdiction_type,FIPS,feature,value,units,offset,min_dist,max_dist,summary,year,section,source\n"); output.push_str( "county-1,state-1,,jurisdiction_type-1,11111,feature-1,,,,,,,2001,,source-1\n", ); diff --git a/docs/diagram/compass-db.dot b/docs/diagram/compass-db.dot index a02b4582..c12c5894 100755 --- a/docs/diagram/compass-db.dot +++ b/docs/diagram/compass-db.dot @@ -480,7 +480,7 @@ digraph g { - ord_year + year integer @@ -606,7 +606,7 @@ digraph g { - ord_year + year integer diff --git a/docs/source/conf.py b/docs/source/conf.py index 87948bd2..9c365357 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -61,7 +61,6 @@ intersphinx_mapping = { "elm": ("https://natlabrockies.github.io/elm", None), - "lc": ("https://reference.langchain.com/python/", None), "matplotlib": ("https://matplotlib.org/stable", None), "networkx": ("https://networkx.org/documentation/stable", None), "numpy": ("https://numpy.org/doc/stable/", None), @@ -369,8 +368,8 @@ def setup(app): "pd.Index": "~pandas.Index", "pd.NaT": "~pandas.NaT", # Langchain - "LCTextSplitter": ":class:`~langchain_text_splitters.base.TextSplitter`", - "RCTextSplitter": ":class:`~langchain_text_splitters.character.RecursiveCharacterTextSplitter`", + "LCTextSplitter": "`TextSplitter `_", + "RCTextSplitter": "`RecursiveCharacterTextSplitter `_", # fixing ELM docstrings "nx.DiGraph": ":class:`networkx.DiGraph`", "ApiBase": ":class:`~elm.base.ApiBase`", diff --git a/docs/source/dev/README.rst b/docs/source/dev/README.rst index 1d1bd2f1..957e2f2f 100644 --- a/docs/source/dev/README.rst +++ b/docs/source/dev/README.rst @@ -200,9 +200,6 @@ As such, please adhere to these guidelines: For example, use ``:class:`plotly.graph_objects.Figure```, which renders as :class:`plotly.graph_objects.Figure` * Networkx: ``networkx`` For example, use ``:class:`~networkx.MultiDiGraph```, which renders as :class:`~networkx.MultiDiGraph` - * langchain_text_splitters : ``langchain_text_splitters`` - For example, use ``:class:`~langchain_text_splitters.character.RecursiveCharacterTextSplitter```, - which renders as :class:`~langchain_text_splitters.character.RecursiveCharacterTextSplitter` * elm: ``elm`` For example, use ``:class:`elm.web.document.PDFDocument```, which renders as :class:`elm.web.document.PDFDocument` diff --git a/examples/README.md b/examples/README.md old mode 100644 new mode 100755 diff --git a/examples/one_shot_schema_extraction/README.rst b/examples/one_shot_schema_extraction/README.rst index 8c5a06e1..9dfb9095 100644 --- a/examples/one_shot_schema_extraction/README.rst +++ b/examples/one_shot_schema_extraction/README.rst @@ -80,23 +80,24 @@ at extraction time. This approach allows you to encode complex edge case handlin logic without having to write any code, and it also allows you to easily update the logic by simply editing the schema. -If your schema contains a ``qualitative_restrictions`` key within the ``$definitions``, -the ``properties`` of the ``qualitative_restrictions`` will be marked as qualitative -outputs and the outputs will be separated correspondingly in the final output. Note -that qualitative outputs are expected to only contain extracted data in the ``summary`` -field. The ``value`` and ``units`` fields for qualitative outputs will be dropped -from the output. - The schema also includes a ``$examples`` key with example extractions that the model can refer to when deciding how to parse the text. You can be as detailed as you want in these instructions, and you can experiment with different outputs to tune the model's understanding of the task and the desired output format. -Finally, the same schema includes a ``$instructions`` key with general instructions +The same schema includes a ``$instructions`` key with general instructions for the model to follow when parsing the text. This is a good place to reinforce the importance of following the schema and to provide any additional context that might be helpful for the model to know when performing the extraction. +Finally, the schema contains an (optional) ``$qualitative_features`` key, which +contains a subset of the features defined in the schema. This list tells COMPASS +to categorize these features as "qualitative", which means that they are expected to +only contain textual summaries in the ``summary`` field of the output, and that the +``value`` and ``units`` fields for these features can be ignored and should (will) be +dropped from the final output. This input is not shown to the LLM, so the LLM response +will not be influenced by this key. + You can add or remove as many of these extra keys as you want, and you can experiment with different ways of encoding the instructions and examples to see what works best for your particular use case. The main thing to keep in mind is that the core structure of the diff --git a/examples/one_shot_schema_extraction/wind_schema.json b/examples/one_shot_schema_extraction/wind_schema.json index 636cfe02..7163b7e5 100644 --- a/examples/one_shot_schema_extraction/wind_schema.json +++ b/examples/one_shot_schema_extraction/wind_schema.json @@ -274,5 +274,16 @@ "For prohibited use districts: Only include districts with unconditional prohibitions currently in effect.", "Use the exact district names/codes as they appear in the ordinance text." ] - } + }, + "$qualitative_features": [ + "color", + "decommissioning", + "lighting", + "prohibitions", + "visual impact", + "repowering", + "climbing prevention", + "signage", + "soil" + ] } diff --git a/examples/water_rights_demo/README.md b/examples/water_rights_demo/README.md new file mode 100755 index 00000000..542a2e79 --- /dev/null +++ b/examples/water_rights_demo/README.md @@ -0,0 +1,7 @@ +# INFRA-COMPASS Texas Water Rights + + +This directory shows you two ways to run COMPASS to extract groundwater rights for several +groundwater conservation districts in Texas. The first is using a [traditional RAG-based +approach](./rag-based), and the second is using a [one-shot schema extraction +approach](./one-shot/). \ No newline at end of file diff --git a/examples/water_rights_demo/one-shot/README.rst b/examples/water_rights_demo/one-shot/README.rst new file mode 100755 index 00000000..623ddc0e --- /dev/null +++ b/examples/water_rights_demo/one-shot/README.rst @@ -0,0 +1,18 @@ +************************************************* +INFRA-COMPASS Texas Water Rights One-Shot Example +************************************************* + +This directory contains an example configuration for extracting groundwater rights +for several districts in Texas using a one-shot plugin config. To execute this run, +fill out the config file with the appropriate paths and API keys, +then run the following command: + +.. code-block:: shell + + compass process -c config.json5 -p plugin_config.yaml + + +Note that the one-shot plugin will still run location and document type validation, +which may not be desirable in this case. To disable this validation, you would need to +implement your own plugin and manually disable the validation by setting the appropriate +document attributes after collection. diff --git a/examples/water_rights_demo/one-shot/config.json5 b/examples/water_rights_demo/one-shot/config.json5 new file mode 100644 index 00000000..8d0a32b4 --- /dev/null +++ b/examples/water_rights_demo/one-shot/config.json5 @@ -0,0 +1,20 @@ +{ + out_dir: "./outputs", + tech: "tx water rights one shot", + jurisdiction_fp: "../jurisdictions.csv", + "known_local_docs": "./local_docs.json5", // We have Panola County docs locally + model: [ + { + name: "egswaterord-gpt4.1-mini", + llm_call_kwargs: {temperature: 0, timeout: 300}, + llm_service_rate_limit: 500000, + text_splitter_chunk_size: 10000, + text_splitter_chunk_overlap: 500, + "client_kwargs": { + "api_key": "", + "api_version": "", + "azure_endpoint": "", + }, + }, + ], +} diff --git a/examples/water_rights_demo/one-shot/existing_docs/Panola County District Management Plan.pdf b/examples/water_rights_demo/one-shot/existing_docs/Panola County District Management Plan.pdf new file mode 100644 index 00000000..e4b42f88 Binary files /dev/null and b/examples/water_rights_demo/one-shot/existing_docs/Panola County District Management Plan.pdf differ diff --git a/examples/water_rights_demo/one-shot/existing_docs/Panola County District Rules.pdf b/examples/water_rights_demo/one-shot/existing_docs/Panola County District Rules.pdf new file mode 100644 index 00000000..8a039600 Binary files /dev/null and b/examples/water_rights_demo/one-shot/existing_docs/Panola County District Rules.pdf differ diff --git a/examples/water_rights_demo/one-shot/local_docs.json5 b/examples/water_rights_demo/one-shot/local_docs.json5 new file mode 100755 index 00000000..3274b82b --- /dev/null +++ b/examples/water_rights_demo/one-shot/local_docs.json5 @@ -0,0 +1,22 @@ +{ + "64": [ + { + "source_fp": "./existing_docs/Panola County District Management Plan.pdf", + "source": "https://www.twdb.texas.gov/groundwater/docs/GCD/pcgcd/pcgcd_mgmt_plan2023.pdf", + "date": [2023, 4, 27], // [year, month, day] - Skips date extraction if given + "check_if_legal_doc": false, // Skip legal doc check + + // Optional metadata fields - not required but can be helpful for metadata in the run output + "from_ocr": true, + }, + { + "source_fp": "./existing_docs/Panola County District Rules.pdf", + "source": "https://www.beg.utexas.edu/files/content/beg/research/swr/mgmtplans/PANOLA_CNTY_GCD_RULES.pdf", + "date": [2009, 1, 20], // [year, month, day] - Skips date extraction if given + "check_if_legal_doc": false, // Skip legal doc check + + // Optional metadata fields - not required but can be helpful for metadata in the run output + "from_ocr": false, + }, + ], +} \ No newline at end of file diff --git a/examples/water_rights_demo/one-shot/plugin_config.yaml b/examples/water_rights_demo/one-shot/plugin_config.yaml new file mode 100755 index 00000000..b5f7e47a --- /dev/null +++ b/examples/water_rights_demo/one-shot/plugin_config.yaml @@ -0,0 +1,38 @@ +schema: ./water_rights_schema.json5 + +data_type_short_desc: water rights and regulations + +allow_multi_doc_extraction: True # Important for water rights! + +query_templates: + - "{jurisdiction} rules" + - "{jurisdiction} management plan" + - "{jurisdiction} well permits" + - "{jurisdiction} well permit requirements" + - "requirements to drill a water well in {jurisdiction}" + +website_keywords: + pdf: 92160 + water: 46080 + rights: 23040 + zoning: 11520 + ordinance: 5760 + renewable energy: 1440 + planning: 720 + plan: 360 + government: 180 + code: 60 + area: 60 + land development: 15 + land: 3 + environment: 3 + energy: 3 + renewable: 3 + municipal: 1 + department: 1 + +collection_prompts: True # Can disable this to pass entire document to context + +extraction_system_prompt: |- + You are a legal scholar extracting structured data from water rights and groundwater conservation district regulations. + Follow all instructions in the schema descriptions carefully. diff --git a/examples/water_rights_demo/one-shot/water_rights_schema.json5 b/examples/water_rights_demo/one-shot/water_rights_schema.json5 new file mode 100755 index 00000000..3f66c100 --- /dev/null +++ b/examples/water_rights_demo/one-shot/water_rights_schema.json5 @@ -0,0 +1,242 @@ +{ + "title": "Texas Water Rights Extraction Schema", + "description": "Single-shot structured extraction schema for Texas groundwater conservation district water-rights ordinances and management plans. This schema encodes the decision-tree logic from COMPASS water-rights extraction into comprehensive field descriptions that guide LLMs to extract all supported features in one call. Output is a flat array of extraction objects, where each object represents one row in the final CSV/DataFrame output.", + "version": "1.0.0", + "type": "object", + "required": ["outputs"], + "additionalProperties": false, + "properties": { + "outputs": { + "type": "array", + "items": { + "type": "object", + "required": ["feature", "value", "units", "section", "summary", "source"], + "additionalProperties": false, + "properties": { + "feature": { + "type": "string", + "description": "The water-rights feature being extracted. Must be one of the enumerated feature IDs.", + "enum": [ + "permit requirements", + "extraction permit requirements", + "daily extraction limits", + "monthly extraction limits", + "annual extraction limits", + "well spacing", + "drilling window", + "metering device", + "district drought management plan", + "well drought management plan", + "plugging requirements", + "external transfer restrictions", + "production reporting", + "production cost", + "setback restrictions", + "redrilling restrictions", + "geothermal requirements", + "oil and gas requirements" + ] + }, + "value": { + "description": "The extracted value for this feature. Use booleans for yes/no requirements, numbers for explicit numerical limits, and strings for permit-specific/rate-structure values. Use null when no enacted requirement is found.", + "anyOf": [ + {"type": "boolean"}, + {"type": "number"}, + {"type": "string"}, + { + "type": "array", + "items": { + "anyOf": [ + {"type": "number"}, + {"type": "string"} + ] + } + }, + {"type": "null"} + ] + }, + "units": { + "type": ["string", "null"], + "description": "Units associated with value. Examples: 'feet', 'yards', 'days', 'months', 'gallons/day', 'acre-feet/year', 'dollars/acre-foot', or null when units are not applicable.", + "default": null + }, + "section": { + "type": ["string", "null"], + "description": "Section title/number where the requirement is found (for example, 'Rule 8.4 - Well Spacing'). Include exact numbering/labels when available. Null when section is unavailable.", + "default": null + }, + "summary": { + "type": ["string", "null"], + "description": "Short requirement summary using direct excerpts/quotes whenever possible. Include key conditions, exemptions, qualifiers, application scope (permit/well/aquifer), and explanation for selected value. Null when no requirement exists.", + "default": null + }, + "source": { + "type": ["number", "null"], + "description": "Integer indicating the source index from which this information was pulled. If not applicable or unavailable, use null.", + "default": null + } + } + } + } + }, + "$definitions": { + "scope": { + "description": "Extract enacted Texas groundwater district water-rights rules for groundwater wells, permitting, extraction, production, transfer, and related compliance controls. Ignore unrelated municipal utility policy unless it directly governs groundwater well permitting/operation in the district text." + }, + "permit_and_authorization": { + "description": "Permit and authorization features mapped from permit decision trees.", + "properties": { + "permit requirements": { + "description": "Whether a permit/application is required to drill a groundwater well. VALUE: boolean. SUMMARY: include core permit requirements, filing conditions, and any explicit well exemptions from permitting. UNITS: null." + }, + "extraction permit requirements": { + "description": "Whether a permit is required to extract/produce groundwater from a well. VALUE: boolean. SUMMARY: include extraction-specific permit language (if drilling and extraction are treated differently, preserve that distinction). UNITS: null." + } + } + }, + "interval_limits": { + "description": "Interval-specific production/extraction/withdrawal limits for groundwater wells.", + "properties": { + "daily extraction limits": { + "description": "Daily production/withdrawal limit. VALUE: explicit numeric limit when stated, otherwise string such as 'permit specific'. If multiple explicit limits exist by permit/well/aquifer, value may be an array of numbers/strings. UNITS: include interval-aware units such as 'gallons/day' or 'acre-feet/day'. SUMMARY: include application scope and whether limit is explicit or permit specific." + }, + "monthly extraction limits": { + "description": "Monthly production/withdrawal limit. Same extraction logic as daily limits, but restricted to monthly interval references only." + }, + "annual extraction limits": { + "description": "Annual production/withdrawal limit. Same extraction logic as daily limits, but restricted to annual interval references only." + } + } + }, + "well_and_operations": { + "description": "Operational controls and well-construction or operation constraints.", + "properties": { + "well spacing": { + "description": "Minimum required distance between wells. VALUE: numeric distance when explicitly stated. UNITS: 'feet', 'yards', or similar distance unit. SUMMARY: include any qualifier tied to depth, production capacity, permit class, or well type. Ignore spacing that is only for property lines, septic systems, or non-well infrastructure." + }, + "drilling window": { + "description": "Time period after permit issuance/approval by which drilling must commence. VALUE: numeric time period. UNITS: 'days', 'months', etc. SUMMARY: include trigger event (for example, permit issuance), restart/extension clauses, and expiration implications." + }, + "metering device": { + "description": "Requirement for metering/measurement devices to monitor usage/production. VALUE: boolean. UNITS: null. SUMMARY: include device type, well thresholds (for example, >35 gpm), and reporting or calibration obligations if provided." + }, + "plugging requirements": { + "description": "Whether district imposes water-well plugging requirements. VALUE: boolean. UNITS: null. SUMMARY: include abandonment triggers, methods/standards, notification/reporting, and responsible party language." + }, + "setback restrictions": { + "description": "Whether district imposes well setback/location restrictions (property lines, buildings, septic systems, contamination sources). VALUE: boolean. UNITS: null unless an explicit single distance is extracted as value. SUMMARY: include restricted reference features and any listed minimum distances." + }, + "redrilling restrictions": { + "description": "Whether district imposes restrictions on redrilling/deepening/widening existing wells. VALUE: boolean. UNITS: null. SUMMARY: include approval triggers, technical conditions, and distinctions between maintenance and substantive redrilling." + } + } + }, + "drought_and_planning": { + "description": "District-level and well-owner-level drought planning obligations.", + "properties": { + "district drought management plan": { + "description": "Whether district has a drought management/contingency plan at policy level. VALUE: boolean. UNITS: null. SUMMARY: include district drought stages, triggers, and response actions.", + "x-tree-key": "drought_plan" + }, + "well drought management plan": { + "description": "Whether well owners/users/applicants must develop contingency or drought plans. VALUE: boolean. UNITS: null. SUMMARY: include who must prepare plans and required components." + } + } + }, + "transfer_reporting_and_cost": { + "description": "Transfer restrictions, production reporting, and monetary charges.", + "properties": { + "external transfer restrictions": { + "description": "Restrictions or costs for transporting/exporting water outside district boundaries. VALUE: boolean indicating whether transfer restrictions/cost framework exists. UNITS: null unless encoding explicit transfer-rate value. SUMMARY: include restrictions, permit conditions, whether transfer costs exist, and any explicit cost amount/rate structure." + }, + "production reporting": { + "description": "Requirement for reporting production/extraction volumes. VALUE: boolean. UNITS: null. SUMMARY: include reporting cadence, threshold triggers, forms, and deadlines." + }, + "production cost": { + "description": "Production/extraction fee charged per unit water extracted (not one-time permit fee). VALUE: numeric cost when explicit, or string such as 'permit specific' / rate structure. UNITS: cost-rate units such as 'dollars/gallon' or 'dollars/acre-foot'. SUMMARY: include applicability (all users vs permit-specific) and any tiers." + } + } + }, + "resource_specific": { + "description": "Optional resource-specific policy features present in the water-rights decision-tree set.", + "properties": { + "geothermal requirements": { + "description": "Whether district has requirements specific to geothermal systems. VALUE: boolean. UNITS: null. SUMMARY: include geothermal-specific requirements when present." + }, + "oil and gas requirements": { + "description": "Whether district has requirements specific to oil and gas operations. VALUE: boolean. UNITS: null. SUMMARY: include oil-and-gas-specific provisions when present." + } + } + } + }, + "$examples": [ + { + "feature": "permit requirements", + "value": true, + "units": null, + "section": "Rule 3.2 - Permit Required", + "summary": "'No person may drill a non-exempt well without first obtaining a permit from the District.' Exempt domestic wells are listed separately under Rule 3.5." + }, + { + "feature": "annual extraction limits", + "value": 2, + "units": "acre-feet/year", + "section": "Rule 8.1 - Production Limits", + "summary": "The district sets an explicit annual production cap of '2 acre-feet per acre per year' for this permit class; text also notes permit-specific adjustments by aquifer conditions." + }, + { + "feature": "well spacing", + "value": 500, + "units": "feet", + "section": "Rule 8.4 - Well Spacing", + "summary": "'A new non-exempt well must be at least 500 feet from any existing non-exempt well.' Reduced spacing may be allowed for low-capacity wells below stated gpm threshold." + }, + { + "feature": "production cost", + "value": "permit specific", + "units": "dollars/acre-foot", + "section": "Fee Schedule - Production Fees", + "summary": "Production fees are assessed by permit class and use category rather than one universal rate; schedule provides tiered dollar-per-acre-foot charges." + } + ], + "$instructions": { + "general": [ + "Extract only enacted district requirements, not proposed language or general background text.", + "Use direct excerpts/quotes in summary whenever possible.", + "If a feature has no requirement, set value, units, section, and summary to null or omit the feature row.", + "When multiple values exist for one feature, choose the value applicable to the primary general rule and describe alternatives/conditions in summary.", + "Preserve distinctions between drilling permits, extraction permits, transfer permits, and reporting obligations.", + "Do not convert one-time application fees into production or transfer rate values." + ], + "boolean_features": [ + "Use value=true only when text clearly establishes that the requirement/policy exists.", + "Use value=false only when text clearly states it is not required or not implemented.", + "If the text is ambiguous, use null and explain ambiguity in summary." + ], + "numeric_and_units": [ + "For extraction limits, ensure units include interval context (daily/monthly/annual).", + "For spacing and drilling window values, extract explicit numbers and corresponding units from text.", + "For cost features, preserve explicit rate units (for example, dollars per gallon, dollars per acre-foot).", + "For permit-specific or tiered values where a single number is not valid, use a descriptive string value and capture details in summary." + ], + "scope_filters": [ + "For well spacing, ignore distances that only reference septic systems, structures, or property lines unless the text explicitly frames them as well-to-well spacing rules.", + "For production cost, ignore one-time filing/application fees unless the tree logic explicitly treats them as production cost.", + "For external transfer restrictions, focus on export/transport outside district boundaries and related permit/cost terms." + ] + }, + "$qualitative_features": [ + "permit requirements", + "extraction permit requirements", + "metering device", + "district drought management plan", + "well drought management plan", + "plugging requirements", + "external transfer restrictions", + "production reporting", + "setback restrictions", + "redrilling restrictions", + "geothermal requirements", + "oil and gas requirements" + ] +} diff --git a/examples/water_rights_demo/README.rst b/examples/water_rights_demo/rag-based/README.rst old mode 100644 new mode 100755 similarity index 67% rename from examples/water_rights_demo/README.rst rename to examples/water_rights_demo/rag-based/README.rst index 7b10862b..d465ec9f --- a/examples/water_rights_demo/README.rst +++ b/examples/water_rights_demo/rag-based/README.rst @@ -1,9 +1,9 @@ -***************************************** -INFRA-COMPASS Texas Water Rights Demo Run -***************************************** +************************************************** +INFRA-COMPASS Texas Water Rights RAG-Based Example +************************************************** This directory contains an example configuration for extracting groundwater rights -for several districtis in Texas. To execute this run, fill out the confg file with +for several districts in Texas. To execute this run, fill out the config file with the appropriate paths and API keys, then run the following command: .. code-block:: shell diff --git a/examples/water_rights_demo/config.json5 b/examples/water_rights_demo/rag-based/config.json5 old mode 100644 new mode 100755 similarity index 94% rename from examples/water_rights_demo/config.json5 rename to examples/water_rights_demo/rag-based/config.json5 index 177bf230..59074b7a --- a/examples/water_rights_demo/config.json5 +++ b/examples/water_rights_demo/rag-based/config.json5 @@ -1,7 +1,7 @@ { out_dir: "./outputs", tech: "water rights", - jurisdiction_fp: "jurisdictions.csv", + jurisdiction_fp: "../jurisdictions.csv", model: [ { name: "text-embedding-ada-002", diff --git a/pixi.lock b/pixi.lock index cf1d43a2..7110ecea 100644 --- a/pixi.lock +++ b/pixi.lock @@ -3365,7 +3365,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/roman-numerals-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/rpds-py-0.30.0-py313h843e2db_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/rsa-4.9.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.14.11-h4196e79_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.15.5-h40fa522_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ruff-lsp-0.0.62-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/s3transfer-0.16.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/scikit-learn-1.8.0-np2py313h16d504d_1.conda @@ -3854,7 +3854,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/roman-numerals-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rpds-py-0.30.0-py313h8f1d341_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/rsa-4.9.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ruff-0.14.11-hc0dabaa_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ruff-0.15.5-he9a2e21_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ruff-lsp-0.0.62-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rust-1.89.0-h6cf38e9_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/rust-std-aarch64-unknown-linux-gnu-1.89.0-hbe8e118_0.conda @@ -4307,7 +4307,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/roman-numerals-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/rpds-py-0.30.0-py313hcc225dc_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/rsa-4.9.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/ruff-0.14.11-hb17bafe_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/ruff-0.15.5-h8ee721d_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ruff-lsp-0.0.62-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/s3transfer-0.16.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/scikit-learn-1.8.0-np2py313he2891f2_1.conda @@ -4753,7 +4753,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/roman-numerals-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/rpds-py-0.30.0-py313h2c089d5_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/rsa-4.9.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ruff-0.14.11-hb0cad00_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ruff-0.15.5-h279115b_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ruff-lsp-0.0.62-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/s3transfer-0.16.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/scikit-learn-1.8.0-np2py313h3b23316_1.conda @@ -5184,7 +5184,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/roman-numerals-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/rpds-py-0.30.0-py313hfbe8231_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/rsa-4.9.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/ruff-0.14.11-h37e10c4_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ruff-0.15.5-h5739096_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ruff-lsp-0.0.62-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/s3transfer-0.16.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/scikit-learn-1.8.0-np2py313h4ce4a18_1.conda @@ -16692,8 +16692,8 @@ packages: timestamp: 1736252433366 - pypi: ./ name: infra-compass - version: 0.13.2.dev5+g2a2cf74.d20260210 - sha256: 14c80efebd7b5a7937d34c6e5de47a023a4ffc6e29953d0d7267c4c258816d68 + version: 0.14.3.dev20+gbb9a53e6.d20260309 + sha256: cef76fd52eb2f0853ed265397d062b290dfc3c508bfccc8d19c5e0bd95574e34 requires_dist: - beautifulsoup4>=4.12.3,<5 - click>=8.1.7,<9 @@ -16713,7 +16713,7 @@ packages: - pytesseract>=0.3.13,<0.4 ; extra == 'ocr' - jupyter>=1.0.0,<1.1 ; extra == 'dev' - pipreqs>=0.4.13,<0.5 ; extra == 'dev' - - ruff>=0.14.11,<0.15 ; extra == 'dev' + - ruff>=0.15.5,<0.16 ; extra == 'dev' - ruff-lsp>=0.0.62,<0.0.63 ; extra == 'dev' - flaky>=3.8.1,<4 ; extra == 'test' - pytest>=8.3.3,<9 ; extra == 'test' @@ -29218,26 +29218,26 @@ packages: version: 1.4.1 sha256: a7e48d805e12011c2cf739a29d6a60ae852fb1de9fc84220bbcef67e6e595d7d requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.14.11-h4196e79_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.15.5-h40fa522_0.conda noarch: python - sha256: a11a028fa1e5e273111330298b98005307eee32af69b0b5b09d8b7b23f6062ce - md5: da06de874b1e1e2029772f9c319d164e + sha256: da5d47b231a590257b4ee0f3459e6ec30012ae549e3c29601cd15de178dafe9c + md5: 2ad709f7abc95e934d96e7a20b837b6e depends: - python - - libgcc >=14 - __glibc >=2.17,<3.0.a0 + - libgcc >=14 constrains: - __glibc >=2.17 license: MIT license_family: MIT purls: - - pkg:pypi/ruff?source=compressed-mapping - size: 11460448 - timestamp: 1767948559731 -- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ruff-0.14.11-hc0dabaa_0.conda + - pkg:pypi/ruff?source=hash-mapping + size: 9273260 + timestamp: 1772780208047 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ruff-0.15.5-he9a2e21_0.conda noarch: python - sha256: 251d1dd73e424d48b293eb9aaff9b0f5e63c34affa3b34da1693310450e0e550 - md5: 3a912737778789d94695529739c888e5 + sha256: 7f3de3fe5a104ddc41da7e1f15d10a83b13d712d622ca10a3e6eacfcaa9af2f5 + md5: 5423b0d4e00191340f2aa5b517110682 depends: - python - libgcc >=14 @@ -29247,27 +29247,27 @@ packages: license_family: MIT purls: - pkg:pypi/ruff?source=hash-mapping - size: 10946211 - timestamp: 1767948572467 -- conda: https://conda.anaconda.org/conda-forge/osx-64/ruff-0.14.11-hb17bafe_0.conda + size: 8917323 + timestamp: 1772780223372 +- conda: https://conda.anaconda.org/conda-forge/osx-64/ruff-0.15.5-h8ee721d_0.conda noarch: python - sha256: 932ba833151abad4f910e4301c7f69600ba6025effd4d2f56d9dca23b31fe4ad - md5: ce558e6855bb2ff55477d6c1ffd845e2 + sha256: 4968ba59a12f211106b2b9026bb867a0e6a6abc0f6b3be67259d616e5c79357b + md5: 81083c6d1627d9394b6f669d2931a407 depends: - python - - __osx >=10.13 + - __osx >=11.0 constrains: - __osx >=10.13 license: MIT license_family: MIT purls: - - pkg:pypi/ruff?source=compressed-mapping - size: 11356233 - timestamp: 1767948666150 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/ruff-0.14.11-hb0cad00_0.conda + - pkg:pypi/ruff?source=hash-mapping + size: 9288390 + timestamp: 1772780432283 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/ruff-0.15.5-h279115b_0.conda noarch: python - sha256: b76ff21a7a466ed1f80a640d437ef0f6345240ce2150972584e4f97b0c7db955 - md5: aabef64b30defea8e9166b0b2248fa85 + sha256: 89cb3edc0239200b83cece7e27dce096facf2db0d3370dc46f047db65b1f1126 + md5: 02f7f9ffb450e26b44120c1cc8e543a4 depends: - python - __osx >=11.0 @@ -29276,13 +29276,13 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/ruff?source=compressed-mapping - size: 10375602 - timestamp: 1767948656938 -- conda: https://conda.anaconda.org/conda-forge/win-64/ruff-0.14.11-h37e10c4_0.conda + - pkg:pypi/ruff?source=hash-mapping + size: 8476952 + timestamp: 1772780440888 +- conda: https://conda.anaconda.org/conda-forge/win-64/ruff-0.15.5-h5739096_0.conda noarch: python - sha256: 86181b96eb46ca085c721c66a9e961e8646917fff6469f0836e90371a793ea27 - md5: 36a7142884f26724a3d33b2f80d13c2d + sha256: d0bba8615cc662c684bb79a30145b8d4940a80d6906d12ef5ed074f1f1879bc8 + md5: 4b5fef0aa16f91d1286205ff19f2123c depends: - python - vc >=14.3,<15 @@ -29291,9 +29291,9 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/ruff?source=compressed-mapping - size: 11870088 - timestamp: 1767948584867 + - pkg:pypi/ruff?source=hash-mapping + size: 9732695 + timestamp: 1772780220897 - conda: https://conda.anaconda.org/conda-forge/noarch/ruff-lsp-0.0.62-pyhd8ed1ab_0.conda sha256: 2640f3ae1cd31209c26c70b0413730fb4e903aefc4649dc21f9dd28b08e97a61 md5: 5962a27993ab1b25dd2c8e87a3365753 diff --git a/pyproject.toml b/pyproject.toml index 1aa80168..7735d75e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ ocr = [ dev = [ "jupyter>=1.0.0,<1.1", "pipreqs>=0.4.13,<0.5", - "ruff>=0.14.11,<0.15", + "ruff>=0.15.5,<0.16", "ruff-lsp>=0.0.62,<0.0.63", ] test = [ @@ -223,7 +223,7 @@ geopandas = ">=1.0.1,<2" ipykernel = ">=7.1.0,<8" jupyter = ">=1.0.0,<1.1" pipreqs = ">=0.4.13,<0.5" -ruff = ">=0.14.11,<0.15" +ruff = ">=0.15.5,<0.16" ruff-lsp = ">=0.0.62,<0.0.63" seaborn = ">=0.13.2,<0.14" diff --git a/tests/python/unit/extraction/test_extraction_context.py b/tests/python/unit/extraction/test_extraction_context.py index 57ad9ff6..672c930d 100644 --- a/tests/python/unit/extraction/test_extraction_context.py +++ b/tests/python/unit/extraction/test_extraction_context.py @@ -57,17 +57,34 @@ def test_extraction_context_text_empty(): def test_extraction_context_text_single_doc(): """Test text property with single document""" doc = PDFDocument(["page one", "page two"]) + doc.attrs["year"] = 2024 + doc.attrs["source"] = "single_doc.pdf" ctx = ExtractionContext(doc) - assert ctx.text == "page one\npage two" + expected = ( + "## MULTI-DOCUMENT CONTEXT ##" + "\n\n" + "# SOURCE INDEX #: 0" + "\n" + "# CONTENT #:\npage one\npage two" + ) + assert ctx.text == expected def test_extraction_context_text_multiple_docs(): """Test text property concatenates multiple documents""" doc1 = PDFDocument(["doc1 page1", "doc1 page2"]) + doc1.attrs["year"] = 2020 + doc1.attrs["source"] = "doc1.pdf" doc2 = HTMLDocument(["

doc2 content

"]) + doc2.attrs["year"] = 2021 + doc2.attrs["source"] = "doc2.html" ctx = ExtractionContext([doc1, doc2]) - expected = "doc1 page1\ndoc1 page2\n\ndoc2 content\n\n" - assert ctx.text == expected + text = ctx.text + assert "## MULTI-DOCUMENT CONTEXT ##" in text + assert "# SOURCE INDEX #: 0" in text + assert "# CONTENT #:\ndoc1 page1\ndoc1 page2" in text + assert "# SOURCE INDEX #: 1" in text + assert f"# CONTENT #:\n{doc2.text}" in text def test_extraction_context_pages_empty(): @@ -316,6 +333,43 @@ async def fake_mover(doc_arg, out_fn): # noqa assert doc.attrs["out_fp"] == output_path +def test_multi_doc_context(): + """Test multi_doc_context""" + doc1 = PDFDocument(["doc 1"]) + doc1.attrs["source"] = "doc1.pdf" + doc2 = PDFDocument(["doc 2"]) + doc2.attrs["source"] = "doc2.pdf" + ctx = ExtractionContext([doc1, doc2], attrs={"existing": "keep"}) + + combined_text = ctx.multi_doc_context() + + assert ctx.attrs["existing"] == "keep" + assert combined_text == ctx.text + assert ctx.data_docs == [] + assert "out_fp" not in doc1.attrs + assert "out_fp" not in doc2.attrs + + +def test_multi_doc_context_with_attr_text_key(): + """Test multi_doc_context with attr_text_key""" + doc1 = PDFDocument(["doc 1 full text"]) + doc1.attrs["source"] = "doc1.pdf" + doc1.attrs["summary"] = "doc 1 summary" + doc2 = PDFDocument(["doc 2 full text"]) + doc2.attrs["source"] = "doc2.pdf" + doc2.attrs["summary"] = "doc 2 summary" + ctx = ExtractionContext([doc1, doc2]) + + combined_text = ctx.multi_doc_context(attr_text_key="summary") + + assert "# SOURCE INDEX #: 0" in combined_text + assert "# SOURCE INDEX #: 1" in combined_text + assert "# CONTENT #:\ndoc 1 summary" in combined_text + assert "# CONTENT #:\ndoc 2 summary" in combined_text + assert "doc 1 full text" not in combined_text + assert "doc 2 full text" not in combined_text + + @pytest.mark.parametrize( "input_val", [ diff --git a/tests/python/unit/utilities/test_utilities_finalize.py b/tests/python/unit/utilities/test_utilities_finalize.py index 1fd45645..ae8c7eee 100644 --- a/tests/python/unit/utilities/test_utilities_finalize.py +++ b/tests/python/unit/utilities/test_utilities_finalize.py @@ -174,7 +174,7 @@ def test_doc_infos_to_db_compiles_and_formats(tmp_path): "units": "ft", "adder": 300, "source": "http://example.com/valid", - "ord_year": 2022, + "year": 2022, } ] ).to_csv(valid_csv, index=False) @@ -208,7 +208,7 @@ def test_doc_infos_to_db_compiles_and_formats(tmp_path): row = db.iloc[0] assert row["source"] == "http://example.com/valid" - assert row["ord_year"] == 2022 + assert row["year"] == 2022 assert row["FIPS"] == "12345" assert bool(row["quantitative"]) is True assert pd.isna(row["adder"]) @@ -229,7 +229,7 @@ def test_save_db_writes_csvs(tmp_path): "value": 100, "units": "ft", "summary": "Maximum height", - "ord_year": 2020, + "year": 2020, "source": "http://source", "quantitative": True, } @@ -373,7 +373,7 @@ def test_compile_run_summary_message_includes_cost(tmp_path): assert "Total runtime: 1:01:01" in message assert "Total cost" in message assert "$42.50" in message - assert "Number of documents found: 3" in message + assert "Number of jurisdictions with extracted data: 3" in message def test_compile_run_summary_message_without_cost(tmp_path): diff --git a/tests/python/unit/utilities/test_utilities_parsing.py b/tests/python/unit/utilities/test_utilities_parsing.py index fb916e68..36afd4b0 100644 --- a/tests/python/unit/utilities/test_utilities_parsing.py +++ b/tests/python/unit/utilities/test_utilities_parsing.py @@ -9,7 +9,7 @@ from compass.utilities.parsing import ( clean_backticks_from_llm_response, convert_paths_to_strings, - extract_ord_year_from_doc_attrs, + extract_year_from_doc_attrs, llm_response_as_json, merge_overlapping_texts, num_ordinances_dataframe, @@ -93,10 +93,10 @@ def test_merge_overlapping_texts(text_chunks, n, expected): ({"other_key": "value"}, None), ], ) -def test_extract_ord_year_from_doc_attrs(doc_attrs, expected): - """Test the `extract_ord_year_from_doc_attrs` function""" +def test_extract_year_from_doc_attrs(doc_attrs, expected): + """Test the `extract_year_from_doc_attrs` function""" - assert extract_ord_year_from_doc_attrs(doc_attrs) == expected + assert extract_year_from_doc_attrs(doc_attrs) == expected def test_num_ordinances_dataframe_empty():