diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 36b999fd1..39c58e02d 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -866,6 +866,7 @@ async def aprocess_html( ################################ # Structured Content Extraction # ################################ + token_usage = None if ( not bool(extracted_content) and config.extraction_strategy @@ -914,6 +915,17 @@ async def aprocess_html( extracted_content, indent=4, default=str, ensure_ascii=False ) + # Capture token usage from extraction strategy + if hasattr(config.extraction_strategy, 'total_usage'): + _token_usage = config.extraction_strategy.total_usage + if _token_usage and hasattr(_token_usage, '__dict__'): + token_usage = { + k: v for k, v in _token_usage.__dict__.items() + if v is not None and v != 0 + } or None + else: + token_usage = None + # Log extraction completion self.logger.url_status( url=_url, @@ -940,6 +952,7 @@ async def aprocess_html( screenshot=screenshot_data, pdf=pdf_data, extracted_content=extracted_content, + token_usage=token_usage, success=True, error_message="", ) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 506538970..415997f72 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -160,6 +160,8 @@ class CrawlResult(BaseModel): cache_status: Optional[str] = None # "hit", "hit_validated", "hit_fallback", "miss" # Anti-bot retry/proxy usage stats crawl_stats: Optional[Dict[str, Any]] = None + # LLM token usage (populated when using LLMExtractionStrategy) + token_usage: Optional[Dict[str, Any]] = None model_config = ConfigDict(arbitrary_types_allowed=True)