forked from unclecode/crawl4ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
1660 lines (1357 loc) · 58.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import time
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
import json
import html
import re
import os
import platform
from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import *
from pathlib import Path
from typing import Dict, Any
from urllib.parse import urljoin
import requests
from requests.exceptions import InvalidSchema
from typing import Optional, Tuple, Dict, Any
import xxhash
from colorama import Fore, Style, init
import textwrap
import cProfile
import pstats
from functools import wraps
import asyncio
class InvalidCSSSelectorError(Exception):
pass
def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str:
"""
Create a styled message box with colored borders and formatted text.
How it works:
1. Determines box style and colors based on the message type (e.g., info, warning).
2. Wraps text to fit within the specified width.
3. Constructs a box using characters (single or double lines) with appropriate formatting.
4. Adds optional newlines before and after the box.
Args:
message (str): The message to display inside the box.
type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
width (int): Width of the box. Defaults to 120.
add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
double_line (bool): Whether to use double lines for the box border. Defaults to False.
Returns:
str: A formatted string containing the styled message box.
"""
init()
# Define border and text colors for different types
styles = {
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
}
border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
# Define box characters based on line style
box_chars = {
"single": ("─", "│", "┌", "┐", "└", "┘"),
"double": ("═", "║", "╔", "╗", "╚", "╝")
}
line_style = "double" if double_line else "single"
h_line, v_line, tl, tr, bl, br = box_chars[line_style]
# Process lines with lighter text color
formatted_lines = []
raw_lines = message.split('\n')
if raw_lines:
first_line = f"{prefix} {raw_lines[0].strip()}"
wrapped_first = textwrap.fill(first_line, width=width-4)
formatted_lines.extend(wrapped_first.split('\n'))
for line in raw_lines[1:]:
if line.strip():
wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
formatted_lines.extend(wrapped.split('\n'))
else:
formatted_lines.append("")
# Create the box with colored borders and lighter text
horizontal_line = h_line * (width - 1)
box = [
f"{border_color}{tl}{horizontal_line}{tr}",
*[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
]
result = "\n".join(box)
if add_newlines:
result = f"\n{result}\n"
return result
def calculate_semaphore_count():
"""
Calculate the optimal semaphore count based on system resources.
How it works:
1. Determines the number of CPU cores and total system memory.
2. Sets a base count as half of the available CPU cores.
3. Limits the count based on memory, assuming 2GB per semaphore instance.
4. Returns the minimum value between CPU and memory-based limits.
Returns:
int: The calculated semaphore count.
"""
cpu_count = os.cpu_count()
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
base_count = max(1, cpu_count // 2)
memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
return min(base_count, memory_based_cap)
def get_system_memory():
"""
Get the total system memory in bytes.
How it works:
1. Detects the operating system.
2. Reads memory information from system-specific commands or files.
3. Converts the memory to bytes for uniformity.
Returns:
int: The total system memory in bytes.
Raises:
OSError: If the operating system is unsupported.
"""
system = platform.system()
if system == "Linux":
with open('/proc/meminfo', 'r') as mem:
for line in mem:
if line.startswith('MemTotal:'):
return int(line.split()[1]) * 1024 # Convert KB to bytes
elif system == "Darwin": # macOS
import subprocess
output = subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8')
return int(output.strip())
elif system == "Windows":
import ctypes
kernel32 = ctypes.windll.kernel32
c_ulonglong = ctypes.c_ulonglong
class MEMORYSTATUSEX(ctypes.Structure):
_fields_ = [
('dwLength', ctypes.c_ulong),
('dwMemoryLoad', ctypes.c_ulong),
('ullTotalPhys', c_ulonglong),
('ullAvailPhys', c_ulonglong),
('ullTotalPageFile', c_ulonglong),
('ullAvailPageFile', c_ulonglong),
('ullTotalVirtual', c_ulonglong),
('ullAvailVirtual', c_ulonglong),
('ullAvailExtendedVirtual', c_ulonglong),
]
memoryStatus = MEMORYSTATUSEX()
memoryStatus.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
kernel32.GlobalMemoryStatusEx(ctypes.byref(memoryStatus))
return memoryStatus.ullTotalPhys
else:
raise OSError("Unsupported operating system")
def get_home_folder():
"""
Get or create the home folder for Crawl4AI configuration and cache.
How it works:
1. Uses environment variables or defaults to the user's home directory.
2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
3. Returns the path to the home folder.
Returns:
str: The path to the Crawl4AI home folder.
"""
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True)
os.makedirs(f"{home_folder}/cache", exist_ok=True)
os.makedirs(f"{home_folder}/models", exist_ok=True)
return home_folder
def beautify_html(escaped_html):
"""
Beautifies an escaped HTML string.
Parameters:
escaped_html (str): A string containing escaped HTML.
Returns:
str: A beautifully formatted HTML string.
"""
# Unescape the HTML string
unescaped_html = html.unescape(escaped_html)
# Use BeautifulSoup to parse and prettify the HTML
soup = BeautifulSoup(unescaped_html, 'html.parser')
pretty_html = soup.prettify()
return pretty_html
def split_and_parse_json_objects(json_string):
"""
Splits a JSON string which is a list of objects and tries to parse each object.
Parameters:
json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.
Returns:
tuple: A tuple containing two lists:
- First list contains all successfully parsed JSON objects.
- Second list contains the string representations of all segments that couldn't be parsed.
"""
# Trim the leading '[' and trailing ']'
if json_string.startswith('[') and json_string.endswith(']'):
json_string = json_string[1:-1].strip()
# Split the string into segments that look like individual JSON objects
segments = []
depth = 0
start_index = 0
for i, char in enumerate(json_string):
if char == '{':
if depth == 0:
start_index = i
depth += 1
elif char == '}':
depth -= 1
if depth == 0:
segments.append(json_string[start_index:i+1])
# Try parsing each segment
parsed_objects = []
unparsed_segments = []
for segment in segments:
try:
obj = json.loads(segment)
parsed_objects.append(obj)
except json.JSONDecodeError:
unparsed_segments.append(segment)
return parsed_objects, unparsed_segments
def sanitize_html(html):
"""
Sanitize an HTML string by escaping quotes.
How it works:
1. Replaces all unwanted and special characters with an empty string.
2. Escapes double and single quotes for safe usage.
Args:
html (str): The HTML string to sanitize.
Returns:
str: The sanitized HTML string.
"""
# Replace all unwanted and special characters with an empty string
sanitized_html = html
# sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
# Escape all double and single quotes
sanitized_html = sanitized_html.replace('"', '\\"').replace("'", "\\'")
return sanitized_html
def sanitize_input_encode(text: str) -> str:
"""Sanitize input to handle potential encoding issues."""
try:
try:
if not text:
return ''
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
return text.encode('utf-8', errors='ignore').decode('utf-8')
except UnicodeEncodeError as e:
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
# Fall back to ASCII if UTF-8 fails
return text.encode('ascii', errors='ignore').decode('ascii')
except Exception as e:
raise ValueError(f"Error sanitizing input: {str(e)}") from e
def escape_json_string(s):
"""
Escapes characters in a string to be JSON safe.
Parameters:
s (str): The input string to be escaped.
Returns:
str: The escaped string, safe for JSON encoding.
"""
# Replace problematic backslash first
s = s.replace('\\', '\\\\')
# Replace the double quote
s = s.replace('"', '\\"')
# Escape control characters
s = s.replace('\b', '\\b')
s = s.replace('\f', '\\f')
s = s.replace('\n', '\\n')
s = s.replace('\r', '\\r')
s = s.replace('\t', '\\t')
# Additional problematic characters
# Unicode control characters
s = re.sub(r'[\x00-\x1f\x7f-\x9f]', lambda x: '\\u{:04x}'.format(ord(x.group())), s)
return s
def replace_inline_tags(soup, tags, only_text=False):
"""
Replace inline HTML tags with Markdown-style equivalents.
How it works:
1. Maps specific tags (e.g., <b>, <i>) to Markdown syntax.
2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
3. Optionally replaces tags with their text content only.
Args:
soup (BeautifulSoup): Parsed HTML content.
tags (List[str]): List of tags to replace.
only_text (bool): Whether to replace tags with plain text. Defaults to False.
Returns:
BeautifulSoup: Updated BeautifulSoup object with replaced tags.
"""
tag_replacements = {
'b': lambda tag: f"**{tag.text}**",
'i': lambda tag: f"*{tag.text}*",
'u': lambda tag: f"__{tag.text}__",
'span': lambda tag: f"{tag.text}",
'del': lambda tag: f"~~{tag.text}~~",
'ins': lambda tag: f"++{tag.text}++",
'sub': lambda tag: f"~{tag.text}~",
'sup': lambda tag: f"^^{tag.text}^^",
'strong': lambda tag: f"**{tag.text}**",
'em': lambda tag: f"*{tag.text}*",
'code': lambda tag: f"`{tag.text}`",
'kbd': lambda tag: f"`{tag.text}`",
'var': lambda tag: f"_{tag.text}_",
's': lambda tag: f"~~{tag.text}~~",
'q': lambda tag: f'"{tag.text}"',
'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
'cite': lambda tag: f"_{tag.text}_",
'dfn': lambda tag: f"_{tag.text}_",
'time': lambda tag: f"{tag.text}",
'small': lambda tag: f"<small>{tag.text}</small>",
'mark': lambda tag: f"=={tag.text}=="
}
replacement_data = [(tag, tag_replacements.get(tag, lambda t: t.text)) for tag in tags]
for tag_name, replacement_func in replacement_data:
for tag in soup.find_all(tag_name):
replacement_text = tag.text if only_text else replacement_func(tag)
tag.replace_with(replacement_text)
return soup
# for tag_name in tags:
# for tag in soup.find_all(tag_name):
# if not only_text:
# replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
# tag.replace_with(replacement_text)
# else:
# tag.replace_with(tag.text)
# return soup
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
"""
Extract structured content, media, and links from website HTML.
How it works:
1. Parses the HTML content using BeautifulSoup.
2. Extracts internal/external links and media (images, videos, audios).
3. Cleans the content by removing unwanted tags and attributes.
4. Converts cleaned HTML to Markdown.
5. Collects metadata and returns the extracted information.
Args:
url (str): The website URL.
html (str): The HTML content of the website.
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
Returns:
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
"""
try:
if not html:
return None
# Parse HTML content with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Get the content within the <body> tag
body = soup.body
# If css_selector is provided, extract content based on the selector
if css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
div_tag = soup.new_tag('div')
for el in selected_elements:
div_tag.append(el)
body = div_tag
links = {
'internal': [],
'external': []
}
# Extract all internal and external links
for a in body.find_all('a', href=True):
href = a['href']
url_base = url.split('/')[2]
if href.startswith('http') and url_base not in href:
links['external'].append({
'href': href,
'text': a.get_text()
})
else:
links['internal'].append(
{
'href': href,
'text': a.get_text()
}
)
# Remove script, style, and other tags that don't carry useful content from body
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
tag.decompose()
# Remove all attributes from remaining tags in body, except for img tags
for tag in body.find_all():
if tag.name != 'img':
tag.attrs = {}
# Extract all img tgas int0 [{src: '', alt: ''}]
media = {
'images': [],
'videos': [],
'audios': []
}
for img in body.find_all('img'):
media['images'].append({
'src': img.get('src'),
'alt': img.get('alt'),
"type": "image"
})
# Extract all video tags into [{src: '', alt: ''}]
for video in body.find_all('video'):
media['videos'].append({
'src': video.get('src'),
'alt': video.get('alt'),
"type": "video"
})
# Extract all audio tags into [{src: '', alt: ''}]
for audio in body.find_all('audio'):
media['audios'].append({
'src': audio.get('src'),
'alt': audio.get('alt'),
"type": "audio"
})
# Replace images with their alt text or remove them if no alt text is available
for img in body.find_all('img'):
alt_text = img.get('alt')
if alt_text:
img.replace_with(soup.new_string(alt_text))
else:
img.decompose()
# Create a function that replace content of all"pre" tag with its inner text
def replace_pre_tags_with_text(node):
for child in node.find_all('pre'):
# set child inner html to its text
child.string = child.get_text()
return node
# Replace all "pre" tags with their inner text
body = replace_pre_tags_with_text(body)
# Replace inline tags with their text content
body = replace_inline_tags(
body,
['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'],
only_text=kwargs.get('only_text', False)
)
# Recursively remove empty elements, their parent elements, and elements with word count below threshold
def remove_empty_and_low_word_count_elements(node, word_count_threshold):
for child in node.contents:
if isinstance(child, element.Tag):
remove_empty_and_low_word_count_elements(child, word_count_threshold)
word_count = len(child.get_text(strip=True).split())
if (len(child.contents) == 0 and not child.get_text(strip=True)) or word_count < word_count_threshold:
child.decompose()
return node
body = remove_empty_and_low_word_count_elements(body, word_count_threshold)
def remove_small_text_tags(body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD):
# We'll use a list to collect all tags that don't meet the word count requirement
tags_to_remove = []
# Traverse all tags in the body
for tag in body.find_all(True): # True here means all tags
# Check if the tag contains text and if it's not just whitespace
if tag.string and tag.string.strip():
# Split the text by spaces and count the words
word_count = len(tag.string.strip().split())
# If the word count is less than the threshold, mark the tag for removal
if word_count < word_count_threshold:
tags_to_remove.append(tag)
# Remove all marked tags from the tree
for tag in tags_to_remove:
tag.decompose() # or tag.extract() to remove and get the element
return body
# Remove small text tags
body = remove_small_text_tags(body, word_count_threshold)
def is_empty_or_whitespace(tag: Tag):
if isinstance(tag, NavigableString):
return not tag.strip()
# Check if the tag itself is empty or all its children are empty/whitespace
if not tag.contents:
return True
return all(is_empty_or_whitespace(child) for child in tag.contents)
def remove_empty_tags(body: Tag):
# Continue processing until no more changes are made
changes = True
while changes:
changes = False
# Collect all tags that are empty or contain only whitespace
empty_tags = [tag for tag in body.find_all(True) if is_empty_or_whitespace(tag)]
for tag in empty_tags:
# If a tag is empty, decompose it
tag.decompose()
changes = True # Mark that a change was made
return body
# Remove empty tags
body = remove_empty_tags(body)
# Flatten nested elements with only one child of the same type
def flatten_nested_elements(node):
for child in node.contents:
if isinstance(child, element.Tag):
flatten_nested_elements(child)
if len(child.contents) == 1 and child.contents[0].name == child.name:
# print('Flattening:', child.name)
child_content = child.contents[0]
child.replace_with(child_content)
return node
body = flatten_nested_elements(body)
# Remove comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove consecutive empty newlines and replace multiple spaces with a single space
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
# Sanitize the cleaned HTML content
cleaned_html = sanitize_html(cleaned_html)
# sanitized_html = escape_json_string(cleaned_html)
# Convert cleaned HTML to Markdown
h = html2text.HTML2Text()
h = CustomHTML2Text()
h.ignore_links = True
markdown = h.handle(cleaned_html)
markdown = markdown.replace(' ```', '```')
try:
meta = extract_metadata(html, soup)
except Exception as e:
print('Error extracting metadata:', str(e))
meta = {}
# Return the Markdown content
return{
'markdown': markdown,
'cleaned_html': cleaned_html,
'success': True,
'media': media,
'links': links,
'metadata': meta
}
except Exception as e:
print('Error processing HTML content:', str(e))
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
if not html:
return None
soup = BeautifulSoup(html, 'html.parser')
body = soup.body
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
for tag in kwargs.get('excluded_tags', []) or []:
for el in body.select(tag):
el.decompose()
if css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
body = soup.new_tag('div')
for el in selected_elements:
body.append(el)
links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []}
# Extract meaningful text for media files from closest parent
def find_closest_parent_with_useful_text(tag):
current_tag = tag
while current_tag:
current_tag = current_tag.parent
# Get the text content from the parent tag
if current_tag:
text_content = current_tag.get_text(separator=' ',strip=True)
# Check if the text content has at least word_count_threshold
if len(text_content.split()) >= image_description_min_word_threshold:
return text_content
return None
def process_image(img, url, index, total_images):
#Check if an image has valid display and inside undesired html elements
def is_valid_image(img, parent, parent_classes):
style = img.get('style', '')
src = img.get('src', '')
classes_to_check = ['button', 'icon', 'logo']
tags_to_check = ['button', 'input']
return all([
'display:none' not in style,
src,
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
parent.name not in tags_to_check
])
#Score an image for it's usefulness
def score_image_for_usefulness(img, base_url, index, images_count):
# Function to parse image height/width value and units
def parse_dimension(dimension):
if dimension:
match = re.match(r"(\d+)(\D*)", dimension)
if match:
number = int(match.group(1))
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
return number, unit
return None, None
# Fetch image file metadata to extract size and extension
def fetch_image_file_size(img, base_url):
#If src is relative path construct full URL, if not it may be CDN URL
img_url = urljoin(base_url,img.get('src'))
try:
response = requests.head(img_url)
if response.status_code == 200:
return response.headers.get('Content-Length',None)
else:
print(f"Failed to retrieve file size for {img_url}")
return None
except InvalidSchema as e:
return None
finally:
return
image_height = img.get('height')
height_value, height_unit = parse_dimension(image_height)
image_width = img.get('width')
width_value, width_unit = parse_dimension(image_width)
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
image_format = os.path.splitext(img.get('src',''))[1].lower()
# Remove . from format
image_format = image_format.strip('.')
score = 0
if height_value:
if height_unit == 'px' and height_value > 150:
score += 1
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
score += 1
if width_value:
if width_unit == 'px' and width_value > 150:
score += 1
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
score += 1
if image_size > 10000:
score += 1
if img.get('alt') != '':
score+=1
if any(image_format==format for format in ['jpg','png','webp']):
score+=1
if index/images_count<0.5:
score+=1
return score
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
return None
score = score_image_for_usefulness(img, url, index, total_images)
if score <= IMAGE_SCORE_THRESHOLD:
return None
return {
'src': img.get('src', '').replace('\\"', '"').strip(),
'alt': img.get('alt', ''),
'desc': find_closest_parent_with_useful_text(img),
'score': score,
'type': 'image'
}
def process_element(element: element.PageElement) -> bool:
try:
if isinstance(element, NavigableString):
if isinstance(element, Comment):
element.extract()
return False
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
element.decompose()
return False
keep_element = False
if element.name == 'a' and element.get('href'):
href = element['href']
url_base = url.split('/')[2]
link_data = {'href': href, 'text': element.get_text()}
if href.startswith('http') and url_base not in href:
links['external'].append(link_data)
else:
links['internal'].append(link_data)
keep_element = True
elif element.name == 'img':
return True # Always keep image elements
elif element.name in ['video', 'audio']:
media[f"{element.name}s"].append({
'src': element.get('src'),
'alt': element.get('alt'),
'type': element.name,
'description': find_closest_parent_with_useful_text(element)
})
source_tags = element.find_all('source')
for source_tag in source_tags:
media[f"{element.name}s"].append({
'src': source_tag.get('src'),
'alt': element.get('alt'),
'type': element.name,
'description': find_closest_parent_with_useful_text(element)
})
return True # Always keep video and audio elements
if element.name != 'pre':
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
if kwargs.get('only_text', False):
element.replace_with(element.get_text())
else:
element.unwrap()
elif element.name != 'img':
element.attrs = {}
# Process children
for child in list(element.children):
if isinstance(child, NavigableString) and not isinstance(child, Comment):
if len(child.strip()) > 0:
keep_element = True
else:
if process_element(child):
keep_element = True
# Check word count
if not keep_element:
word_count = len(element.get_text(strip=True).split())
keep_element = word_count >= word_count_threshold
if not keep_element:
element.decompose()
return keep_element
except Exception as e:
print('Error processing element:', str(e))
return False
#process images by filtering and extracting contextual text from the page
imgs = body.find_all('img')
media['images'] = [
result for result in
(process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
if result is not None
]
process_element(body)
def flatten_nested_elements(node):
if isinstance(node, NavigableString):
return node
if len(node.contents) == 1 and isinstance(node.contents[0], element.Tag) and node.contents[0].name == node.name:
return flatten_nested_elements(node.contents[0])
node.contents = [flatten_nested_elements(child) for child in node.contents]
return node
body = flatten_nested_elements(body)
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
for img in imgs:
try:
src = img.get('src', '')
if base64_pattern.match(src):
img['src'] = base64_pattern.sub('', src)
except:
pass
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = sanitize_html(cleaned_html)
h = CustomHTML2Text()
h.ignore_links = True
markdown = h.handle(cleaned_html)
markdown = markdown.replace(' ```', '```')
try:
meta = extract_metadata(html, soup)
except Exception as e:
print('Error extracting metadata:', str(e))
meta = {}
return {
'markdown': markdown,
'cleaned_html': cleaned_html,
'success': True,
'media': media,
'links': links,
'metadata': meta
}
def extract_metadata(html, soup=None):
"""
Extract optimized content, media, and links from website HTML.
How it works:
1. Similar to `get_content_of_website`, but optimized for performance.
2. Filters and scores images for usefulness.
3. Extracts contextual descriptions for media files.
4. Handles excluded tags and CSS selectors.
5. Cleans HTML and converts it to Markdown.
Args:
url (str): The website URL.
html (str): The HTML content of the website.
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
**kwargs: Additional options for customization.
Returns:
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
"""
metadata = {}
if not html and not soup:
return {}
if not soup:
soup = BeautifulSoup(html, 'lxml')
head = soup.head
if not head:
return metadata
# Title
title_tag = head.find('title')
metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None
# Meta description
description_tag = head.find('meta', attrs={'name': 'description'})
metadata['description'] = description_tag.get('content', '').strip() if description_tag else None
# Meta keywords
keywords_tag = head.find('meta', attrs={'name': 'keywords'})
metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None
# Meta author
author_tag = head.find('meta', attrs={'name': 'author'})
metadata['author'] = author_tag.get('content', '').strip() if author_tag else None
# Open Graph metadata
og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')})
for tag in og_tags:
property_name = tag.get('property', '').strip()
content = tag.get('content', '').strip()
if property_name and content:
metadata[property_name] = content
# Twitter Card metadata
twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')})
for tag in twitter_tags:
property_name = tag.get('name', '').strip()
content = tag.get('content', '').strip()
if property_name and content:
metadata[property_name] = content
return metadata
def extract_xml_tags(string):
"""
Extracts XML tags from a string.
Args:
string (str): The input string containing XML tags.
Returns:
List[str]: A list of XML tags extracted from the input string.
"""
tags = re.findall(r'<(\w+)>', string)
return list(set(tags))
def extract_xml_data(tags, string):
"""
Extract data for specified XML tags from a string.
How it works:
1. Searches the string for each tag using regex.
2. Extracts the content within the tags.
3. Returns a dictionary of tag-content pairs.
Args:
tags (List[str]): The list of XML tags to extract.
string (str): The input string containing XML data.
Returns:
Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
"""
data = {}
for tag in tags:
pattern = f"<{tag}>(.*?)</{tag}>"
match = re.search(pattern, string, re.DOTALL)
if match:
data[tag] = match.group(1).strip()
else:
data[tag] = ""
return data
def perform_completion_with_backoff(
provider,
prompt_with_variables,
api_token,
json_response = False,
base_url=None,
**kwargs
):
"""
Perform an API completion request with exponential backoff.
How it works:
1. Sends a completion request to the API.
2. Retries on rate-limit errors with exponential delays.
3. Returns the API response or an error after all retries.
Args:
provider (str): The name of the API provider.
prompt_with_variables (str): The input prompt for the completion request.
api_token (str): The API token for authentication.