-
Notifications
You must be signed in to change notification settings - Fork 0
/
sitemap.py
147 lines (119 loc) · 4.9 KB
/
sitemap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Loader that fetches a sitemap and loads those URLs."""
import itertools
import re
from typing import Any, Callable, Generator, Iterable, List, Optional
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.schema import Document
def _default_parsing_function(content: Any) -> str:
return str(content.get_text())
def _default_meta_function(meta: dict, _content: Any) -> dict:
return {"source": meta["loc"], **meta}
def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, None]:
it = iter(iterable)
while item := list(itertools.islice(it, size)):
yield item
class SitemapLoader(WebBaseLoader):
"""Loader that fetches a sitemap and loads those URLs."""
def __init__(
self,
web_path: str,
filter_urls: Optional[List[str]] = None,
parsing_function: Optional[Callable] = None,
blocksize: Optional[int] = None,
blocknum: int = 0,
meta_function: Optional[Callable] = None,
is_local: bool = False,
):
"""Initialize with webpage path and optional filter URLs.
Args:
web_path: url of the sitemap. can also be a local path
filter_urls: list of strings or regexes that will be applied to filter the
urls that are parsed and loaded
parsing_function: Function to parse bs4.Soup output
blocksize: number of sitemap locations per block
blocknum: the number of the block that should be loaded - zero indexed
meta_function: Function to parse bs4.Soup output for metadata
remember when setting this method to also copy metadata["loc"]
to metadata["source"] if you are using this field
is_local: whether the sitemap is a local file
"""
if blocksize is not None and blocksize < 1:
raise ValueError("Sitemap blocksize should be at least 1")
if blocknum < 0:
raise ValueError("Sitemap blocknum can not be lower then 0")
try:
import lxml # noqa:F401
except ImportError:
raise ImportError(
"lxml package not found, please install it with " "`pip install lxml`"
)
super().__init__(web_path)
self.filter_urls = filter_urls
self.parsing_function = parsing_function or _default_parsing_function
self.meta_function = meta_function or _default_meta_function
self.blocksize = blocksize
self.blocknum = blocknum
self.is_local = is_local
def parse_sitemap(self, soup: Any) -> List[dict]:
"""Parse sitemap xml and load into a list of dicts."""
els = []
for url in soup.find_all("url"):
loc = url.find("loc")
if not loc:
continue
# Strip leading and trailing whitespace and newlines
loc_text = loc.text.strip()
#print(f"LOC {loc_text}")
#if self.filter_urls and not any(
# re.match(r, loc_text) for r in self.filter_urls
#):
# continue
els.append(
{
tag: prop.text
for tag in ["loc", "lastmod", "changefreq", "priority"]
if (prop := url.find(tag))
}
)
for sitemap in soup.find_all("sitemap"):
print(f"Found embedded sitemap {sitemap}")
loc = sitemap.find("loc")
if not loc:
continue
soup_child = self.scrape_all([loc.text], "xml")[0]
els.extend(self.parse_sitemap(soup_child))
#for el in els:
# print(f"ELS {el['loc']}")
return els
def load(self) -> List[Document]:
"""Load sitemap."""
if self.is_local:
try:
import bs4
except ImportError:
raise ImportError(
"beautifulsoup4 package not found, please install it"
" with `pip install beautifulsoup4`"
)
fp = open(self.web_path)
soup = bs4.BeautifulSoup(fp, "xml")
else:
soup = self.scrape("xml")
els = self.parse_sitemap(soup)
if self.blocksize is not None:
elblocks = list(_batch_block(els, self.blocksize))
blockcount = len(elblocks)
if blockcount - 1 < self.blocknum:
raise ValueError(
"Selected sitemap does not contain enough blocks for given blocknum"
)
else:
els = elblocks[self.blocknum]
results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])
return [
Document(
page_content=self.parsing_function(results[i]),
metadata=self.meta_function(els[i], results[i]),
)
for i in range(len(results))
]