Skip to content

Commit 40e1450

Browse files
Scraper: Parse Remarks sections (popular) as a Note
1 parent 5b278b5 commit 40e1450

File tree

1 file changed

+14
-1
lines changed

1 file changed

+14
-1
lines changed

migrate/oldwiki/scrape.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def parse_links(source_label: str, url: str) -> dict:
102102
if not foundInCat:
103103
result[current_category].append((page_url, name))
104104
else:
105-
log(f"!!! Duplicate found in {foundInCat} when parsing {current_category}: {page_url}")
105+
log(f"=> Function {name} already found in category {foundInCat}, skipping duplicate in {current_category}.")
106106

107107
return result
108108

@@ -267,6 +267,18 @@ def parse_notes(content_div):
267267
"text": text
268268
})
269269

270+
# Additional hack: find section 'Remarks' and extract content into an info note
271+
remarks_header = content_div.find("span", id="Remarks")
272+
if remarks_header:
273+
remarks_paragraph = remarks_header.find_next("p")
274+
if remarks_paragraph:
275+
remarks_text = remarks_paragraph.get_text(" ", strip=True)
276+
if remarks_text:
277+
note_boxes.append({
278+
"type": "note",
279+
"text": remarks_text
280+
})
281+
270282
the_notes = []
271283
the_meta = []
272284
for note in note_boxes:
@@ -604,6 +616,7 @@ def parse_function_page(page_url: str, category: str, name: str, source: str) ->
604616
func_pair = i_tag.a.text.strip()
605617

606618
func_notes, func_meta = parse_notes(content_div)
619+
handled_header_names.append("Remarks")
607620

608621
# Syntax: parameters and returns TODO
609622
handled_header_names.append("Syntax")

0 commit comments

Comments
 (0)