22
33import logging
44import os
5- import uuid
65import tempfile
6+ import uuid
7+ from io import BytesIO
78from typing import Callable , Dict , List , Optional
89from urllib .parse import unquote
910
1011import requests
12+ from llama_index .core .instrumentation import DispatcherSpanMixin , get_dispatcher
1113from llama_index .core .readers .base import BaseReader
1214from llama_index .core .schema import Document
13- from llama_index .core .instrumentation import DispatcherSpanMixin , get_dispatcher
1415from retrying import retry
15- from io import BytesIO
1616
1717from .event import (
18+ AttachmentFailedEvent ,
19+ AttachmentProcessedEvent ,
20+ AttachmentProcessingStartedEvent ,
21+ AttachmentSkippedEvent ,
1822 FileType ,
19- TotalPagesToProcessEvent ,
20- PageDataFetchStartedEvent ,
2123 PageDataFetchCompletedEvent ,
22- PageSkippedEvent ,
24+ PageDataFetchStartedEvent ,
2325 PageFailedEvent ,
24- AttachmentProcessingStartedEvent ,
25- AttachmentProcessedEvent ,
26- AttachmentSkippedEvent ,
27- AttachmentFailedEvent ,
26+ PageSkippedEvent ,
27+ TotalPagesToProcessEvent ,
2828)
2929
3030CONFLUENCE_API_TOKEN = "CONFLUENCE_API_TOKEN"
@@ -286,8 +286,7 @@ def load_data(
286286 != 1
287287 ):
288288 raise ValueError (
289- "Must specify exactly one among `space_key`, `page_ids`, `label`, `cql`"
290- " parameters."
289+ "Must specify exactly one among `space_key`, `page_ids`, `label`, `cql` parameters."
291290 )
292291
293292 if cursor and start :
@@ -314,14 +313,9 @@ def load_data(
314313 " please use `max_num_results` instead."
315314 )
316315
317- try :
318- import html2text # type: ignore
319- except ImportError :
320- raise ImportError (
321- "`html2text` package not found, please run `pip install html2text`"
322- )
316+ from .html_parser import HtmlTextParser
323317
324- text_maker = html2text . HTML2Text ()
318+ text_maker = HtmlTextParser ()
325319
326320 if not start :
327321 start = 0
@@ -603,7 +597,7 @@ def process_page(self, page, include_attachments, text_maker):
603597 except OSError :
604598 pass
605599 else :
606- text = text_maker .handle (page ["body" ]["export_view" ]["value" ]) + "" .join (
600+ text = text_maker .convert (page ["body" ]["export_view" ]["value" ]) + "" .join (
607601 attachment_texts
608602 )
609603
@@ -626,8 +620,7 @@ def process_attachment(self, page_id):
626620 pass
627621 except ImportError :
628622 raise ImportError (
629- "`pytesseract` or `pdf2image` or `Pillow` package not found, please run"
630- " `pip install pytesseract pdf2image Pillow`"
623+ "`pytesseract` or `pdf2image` or `Pillow` package not found, please run `pip install pytesseract pdf2image Pillow`"
631624 )
632625
633626 # depending on setup you may also need to set the correct path for poppler and tesseract
@@ -815,8 +808,7 @@ def process_pdf(self, link):
815808 from pdf2image import convert_from_bytes # type: ignore
816809 except ImportError :
817810 raise ImportError (
818- "`pytesseract` or `pdf2image` package not found, please run `pip"
819- " install pytesseract pdf2image`"
811+ "`pytesseract` or `pdf2image` package not found, please run `pip install pytesseract pdf2image`"
820812 )
821813
822814 response = self .confluence .request (path = link , absolute = True )
@@ -926,8 +918,7 @@ def process_image(self, link):
926918 from PIL import Image # type: ignore
927919 except ImportError :
928920 raise ImportError (
929- "`pytesseract` or `Pillow` package not found, please run `pip install"
930- " pytesseract Pillow`"
921+ "`pytesseract` or `Pillow` package not found, please run `pip install pytesseract Pillow`"
931922 )
932923
933924 text = ""
@@ -1166,8 +1157,7 @@ def process_svg(self, link):
11661157 from svglib .svglib import svg2rlg # type: ignore
11671158 except ImportError :
11681159 raise ImportError (
1169- "`pytesseract`, `Pillow`, or `svglib` package not found, please run"
1170- " `pip install pytesseract Pillow svglib`"
1160+ "`pytesseract`, `Pillow`, or `svglib` package not found, please run `pip install pytesseract Pillow svglib`"
11711161 )
11721162
11731163 response = self .confluence .request (path = link , absolute = True )
0 commit comments