Skip to content

Commit b56ea46

Browse files
authored
Refactor Confluence integration: Update license to MIT, remove requirements.txt, and implement HtmlTextParser for HTML to Markdown conversion. Update dependencies and tests accordingly. (run-llama#20262)
1 parent 7b7430f commit b56ea46

File tree

10 files changed

+394
-685
lines changed

10 files changed

+394
-685
lines changed

llama-index-integrations/readers/llama-index-readers-confluence/LICENSE

Lines changed: 21 additions & 595 deletions
Large diffs are not rendered by default.

llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,29 @@
22

33
import logging
44
import os
5-
import uuid
65
import tempfile
6+
import uuid
7+
from io import BytesIO
78
from typing import Callable, Dict, List, Optional
89
from urllib.parse import unquote
910

1011
import requests
12+
from llama_index.core.instrumentation import DispatcherSpanMixin, get_dispatcher
1113
from llama_index.core.readers.base import BaseReader
1214
from llama_index.core.schema import Document
13-
from llama_index.core.instrumentation import DispatcherSpanMixin, get_dispatcher
1415
from retrying import retry
15-
from io import BytesIO
1616

1717
from .event import (
18+
AttachmentFailedEvent,
19+
AttachmentProcessedEvent,
20+
AttachmentProcessingStartedEvent,
21+
AttachmentSkippedEvent,
1822
FileType,
19-
TotalPagesToProcessEvent,
20-
PageDataFetchStartedEvent,
2123
PageDataFetchCompletedEvent,
22-
PageSkippedEvent,
24+
PageDataFetchStartedEvent,
2325
PageFailedEvent,
24-
AttachmentProcessingStartedEvent,
25-
AttachmentProcessedEvent,
26-
AttachmentSkippedEvent,
27-
AttachmentFailedEvent,
26+
PageSkippedEvent,
27+
TotalPagesToProcessEvent,
2828
)
2929

3030
CONFLUENCE_API_TOKEN = "CONFLUENCE_API_TOKEN"
@@ -286,8 +286,7 @@ def load_data(
286286
!= 1
287287
):
288288
raise ValueError(
289-
"Must specify exactly one among `space_key`, `page_ids`, `label`, `cql`"
290-
" parameters."
289+
"Must specify exactly one among `space_key`, `page_ids`, `label`, `cql` parameters."
291290
)
292291

293292
if cursor and start:
@@ -314,14 +313,9 @@ def load_data(
314313
" please use `max_num_results` instead."
315314
)
316315

317-
try:
318-
import html2text # type: ignore
319-
except ImportError:
320-
raise ImportError(
321-
"`html2text` package not found, please run `pip install html2text`"
322-
)
316+
from .html_parser import HtmlTextParser
323317

324-
text_maker = html2text.HTML2Text()
318+
text_maker = HtmlTextParser()
325319

326320
if not start:
327321
start = 0
@@ -603,7 +597,7 @@ def process_page(self, page, include_attachments, text_maker):
603597
except OSError:
604598
pass
605599
else:
606-
text = text_maker.handle(page["body"]["export_view"]["value"]) + "".join(
600+
text = text_maker.convert(page["body"]["export_view"]["value"]) + "".join(
607601
attachment_texts
608602
)
609603

@@ -626,8 +620,7 @@ def process_attachment(self, page_id):
626620
pass
627621
except ImportError:
628622
raise ImportError(
629-
"`pytesseract` or `pdf2image` or `Pillow` package not found, please run"
630-
" `pip install pytesseract pdf2image Pillow`"
623+
"`pytesseract` or `pdf2image` or `Pillow` package not found, please run `pip install pytesseract pdf2image Pillow`"
631624
)
632625

633626
# depending on setup you may also need to set the correct path for poppler and tesseract
@@ -815,8 +808,7 @@ def process_pdf(self, link):
815808
from pdf2image import convert_from_bytes # type: ignore
816809
except ImportError:
817810
raise ImportError(
818-
"`pytesseract` or `pdf2image` package not found, please run `pip"
819-
" install pytesseract pdf2image`"
811+
"`pytesseract` or `pdf2image` package not found, please run `pip install pytesseract pdf2image`"
820812
)
821813

822814
response = self.confluence.request(path=link, absolute=True)
@@ -926,8 +918,7 @@ def process_image(self, link):
926918
from PIL import Image # type: ignore
927919
except ImportError:
928920
raise ImportError(
929-
"`pytesseract` or `Pillow` package not found, please run `pip install"
930-
" pytesseract Pillow`"
921+
"`pytesseract` or `Pillow` package not found, please run `pip install pytesseract Pillow`"
931922
)
932923

933924
text = ""
@@ -1166,8 +1157,7 @@ def process_svg(self, link):
11661157
from svglib.svglib import svg2rlg # type: ignore
11671158
except ImportError:
11681159
raise ImportError(
1169-
"`pytesseract`, `Pillow`, or `svglib` package not found, please run"
1170-
" `pip install pytesseract Pillow svglib`"
1160+
"`pytesseract`, `Pillow`, or `svglib` package not found, please run `pip install pytesseract Pillow svglib`"
11711161
)
11721162

11731163
response = self.confluence.request(path=link, absolute=True)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
class HtmlTextParser:
2+
def __init__(self):
3+
try:
4+
from markdownify import markdownify # noqa: F401
5+
except ImportError:
6+
raise ImportError(
7+
"`markdownify` package not found, please run `pip install markdownify`"
8+
)
9+
10+
def convert(self, html: str) -> str:
11+
from markdownify import markdownify
12+
13+
if not html:
14+
return ""
15+
16+
return markdownify(
17+
html,
18+
heading_style="ATX", # Use # for headings instead of underlines
19+
bullets="*", # Use * for unordered lists
20+
strip=["script", "style"], # Remove script and style tags for security
21+
)

llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,15 @@ dev = [
2626

2727
[project]
2828
name = "llama-index-readers-confluence"
29-
version = "0.5.0"
29+
version = "0.6.0"
3030
description = "llama-index readers confluence integration"
3131
authors = [{name = "Your Name", email = "[email protected]"}]
32-
requires-python = ">=3.9,<4.0"
32+
requires-python = ">=3.9,<3.14"
3333
readme = "README.md"
34-
license = "GPL-3.0-or-later"
34+
license = "MIT"
3535
maintainers = [{name = "zywilliamli"}]
3636
dependencies = [
3737
"atlassian-python-api>=3.41.9,<5",
38-
"html2text>=2024.2.26,<2025",
3938
"pytesseract>=0.3.10,<0.4",
4039
"pdf2image>=1.17.0,<2",
4140
"pillow>=10.2.0,<11",
@@ -44,6 +43,7 @@ dependencies = [
4443
"svglib>=1.5,<1.6",
4544
"retrying>=1.3.4,<2",
4645
"llama-index-core>=0.13.0,<0.15",
46+
"markdownify>=1.2.0,<2.0.0",
4747
]
4848

4949
[tool.codespell]

llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt

Lines changed: 0 additions & 9 deletions
This file was deleted.

0 commit comments

Comments
 (0)