Skip to content

Commit eabe81a

Browse files
add PyPDF2 for PDF parsing and enhance DocumentLoader to support various file types and URLs
1 parent fc53b81 commit eabe81a

File tree

11 files changed

+805
-726
lines changed

11 files changed

+805
-726
lines changed

build/lib/semantio/agent.py

Lines changed: 218 additions & 313 deletions
Large diffs are not rendered by default.
Lines changed: 176 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,191 @@
1-
from typing import List, Dict, Any
1+
import os
2+
import json
3+
import csv
4+
import re
25
from pathlib import Path
6+
from typing import List, Dict, Any
7+
from io import BytesIO
38

4-
class DocumentLoader:
5-
"""
6-
A class to load documents from various sources (e.g., files, URLs) into the knowledge base.
7-
"""
9+
import requests
10+
from bs4 import BeautifulSoup
811

9-
def __init__(self):
10-
"""
11-
Initialize the DocumentLoader.
12-
"""
13-
pass
12+
# Optional: Import pandas for XLSX support and PyPDF2 for PDF support
13+
try:
14+
import pandas as pd
15+
except ImportError:
16+
pd = None
1417

15-
def load_from_file(self, file_path: str) -> List[Dict[str, Any]]:
16-
"""
17-
Load documents from a file.
18+
try:
19+
from PyPDF2 import PdfReader
20+
except ImportError:
21+
PdfReader = None
1822

19-
Args:
20-
file_path (str): The path to the file.
2123

22-
Returns:
23-
List[Dict[str, Any]]: A list of documents, where each document is a dictionary.
24-
"""
25-
file_path = Path(file_path)
26-
if not file_path.exists():
27-
raise FileNotFoundError(f"File not found: {file_path}")
24+
def flatten_json(data: Any, parent_key: str = "", separator: str = "_") -> List[Dict[str, Any]]:
25+
"""
26+
Recursively flatten a JSON structure.
27+
For each key-value pair, add an entry mapping key->value.
28+
Additionally, if the value is a string, add an entry mapping the value to its flattened key.
29+
"""
30+
items = []
31+
if isinstance(data, dict):
32+
for key, value in data.items():
33+
new_key = f"{parent_key}{separator}{key}" if parent_key else key
34+
if isinstance(value, (dict, list)):
35+
items.extend(flatten_json(value, new_key, separator))
36+
else:
37+
items.append({new_key: value})
38+
if isinstance(value, str):
39+
items.append({value: new_key})
40+
elif isinstance(data, list):
41+
for index, item in enumerate(data):
42+
new_key = f"{parent_key}{separator}{index}" if parent_key else str(index)
43+
if isinstance(item, (dict, list)):
44+
items.extend(flatten_json(item, new_key, separator))
45+
else:
46+
items.append({new_key: item})
47+
if isinstance(item, str):
48+
items.append({item: new_key})
49+
return items
2850

29-
# Example: Load a JSON file
30-
if file_path.suffix == ".json":
31-
import json
32-
with open(file_path, "r") as f:
33-
return json.load(f)
34-
# Example: Load a text file
35-
elif file_path.suffix == ".txt":
36-
with open(file_path, "r") as f:
37-
return [{"text": f.read()}]
38-
else:
39-
raise ValueError(f"Unsupported file type: {file_path.suffix}")
4051

41-
def load_from_url(self, url: str) -> List[Dict[str, Any]]:
52+
class DocumentLoader:
53+
"""
54+
A dynamic document loader that supports multiple source types:
55+
56+
- Local files: CSV, TXT, JSON, XLSX, PDF
57+
- URL sources: HTML websites (text extraction), JSON APIs, PDF URLs
58+
- YouTube links: Extracts transcripts using youtube_transcript_api
59+
60+
For JSON sources, if flatten is True (default), the returned document is a dictionary with two keys:
61+
"original": the raw JSON data,
62+
"flattened": a list of flattened key/value pairs (including reverse mappings).
63+
"""
64+
def load(self, source: str, flatten: bool = True) -> List[Dict[str, Any]]:
4265
"""
43-
Load documents from a URL.
66+
Load documents from the given source.
67+
If source starts with "http", treat it as a URL; otherwise, as a local file.
68+
"""
69+
if source.startswith("http"):
70+
return self.load_from_url(source, flatten=flatten)
71+
else:
72+
return self.load_from_file(source, flatten=flatten)
4473

45-
Args:
46-
url (str): The URL to load documents from.
74+
def load_from_file(self, file_path: str, flatten: bool = True) -> List[Dict[str, Any]]:
75+
path = Path(file_path)
76+
if not path.exists():
77+
raise FileNotFoundError(f"File not found: {file_path}")
78+
ext = path.suffix.lower()
79+
if ext == ".json":
80+
with open(path, "r", encoding="utf-8") as f:
81+
data = json.load(f)
82+
if flatten:
83+
return [{"original": data, "flattened": flatten_json(data)}]
84+
else:
85+
return data if isinstance(data, list) else [data]
86+
elif ext == ".txt":
87+
with open(path, "r", encoding="utf-8") as f:
88+
content = f.read()
89+
return [{"text": content}]
90+
elif ext == ".csv":
91+
with open(path, "r", encoding="utf-8") as f:
92+
reader = csv.DictReader(f)
93+
return [row for row in reader]
94+
elif ext == ".xlsx":
95+
if pd is None:
96+
raise ImportError("pandas is required to load XLSX files")
97+
df = pd.read_excel(path)
98+
return df.to_dict(orient="records")
99+
elif ext == ".pdf":
100+
if PdfReader is None:
101+
raise ImportError("PyPDF2 is required to load PDF files")
102+
reader = PdfReader(str(path))
103+
content = ""
104+
for page in reader.pages:
105+
content += page.extract_text() or ""
106+
return [{"text": content}]
107+
else:
108+
raise ValueError(f"Unsupported file type: {ext}")
47109

48-
Returns:
49-
List[Dict[str, Any]]: A list of documents, where each document is a dictionary.
50-
"""
51-
import requests
110+
def load_from_url(self, url: str, flatten: bool = True) -> List[Dict[str, Any]]:
111+
if "youtube.com" in url or "youtu.be" in url:
112+
return self._load_youtube(url)
52113
response = requests.get(url)
53114
if response.status_code != 200:
54115
raise ValueError(f"Failed to fetch data from URL: {url}")
55-
56-
# Example: Load JSON data from a URL
57-
if "application/json" in response.headers.get("Content-Type", ""):
58-
return response.json()
59-
# Example: Load text data from a URL
116+
content_type = response.headers.get("Content-Type", "").lower()
117+
if "application/json" in content_type:
118+
data = response.json()
119+
if flatten:
120+
return [{"original": data, "flattened": flatten_json(data)}]
121+
else:
122+
return data if isinstance(data, list) else [data]
123+
elif "text/html" in content_type:
124+
# First, try with requests + BeautifulSoup.
125+
soup = BeautifulSoup(response.text, "html.parser")
126+
text = soup.get_text(separator="\n").strip()
127+
# If the text seems too short (less than 50 words), assume content is loaded via JavaScript.
128+
if len(text.split()) < 50:
129+
try:
130+
text = self._fetch_with_headless_browser(url)
131+
except Exception as e:
132+
# If headless browser fails, log and fallback to the short text.
133+
print(f"Headless fetch failed: {e}")
134+
return [{"text": text}]
135+
elif "application/pdf" in content_type:
136+
if PdfReader is None:
137+
raise ImportError("PyPDF2 is required to load PDF files")
138+
pdf_file = BytesIO(response.content)
139+
reader = PdfReader(pdf_file)
140+
text = ""
141+
for page in reader.pages:
142+
text += page.extract_text() or ""
143+
return [{"text": text}]
60144
else:
61-
return [{"text": response.text}]
145+
return [{"text": response.text}]
146+
147+
def _fetch_with_headless_browser(self, url: str) -> str:
148+
"""
149+
Use a headless browser (Playwright) to fetch fully rendered content.
150+
"""
151+
try:
152+
from playwright.sync_api import sync_playwright
153+
except ImportError:
154+
raise ImportError("playwright is required for JS-rendered pages. Install it with 'pip install playwright' and run 'playwright install'.")
155+
with sync_playwright() as p:
156+
browser = p.chromium.launch(headless=True)
157+
page = browser.new_page()
158+
page.goto(url, wait_until="networkidle")
159+
html = page.content()
160+
browser.close()
161+
soup = BeautifulSoup(html, "html.parser")
162+
text = soup.get_text(separator="\n").strip()
163+
return text
164+
165+
def _load_youtube(self, url: str) -> List[Dict[str, Any]]:
166+
try:
167+
from youtube_transcript_api import YouTubeTranscriptApi
168+
except ImportError:
169+
raise ImportError("youtube_transcript_api is required to load YouTube transcripts")
170+
171+
video_id = None
172+
patterns = [r"v=([^&]+)", r"youtu\.be/([^?&]+)"]
173+
for pattern in patterns:
174+
match = re.search(pattern, url)
175+
if match:
176+
video_id = match.group(1)
177+
break
178+
if not video_id:
179+
raise ValueError("Could not extract video ID from URL")
180+
181+
# Define a prioritized list of language codes to try
182+
preferred_languages = ["en", "hi", "es", "fr", "de", "ru"]
183+
184+
try:
185+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=preferred_languages)
186+
text = " ".join(segment["text"] for segment in transcript)
187+
return [{"text": text}]
188+
except Exception as e:
189+
# Return a fallback document indicating transcript retrieval failed
190+
return [{"text": f"Transcript not available for video {url}: {str(e)}"}]
191+
44.1 KB
Binary file not shown.

dist/semantio-0.0.8.tar.gz

33.8 KB
Binary file not shown.

requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,7 @@ duckduckgo-search
1717
yfinance
1818
beautifulsoup4
1919
webdriver-manager
20-
validators
20+
validators
21+
PyPDF2
22+
youtube_transcript_api
23+
pandas

semantio.egg-info/PKG-INFO

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Metadata-Version: 2.1
22
Name: semantio
3-
Version: 0.0.7
3+
Version: 0.0.8
44
Summary: A powerful SDK for building AI agents
55
Home-page: https://github.com/Syenah/semantio
66
Author: Rakesh

semantio.egg-info/requires.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,6 @@ yfinance
1818
beautifulsoup4
1919
webdriver-manager
2020
validators
21+
PyPDF2
22+
youtube-transcript-api
23+
pandas

0 commit comments

Comments
 (0)