-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Llama index readers gitbook (#16862)
- Loading branch information
Showing
19 changed files
with
5,651 additions
and
0 deletions.
There are no files selected for viewing
153 changes: 153 additions & 0 deletions
153
llama-index-integrations/readers/llama-index-readers-gitbook/.gitignore
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
llama_index/_static | ||
.DS_Store | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
bin/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
etc/ | ||
include/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
share/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
.ruff_cache | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
notebooks/ | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
pyvenv.cfg | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# Jetbrains | ||
.idea | ||
modules/ | ||
*.swp | ||
|
||
# VsCode | ||
.vscode | ||
|
||
# pipenv | ||
Pipfile | ||
Pipfile.lock | ||
|
||
# pyright | ||
pyrightconfig.json |
3 changes: 3 additions & 0 deletions
3
llama-index-integrations/readers/llama-index-readers-gitbook/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
poetry_requirements( | ||
name="poetry", | ||
) |
1 change: 1 addition & 0 deletions
1
llama-index-integrations/readers/llama-index-readers-gitbook/CHANGELOG.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# CHANGELOG |
17 changes: 17 additions & 0 deletions
17
llama-index-integrations/readers/llama-index-readers-gitbook/Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
GIT_ROOT ?= $(shell git rev-parse --show-toplevel) | ||
|
||
help: ## Show all Makefile targets. | ||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' | ||
|
||
format: ## Run code autoformatters (black). | ||
pre-commit install | ||
git ls-files | xargs pre-commit run black --files | ||
|
||
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy | ||
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files | ||
|
||
test: ## Run tests via pytest. | ||
python -m unittest discover tests | ||
|
||
watch-docs: ## Build and watch documentation. | ||
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ |
30 changes: 30 additions & 0 deletions
30
llama-index-integrations/readers/llama-index-readers-gitbook/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# LlamaIndex Readers Integration: Gitbook | ||
|
||
## Overview | ||
|
||
Simple Gitbook Reader allows loading data from a gitbook space. It collects & converts contents from gitbook space into documents used by LlamaIndex. | ||
|
||
### Installation | ||
|
||
You can install Gitbook Reader via pip: | ||
|
||
```bash | ||
pip install llama-index-readers-gitbook | ||
``` | ||
|
||
### Usage | ||
|
||
```python | ||
from llama_index.readers.gitbook import SimpleGitbookReader | ||
|
||
# Initialize SimpleGitbookReader | ||
reader = SimpleGitbookReader( | ||
api_token="<Gitbook API Token>", # Gitbook API Token | ||
) | ||
|
||
# load data from Gitbook | ||
documents = reader.load_data( | ||
space_id="<Gitbook Space Id>", # Id of the gitbook space | ||
metadata_names=None, # Names of the fields to add to metadata attribute (available: 'path', 'title', 'description', 'parent') | ||
) | ||
``` |
1 change: 1 addition & 0 deletions
1
...-index-integrations/readers/llama-index-readers-gitbook/llama_index/readers/gitbook/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
python_sources() |
3 changes: 3 additions & 0 deletions
3
...-integrations/readers/llama-index-readers-gitbook/llama_index/readers/gitbook/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from llama_index.readers.gitbook.base import SimpleGitbookReader | ||
|
||
__all__ = ["SimpleGitbookReader"] |
110 changes: 110 additions & 0 deletions
110
...ndex-integrations/readers/llama-index-readers-gitbook/llama_index/readers/gitbook/base.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
from typing import List, Optional | ||
|
||
from llama_index.core.readers.base import BaseReader | ||
from llama_index.core.schema import Document | ||
|
||
from llama_index.readers.gitbook.gitbook_client import GitbookClient | ||
|
||
VALID_METADATA_FIELDS = {"path", "title", "description", "parent"} | ||
|
||
|
||
class SimpleGitbookReader(BaseReader): | ||
"""Simple gitbook reader. | ||
Convert each gitbook page into Document used by LlamaIndex. | ||
Args: | ||
api_token (str): Gitbook API Token. | ||
api_url (str): Gitbook API Endpoint. | ||
""" | ||
|
||
def __init__(self, api_token: str, api_url: str = None) -> None: | ||
"""Initialize with parameters.""" | ||
self.client = GitbookClient(api_token, api_url) | ||
|
||
def load_data( | ||
self, | ||
space_id: str, | ||
metadata_names: Optional[List[str]] = None, | ||
show_progress=False, | ||
) -> List[Document]: | ||
"""Load data from the input directory. | ||
Args: | ||
space_id (str): Gitbook space id | ||
metadata_names (Optional[List[str]]): names of the fields to be added | ||
to the metadata attribute of the Document. | ||
only 'path', 'title', 'description', 'parent' are available | ||
Defaults to None | ||
show_progress (bool, optional): Show progress bar. Defaults to False | ||
Returns: | ||
List[Document]: A list of documents. | ||
""" | ||
if metadata_names: | ||
invalid_fields = set(metadata_names) - VALID_METADATA_FIELDS | ||
if invalid_fields: | ||
raise ValueError( | ||
f"Invalid metadata fields: {', '.join(invalid_fields)}" | ||
) | ||
|
||
documents = [] | ||
pages = self.client.list_pages(space_id) | ||
|
||
if show_progress: | ||
from tqdm import tqdm | ||
|
||
iterator = tqdm(pages, desc="Downloading pages") | ||
else: | ||
iterator = pages | ||
|
||
for page in iterator: | ||
id = page.get("id") | ||
content = self.client.get_page_markdown(space_id, id) | ||
if not content: | ||
print(f"Warning: No content found for page ID {id}. Skipping...") | ||
continue | ||
|
||
if metadata_names is None: | ||
documents.append( | ||
Document(text=content, id_=id, metadata={"path": page.get("path")}) | ||
) | ||
else: | ||
try: | ||
metadata = {name: page.get(name) for name in metadata_names} | ||
except KeyError as err: | ||
raise ValueError( | ||
f"{err.args[0]} field is not available. Choose from {', '.join(VALID_METADATA_FIELDS)}" | ||
) from err | ||
documents.append(Document(text=content, id_=id, metadata=metadata)) | ||
|
||
return documents | ||
|
||
|
||
if __name__ == "__main__": | ||
import os | ||
import sys | ||
|
||
def load_env_file(): | ||
"""Load environment variables from .env file.""" | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
env_path = os.path.join(current_dir, "../../../.env") | ||
if os.path.exists(env_path): | ||
with open(env_path) as f: | ||
for line in f: | ||
line = line.strip() | ||
if line and not line.startswith("#"): | ||
key, value = line.split("=", 1) | ||
os.environ[key.strip()] = value.strip() | ||
|
||
load_env_file() | ||
api_token = os.getenv("GITBOOK_API_TOKEN") | ||
space_id = os.getenv("GITBOOK_SPACE_ID") | ||
|
||
if not api_token or not space_id: | ||
print("Error: GITBOOK_API_TOKEN and GITBOOK_SPACE_ID must be set in .env file") | ||
sys.exit(1) | ||
|
||
reader = SimpleGitbookReader(api_token) | ||
print(reader.load_data(space_id, show_progress=True)) |
Oops, something went wrong.