Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
470 changes: 470 additions & 0 deletions graph_rag/KG_web/KG_web_example.ipynb

Large diffs are not rendered by default.

Binary file added graph_rag/KG_web/Output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added graph_rag/KG_web/OutputNotebook.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
62 changes: 62 additions & 0 deletions graph_rag/KG_web/Scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
This file Scrapes the documentation from a given website including all sublinks using BeautifulSoup

Functions:
scrape_page: Scrapes page to extract relevant details
scrape_website: A function to recursively scrape all pages starting from the main documentation URL
build_index_from_documents: Builds an index from scraped documents
"""


import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

import time

def scrape_page(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.get_text(strip=True)
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
return content, links
except requests.exceptions.RequestException as e:
print(f"Error scraping {url}: {e}")
return None, []

def scrape_website(base_url, max_depth=3):
visited = set()
to_visit = [base_url]
all_documents = []

while to_visit:
current_url = to_visit.pop(0)
if current_url in visited or len(visited) >= max_depth:
continue

visited.add(current_url)

content, links = scrape_page(current_url)
if content:
document = Document(
text=content,
metadata={'url': current_url}
)
all_documents.append(document)

to_visit.extend(links)

time.sleep(1)
return all_documents

def build_index_from_documents(documents):
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.llm = Ollama(model="llama3", request_timeout=360.0)

index = VectorStoreIndex.from_documents(documents,)
return index
18 changes: 18 additions & 0 deletions graph_rag/KG_web/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import Scraper
import pickle

base_url = input("Enter the URL of the documentation to scrape: ")

scraped_documents = Scraper.scrape_website(base_url)

for i, doc in enumerate(scraped_documents, 1):
print(f"\nDocument {i}")
print(f"URL: {doc.metadata.get('url', 'No URL found')}")
print(f"Content Snippet: {doc.text[:200]}...\n")

index = Scraper.build_index_from_documents(scraped_documents)

with open("documentation_index.pkl", "wb") as file:
pickle.dump(index, file)

print("\nIndexing complete. The index has been saved to 'documentation_index.pkl'.")
196 changes: 196 additions & 0 deletions graph_rag/evaluation/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@

# Knowledge Graph Evaluation

This module provides methods to evaluate the performance of GraphRag. The following integrations are available for evaluation:

- **Llama-Index Evaluation Pack**
- **Ragas Evaluation Pack**

Additionally, this module includes scripts for creating custom test datasets to benchmark and evaluate GraphRag.

## Getting Started
This section demonstrates how to use the functions provided in the module:

---

### 1. QA Generation and Critique

This module offers tools to generate question-answer (QA) pairs from input documents using a language model and critique them based on various criteria like groundedness, relevance, and standalone quality.

> #### Generate and Critique QA Pairs

To use this module, follow these steps:

#### 1. Generate QA Pairs

First, we prepare our dataset for generating QA pairs. In this example, we'll use Keras-IO documentation and Llama-Index's `SimpleDirectoryReader` to obtain `Document` objects.

```python
!git clone https://github.com/keras-team/keras-io.git

def get_data(input_dir="path/to/keras-io/templates"):
reader = SimpleDirectoryReader(
input_dir,
recursive=True,
exclude=["path/to/keras-io/templates/examples"]
)
docs = reader.load_data()

splitter = SentenceSplitter(
chunk_size=300,
chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(docs)
documents = [Document(text=node.text, metadata=node.metadata) for node in nodes]

return docs

# load the documents
documents=get_data()
```

Use the `qa_generator` function to generate QA pairs from your input documents.

```python
from evaluation.ragas_evaluation.QA_graphrag_testdataset import qa_generator

N_GENERATIONS = 20

# Generate the QA pairs
qa_pairs = qa_generator(documents, N_GENERATIONS)
```

#### 2. Critique the Generated QA Pairs

Once you have generated the QA pairs, critique them using the `critique_qa` function.

```python
from evaluation.ragas_evaluation.QA_graphrag_testdatasete import critique_qa

# Critique the generated QA pairs
critiqued_qa_pairs = critique_qa(qa_pairs)

# The critiqued pairs will include scores and evaluations for groundedness, relevance, and standalone quality
```

---
### 2. Evaluating Your Knowledge Graph with Llama-Index Evaluator Pack

This section demonstrates how to evaluate the performance of your query engine using the Llama-Index RAG evaluator pack.

> #### Evaluate Your Knowledge Graph with llama-index

To evaluate your query engine, follow these steps:
```shell
llamaindex-cli download-llamadataset PaulGrahamEssayDataset --download-dir ./data
```

```python
from evaluation.evaluation_llama_index import evaluate


# Path to your labeled RAG dataset
RAG_DATASET = "./data/rag_dataset.json"

# Define the language model and embedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

llm = Ollama(base_url="http://localhost:11434", model="llama2")
embedding = HuggingFaceEmbedding(model_name="microsoft/codebert-base")

# Your query engine instance
from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle, get_query_engine

index = get_index_from_pickle("path/to/graphIndex.pkl")
query_engine = get_query_engine(index)

# Evaluate the dataset
evaluation_results = evaluate(RAG_DATASET, query_engine)

# Review the results
print(evaluation_results)
```
| Metrics | RAG | Base RAG |
|------------------------------|------------|-----------|
| **Mean Correctness Score** | 3.340909 | 0.934 |
| **Mean Relevancy Score** | 0.750000 | 4.239 |
| **Mean Faithfulness Score** | 0.386364 | 0.977 |
| **Mean Context Similarity Score** | 0.948765 | 0.977 |



This example shows how to quickly evaluate your query engine's performance using the Llama-Index RAG evaluator pack.


---
### 3. Evaluating Your Knowledge Graph with Ragas backend

You can easily evaluate the performance of your query engine using this module.

> #### Load and Evaluate Your Dataset with ragas

Use the `load_test_dataset` function to load your dataset and directly evaluate it using the `evaluate` function. This method handles all necessary steps, including batching the data.

```python
from evaluation.ragas_evaluation.evaluation_ragas load_test_dataset, evaluate

# Step 1: Load the dataset from a pickle file
dataset_path = "/content/keras_docs_embedded.pkl"
test_dataset = load_test_dataset(dataset_path)
```

> **Note:** `test_dataset` is a list of Llama-Index `Document` objects.

```python
# Step 2: Define the language model and embedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

llm = Ollama(base_url="http://localhost:11434", model="codellama")
embedding = HuggingFaceEmbedding(model_name="microsoft/codebert-base")

# Step 3: Specify the metrics for evaluation
metrics = [faithfulness, answer_relevancy, context_precision, context_recall]

# Step 4: Load the query engine (Llama-Index)
from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle, get_query_engine

index = get_index_from_pickle("path/to/graphIndex.pkl")
query_engine = get_query_engine(index)

# Step 5: Evaluate the dataset
evaluation_results = evaluate(
query_engine=query_engine,
dataset=test_dataset,
llm=llm,
embeddings=embedding,
metrics=metrics,
# Default batch size is 4
)
```

**Output:**
```python
{'faithfulness': 0.0333, 'answer_relevancy': 0.9834, 'context_precision': 0.2000, 'context_recall': 0.8048}
```

```python
rdf = evaluation_results.to_pandas()
rdf.to_csv("results.csv", index=False)
```
---
**Detailed Result:**

| question | contexts | answer | ground_truth | faithfulness | answer_relevancy | context_precision | context_recall |
|-----------------------------------------------|---------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|--------------|------------------|-------------------|----------------|
| What is mixed precision in computing? | [Examples GPT-2 text generation Parameter…] | Mixed precision is a technique used to improve… | A combination of different numerical precision… | 0.166667 | 0.981859 | 0.0 | 0.666667 |
| What is the title of the guide discussed in th... | [Available guides… Hyperparameter T…] | The title of the guide discussed in the given… | How to distribute training | 0.000000 | 1.000000 | 0.0 | 1.000000 |
| What is Keras 3? | [No relationships found.] | Keras 3 is a new version of the popular deep l… | A deep learning framework that works with Tensor… | 0.000000 | 0.974711 | 0.0 | 0.500000 |
| What was the percentage boost in StableDiffusion... | [A first example: A MNIST convnet…] | The percentage boost in StableDiffusion traini… | Over 150% | 0.000000 | 0.970565 | 1.0 | 1.000000 |
| What are some examples of pretrained models av... | [No relationships found.] | Some examples of pre-trained models available… | BERT, OPT, Whisper, T5, StableDiffusion, YOLOv8… | 0.000000 | 0.989769 | 0.0 | 0.857143 |





49 changes: 49 additions & 0 deletions graph_rag/evaluation/evaluation_llama_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
This script evaluates a RagDataset using a RagEvaluatorPack, which assesses query engines by benchmarking against
labeled data using LLMs and embeddings.

Functions:
- evaluate: Evaluates the query engine using a labeled RAG dataset and specified models for both the LLM and embeddings.
"""

from llama_index.core.llama_dataset import LabelledRagDataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding





def evaluate(
RAG_DATASET: str,
query_engine: object,
ollama_model: str = "llama3",
embedd_model: str = "microsoft/codebert-base",
):
"""
Evaluates a RAG dataset by using a query engine and benchmarks it using LLM and embedding models.

Args:
RAG_DATASET: Path to the JSON file containing the labeled RAG dataset.
query_engine: The query engine to evaluate.
ollama_model: The LLM model to use for evaluation (default: "llama3").
embedd_model: The Hugging Face embedding model to use for evaluation (default: "microsoft/codebert-base").

Returns:
A DataFrame containing the benchmarking results, including LLM calls and evaluations.
"""

RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./rag_evaluator_pack")
rag_dataset = LabelledRagDataset.from_json(RAG_DATASET)
rag_evaluator_pack = RagEvaluatorPack(
rag_dataset=rag_dataset,
query_engine=query_engine,
judge_llm=Ollama(base_url="http://localhost:11434", model=ollama_model),
embed_model=HuggingFaceEmbedding(model_name=embedd_model),
)
benchmark_df = await rag_evaluator_pack.arun(
batch_size=5, # batches the number of llm calls to make
sleep_time_in_seconds=1, # seconds to sleep before making an api call
)
return benchmark_df
Loading