Improve performance and precision on movies AI

vintasoftware · Oct 2, 2024 · 854c952 · 854c952
1 parent c98970d
commit 854c952
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 25 deletions.
diff --git a/example/README.md b/example/README.md
@@ -34,6 +34,7 @@ Fill the `.env` file with the necessary API keys. You'll need accounts on:
 - [OpenAI](https://platform.openai.com/)
 - [Weather API](https://www.weatherapi.com/)
 - [Brave Search API](https://app.tavily.com/)
+- [Jina Reader API](https://jina.ai/)
 
 Activate the poetry shell:
 

diff --git a/example/example/settings.py b/example/example/settings.py
@@ -180,6 +180,7 @@
 # Example specific settings:
 
 WEATHER_API_KEY = os.getenv("WEATHER_API_KEY")  # get for free at https://www.weatherapi.com/
+JINA_API_KEY = os.getenv("JINA_API_KEY")  # get for free at https://jina.ai/
 BRAVE_SEARCH_API_KEY = os.getenv(
     "BRAVE_SEARCH_API_KEY"
 )  # get for free at https://brave.com/search/api/

diff --git a/example/movies/ai_assistants.py b/example/movies/ai_assistants.py
@@ -1,3 +1,5 @@
+import threading
+import time
 from typing import Sequence
 
 from django.conf import settings
@@ -8,16 +10,30 @@
 import requests
 from langchain_community.tools import BraveSearch
 from langchain_core.tools import BaseTool
-from pydantic import BaseModel
 
 from django_ai_assistant import AIAssistant, method_tool
 from movies.models import MovieBacklogItem
 
 
-class IMDbMovie(BaseModel):
-    imdb_url: str
-    imdb_rating: float
-    scrapped_imdb_page_markdown: str
+brave_search_lock = threading.Lock()
+
+
+class RateLimitedBraveSearch(BraveSearch):
+    def _run(self, query: str, **kwargs) -> str:
+        """Use the tool."""
+
+        # brave_search_lock is necessary to ensure 1 request/second,
+        # due to free plan limitations of Brave Search API:
+        try:
+            brave_search_lock.acquire(timeout=10)
+            start_time = time.time()
+            result = self.search_wrapper.run(query)
+            elapsed_time = time.time() - start_time
+            if 1 - elapsed_time > 0:
+                time.sleep(1 - elapsed_time + 0.2)  # sleep plus some jitter
+            return result
+        finally:
+            brave_search_lock.release()
 
 
 # Note this assistant is not registered, but we'll use it as a tool on the other.
@@ -28,31 +44,38 @@ class IMDbScraper(AIAssistant):
         "You're a function to find the IMDb URL of a given movie, "
         "and scrape this URL to get the movie rating and other information.\n"
         "Use the search function to find the IMDb URL. "
-        "Make search queries like: \n"
-        "- IMDb page of The Matrix\n"
-        "- IMDb page of The Godfather\n"
-        "- IMDb page of The Shawshank Redemption\n"
-        "Then check results, scape the IMDb URL, process the page, and produce a JSON output."
+        "Make search queries like:\n"
+        "- IMDb page of <queried movie here>\n"
+        "Then check results, scrape the IMDb URL, process the page, and produce an output like this: \n"
+        "- IMDb URL: ...\n"
+        "- IMDb Rating: ...\n"
+        "- IMDb Page: <Markdown content of the IMDb page>"
     )
     name = "IMDb Scraper"
     model = "gpt-4o-mini"
-    structured_output = IMDbMovie
+    tool_max_concurrency = 4
 
     def get_instructions(self):
         # Warning: this will use the server's timezone
         # See: https://docs.djangoproject.com/en/5.0/topics/i18n/timezones/#default-time-zone-and-current-time-zone
         # In a real application, you should use the user's timezone
         current_date_str = timezone.now().date().isoformat()
-        return f"{self.instructions} Today is: {current_date_str}."
+        return f"{self.instructions}.\n Today is: {current_date_str}."
 
     @method_tool
     def scrape_imdb_url(self, url: str) -> str:
-        """Scrape the IMDb URL and return the content as markdown."""
-        return requests.get("https://r.jina.ai/" + url, timeout=20).text[:10000]
+        """Scrape the IMDb URL and return the content as Markdown."""
+        return requests.get(
+            "https://r.jina.ai/" + url,
+            headers={
+                "Authorization": "Bearer " + settings.JINA_API_KEY,
+            },
+            timeout=20,
+        ).text[:30000]
 
     def get_tools(self) -> Sequence[BaseTool]:
         return [
-            BraveSearch.from_api_key(
+            RateLimitedBraveSearch.from_api_key(
                 api_key=settings.BRAVE_SEARCH_API_KEY, search_kwargs={"count": 5}
             ),
             *super().get_tools(),
@@ -63,18 +86,23 @@ class MovieRecommendationAIAssistant(AIAssistant):
     id = "movie_recommendation_assistant"  # noqa: A003
     instructions = (
         "You're a helpful movie recommendation assistant. "
-        "Help the user find movies to watch and manage their movie backlogs. "
-        "Use the provided functions to answer questions and run operations.\n"
+        "Use the provided functions to answer queries and run operations.\n"
+        "Use the search function to find movie recommendations based on user's query.\n"
+        "Then, use the IMDb Scraper to get the IMDb URL and rating of the movies you're recommending. "
+        "Both the IMDb URL and rating are necessary to add a movie to the user's backlog. "
         "Note the backlog is stored in a DB. "
-        "When managing the backlog, you must call the functions, to keep the sync with the DB. "
+        "When managing the backlog, you must call the functions, to keep your answers in sync with the DB. "
         "The backlog has an order, and you should respect it. Call `reorder_backlog` when necessary.\n"
-        "Include the IMDb URL and rating of the movies when displaying the backlog. "
-        "You must use the IMDb Scraper to get the IMDb URL and rating of the movies. \n"
-        "Ask the user if they want to add your recommended movies to their backlog, "
-        "but only if the movie is not on the user's backlog yet."
+        "When showing the backlog, show the movies in the order they are stored in the DB, "
+        "and include the IMDb URL and rating.\n"
+        "Ask the user if they want to add your recommended movies to their backlog.\n"
+        "User may talk to you in any language. Respond with the same language, "
+        "but refer to movies and call functions with their English name.\n"
+        "Do not include images in your response."
     )
     name = "Movie Recommendation Assistant"
     model = "gpt-4o-mini"
+    tool_max_concurrency = 4
 
     def get_instructions(self):
         # Warning: this will use the server's timezone
@@ -93,10 +121,14 @@ def get_instructions(self):
 
     def get_tools(self) -> Sequence[BaseTool]:
         return [
-            BraveSearch.from_api_key(
+            RateLimitedBraveSearch.from_api_key(
                 api_key=settings.BRAVE_SEARCH_API_KEY, search_kwargs={"count": 5}
             ),
-            IMDbScraper().as_tool(description="IMDb Scraper to get the IMDb data a given movie."),
+            IMDbScraper().as_tool(
+                description="IMDb Scraper to get the IMDb data a given movie. "
+                "Given a movie name (in English), "
+                "finds the movie URL, rating, and scrapes the IMDb page (as Markdown)."
+            ),
             *super().get_tools(),
         ]
 
@@ -116,7 +148,10 @@ def get_movies_backlog(self) -> str:
 
     @method_tool
     def add_movie_to_backlog(self, movie_name: str, imdb_url: str, imdb_rating: float) -> str:
-        """Add a movie to user's backlog. Must pass the movie_name, imdb_url, and imdb_rating."""
+        """
+        Add a movie to user's backlog. Must pass the movie_name, imdb_url, and imdb_rating.
+        Set imdb_rating to 0.0 if not available.
+        """
 
         with transaction.atomic():
             MovieBacklogItem.objects.update_or_create(