Skip to content

Commit

Permalink
Added functionality for Google Colab usecase in Crawler Module (#1436)
Browse files Browse the repository at this point in the history
* Added functionality for Google Colab usecase

* Corrected typo in installation guide of driver

* Corrected typo in installation guide of driver

* Corrected the copy command
  • Loading branch information
prikmm authored Sep 13, 2021
1 parent b53ad7a commit 389f6b6
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion haystack/connector/crawler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
import json
import logging
import re
Expand Down Expand Up @@ -39,6 +40,8 @@ def __init__(self, output_dir: str, urls: Optional[List[str]] = None, crawler_de
All URLs not matching at least one of the regular expressions will be dropped.
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
"""
IN_COLAB = "google.colab" in sys.modules

try:
from webdriver_manager.chrome import ChromeDriverManager
except ImportError:
Expand All @@ -53,7 +56,23 @@ def __init__(self, output_dir: str, urls: Optional[List[str]] = None, crawler_de

options = webdriver.chrome.options.Options()
options.add_argument('--headless')
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
if IN_COLAB:
try:
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome('chromedriver', options=options)
except :
raise Exception(
"""
\'chromium-driver\' needs to be installed manually when running colab. Follow the below given commands:
!apt-get update
!apt install chromium-driver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
If it has already been installed, please check if it has been copied to the right directory i.e. to \'/usr/bin\'"""
)
else:
logger.info("'chrome-driver' will be automatically installed.")
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
self.urls = urls
self.output_dir = output_dir
self.crawler_depth = crawler_depth
Expand Down

0 comments on commit 389f6b6

Please sign in to comment.