sudo apt install tesseract-ocr poppler-utils
pip install lemonpdf
lemonpdf -u file.pdf
lemonpdf -u file.pdf -o urls.txt -s
lemonpdf -d file.pdf
lemonpdf -d file.pdf -o domains.txt -s
from lemonpdf import Extractor
pdf_path = 'file.pdf'
output_txt_path = 'out_file.txt'
extractor = Extractor(pdf_path=pdf_path, output_txt_path=output_txt_path)
urls = extractor.extract_urls(save=True)
print(urls)
from lemonpdf import Extractor
pdf_path = 'file.pdf'
output_txt_path = 'domains.txt'
extractor = Extractor(pdf_path=pdf_path, output_txt_path=output_txt_path)
urls = extractor.extract_domains(save=True)
print(urls)