Skip to content

Commit

Permalink
Commit commited
Browse files Browse the repository at this point in the history
  • Loading branch information
ridiculouswaffle committed Mar 29, 2024
0 parents commit 3773e52
Show file tree
Hide file tree
Showing 8 changed files with 288 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
target/
.lsp/
.cpcache/
.clj-kondo/
drivers/
Empty file added .projectile
Empty file.
7 changes: 7 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Copyright 2024 Dereck Smith

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
64 changes: 64 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Mail Harvester

A Clojure app that scrapes emails and links from any website.

Browsers supported:
* Chrome
* Firefox
* Safari

## How to use?

### Note

If you are using Chrome or Firefox, skip this section.

If you want to use Safari, you should enable a feature to use this application (Remote Automation)

For Sonoma:
* Open Safari > Preferences from the menu bar (or use the shortcut Command + ,)
* Go to the Advanced section
* Check the "Show features for web developers" checkbox
* Go to the Developer section
* Check the "Allow Remote Automation" checkbox

For Ventura and below:
* Open Safari > Preferences from the menu bar (or use the shortcut Command + ,)
* Go to the Advanced Section
* Check the "Show Develop menu in menu bar"
* Click Safari > Develop > Allow Remote Automation from the menu bar

### Setup

Before using this app, you need to install Java from [here](https://adoptium.net).

After you have installed Java, download the latest release from the Releases section in the right side of this page

After you have download the archive, unzip it and double click the .jar to use!

# For Developers

## Prerequisites

While running locally, this project expects the drivers to be in a folder named `drivers`. For users convenience, these drivers are packaged in the Releases, but not in the repository.

If you run it with the `clj` tool, they should be in the root of the repository. If you run it after compiling it in a jar, it needs to be in the `target` directory (or wherever the JAR is)

The browsers this application supports are:
* Chrome
* Firefox
* Safari

You can download the drivers for them at:
* [Chrome](https://chromedriver.chromium.org/downloads)
* [Firefox](https://github.com/mozilla/geckodriver/releases)
* Safari doesn't need a driver. Check the [note](#note) above

## How to run

To run from the `clj` tool, use `clj -M -m mail-harvester.core`
To compile a JAR, use `clj -T:build uber`

# License

This project is licensed under the MIT License.
30 changes: 30 additions & 0 deletions build.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
(ns build
(:require [clojure.tools.build.api :as b]))

(def build-directory "target")
(def jar-content (str build-directory "/resources"))

(def basis (b/create-basis {:project "deps.edn"}))
(def version "1.0.0")
(def app-name "mail-harvester")
(def uber-file-name (format "%s/%s-%s-standalone.jar" build-directory app-name version))

(defn clean [_]
(b/delete {:path build-directory})
(println (format "Build directory \"%s\" removed" build-directory)))

(defn uber [_]
(clean nil)
(b/copy-file {:src "README.md"
:target-dir build-directory})

(b/compile-clj {:basis basis
:src-dirs ["src"]
:class-dir jar-content})

(b/uber {:class-dir jar-content
:uber-file uber-file-name
:basis basis
:main 'mail-harvester.core})

(println (format "Uber file created: \"%s\"" uber-file-name)))
9 changes: 9 additions & 0 deletions deps.edn
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{:paths ["src" "classes"]
:deps {org.clojure/clojure {:mvn/version "1.11.2"}
cljfx/cljfx {:mvn/version "1.8.0"}
etaoin/etaoin {:mvn/version "1.0.40"}
org.clojure/core.async {:mvn/version "1.6.681"}
org.clojure/data.csv {:mvn/version "1.1.0"}}
:aliases {
:build {:deps {io.github.clojure/tools.build {:mvn/version "0.10.0"}}
:ns-default build}}}
120 changes: 120 additions & 0 deletions src/mail_harvester/core.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
(ns mail-harvester.core
(:gen-class)
(:require [cljfx.api :as fx]
[mail-harvester.scraper :as scraper]
[clojure.core.async :refer [thread]])
(:import [javafx.application Platform]))

;; The main, mutable state for the app
(def *state
(atom {:status "Not in use"
:url ""
:browser "Chrome"}))

;; Separate widgets so that they are easier to read.

(defn url-field
"The URL field for the app"
[{}]
{:fx/type :h-box
:spacing 5
:alignment :center
:children [{:fx/type :label
:text "URL: "}
{:fx/type :text-field
:on-text-changed #(swap! *state assoc :url %)}]})

(defn browser-picker
"The browser picker menu for the app"
[{}]
{:fx/type :h-box
:spacing 5
:alignment :center
:children [{:fx/type :label
:text "Browser: "}
{:fx/type :choice-box
:items ["Chrome" "Firefox" "Safari"]
:value "Chrome"
:on-value-changed (fn [value]
(swap! *state assoc :browser value))}]})

(defn scrape-emails-button
"The 'Scrape URL for emails' button for the app"
[{}]
{:fx/type :button
:on-action (fn [_]
;; Let the user know that scraping is going on
(swap! *state assoc :status "Scraping")
;; Run the scraper function in another thread to prevent locking the UI
(thread (try
(let [res (scraper/scrape-url (-> @*state :url)
(-> @*state :browser)
"emails")]
;; Export it to a CSV
(scraper/write-to-exports res "emails")
;; ...and let the user know that we have exported it
(swap! *state assoc :status "Scraping Done!"))
(catch Exception e
;; Write an error log
(spit "error.txt" e)
;; And tell the user an error occured
(swap! *state assoc :status "Error! Please file an issue on GitHub")))))
:text "Scrape URL for emails"})

(defn scrape-links-button
"The 'Scrape URL for links' button for the app"
[{}]
{:fx/type :button
:on-action (fn [_]
;; Let the user know that scraping is going on
(swap! *state assoc :status "Scraping")
;; Run the scraper function in another thread to prevent locking the UI
(thread (try
(let [res (scraper/scrape-url (-> @*state :url)
(-> @*state :browser)
"links")]
;; Export it to a CSV
(scraper/write-to-exports res "links")
;; ..and let the user know that we have exported it
(swap! *state assoc :status "Scraping Done!"))
(catch Exception e
;; Write an error log
(println e)
;; And tell the user an error occured
(swap! *state assoc :status "Error! Please file an issue on GitHub")))))
:text "Scrape URL for links"})

(defn root
"The root app that glues all the components together"
[{:keys [status]}]
{:fx/type :stage
:showing true
:title "Mail Harvester"
:scene {:fx/type :scene
:root {:fx/type :v-box
:padding 15
:alignment :center
:spacing 5
:children [{:fx/type url-field}
{:fx/type browser-picker}
{:fx/type :label
:text "Choose the action you would like to perform"}
{:fx/type :v-box
:alignment :center
:spacing 5
:children [{:fx/type scrape-links-button}
{:fx/type scrape-emails-button}]}
{:fx/type :label
:text (str "Status: " status)}
{:fx/type :label
:text "Visit the documentation for details on usage"}]}}})

;; A renderer that constantly checks the state and reloads if anything changes in the state
(def renderer (fx/create-renderer
:middleware (fx/wrap-map-desc assoc :fx/type root)))

(defn -main
"The entry point for the app"
[& args]
(Platform/setImplicitExit true) ;; Exits when the main window closes
(fx/mount-renderer *state renderer))
53 changes: 53 additions & 0 deletions src/mail_harvester/scraper.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
(ns mail-harvester.scraper
(:require [etaoin.api :as e]
[clojure.java.io :as io]
[clojure.data.csv :as csv]
[clojure.string :as string])
(:import [java.util UUID]))

(defn fetch-emails
"Fetches the mailto: links in a website, removes the mailto: prefix and returns a list of emails"
[driver]
(for [rawemail (e/query-all driver {:css "a[href^=\"mailto:\"]"})]
;; Replace instances of "mailto:" with an empty string
(string/replace (e/get-element-attr-el driver rawemail :href)
#"mailto:"
"")))

(defn fetch-links
"Fetches any links in a website and returns a list of links"
[driver]
(for [rawlink (e/query-all driver {:css "a"})]
;; Return the raw link from the href attribute
(e/get-element-attr-el driver rawlink :href)))

(defn scrape-url
"Scrapes from the URL, with information on which browser to use and what action to perform"
[url browser action]
(cond
(= browser "Firefox") (let [driver (e/firefox-headless {:path-driver "./drivers/geckodriver"})]
(e/go driver url)
(cond (= action "emails") (fetch-emails driver)
(= action "links") (fetch-links driver)
:else (println (format "Unknown action \"%s\" ignored" action))))
(= browser "Chrome") (let [driver (e/chrome-headless {:path-driver "./drivers/chromedriver"})]
(e/go driver url)
(cond (= action "emails") (fetch-emails driver)
(= action "links") (fetch-links driver)
:else (println (format "Unknown action \"%s\" ignored" action))))
(= browser "Safari") (let [driver (e/safari)]
(e/go driver url)
(cond (= action "emails") (fetch-emails driver)
(= action "links") (fetch-links driver)
:else (println (format "Unknown action \"%s\" ignored" action))))
:else (throw (Exception. (str "Browser is not valid: " browser)))))

(defn write-to-exports
"Exports a list of emails or links to a CSV file with a random UUID and a prefix"
[links type]
;; Open a writer
(with-open [writer (io/writer (str type "-" (.toString (UUID/randomUUID)) ".csv"))]
;; Write the data to it. It will handle closing it after it's not in use
(csv/write-csv writer
(map (fn [link]
(vector link)) links))))

0 comments on commit 3773e52

Please sign in to comment.