pinboard-to-kindle.recipe

#!/usr/bin/env python

# To test this recipe run:
# echo "PINBOARD_TOKEN=..." > config.env
# eval $(egrep -v "^#" config.env | xargs) ebook-convert pinboard-to-kindle.recipe pinboard.epub

__license__ = "MIT"
__copyright__ = "2020, Christian Hans"
__website__ = "https://github.com/christianhans/pinboard-to-kindle"

#
# Config
#
## Maximum number of unread Pinboard bookmarks to fetch
MAX_ARTICLES = 15

## If set, the recipe will only consider unread Pinboard bookmarks that have this tag
KINDLE_TO_TAG = "kindle-to"

## If set, the recipe will update all fetched Pinboard bookmarks and
## replace KINDLE_TO_TAG with this tag
KINDLE_SENT_TAG = "kindle-sent"

## Relative file path to fetch-article-moz-readability/index.js
FETCH_ARTICLE_MOZ_READABILITY_RELATIVE_PATH = "fetch-article-moz-readability/index.js"

## Temporary folder to use for downloading bookmark's page content
FETCH_ARTICLE_TMP_PATH = "/tmp"

## Node binary to use
NODE_BIN = "node"

import os
import sys
import re
import json
import subprocess
import uuid

from urllib.parse import urlencode
from urllib.request import urlopen

class PinboardRecipe(BasicNewsRecipe):
    title = "Pinboard"
    description = "Generate an ebook of unread Pinboard bookmarks."
    __author__ = "Christian Hans"

    auto_cleanup = False
    no_stylesheets = True
    remove_javascript = True
    articles_are_obfuscated = True
    encoding = "utf8"

    downloaded_file_paths = {}

    def _get_pinboard_token(self):
        pinboard_token = os.environ.get("PINBOARD_TOKEN")
        if not pinboard_token:
            self.abort_recipe_processing(
                "Please set PINBOARD_TOKEN environment variable."
            )
        return pinboard_token

    def _get_bookmarks(self):
        params = urlencode(
            {
                "tag": KINDLE_TO_TAG if KINDLE_TO_TAG else "",
                "format": "json",
                "auth_token": self._get_pinboard_token(),
            }
        )
        response = urlopen("https://api.pinboard.in/v1/posts/all?" + params)
        bookmarks = json.loads(response.read().decode('utf-8-sig'))
        bookmarks = [b for b in bookmarks if (b["toread"] == "yes")]
        if len(bookmarks) == 0:
            self.abort_recipe_processing("No unread Pinboard bookmarks.")
        return bookmarks

    def _mark_bookmark_as_sent(self, bookmark):
        if (not KINDLE_TO_TAG) or (not KINDLE_SENT_TAG):
            return
        print("Updating Pinboard tags: {}".format(bookmark["href"]))
        tags = bookmark["tags"].split()
        if KINDLE_TO_TAG in tags:
            tags.remove(KINDLE_TO_TAG)
        if not KINDLE_SENT_TAG in tags:
            tags.append(KINDLE_SENT_TAG)
        params = urlencode(
            {
                "url": bookmark["href"],
                "description": bookmark["description"],
                "extended": bookmark["extended"],
                "tags": " ".join(tags),
                "dt": bookmark["time"],
                "shared": bookmark["shared"],
                "toread": bookmark["toread"],
                "replace": "yes",
                "format": "json",
                "auth_token": self._get_pinboard_token(),
            }
        )
        urlopen("https://api.pinboard.in/v1/posts/add?" + params)

    def _fetch_article_moz_readability(self, url):
        downloaded_file_path = os.path.join(
            FETCH_ARTICLE_TMP_PATH, "{}.html".format(uuid.uuid1())
        )
        self.downloaded_file_paths[url] = downloaded_file_path

        print("Downloading: {}".format(url))

        subprocess.call(
            [
                NODE_BIN,
                "--unhandled-rejections=strict",
                self._get_fetch_article_moz_readability_script_path(),
                url,
                "--output_file={}".format(downloaded_file_path),
            ]
        )

        return downloaded_file_path

    def _get_fetch_article_moz_readability_script_path(self):
        if len(sys.argv) < 3 or not "ebook-convert" in sys.argv[0]:
            self.abort_recipe_processing(
                "Please invoke this Calibre recipe via: "
                "ebook-convert /path/to/pinboard-to-kindle.recipe output_file [options]"
            )
        if not "PWD" in os.environ:
            self.abort_recipe_processing("PWD environment variable not set.")
        recipe_base_path = os.path.normpath(
            os.path.join(os.environ["PWD"], os.path.dirname(sys.argv[1]))
        )
        return os.path.join(recipe_base_path, FETCH_ARTICLE_MOZ_READABILITY_RELATIVE_PATH)

    def _get_article_metadata(self, article_file_path):
        res = {}
        with open(article_file_path, "r") as f:
            html = f.read()
        match = re.search(
            '<[^>]+ class="pb-to-kindle-article-title">(.*?)</[^>]+>', html, re.IGNORECASE
        )
        if match:
            res["title"] = match.group(1)
        match = re.search(
            '<[^>]+ class="pb-to-kindle-article-metadata">(.*?)</[^>]+>',
            html,
            re.IGNORECASE,
        )
        if match:
            res["metadata"] = match.group(1)
        return res

    def get_obfuscated_article(self, url):
        """ Let Calibre download images and other media in downloaded html files """
        return self.downloaded_file_paths.get(url)

    def parse_index(self):
        articles = []

        bookmarks = self._get_bookmarks()
        for bookmark in bookmarks:
            if len(articles) >= MAX_ARTICLES:
                break

            url = bookmark["href"]
            try:
                article_file_path = self._fetch_article_moz_readability(url)
                article_info = self._get_article_metadata(article_file_path)
            except:
                print("Error fetching URL: {}".format(url))
                continue

            self._mark_bookmark_as_sent(bookmark)

            articles.append(
                {
                    "title": article_info.get("title", ""),
                    "url": url,
                    "description": article_info.get("metadata", ""),
                }
            )

        if len(articles) == 0:
            self.abort_recipe_processing("No articles were fetched.")

        return [("Pinboard", articles)]

    def cleanup(self):
        for file_path in self.downloaded_file_paths.values():
            if not os.path.exists(file_path):
                continue
            print("Removing temporary file: {}".format(file_path))
            os.remove(file_path)