net.py

# Copyright (C) IBM Corporation 2008
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import os
import shutil
import urllib
import logging
from gettext import gettext as _

from sugar3.activity.activity import get_bundle_path

import book
from infoslicer.processing.NewtifulSoup import NewtifulStoneSoup \
        as BeautifulStoneSoup
from infoslicer.processing.MediaWiki_Parser import MediaWiki_Parser
from infoslicer.processing.MediaWiki_Helper import MediaWiki_Helper
from infoslicer.processing.MediaWiki_Helper import PageNotFoundError

logger = logging.getLogger('infoslicer')
elogger = logging.getLogger('infoslicer::except')

proxies = None

def download_wiki_article(title, wiki, progress):
    try:
        progress.set_label(_('"%s" download in progress...') % title)
        article, url = MediaWiki_Helper().getArticleAsHTMLByTitle(title, wiki)

        progress.set_label(_('Processing "%s"...') % title)
        parser = MediaWiki_Parser(article, title, url)
        contents = parser.parse()

        progress.set_label(_('Downloading "%s" images...') % title)
        book.wiki.create(title + _(' (from %s)') % wiki, contents)

        progress.set_label(_('"%s" successfully downloaded') % title)

    except PageNotFoundError, e:
        elogger.debug('download_and_add: %s' % e)
        progress.set_label(_('"%s" could not be found') % title)

    except Exception, e:
        elogger.debug('download_and_add: %s' % e)
        progress.set_label(_('Error downloading "%s"; check your connection') % title)

def image_handler(root, uid, document):
    """
        Takes a DITA article and downloads images referenced in it
        (finding all <image> tags).
        Attemps to fix incomplete paths using source url.
        @param document: DITA to work on
        @return: The document with image tags adjusted to point to local paths
    """
    document = BeautifulStoneSoup(document)
    dir_path =  os.path.join(root, uid, "images")

    logger.debug('image_handler: %s' % dir_path)

    if not os.path.exists(dir_path):
        os.makedirs(dir_path, 0777)

    for image in document.findAll("image"):
        fail = False
        path = image['href']
        if "#DEMOLIBRARY#" in path:
            path = path.replace("#DEMOLIBRARY#",
                    os.path.join(get_bundle_path(), 'examples'))
            image_title = os.path.split(path)[1]
            shutil.copyfile(path, os.path.join(dir_path, image_title))
        else:
            image_title = path.rsplit("/", 1)[-1]
            # attempt to fix incomplete paths
            if (not path.startswith("http://")) and document.source != None and document.source.has_key("href"):
                if path.startswith("//upload"):
                    path = 'http:' + path
                elif path.startswith("/"):
                    path = document.source['href'].rsplit("/", 1)[0] + path
                else:
                    path = document.source['href'].rsplit("/", 1)[0] + "/" + path
            logger.debug("Retrieving image: " + path)
            file = open(os.path.join(dir_path, image_title), 'wb')
            image_contents = _open_url(path)
            if image_contents == None:
                fail = True
            else:
                file.write(image_contents)
            file.close()
        #change to relative paths:
        if not fail:
            image['href'] = os.path.join(dir_path.replace(os.path.join(root, ""), "", 1), image_title)
            image['orig_href'] = path
        else:
            image.extract()

    return document.prettify()

def _open_url(url):
    """
        retrieves content from specified url
    """
    urllib._urlopener = _new_url_opener()
    try:
        logger.debug("opening " + url)
        logger.debug("proxies: " + str(proxies))
        doc = urllib.urlopen(url, proxies=proxies)
        output = doc.read()
        doc.close()
        logger.debug("url opened succesfully")
        return output
    except IOError, e:
        elogger.debug('_open_url: %s' % e)

class _new_url_opener(urllib.FancyURLopener):
    version = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1b2)" \
              "Gecko/20081218 Gentoo Iceweasel/3.1b2"

# http proxy

_proxy_file = os.path.join(os.path.split(os.path.split(__file__)[0])[0],
        'proxy.cfg')
_proxylist = {}

if os.access(_proxy_file, os.F_OK):
    proxy_file_handle = open(_proxy_file, "r")
    for line in proxy_file_handle.readlines():
        parts = line.split(':', 1)
        #logger.debug("setting " + parts[0] + " proxy to " + parts[1])
        _proxylist[parts[0].strip()] = parts[1].strip()
    proxy_file_handle.close()

if _proxylist:
    proxies = _proxylist