Skip to content

Commit 4d4b31d

Browse files
committed
Make URL part joining aware of absolute URLs.
Previously, with RELATIVE_URLS disabled, when both SITEURL and STATIC_URL were absolute, the final generate data URLs looked wrong like this (two absolute URLs joined by `/`): http://your.site/http://static.your.site/image.png With this patch, the data URLs are correctly: http://static.your.site/image.png This also applies to all *_URL configuration options (for example, ability to have pages and articles on different domains) and behaves like one expects even with URLs starting with just `//`, thanks to making use of urllib.parse.urljoin(). However, when RELATIVE_URLS are enabled, urllib.parse.urljoin() doesn't handle the relative base correctly. In that case, simple os.path.join() is used. That, however, breaks the above case, but as RELATIVE_URLS are meant for local development (thus no data scattered across multiple domains), I don't see any problem. Just to clarify, this is a fully backwards-compatible change, it only enables new use cases that were impossible before.
1 parent a9bdcdf commit 4d4b31d

File tree

2 files changed

+73
-6
lines changed

2 files changed

+73
-6
lines changed

pelican/contents.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pytz
1212

1313
import six
14-
from six.moves.urllib.parse import urlparse, urlunparse
14+
from six.moves.urllib.parse import urljoin, urlparse, urlunparse
1515

1616
from pelican import signals
1717
from pelican.settings import DEFAULT_CONFIG
@@ -234,6 +234,25 @@ def _link_replacer(self, siteurl, m):
234234
path = value.path
235235
origin = m.group('path')
236236

237+
# urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
238+
# so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
239+
# properly get `../a.html`. However, os.path.join() produces
240+
# `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
241+
# instead of correct "http://foo/bar.html", so one has to pick a side
242+
# as there is no silver bullet.
243+
if self.settings['RELATIVE_URLS']:
244+
joiner = os.path.join
245+
else:
246+
joiner = urljoin
247+
248+
# However, it's not *that* simple: urljoin("blog", "index.html")
249+
# produces just `index.html` instead of `blog/index.html` (unlike
250+
# os.path.join()), so in order to get a correct answer one needs to
251+
# append a trailing slash to siteurl in that case. This also makes
252+
# the new behavior fully compatible with Pelican 3.7.1.
253+
if not siteurl.endswith('/'):
254+
siteurl += '/'
255+
237256
# XXX Put this in a different location.
238257
if what in {'filename', 'attach'}:
239258
if path.startswith('/'):
@@ -260,7 +279,7 @@ def _link_replacer(self, siteurl, m):
260279
"%s used {attach} link syntax on a "
261280
"non-static file. Use {filename} instead.",
262281
self.get_relative_source_path())
263-
origin = '/'.join((siteurl, linked_content.url))
282+
origin = joiner(siteurl, linked_content.url)
264283
origin = origin.replace('\\', '/') # for Windows paths.
265284
else:
266285
logger.warning(
@@ -269,13 +288,13 @@ def _link_replacer(self, siteurl, m):
269288
'limit_msg': ("Other resources were not found "
270289
"and their urls not replaced")})
271290
elif what == 'category':
272-
origin = '/'.join((siteurl, Category(path, self.settings).url))
291+
origin = joiner(siteurl, Category(path, self.settings).url)
273292
elif what == 'tag':
274-
origin = '/'.join((siteurl, Tag(path, self.settings).url))
293+
origin = joiner(siteurl, Tag(path, self.settings).url)
275294
elif what == 'index':
276-
origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
295+
origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
277296
elif what == 'author':
278-
origin = '/'.join((siteurl, Author(path, self.settings).url))
297+
origin = joiner(siteurl, Author(path, self.settings).url)
279298
else:
280299
logger.warning(
281300
"Replacement Indicator '%s' not recognized, "

pelican/tests/test_contents.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,54 @@ def test_intrasite_link_more(self):
397397
'</blockquote>'
398398
)
399399

400+
def test_intrasite_link_absolute(self):
401+
"""Test that absolute URLs are merged properly."""
402+
403+
args = self.page_kwargs.copy()
404+
args['settings'] = get_settings(
405+
STATIC_URL='http://static.cool.site/{path}',
406+
ARTICLE_URL='http://blog.cool.site/{slug}.html')
407+
args['source_path'] = 'content'
408+
args['context']['filenames'] = {
409+
'images/poster.jpg': Static('',
410+
settings=args['settings'],
411+
source_path='images/poster.jpg'),
412+
'article.rst': Article('',
413+
settings=args['settings'],
414+
metadata={'slug': 'article',
415+
'title': 'Article'})
416+
}
417+
418+
# Article link will go to blog
419+
args['content'] = (
420+
'<a href="{filename}article.rst">Article</a>'
421+
)
422+
content = Page(**args).get_content('http://cool.site')
423+
self.assertEqual(
424+
content,
425+
'<a href="http://blog.cool.site/article.html">Article</a>'
426+
)
427+
428+
# Page link will go to the main site
429+
args['content'] = (
430+
'<a href="{index}">Index</a>'
431+
)
432+
content = Page(**args).get_content('http://cool.site')
433+
self.assertEqual(
434+
content,
435+
'<a href="http://cool.site/index.html">Index</a>'
436+
)
437+
438+
# Image link will go to static
439+
args['content'] = (
440+
'<img src="{filename}/images/poster.jpg"/>'
441+
)
442+
content = Page(**args).get_content('http://cool.site')
443+
self.assertEqual(
444+
content,
445+
'<img src="http://static.cool.site/images/poster.jpg"/>'
446+
)
447+
400448
def test_intrasite_link_markdown_spaces(self):
401449
# Markdown introduces %20 instead of spaces, this tests that
402450
# we support markdown doing this.

0 commit comments

Comments
 (0)