Skip to content

Commit

Permalink
Improve handling empty segments in urls according to RFC3986
Browse files Browse the repository at this point in the history
The patch makes `yarl.URL()` objects preserve empty segments
when joining additional parts in cases like

```python
URL("https://web.archive.org/web/") / "https://github.com/"
```

PR #1026

Fixes #926
Fixes #984
Closes #1023

Co-authored-by: Sam Bull <[email protected]>
Co-authored-by: Sviatoslav Sydorenko <[email protected]>
Co-authored-by: Alessio Bogon <[email protected]>
  • Loading branch information
4 people authored Jul 3, 2024
1 parent 433beaf commit 4bededd
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 22 deletions.
6 changes: 3 additions & 3 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ Bug fixes
---------

- Stopped dropping trailing slashes in :py:meth:`~yarl.URL.joinpath` -- by :user:`gmacon`. (:issue:`862`, :issue:`866`)
- Started accepting string subclasses in ``__truediv__()`` operations (``URL / segment``) -- by :user:`mjpieters`. (:issue:`871`, :issue:`884`)
- Started accepting string subclasses in :meth:`~yarl.URL.__truediv__` operations (``URL / segment``) -- by :user:`mjpieters`. (:issue:`871`, :issue:`884`)
- Fixed the human representation of URLs with square brackets in usernames and passwords -- by :user:`mjpieters`. (:issue:`876`, :issue:`882`)
- Updated type hints to include ``URL.missing_port()``, ``URL.__bytes__()``
and the ``encoding`` argument to :py:meth:`~yarl.URL.joinpath`
Expand Down Expand Up @@ -174,7 +174,7 @@ Contributor-facing changes
Bugfixes
--------

- Fix regression with ``__truediv__`` and absolute URLs with empty paths causing the raw path to lack the leading ``/``.
- Fix regression with :meth:`~yarl.URL.__truediv__` and absolute URLs with empty paths causing the raw path to lack the leading ``/``.
(`#854 <https://github.com/aio-libs/yarl/issues/854>`_)


Expand All @@ -196,7 +196,7 @@ Features
--------

- Added ``URL.joinpath(*elements)``, to create a new URL appending multiple path elements. (`#704 <https://github.com/aio-libs/yarl/issues/704>`_)
- Made ``URL.__truediv__()`` return ``NotImplemented`` if called with an
- Made :meth:`URL.__truediv__() <yarl.URL.__truediv__>` return ``NotImplemented`` if called with an
unsupported type — by :user:`michaeljpeters`.
(`#832 <https://github.com/aio-libs/yarl/issues/832>`_)

Expand Down
21 changes: 21 additions & 0 deletions CHANGES/1026.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Joining URLs with empty segments has been changed
to match :rfc:`3986`.

Previously empty segments would be removed from path,
breaking use-cases such as

.. code-block:: python
URL("https://web.archive.org/web/") / "https://github.com/"
Now :meth:`/ operation <yarl.URL.__truediv__>` and :meth:`URL.joinpath() <yarl.URL.joinpath>`
keep empty segments, but do not introduce new empty segments.
e.g.

.. code-block:: python
URL("https://example.org/") / ""
does not introduce an empty segment.

-- by :user:`commonism` and :user:`youtux`
2 changes: 2 additions & 0 deletions CHANGES/1026.doc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The pre-existing :meth:`/ magic method <yarl.URL.__truediv__>`
has been documented in the API reference -- by :user:`commonism`.
14 changes: 14 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,20 @@ The path is encoded if needed.

.. versionadded:: 1.9

.. method:: URL.__truediv__(url)

Shortcut for :meth:`URL.joinpath` with a single element and ``encoded=False``.

.. doctest::

>>> url = URL('http://example.com/path?arg#frag') / 'to'
>>> url
URL('http://example.com/path/to')
>>> url.parts
('/', 'path', 'to')

.. versionadded:: 0.9

.. method:: URL.join(url)

Construct a full (“absolute”) URL by combining a “base URL”
Expand Down
45 changes: 45 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,13 +799,25 @@ def test_div_with_dots():
pytest.param(
"/path/", ("to",), "http://example.com/path/to", id="path-with-slash"
),
pytest.param(
"/path", ("",), "http://example.com/path/", id="path-add-trailing-slash"
),
pytest.param(
"/path?a=1#frag",
("to",),
"http://example.com/path/to",
id="cleanup-query-and-fragment",
),
pytest.param("", ("path/",), "http://example.com/path/", id="trailing-slash"),
pytest.param(
"",
(
"path",
"",
),
"http://example.com/path/",
id="trailing-slash-empty-string",
),
pytest.param(
"", ("path/", "to/"), "http://example.com/path/to/", id="duplicate-slash"
),
Expand All @@ -827,6 +839,39 @@ def test_joinpath(base, to_join, expected):
assert str(url.joinpath(*to_join)) == expected


@pytest.mark.parametrize(
"base,to_join,expected",
[
pytest.param("path", "a", "path/a", id="default_default"),
pytest.param("path", "./a", "path/a", id="default_relative"),
pytest.param("path/", "a", "path/a", id="empty-segment_default"),
pytest.param("path/", "./a", "path/a", id="empty-segment_relative"),
pytest.param("path", ".//a", "path//a", id="default_empty-segment"),
pytest.param("path/", ".//a", "path//a", id="empty-segment_empty_segment"),
pytest.param("path//", "a", "path//a", id="empty-segments_default"),
pytest.param("path//", "./a", "path//a", id="empty-segments_relative"),
pytest.param("path//", ".//a", "path///a", id="empty-segments_empty-segment"),
pytest.param("path", "a/", "path/a/", id="default_trailing-empty-segment"),
pytest.param("path", "a//", "path/a//", id="default_trailing-empty-segments"),
pytest.param("path", "a//b", "path/a//b", id="default_embedded-empty-segment"),
],
)
def test_joinpath_empty_segments(base, to_join, expected):
url = URL(f"http://example.com/{base}")
assert (
f"http://example.com/{expected}" == str(url.joinpath(to_join))
and str(url / to_join) == f"http://example.com/{expected}"
)


def test_joinpath_single_empty_segments():
"""joining standalone empty segments does not create empty segments"""
a = URL("/1//2///3")
assert a.parts == ("/", "1", "", "2", "", "", "3")
b = URL("scheme://host").joinpath(*a.parts[1:])
assert b.path == "/1/2/3"


@pytest.mark.parametrize(
"url,to_join,expected",
[
Expand Down
44 changes: 25 additions & 19 deletions yarl/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,28 +713,34 @@ def _validate_authority_uri_abs_path(host, path):
"Path in a URL with authority should start with a slash ('/') if set"
)

def _make_child(self, segments, encoded=False):
"""add segments to self._val.path, accounting for absolute vs relative paths"""
# keep the trailing slash if the last segment ends with /
parsed = [""] if segments and segments[-1][-1:] == "/" else []
for seg in reversed(segments):
if not seg:
continue
if seg[0] == "/":
def _make_child(self, paths, encoded=False):
"""
add paths to self._val.path, accounting for absolute vs relative paths,
keep existing, but do not create new, empty segments
"""
parsed = []
for idx, path in enumerate(reversed(paths)):
# empty segment of last is not removed
last = idx == 0
if path and path[0] == "/":
raise ValueError(
f"Appending path {seg!r} starting from slash is forbidden"
)
seg = seg if encoded else self._PATH_QUOTER(seg)
if "/" in seg:
parsed += (
sub for sub in reversed(seg.split("/")) if sub and sub != "."
f"Appending path {path!r} starting from slash is forbidden"
)
elif seg != ".":
parsed.append(seg)
path = path if encoded else self._PATH_QUOTER(path)
segments = [
segment for segment in reversed(path.split("/")) if segment != "."
]
if not segments:
continue
# remove trailing empty segment for all but the last path
segment_slice_start = int(not last and segments[0] == "")
parsed += segments[segment_slice_start:]
parsed.reverse()
old_path = self._val.path
if old_path:
parsed = [*old_path.rstrip("/").split("/"), *parsed]

if self._val.path and (old_path_segments := self._val.path.split("/")):
old_path_cutoff = -1 if old_path_segments[-1] == "" else None
parsed = [*old_path_segments[:old_path_cutoff], *parsed]

if self.is_absolute():
parsed = _normalize_path_segments(parsed)
if parsed and parsed[0] != "":
Expand Down

0 comments on commit 4bededd

Please sign in to comment.