diff --git a/src/requests/_internal_utils.py b/src/requests/_internal_utils.py index f2cf635e29..8d43cc5796 100644 --- a/src/requests/_internal_utils.py +++ b/src/requests/_internal_utils.py @@ -23,25 +23,14 @@ def to_native_string(string, encoding="ascii"): - """Given a string object, regardless of type, returns a representation of - that string in the native string type, encoding and decoding where - necessary. This assumes ASCII unless told otherwise. - """ - if isinstance(string, builtin_str): - out = string - else: - out = string.decode(encoding) - - return out + """Given a string object, returns a representation in the native string type.""" + if isinstance(string, bytes): + return string.decode(encoding) + return string def unicode_is_ascii(u_string): - """Determine if unicode string only contains ASCII characters. - - :param str u_string: unicode string to check. Must be unicode - and not Python 2 `str`. - :rtype: bool - """ + """Determine if unicode string only contains ASCII characters.""" assert isinstance(u_string, str) try: u_string.encode("ascii") diff --git a/src/requests/models.py b/src/requests/models.py index 8f56ca7d23..25dc5136de 100644 --- a/src/requests/models.py +++ b/src/requests/models.py @@ -65,6 +65,7 @@ super_len, to_key_val_list, ) +from urllib.parse import urlparse as parse_url, urlunparse #: The set of HTTP status codes that indicate an automatically #: processable redirect. @@ -408,32 +409,32 @@ def _get_idna_encoded_host(host): def prepare_url(self, url, params): """Prepares the given HTTP URL.""" - #: Accept objects that have string representations. - #: We're unable to blindly call unicode/str functions - #: as this will include the bytestring indicator (b'') - #: on python 3.x. - #: https://github.com/psf/requests/pull/2238 if isinstance(url, bytes): url = url.decode("utf8") else: url = str(url) - # Remove leading whitespaces from url url = url.lstrip() - # Don't do any URL preparation for non-HTTP schemes like `mailto`, - # `data` etc to work around exceptions from `url_parse`, which - # handles RFC 3986 only. if ":" in url and not url.lower().startswith("http"): self.url = url return - # Support for unicode domain names and paths. try: - scheme, auth, host, port, path, query, fragment = parse_url(url) - except LocationParseError as e: + parsed = parse_url(url) + except Exception as e: raise InvalidURL(*e.args) + scheme, auth, host, port, path, query, fragment = ( + parsed.scheme, + parsed.username, + parsed.hostname, + parsed.port, + parsed.path, + parsed.query, + parsed.fragment, + ) + if not scheme: raise MissingSchema( f"Invalid URL {url!r}: No scheme supplied. " @@ -443,41 +444,26 @@ def prepare_url(self, url, params): if not host: raise InvalidURL(f"Invalid URL {url!r}: No host supplied") - # In general, we want to try IDNA encoding the hostname if the string contains - # non-ASCII characters. This allows users to automatically get the correct IDNA - # behaviour. For strings containing only ASCII characters, we need to also verify - # it doesn't start with a wildcard (*), before allowing the unencoded hostname. if not unicode_is_ascii(host): try: - host = self._get_idna_encoded_host(host) + host = host.encode('idna').decode('ascii') except UnicodeError: raise InvalidURL("URL has an invalid label.") elif host.startswith(("*", ".")): raise InvalidURL("URL has an invalid label.") - # Carefully reconstruct the network location - netloc = auth or "" - if netloc: - netloc += "@" - netloc += host - if port: - netloc += f":{port}" + netloc = f"{auth + '@' if auth else ''}{host}{f':{port}' if port else ''}" - # Bare domains aren't valid URLs. - if not path: - path = "/" + path = path or "/" if isinstance(params, (str, bytes)): params = to_native_string(params) - enc_params = self._encode_params(params) + enc_params = self._encode_params(params or "") if enc_params: - if query: - query = f"{query}&{enc_params}" - else: - query = enc_params + query = f"{query}&{enc_params}" if query else enc_params - url = requote_uri(urlunparse([scheme, netloc, path, None, query, fragment])) + url = requote_uri(urlunparse((scheme, netloc, path, None, query, fragment))) self.url = url def prepare_headers(self, headers): diff --git a/src/requests/utils.py b/src/requests/utils.py index 699683e5d9..5e78cbe7bd 100644 --- a/src/requests/utils.py +++ b/src/requests/utils.py @@ -58,6 +58,7 @@ UnrewindableBodyError, ) from .structures import CaseInsensitiveDict +from urllib.parse import quote, urlparse as parse_url, urlunparse NETRC_FILES = (".netrc", "_netrc") @@ -661,24 +662,12 @@ def unquote_unreserved(uri): def requote_uri(uri): - """Re-quote the given URI. - - This function passes the given URI through an unquote/quote cycle to - ensure that it is fully and consistently quoted. - - :rtype: str - """ + """Re-quote the given URI to ensure it is fully and consistently quoted.""" safe_with_percent = "!#$%&'()*+,/:;=?@[]~" safe_without_percent = "!#$&'()*+,/:;=?@[]~" try: - # Unquote only the unreserved characters - # Then quote only illegal characters (do not quote reserved, - # unreserved, or '%') - return quote(unquote_unreserved(uri), safe=safe_with_percent) - except InvalidURL: - # We couldn't unquote the given URI, so let's try quoting it, but - # there may be unquoted '%'s in the URI. We need to make sure they're - # properly quoted so they do not cause issues elsewhere. + return quote(uri, safe=safe_with_percent) + except Exception: # Catch any unforeseen error return quote(uri, safe=safe_without_percent)