This repository has been archived on 2024-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
Bebop/bebop/navigation.py

194 lines
6 KiB
Python

"""URI (RFC 3986) helpers for Gemini navigation.
It was supposed to be just thin fixes around urllib.parse functions but as
gemini is not recognized as a valid scheme it breaks a lot of things, so it
turned into a basic re-implementation of the RFC.
"""
import re
from typing import Any, Dict, Optional
from urllib.parse import quote
URI_RE = re.compile(
"^"
r"(?:(?P<scheme>[^:/?#\n]+):)?"
r"(?://(?P<netloc>[^/?#\n]*))?"
r"(?P<path>[^?#\n]*)"
r"(?:\?(?P<query>[^#\n]*))?"
r"(?:#(?P<fragment>.*))?"
"$"
)
class InvalidUrlException(Exception):
"""Generic exception for invalid URLs used in this module."""
def __init__(self, url):
super().__init__()
self.url = url
def parse_url(
url: str,
absolute: bool =False,
default_scheme: Optional[str] =None
) -> Dict[str, Any]:
"""Return URL parts from this URL.
Use the RFC regex to get parts from URL. This function can be used on
regular URLs but also on not-so-compliant URLs, e.g. "capsule.org/page",
which might be typed by an user (see `absolute` argument).
Arguments:
- url: URL to parse.
- absolute: assume the URL is absolute, e.g. in the case we are trying to
parse an URL an user has written, which is most of the time an absolute
URL even if not perfectly so. This only has an effect if, after the
initial parsing, there is no netloc available.
- default_scheme: specify the scheme to use if the URL either does not
specify it and we need it (e.g. there is a location), or `absolute` is
true; if absolute is true but `default_scheme` is not specified, a netloc
marker ("//") is prefixed without scheme.
Returns:
URL parts, as a dictionary with the following keys: "scheme", "netloc",
"path", "query" and "fragment". All keys are present, but all values can be
None, except path which is always a string (but can be empty).
Raises:
InvalidUrlException if you put really really stupid strings in there.
"""
match = URI_RE.match(url)
if not match:
raise InvalidUrlException(url)
match_dict = match.groupdict()
parts = {
k: match_dict.get(k)
for k in ("scheme", "netloc", "path", "query", "fragment")
}
# Smol hack: if we assume it's an absolute URL and no netloc has been found,
# just prefix default scheme (if any) and "//".
if absolute and not parts["netloc"]:
scheme = parts["scheme"] or default_scheme
prefix = scheme + "://" if scheme else "//"
return parse_url(prefix + url)
# Another smol hack: if there is no scheme, use `default_scheme` as default.
if default_scheme and parts["scheme"] is None:
parts["scheme"] = default_scheme
return parts
def unparse_url(parts) -> str:
"""Unparse parts of an URL produced by `parse_url`."""
url = ""
if parts["scheme"] is not None:
url += parts["scheme"] + ":"
if parts["netloc"] is not None:
url += "//" + parts["netloc"]
if parts["path"] is not None:
url += parts["path"]
if parts["query"] is not None:
url += "?" + parts["query"]
if parts["fragment"] is not None:
url += "#" + parts["fragment"]
return url
def clear_post_path(parts) -> None:
"""Clear optional post-path parts (query and fragment)."""
parts["query"] = None
parts["fragment"] = None
def join_url(base_url: str, rel_url: str) -> str:
"""Join a base URL with a relative path."""
parts = parse_url(base_url)
rel_parts = parse_url(rel_url)
if rel_url.startswith("/"):
new_path = rel_parts["path"]
else:
base_path = parts["path"] or ""
new_path = remove_last_segment(base_path) + "/" + rel_parts["path"]
parts["path"] = remove_dot_segments(new_path)
parts["query"] = rel_parts["query"]
parts["fragment"] = rel_parts["fragment"]
return unparse_url(parts)
def remove_dot_segments(path: str):
"""Remove dot segments in an URL path."""
output = ""
while path:
if path.startswith("../"):
path = path[3:]
elif path.startswith("./") or path.startswith("/./"):
path = path[2:] # Either strip "./" or leave a single "/".
elif path == "/.":
path = "/"
elif path.startswith("/../"):
path = "/" + path[4:]
output = remove_last_segment(output)
elif path == "/..":
path = "/"
output = remove_last_segment(output)
elif path in (".", ".."):
path = ""
else:
first_segment, path = pop_first_segment(path)
output += first_segment
return output
def remove_last_segment(path: str):
"""Remove last path segment, including preceding "/" if any."""
return path[:path.rfind("/")]
def pop_first_segment(path: str):
"""Return first segment and the rest.
Return the first segment including the initial "/" if any, and the rest of
the path up to, but not including, the next "/" or the end of the string.
"""
next_slash = path[1:].find("/")
if next_slash == -1:
return path, ""
next_slash += 1
return path[:next_slash], path[next_slash:]
def set_parameter(url: str, user_input: str) -> str:
"""Return a new URL with the escaped user input appended."""
parts = parse_url(url)
parts["query"] = quote(user_input)
return unparse_url(parts)
def get_parent_path(path: str) -> str:
"""Return the parent path."""
last_slash = path.rstrip("/").rfind("/")
if last_slash > -1:
path = path[:last_slash + 1]
return path
def get_parent_url(url: str) -> str:
"""Return the parent URL (one level up)."""
parts = parse_url(url)
parts["path"] = get_parent_path(parts["path"]) # type: ignore
clear_post_path(parts)
return unparse_url(parts)
def get_root_url(url: str) -> str:
"""Return the root URL (basically discards path)."""
parts = parse_url(url)
parts["path"] = "/"
clear_post_path(parts)
return unparse_url(parts)