This repository has been archived on 2024-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
Bebop/bebop/navigation.py

179 lines
5.2 KiB
Python
Raw Normal View History

"""URI (RFC 3986) helpers for Gemini navigation.
2021-03-11 19:16:15 +01:00
It was supposed to be just thin fixes around urllib.parse functions but as
gemini is not recognized as a valid scheme it breaks a lot of things, so it
turned into a basic re-implementation of the RFC.
"""
2021-02-12 19:01:42 +01:00
import re
from typing import Any, Dict, Optional
from urllib.parse import quote
URI_RE = re.compile(
"^"
r"(?:(?P<scheme>[^:/?#\n]+):)?"
r"(?://(?P<netloc>[^/?#\n]*))?"
r"(?P<path>[^?#\n]*)"
r"(?:\?(?P<query>[^#\n]*))?"
r"(?:#(?P<fragment>.*))?"
"$"
)
2021-02-12 19:01:42 +01:00
NO_NETLOC_SCHEMES = ("bebop",)
2021-02-12 19:01:42 +01:00
class InvalidUrlException(Exception):
"""Generic exception for invalid URLs used in this module."""
2021-02-12 19:01:42 +01:00
def __init__(self, url):
super().__init__()
self.url = url
2021-02-12 19:01:42 +01:00
def parse_url(url: str, default_scheme: Optional[str] =None) -> Dict[str, Any]:
"""Return URL parts from this URL.
Use the RFC regex to get parts from URL. This function can be used on
regular URLs but also on not-so-compliant URLs, e.g. "capsule.org/page",
which might be typed by an user (see `absolute` argument).
Arguments:
- url: URL to parse.
- default_scheme: specify the scheme to use if the URL either does not
specify it and we need it (e.g. there is a location).
Returns:
URL parts, as a dictionary with the following keys: "scheme", "netloc",
"path", "query" and "fragment". All keys are present, but all values can be
None, except path which is always a string (but can be empty).
Raises:
InvalidUrlException if you put really really stupid strings in there.
2021-02-12 19:01:42 +01:00
"""
match = URI_RE.match(url)
if not match:
raise InvalidUrlException(url)
2021-02-12 19:01:42 +01:00
match_dict = match.groupdict()
parts = {
k: match_dict.get(k)
for k in ("scheme", "netloc", "path", "query", "fragment")
}
2021-02-12 19:01:42 +01:00
# Smol hack: if there is no scheme, use `default_scheme` as default.
if default_scheme and parts["scheme"] is None:
parts["scheme"] = default_scheme
return parts
2021-02-16 19:10:11 +01:00
def unparse_url(parts) -> str:
"""Unparse parts of an URL produced by `parse_url`."""
url = ""
if parts["scheme"] is not None:
url += parts["scheme"] + ":"
if parts["netloc"] is not None:
url += "//" + parts["netloc"]
if parts["path"] is not None:
url += parts["path"]
if parts["query"] is not None:
url += "?" + parts["query"]
if parts["fragment"] is not None:
url += "#" + parts["fragment"]
return url
def clear_post_path(parts) -> None:
"""Clear optional post-path parts (query and fragment)."""
parts["query"] = None
parts["fragment"] = None
def join_url(base_url: str, rel_url: str) -> str:
"""Join a base URL with a relative path."""
parts = parse_url(base_url)
rel_parts = parse_url(rel_url)
if rel_url.startswith("/"):
new_path = rel_parts["path"]
else:
base_path = parts["path"] or ""
new_path = remove_last_segment(base_path) + "/" + rel_parts["path"]
parts["path"] = remove_dot_segments(new_path)
parts["query"] = rel_parts["query"]
parts["fragment"] = rel_parts["fragment"]
return unparse_url(parts)
def remove_dot_segments(path: str):
"""Remove dot segments in an URL path."""
output = ""
while path:
if path.startswith("../"):
path = path[3:]
elif path.startswith("./") or path.startswith("/./"):
path = path[2:] # Either strip "./" or leave a single "/".
elif path == "/.":
path = "/"
elif path.startswith("/../"):
path = "/" + path[4:]
output = remove_last_segment(output)
elif path == "/..":
path = "/"
output = remove_last_segment(output)
elif path in (".", ".."):
path = ""
else:
first_segment, path = pop_first_segment(path)
output += first_segment
return output
def remove_last_segment(path: str):
"""Remove last path segment, including preceding "/" if any."""
return path[:path.rfind("/")]
def pop_first_segment(path: str):
"""Return first segment and the rest.
Return the first segment including the initial "/" if any, and the rest of
the path up to, but not including, the next "/" or the end of the string.
"""
next_slash = path[1:].find("/")
if next_slash == -1:
return path, ""
next_slash += 1
return path[:next_slash], path[next_slash:]
def set_parameter(url: str, user_input: str) -> str:
2021-03-11 19:16:15 +01:00
"""Return a new URL with the escaped user input appended."""
parts = parse_url(url)
parts["query"] = quote(user_input)
return unparse_url(parts)
def get_parent_path(path: str) -> str:
"""Return the parent path."""
last_slash = path.rstrip("/").rfind("/")
if last_slash > -1:
path = path[:last_slash + 1]
return path
def get_parent_url(url: str) -> str:
"""Return the parent URL (one level up)."""
parts = parse_url(url)
parts["path"] = get_parent_path(parts["path"]) # type: ignore
clear_post_path(parts)
return unparse_url(parts)
def get_root_url(url: str) -> str:
"""Return the root URL (basically discards path)."""
parts = parse_url(url)
parts["path"] = "/"
clear_post_path(parts)
return unparse_url(parts)