Bebop/bebop/navigation.py

"""URI (RFC 3986) helpers for Gemini navigation.

It was supposed to be just thin fixes around urllib.parse functions but as
gemini is not recognized as a valid scheme it breaks a lot of things, so it
turned into a basic re-implementation of the RFC.
"""

import re
from typing import Any, Dict, Optional
from urllib.parse import quote


URI_RE = re.compile(
    "^"
    r"(?:(?P<scheme>[^:/?#\n]+):)?"
    r"(?://(?P<netloc>[^/?#\n]*))?"
    r"(?P<path>[^?#\n]*)"
    r"(?:\?(?P<query>[^#\n]*))?"
    r"(?:#(?P<fragment>.*))?"
    "$"
)

NO_NETLOC_SCHEMES = ("bebop",)


class InvalidUrlException(Exception):
    """Generic exception for invalid URLs used in this module."""

    def __init__(self, url):
        super().__init__()
        self.url = url


def parse_url(url: str, default_scheme: Optional[str] =None) -> Dict[str, Any]:
    """Return URL parts from this URL.

    Use the RFC regex to get parts from URL. This function can be used on
    regular URLs but also on not-so-compliant URLs, e.g. "capsule.org/page",
    which might be typed by an user (see `absolute` argument).

    Arguments:
    - url: URL to parse.
    - default_scheme: specify the scheme to use if the URL either does not
      specify it and we need it (e.g. there is a location).

    Returns:
    URL parts, as a dictionary with the following keys: "scheme", "netloc",
    "path", "query" and "fragment". All keys are present, but all values can be
    None, except path which is always a string (but can be empty).

    Raises:
    InvalidUrlException if you put really really stupid strings in there.
    """
    match = URI_RE.match(url)
    if not match:
        raise InvalidUrlException(url)

    match_dict = match.groupdict()
    parts = {
        k: match_dict.get(k)
        for k in ("scheme", "netloc", "path", "query", "fragment")
    }

    # Smol hack: if there is no scheme, use `default_scheme` as default.
    if default_scheme and parts["scheme"] is None:
        parts["scheme"] = default_scheme

    return parts


def unparse_url(parts) -> str:
    """Unparse parts of an URL produced by `parse_url`."""
    url = ""
    if parts["scheme"] is not None:
        url += parts["scheme"] + ":"
    if parts["netloc"] is not None:
        url += "//" + parts["netloc"]
    if parts["path"] is not None:
        url += parts["path"]
    if parts["query"] is not None:
        url += "?" + parts["query"]
    if parts["fragment"] is not None:
        url += "#" + parts["fragment"]
    return url


def clear_post_path(parts) -> None:
    """Clear optional post-path parts (query and fragment)."""
    parts["query"] = None
    parts["fragment"] = None


def join_url(base_url: str, rel_url: str) -> str:
    """Join a base URL with a relative path."""
    parts = parse_url(base_url)
    rel_parts = parse_url(rel_url)
    if rel_url.startswith("/"):
        new_path = rel_parts["path"]
    else:
        base_path = parts["path"] or ""
        new_path = remove_last_segment(base_path) + "/" + rel_parts["path"]
    parts["path"] = remove_dot_segments(new_path)
    parts["query"] = rel_parts["query"]
    parts["fragment"] = rel_parts["fragment"]
    return unparse_url(parts)


def remove_dot_segments(path: str):
    """Remove dot segments in an URL path."""
    output = ""
    while path:
        if path.startswith("../"):
            path = path[3:]
        elif path.startswith("./") or path.startswith("/./"):
            path = path[2:]  # Either strip "./" or leave a single "/".
        elif path == "/.":
            path = "/"
        elif path.startswith("/../"):
            path = "/" + path[4:]
            output = remove_last_segment(output)
        elif path == "/..":
            path = "/"
            output = remove_last_segment(output)
        elif path in (".", ".."):
            path = ""
        else:
            first_segment, path = pop_first_segment(path)
            output += first_segment
    return output


def remove_last_segment(path: str):
    """Remove last path segment, including preceding "/" if any."""
    return path[:path.rfind("/")]


def pop_first_segment(path: str):
    """Return first segment and the rest.

    Return the first segment including the initial "/" if any, and the rest of
    the path up to, but not including, the next "/" or the end of the string.
    """
    next_slash = path[1:].find("/")
    if next_slash == -1:
        return path, ""
    next_slash += 1
    return path[:next_slash], path[next_slash:]


def set_parameter(url: str, user_input: str) -> str:
    """Return a new URL with the escaped user input appended."""
    parts = parse_url(url)
    parts["query"] = quote(user_input)
    return unparse_url(parts)


def get_parent_path(path: str) -> str:
    """Return the parent path."""
    last_slash = path.rstrip("/").rfind("/")
    if last_slash > -1:
        path = path[:last_slash + 1]
    return path


def get_parent_url(url: str) -> str:
    """Return the parent URL (one level up)."""
    parts = parse_url(url)
    parts["path"] = get_parent_path(parts["path"])  # type: ignore
    clear_post_path(parts)
    return unparse_url(parts)


def get_root_url(url: str) -> str:
    """Return the root URL (basically discards path)."""
    parts = parse_url(url)
    parts["path"] = "/"
    clear_post_path(parts)
    return unparse_url(parts)
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`"""URI (RFC 3986) helpers for Gemini navigation.`
add module docstrings 2021-03-11 19:16:15 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`It was supposed to be just thin fixes around urllib.parse functions but as`
			`gemini is not recognized as a valid scheme it breaks a lot of things, so it`
			`turned into a basic re-implementation of the RFC.`
			`"""`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`import re`
			`from typing import Any, Dict, Optional`
			`from urllib.parse import quote`


			`URI_RE = re.compile(`
			`"^"`
			`r"(?:(?P<scheme>[^:/?#\n]+):)?"`
			`r"(?://(?P<netloc>[^/?#\n]*))?"`
			`r"(?P<path>[^?#\n]*)"`
			`r"(?:\?(?P<query>[^#\n]*))?"`
			`r"(?:#(?P<fragment>.*))?"`
			`"$"`
			`)`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: fix issue with reloading bebop pages 2021-05-15 18:06:04 +02:00			`NO_NETLOC_SCHEMES = ("bebop",)`

init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`class InvalidUrlException(Exception):`
			`"""Generic exception for invalid URLs used in this module."""`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`def __init__(self, url):`
			`super().__init__()`
			`self.url = url`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00
navigation: abandon assume_absolute URL madness 2021-05-15 19:06:44 +02:00			`def parse_url(url: str, default_scheme: Optional[str] =None) -> Dict[str, Any]:`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`"""Return URL parts from this URL.`

			`Use the RFC regex to get parts from URL. This function can be used on`
			`regular URLs but also on not-so-compliant URLs, e.g. "capsule.org/page",`
			which might be typed by an user (see `absolute` argument).

			`Arguments:`
			`- url: URL to parse.`
			`- default_scheme: specify the scheme to use if the URL either does not`
navigation: abandon assume_absolute URL madness 2021-05-15 19:06:44 +02:00			`specify it and we need it (e.g. there is a location).`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00
			`Returns:`
			`URL parts, as a dictionary with the following keys: "scheme", "netloc",`
			`"path", "query" and "fragment". All keys are present, but all values can be`
			`None, except path which is always a string (but can be empty).`

			`Raises:`
			`InvalidUrlException if you put really really stupid strings in there.`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00			`"""`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`match = URI_RE.match(url)`
			`if not match:`
			`raise InvalidUrlException(url)`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`match_dict = match.groupdict()`
			`parts = {`
			`k: match_dict.get(k)`
			`for k in ("scheme", "netloc", "path", "query", "fragment")`
			`}`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: abandon assume_absolute URL madness 2021-05-15 19:06:44 +02:00			# Smol hack: if there is no scheme, use `default_scheme` as default.
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`if default_scheme and parts["scheme"] is None:`
			`parts["scheme"] = default_scheme`
screen: link disambiguation, indent wraps, etc 2021-02-13 23:34:45 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`return parts`
screen: handle input request (code 10) 2021-02-16 19:10:11 +01:00

navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`def unparse_url(parts) -> str:`
			"""Unparse parts of an URL produced by `parse_url`."""
			`url = ""`
			`if parts["scheme"] is not None:`
			`url += parts["scheme"] + ":"`
			`if parts["netloc"] is not None:`
			`url += "//" + parts["netloc"]`
			`if parts["path"] is not None:`
			`url += parts["path"]`
			`if parts["query"] is not None:`
			`url += "?" + parts["query"]`
			`if parts["fragment"] is not None:`
			`url += "#" + parts["fragment"]`
			`return url`


			`def clear_post_path(parts) -> None:`
			`"""Clear optional post-path parts (query and fragment)."""`
			`parts["query"] = None`
			`parts["fragment"] = None`


			`def join_url(base_url: str, rel_url: str) -> str:`
			`"""Join a base URL with a relative path."""`
			`parts = parse_url(base_url)`
			`rel_parts = parse_url(rel_url)`
			`if rel_url.startswith("/"):`
			`new_path = rel_parts["path"]`
			`else:`
			`base_path = parts["path"] or ""`
			`new_path = remove_last_segment(base_path) + "/" + rel_parts["path"]`
			`parts["path"] = remove_dot_segments(new_path)`
			`parts["query"] = rel_parts["query"]`
			`parts["fragment"] = rel_parts["fragment"]`
			`return unparse_url(parts)`


			`def remove_dot_segments(path: str):`
			`"""Remove dot segments in an URL path."""`
			`output = ""`
			`while path:`
			`if path.startswith("../"):`
			`path = path[3:]`
			`elif path.startswith("./") or path.startswith("/./"):`
			`path = path[2:] # Either strip "./" or leave a single "/".`
			`elif path == "/.":`
			`path = "/"`
			`elif path.startswith("/../"):`
			`path = "/" + path[4:]`
			`output = remove_last_segment(output)`
			`elif path == "/..":`
			`path = "/"`
			`output = remove_last_segment(output)`
			`elif path in (".", ".."):`
			`path = ""`
			`else:`
			`first_segment, path = pop_first_segment(path)`
			`output += first_segment`
			`return output`


			`def remove_last_segment(path: str):`
			`"""Remove last path segment, including preceding "/" if any."""`
			`return path[:path.rfind("/")]`


			`def pop_first_segment(path: str):`
			`"""Return first segment and the rest.`

			`Return the first segment including the initial "/" if any, and the rest of`
			`the path up to, but not including, the next "/" or the end of the string.`
			`"""`
			`next_slash = path[1:].find("/")`
			`if next_slash == -1:`
			`return path, ""`
			`next_slash += 1`
			`return path[:next_slash], path[next_slash:]`


			`def set_parameter(url: str, user_input: str) -> str:`
add module docstrings 2021-03-11 19:16:15 +01:00			`"""Return a new URL with the escaped user input appended."""`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`parts = parse_url(url)`
			`parts["query"] = quote(user_input)`
			`return unparse_url(parts)`
navigation: add function to go to parent URL 2021-03-14 02:05:42 +01:00

navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`def get_parent_path(path: str) -> str:`
			`"""Return the parent path."""`
navigation: add function to go to parent URL 2021-03-14 02:05:42 +01:00			`last_slash = path.rstrip("/").rfind("/")`
			`if last_slash > -1:`
			`path = path[:last_slash + 1]`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`return path`


			`def get_parent_url(url: str) -> str:`
			`"""Return the parent URL (one level up)."""`
			`parts = parse_url(url)`
			`parts["path"] = get_parent_path(parts["path"]) # type: ignore`
			`clear_post_path(parts)`
			`return unparse_url(parts)`
navigation: add a "go to root" function 2021-03-16 19:38:11 +01:00

			`def get_root_url(url: str) -> str:`
			`"""Return the root URL (basically discards path)."""`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`parts = parse_url(url)`
			`parts["path"] = "/"`
			`clear_post_path(parts)`
			`return unparse_url(parts)`