Bebop/bebop/navigation.py

"""URI (RFC 3986) helpers for Gemini navigation.

It was supposed to be just thin fixes around urllib.parse functions but as
gemini is not recognized as a valid scheme it breaks a lot of things, so it
turned into a basic re-implementation of the RFC.
"""

import re
from typing import Any, Dict, Optional
from urllib.parse import quote


URI_RE = re.compile(
    "^"
    r"(?:(?P<scheme>[^:/?#\n]+):)?"
    r"(?://(?P<netloc>[^/?#\n]*))?"
    r"(?P<path>[^?#\n]*)"
    r"(?:\?(?P<query>[^#\n]*))?"
    r"(?:#(?P<fragment>.*))?"
    "$"
)


class InvalidUrlException(Exception):
    """Generic exception for invalid URLs used in this module."""

    def __init__(self, url):
        super().__init__()
        self.url = url


def parse_url(
    url: str,
    absolute: bool =False,
    default_scheme: Optional[str] =None
) -> Dict[str, Any]:
    """Return URL parts from this URL.

    Use the RFC regex to get parts from URL. This function can be used on
    regular URLs but also on not-so-compliant URLs, e.g. "capsule.org/page",
    which might be typed by an user (see `absolute` argument).

    Arguments:
    - url: URL to parse.
    - absolute: assume the URL is absolute, e.g. in the case we are trying to
      parse an URL an user has written, which is most of the time an absolute
      URL even if not perfectly so. This only has an effect if, after the
      initial parsing, there is no netloc available.
    - default_scheme: specify the scheme to use if the URL either does not
      specify it and we need it (e.g. there is a location), or `absolute` is
      true; if absolute is true but `default_scheme` is not specified, a netloc
      marker ("//") is prefixed without scheme.

    Returns:
    URL parts, as a dictionary with the following keys: "scheme", "netloc",
    "path", "query" and "fragment". All keys are present, but all values can be
    None, except path which is always a string (but can be empty).

    Raises:
    InvalidUrlException if you put really really stupid strings in there.
    """
    match = URI_RE.match(url)
    if not match:
        raise InvalidUrlException(url)

    match_dict = match.groupdict()
    parts = {
        k: match_dict.get(k)
        for k in ("scheme", "netloc", "path", "query", "fragment")
    }

    # Smol hack: if we assume it's an absolute URL and no netloc has been found,
    # just prefix default scheme (if any) and "//".
    if absolute and not parts["netloc"]:
        scheme = parts["scheme"] or default_scheme
        prefix = scheme + "://" if scheme else "//"
        return parse_url(prefix + url)

    # Another smol hack: if there is no scheme, use `default_scheme` as default.
    if default_scheme and parts["scheme"] is None:
        parts["scheme"] = default_scheme

    return parts


def unparse_url(parts) -> str:
    """Unparse parts of an URL produced by `parse_url`."""
    url = ""
    if parts["scheme"] is not None:
        url += parts["scheme"] + ":"
    if parts["netloc"] is not None:
        url += "//" + parts["netloc"]
    if parts["path"] is not None:
        url += parts["path"]
    if parts["query"] is not None:
        url += "?" + parts["query"]
    if parts["fragment"] is not None:
        url += "#" + parts["fragment"]
    return url


def clear_post_path(parts) -> None:
    """Clear optional post-path parts (query and fragment)."""
    parts["query"] = None
    parts["fragment"] = None


def join_url(base_url: str, rel_url: str) -> str:
    """Join a base URL with a relative path."""
    parts = parse_url(base_url)
    rel_parts = parse_url(rel_url)
    if rel_url.startswith("/"):
        new_path = rel_parts["path"]
    else:
        base_path = parts["path"] or ""
        new_path = remove_last_segment(base_path) + "/" + rel_parts["path"]
    parts["path"] = remove_dot_segments(new_path)
    parts["query"] = rel_parts["query"]
    parts["fragment"] = rel_parts["fragment"]
    return unparse_url(parts)


def remove_dot_segments(path: str):
    """Remove dot segments in an URL path."""
    output = ""
    while path:
        if path.startswith("../"):
            path = path[3:]
        elif path.startswith("./") or path.startswith("/./"):
            path = path[2:]  # Either strip "./" or leave a single "/".
        elif path == "/.":
            path = "/"
        elif path.startswith("/../"):
            path = "/" + path[4:]
            output = remove_last_segment(output)
        elif path == "/..":
            path = "/"
            output = remove_last_segment(output)
        elif path in (".", ".."):
            path = ""
        else:
            first_segment, path = pop_first_segment(path)
            output += first_segment
    return output


def remove_last_segment(path: str):
    """Remove last path segment, including preceding "/" if any."""
    return path[:path.rfind("/")]


def pop_first_segment(path: str):
    """Return first segment and the rest.

    Return the first segment including the initial "/" if any, and the rest of
    the path up to, but not including, the next "/" or the end of the string.
    """
    next_slash = path[1:].find("/")
    if next_slash == -1:
        return path, ""
    next_slash += 1
    return path[:next_slash], path[next_slash:]


def set_parameter(url: str, user_input: str) -> str:
    """Return a new URL with the escaped user input appended."""
    parts = parse_url(url)
    parts["query"] = quote(user_input)
    return unparse_url(parts)


def get_parent_path(path: str) -> str:
    """Return the parent path."""
    last_slash = path.rstrip("/").rfind("/")
    if last_slash > -1:
        path = path[:last_slash + 1]
    return path


def get_parent_url(url: str) -> str:
    """Return the parent URL (one level up)."""
    parts = parse_url(url)
    parts["path"] = get_parent_path(parts["path"])  # type: ignore
    clear_post_path(parts)
    return unparse_url(parts)


def get_root_url(url: str) -> str:
    """Return the root URL (basically discards path)."""
    parts = parse_url(url)
    parts["path"] = "/"
    clear_post_path(parts)
    return unparse_url(parts)