Bebop/bebop/navigation.py

"""URI (RFC 3986) helpers for Gemini navigation.

It was supposed to be just thin fixes around urllib.parse functions but as
gemini is not recognized as a valid scheme it breaks a lot of things, so it
turned into a basic re-implementation of the RFC.
"""

import re
from typing import Any, Dict, Optional
from urllib.parse import quote


URI_RE = re.compile(
    "^"
    r"(?:(?P<scheme>[^:/?#\n]+):)?"
    r"(?://(?P<netloc>[^/?#\n]*))?"
    r"(?P<path>[^?#\n]*)"
    r"(?:\?(?P<query>[^#\n]*))?"
    r"(?:#(?P<fragment>.*))?"
    "$"
)


class InvalidUrlException(Exception):
    """Generic exception for invalid URLs used in this module."""

    def __init__(self, url):
        super().__init__()
        self.url = url


def parse_url(
    url: str,
    absolute: bool =False,
    default_scheme: Optional[str] =None
) -> Dict[str, Any]:
    """Return URL parts from this URL.

    Use the RFC regex to get parts from URL. This function can be used on
    regular URLs but also on not-so-compliant URLs, e.g. "capsule.org/page",
    which might be typed by an user (see `absolute` argument).

    Arguments:
    - url: URL to parse.
    - absolute: assume the URL is absolute, e.g. in the case we are trying to
      parse an URL an user has written, which is most of the time an absolute
      URL even if not perfectly so. This only has an effect if, after the
      initial parsing, there is no scheme or netloc available.
    - default_scheme: specify the scheme to use if the URL either does not
      specify it and we need it (e.g. there is a location), or `absolute` is
      true; if absolute is true but `default_scheme` is not specified, use the
      gemini scheme.

    Returns:
    URL parts, as a dictionary with the following keys: "scheme", "netloc",
    "path", "query" and "fragment". All keys are present, but all values can be
    None, except path which is always a string (but can be empty).

    Raises:
    InvalidUrlException if you put really really stupid strings in there.
    """
    match = URI_RE.match(url)
    if not match:
        raise InvalidUrlException(url)

    match_dict = match.groupdict()
    parts = {
        k: match_dict.get(k)
        for k in ("scheme", "netloc", "path", "query", "fragment")
    }

    # Smol hack: if we assume it's an absolute URL, just prefix scheme and "//".
    if absolute and not parts["scheme"] and not parts["netloc"]:
        scheme = default_scheme or "gemini"
        return parse_url(scheme + "://" + url)

    # Another smol hack: if there is no scheme, use `default_scheme` as default.
    if default_scheme and parts["scheme"] is None:
        parts["scheme"] = default_scheme

    return parts


def unparse_url(parts) -> str:
    """Unparse parts of an URL produced by `parse_url`."""
    url = ""
    if parts["scheme"] is not None:
        url += parts["scheme"] + ":"
    if parts["netloc"] is not None:
        url += "//" + parts["netloc"]
    if parts["path"] is not None:
        url += parts["path"]
    if parts["query"] is not None:
        url += "?" + parts["query"]
    if parts["fragment"] is not None:
        url += "#" + parts["fragment"]
    return url


def clear_post_path(parts) -> None:
    """Clear optional post-path parts (query and fragment)."""
    parts["query"] = None
    parts["fragment"] = None


def join_url(base_url: str, rel_url: str) -> str:
    """Join a base URL with a relative path."""
    parts = parse_url(base_url)
    rel_parts = parse_url(rel_url)
    if rel_url.startswith("/"):
        new_path = rel_parts["path"]
    else:
        base_path = parts["path"] or ""
        new_path = remove_last_segment(base_path) + "/" + rel_parts["path"]
    parts["path"] = remove_dot_segments(new_path)
    parts["query"] = rel_parts["query"]
    parts["fragment"] = rel_parts["fragment"]
    return unparse_url(parts)


def remove_dot_segments(path: str):
    """Remove dot segments in an URL path."""
    output = ""
    while path:
        if path.startswith("../"):
            path = path[3:]
        elif path.startswith("./") or path.startswith("/./"):
            path = path[2:]  # Either strip "./" or leave a single "/".
        elif path == "/.":
            path = "/"
        elif path.startswith("/../"):
            path = "/" + path[4:]
            output = remove_last_segment(output)
        elif path == "/..":
            path = "/"
            output = remove_last_segment(output)
        elif path in (".", ".."):
            path = ""
        else:
            first_segment, path = pop_first_segment(path)
            output += first_segment
    return output


def remove_last_segment(path: str):
    """Remove last path segment, including preceding "/" if any."""
    return path[:path.rfind("/")]


def pop_first_segment(path: str):
    """Return first segment and the rest.

    Return the first segment including the initial "/" if any, and the rest of
    the path up to, but not including, the next "/" or the end of the string.
    """
    next_slash = path[1:].find("/")
    if next_slash == -1:
        return path, ""
    next_slash += 1
    return path[:next_slash], path[next_slash:]


def set_parameter(url: str, user_input: str) -> str:
    """Return a new URL with the escaped user input appended."""
    parts = parse_url(url)
    parts["query"] = quote(user_input)
    return unparse_url(parts)


def get_parent_path(path: str) -> str:
    """Return the parent path."""
    last_slash = path.rstrip("/").rfind("/")
    if last_slash > -1:
        path = path[:last_slash + 1]
    return path


def get_parent_url(url: str) -> str:
    """Return the parent URL (one level up)."""
    parts = parse_url(url)
    parts["path"] = get_parent_path(parts["path"])  # type: ignore
    clear_post_path(parts)
    return unparse_url(parts)


def get_root_url(url: str) -> str:
    """Return the root URL (basically discards path)."""
    parts = parse_url(url)
    parts["path"] = "/"
    clear_post_path(parts)
    return unparse_url(parts)
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`"""URI (RFC 3986) helpers for Gemini navigation.`
add module docstrings 2021-03-11 19:16:15 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`It was supposed to be just thin fixes around urllib.parse functions but as`
			`gemini is not recognized as a valid scheme it breaks a lot of things, so it`
			`turned into a basic re-implementation of the RFC.`
			`"""`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`import re`
			`from typing import Any, Dict, Optional`
			`from urllib.parse import quote`


			`URI_RE = re.compile(`
			`"^"`
			`r"(?:(?P<scheme>[^:/?#\n]+):)?"`
			`r"(?://(?P<netloc>[^/?#\n]*))?"`
			`r"(?P<path>[^?#\n]*)"`
			`r"(?:\?(?P<query>[^#\n]*))?"`
			`r"(?:#(?P<fragment>.*))?"`
			`"$"`
			`)`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00

navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`class InvalidUrlException(Exception):`
			`"""Generic exception for invalid URLs used in this module."""`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`def __init__(self, url):`
			`super().__init__()`
			`self.url = url`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00
			`def parse_url(`
			`url: str,`
			`absolute: bool =False,`
			`default_scheme: Optional[str] =None`
			`) -> Dict[str, Any]:`
			`"""Return URL parts from this URL.`

			`Use the RFC regex to get parts from URL. This function can be used on`
			`regular URLs but also on not-so-compliant URLs, e.g. "capsule.org/page",`
			which might be typed by an user (see `absolute` argument).

			`Arguments:`
			`- url: URL to parse.`
			`- absolute: assume the URL is absolute, e.g. in the case we are trying to`
			`parse an URL an user has written, which is most of the time an absolute`
			`URL even if not perfectly so. This only has an effect if, after the`
			`initial parsing, there is no scheme or netloc available.`
			`- default_scheme: specify the scheme to use if the URL either does not`
			specify it and we need it (e.g. there is a location), or `absolute` is
			true; if absolute is true but `default_scheme` is not specified, use the
			`gemini scheme.`

			`Returns:`
			`URL parts, as a dictionary with the following keys: "scheme", "netloc",`
			`"path", "query" and "fragment". All keys are present, but all values can be`
			`None, except path which is always a string (but can be empty).`

			`Raises:`
			`InvalidUrlException if you put really really stupid strings in there.`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00			`"""`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`match = URI_RE.match(url)`
			`if not match:`
			`raise InvalidUrlException(url)`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`match_dict = match.groupdict()`
			`parts = {`
			`k: match_dict.get(k)`
			`for k in ("scheme", "netloc", "path", "query", "fragment")`
			`}`
init: basic protocol/nav/rendering 2021-02-12 19:01:42 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`# Smol hack: if we assume it's an absolute URL, just prefix scheme and "//".`
			`if absolute and not parts["scheme"] and not parts["netloc"]:`
			`scheme = default_scheme or "gemini"`
			`return parse_url(scheme + "://" + url)`
screen: link disambiguation, indent wraps, etc 2021-02-13 23:34:45 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			# Another smol hack: if there is no scheme, use `default_scheme` as default.
			`if default_scheme and parts["scheme"] is None:`
			`parts["scheme"] = default_scheme`
screen: link disambiguation, indent wraps, etc 2021-02-13 23:34:45 +01:00
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`return parts`
screen: handle input request (code 10) 2021-02-16 19:10:11 +01:00

navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`def unparse_url(parts) -> str:`
			"""Unparse parts of an URL produced by `parse_url`."""
			`url = ""`
			`if parts["scheme"] is not None:`
			`url += parts["scheme"] + ":"`
			`if parts["netloc"] is not None:`
			`url += "//" + parts["netloc"]`
			`if parts["path"] is not None:`
			`url += parts["path"]`
			`if parts["query"] is not None:`
			`url += "?" + parts["query"]`
			`if parts["fragment"] is not None:`
			`url += "#" + parts["fragment"]`
			`return url`


			`def clear_post_path(parts) -> None:`
			`"""Clear optional post-path parts (query and fragment)."""`
			`parts["query"] = None`
			`parts["fragment"] = None`


			`def join_url(base_url: str, rel_url: str) -> str:`
			`"""Join a base URL with a relative path."""`
			`parts = parse_url(base_url)`
			`rel_parts = parse_url(rel_url)`
			`if rel_url.startswith("/"):`
			`new_path = rel_parts["path"]`
			`else:`
			`base_path = parts["path"] or ""`
			`new_path = remove_last_segment(base_path) + "/" + rel_parts["path"]`
			`parts["path"] = remove_dot_segments(new_path)`
			`parts["query"] = rel_parts["query"]`
			`parts["fragment"] = rel_parts["fragment"]`
			`return unparse_url(parts)`


			`def remove_dot_segments(path: str):`
			`"""Remove dot segments in an URL path."""`
			`output = ""`
			`while path:`
			`if path.startswith("../"):`
			`path = path[3:]`
			`elif path.startswith("./") or path.startswith("/./"):`
			`path = path[2:] # Either strip "./" or leave a single "/".`
			`elif path == "/.":`
			`path = "/"`
			`elif path.startswith("/../"):`
			`path = "/" + path[4:]`
			`output = remove_last_segment(output)`
			`elif path == "/..":`
			`path = "/"`
			`output = remove_last_segment(output)`
			`elif path in (".", ".."):`
			`path = ""`
			`else:`
			`first_segment, path = pop_first_segment(path)`
			`output += first_segment`
			`return output`


			`def remove_last_segment(path: str):`
			`"""Remove last path segment, including preceding "/" if any."""`
			`return path[:path.rfind("/")]`


			`def pop_first_segment(path: str):`
			`"""Return first segment and the rest.`

			`Return the first segment including the initial "/" if any, and the rest of`
			`the path up to, but not including, the next "/" or the end of the string.`
			`"""`
			`next_slash = path[1:].find("/")`
			`if next_slash == -1:`
			`return path, ""`
			`next_slash += 1`
			`return path[:next_slash], path[next_slash:]`


			`def set_parameter(url: str, user_input: str) -> str:`
add module docstrings 2021-03-11 19:16:15 +01:00			`"""Return a new URL with the escaped user input appended."""`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`parts = parse_url(url)`
			`parts["query"] = quote(user_input)`
			`return unparse_url(parts)`
navigation: add function to go to parent URL 2021-03-14 02:05:42 +01:00

navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`def get_parent_path(path: str) -> str:`
			`"""Return the parent path."""`
navigation: add function to go to parent URL 2021-03-14 02:05:42 +01:00			`last_slash = path.rstrip("/").rfind("/")`
			`if last_slash > -1:`
			`path = path[:last_slash + 1]`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`return path`


			`def get_parent_url(url: str) -> str:`
			`"""Return the parent URL (one level up)."""`
			`parts = parse_url(url)`
			`parts["path"] = get_parent_path(parts["path"]) # type: ignore`
			`clear_post_path(parts)`
			`return unparse_url(parts)`
navigation: add a "go to root" function 2021-03-16 19:38:11 +01:00

			`def get_root_url(url: str) -> str:`
			`"""Return the root URL (basically discards path)."""`
navigation: rework entirely URL management oh and fix the whole history mess a bit 2021-05-08 22:41:42 +02:00			`parts = parse_url(url)`
			`parts["path"] = "/"`
			`clear_post_path(parts)`
			`return unparse_url(parts)`