This repository has been archived on 2024-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
Bebop/bebop/gemtext.py

123 lines
3 KiB
Python
Raw Permalink Normal View History

2021-03-11 19:16:15 +01:00
"""Gemtext parser.
To allow a flexible rendering of the content, the parser produces a list of
"elements", each being an instance of one of the dataclasses defined in this
module. A renderer can then completely abstract the original document.
"""
2021-02-12 19:01:42 +01:00
import re
from collections import namedtuple
2021-02-12 19:01:42 +01:00
from dataclasses import dataclass
2021-03-13 20:38:19 +01:00
from typing import List
2021-02-12 19:01:42 +01:00
from bebop.links import Links
2021-02-12 19:01:42 +01:00
@dataclass
class Paragraph:
text: str
@dataclass
class Title:
level: int
text: str
RE = re.compile(r"(#{1,3})\s+(.+)")
@dataclass
class Link:
url: str
text: str
ident: int = 0
2021-02-12 19:01:42 +01:00
RE = re.compile(r"=>\s*(?P<url>\S+)(\s+(?P<text>.+))?")
@dataclass
class Preformatted:
2021-03-13 20:38:19 +01:00
lines: List[str]
2021-02-12 19:01:42 +01:00
FENCE = "```"
@dataclass
class Blockquote:
text: str
RE = re.compile(r">\s*(.*)")
2021-02-16 20:23:44 +01:00
@dataclass
class ListItem:
text: str
RE = re.compile(r"\*\s(.*)")
ParsedGemtext = namedtuple("ParsedGemtext", ("elements", "links", "title"))
def parse_gemtext(text: str, dumb=False) -> ParsedGemtext:
"""Parse a string of Gemtext into a list of elements."""
2021-02-12 19:01:42 +01:00
elements = []
2021-04-16 19:30:14 +02:00
links = Links()
last_link_id = 0
title = ""
2021-02-12 19:01:42 +01:00
preformatted = None
for line in text.splitlines():
line = line.rstrip()
# Empty lines:
# - in standard mode, discard them, except for preformatted blocks.
# - in dumb mode, keep them.
if not line and not (dumb or preformatted):
2021-02-12 19:01:42 +01:00
continue
if line.startswith(Preformatted.FENCE):
if preformatted:
elements.append(preformatted)
preformatted = None
else:
preformatted = Preformatted([])
continue
if preformatted:
preformatted.lines.append(line)
continue
2021-02-12 19:01:42 +01:00
match = Title.RE.match(line)
if match:
hashtags, text = match.groups()
level = hashtags.count("#")
elements.append(Title(level, text))
if not title and level == 1:
title = text
2021-02-12 19:01:42 +01:00
continue
match = Link.RE.match(line)
if match:
match_dict = match.groupdict()
url, text = match_dict["url"], match_dict.get("text", "")
last_link_id += 1
2021-04-16 19:30:14 +02:00
links[last_link_id] = url
elements.append(Link(url, text, last_link_id))
2021-02-12 19:01:42 +01:00
continue
match = Blockquote.RE.match(line)
if match:
text = match.groups()[0]
2021-11-27 11:36:11 +01:00
if text or dumb:
elements.append(Blockquote(text))
2021-02-12 19:01:42 +01:00
continue
2021-02-16 20:23:44 +01:00
match = ListItem.RE.match(line)
if match:
text = match.groups()[0]
elements.append(ListItem(text))
continue
elements.append(Paragraph(line))
2021-02-12 19:01:42 +01:00
# If a preformatted block is not closed before the file ends, consider it
# closed anyway; the spec does not seem to talk about that case.
if preformatted:
elements.append(preformatted)
return ParsedGemtext(elements, links, title)