2021-03-11 19:16:15 +01:00
|
|
|
"""Gemtext parser.
|
|
|
|
|
|
|
|
To allow a flexible rendering of the content, the parser produces a list of
|
|
|
|
"elements", each being an instance of one of the dataclasses defined in this
|
|
|
|
module. A renderer can then completely abstract the original document.
|
|
|
|
"""
|
|
|
|
|
2021-02-12 19:01:42 +01:00
|
|
|
import re
|
2021-03-28 18:55:52 +02:00
|
|
|
from collections import namedtuple
|
2021-02-12 19:01:42 +01:00
|
|
|
from dataclasses import dataclass
|
2021-03-13 20:38:19 +01:00
|
|
|
from typing import List
|
2021-02-12 19:01:42 +01:00
|
|
|
|
2021-03-28 18:55:52 +02:00
|
|
|
from bebop.links import Links
|
|
|
|
|
2021-02-12 19:01:42 +01:00
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Paragraph:
|
|
|
|
text: str
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Title:
|
|
|
|
level: int
|
|
|
|
text: str
|
|
|
|
RE = re.compile(r"(#{1,3})\s+(.+)")
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Link:
|
|
|
|
url: str
|
|
|
|
text: str
|
2021-03-28 18:55:52 +02:00
|
|
|
ident: int = 0
|
2021-02-12 19:01:42 +01:00
|
|
|
RE = re.compile(r"=>\s*(?P<url>\S+)(\s+(?P<text>.+))?")
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Preformatted:
|
2021-03-13 20:38:19 +01:00
|
|
|
lines: List[str]
|
2021-02-12 19:01:42 +01:00
|
|
|
FENCE = "```"
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Blockquote:
|
|
|
|
text: str
|
|
|
|
RE = re.compile(r">\s*(.*)")
|
|
|
|
|
|
|
|
|
2021-02-16 20:23:44 +01:00
|
|
|
@dataclass
|
|
|
|
class ListItem:
|
|
|
|
text: str
|
|
|
|
RE = re.compile(r"\*\s(.*)")
|
|
|
|
|
|
|
|
|
2021-03-28 18:55:52 +02:00
|
|
|
ParsedGemtext = namedtuple("ParsedGemtext", ("elements", "links", "title"))
|
|
|
|
|
|
|
|
|
2021-05-28 13:34:08 +02:00
|
|
|
def parse_gemtext(text: str, dumb=False) -> ParsedGemtext:
|
2021-03-13 20:37:13 +01:00
|
|
|
"""Parse a string of Gemtext into a list of elements."""
|
2021-02-12 19:01:42 +01:00
|
|
|
elements = []
|
2021-04-16 19:30:14 +02:00
|
|
|
links = Links()
|
2021-03-28 18:55:52 +02:00
|
|
|
last_link_id = 0
|
|
|
|
title = ""
|
2021-02-12 19:01:42 +01:00
|
|
|
preformatted = None
|
|
|
|
for line in text.splitlines():
|
|
|
|
line = line.rstrip()
|
2021-07-08 15:29:55 +02:00
|
|
|
# Empty lines:
|
|
|
|
# - in standard mode, discard them, except for preformatted blocks.
|
|
|
|
# - in dumb mode, keep them.
|
|
|
|
if not line and not (dumb or preformatted):
|
2021-02-12 19:01:42 +01:00
|
|
|
continue
|
|
|
|
|
2021-05-04 22:29:00 +02:00
|
|
|
if line.startswith(Preformatted.FENCE):
|
|
|
|
if preformatted:
|
|
|
|
elements.append(preformatted)
|
|
|
|
preformatted = None
|
|
|
|
else:
|
|
|
|
preformatted = Preformatted([])
|
|
|
|
continue
|
|
|
|
|
|
|
|
if preformatted:
|
|
|
|
preformatted.lines.append(line)
|
|
|
|
continue
|
|
|
|
|
2021-02-12 19:01:42 +01:00
|
|
|
match = Title.RE.match(line)
|
|
|
|
if match:
|
|
|
|
hashtags, text = match.groups()
|
2021-03-28 18:55:52 +02:00
|
|
|
level = hashtags.count("#")
|
|
|
|
elements.append(Title(level, text))
|
|
|
|
if not title and level == 1:
|
|
|
|
title = text
|
2021-02-12 19:01:42 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
match = Link.RE.match(line)
|
|
|
|
if match:
|
|
|
|
match_dict = match.groupdict()
|
|
|
|
url, text = match_dict["url"], match_dict.get("text", "")
|
2021-03-28 18:55:52 +02:00
|
|
|
last_link_id += 1
|
2021-04-16 19:30:14 +02:00
|
|
|
links[last_link_id] = url
|
2021-03-28 18:55:52 +02:00
|
|
|
elements.append(Link(url, text, last_link_id))
|
2021-02-12 19:01:42 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
match = Blockquote.RE.match(line)
|
|
|
|
if match:
|
|
|
|
text = match.groups()[0]
|
2021-11-27 11:36:11 +01:00
|
|
|
if text or dumb:
|
|
|
|
elements.append(Blockquote(text))
|
2021-02-12 19:01:42 +01:00
|
|
|
continue
|
|
|
|
|
2021-02-16 20:23:44 +01:00
|
|
|
match = ListItem.RE.match(line)
|
|
|
|
if match:
|
|
|
|
text = match.groups()[0]
|
|
|
|
elements.append(ListItem(text))
|
|
|
|
continue
|
|
|
|
|
2021-05-04 22:29:00 +02:00
|
|
|
elements.append(Paragraph(line))
|
2021-02-12 19:01:42 +01:00
|
|
|
|
2021-03-14 00:05:34 +01:00
|
|
|
# If a preformatted block is not closed before the file ends, consider it
|
|
|
|
# closed anyway; the spec does not seem to talk about that case.
|
|
|
|
if preformatted:
|
|
|
|
elements.append(preformatted)
|
|
|
|
|
2021-03-28 18:55:52 +02:00
|
|
|
return ParsedGemtext(elements, links, title)
|