import re from dataclasses import replace from html.parser import HTMLParser from io import StringIO from typing import * from . import Post FEED_URL = "https://www.blogger.com/feeds/746298260979647434/posts/default/-/Newsticker" class TextonlyParser(HTMLParser): def __init__(self): super().__init__() self.reset() self.strict = False self.convert_charrefs = True self._text = StringIO() def handle_data(self, d): self._text.write(d) @property def text(self): return self._text.getvalue() def strip_tags(html): s = TextonlyParser() s.feed(html) return s.text find_tags = re.compile(r"\+\+\+ (.*?) \+\+\+").finditer def feed_page(page: int = 1, per_page: int = 25) -> str: start = 1 + (page - 1) * per_page return f"{FEED_URL}?start-index={start}&max-results={per_page}" def split_post(post: Post) -> Iterable[Post]: for match in find_tags(strip_tags(post.content)): yield replace(post, content=match[1])