dump current state (wip-ish)

2020-11-01 16:31:37 +01:00 · 2020-11-01 16:31:37 +01:00 · 51fb1c9f26
commit 51fb1c9f26
parent 0124c35472
46 changed files with 3749 additions and 0 deletions
--- a/postillon/init.py
+++ b/postillon/init.py
@ -0,0 +1,4 @@
+from feeder import Feed, Post, all_posts
+
+from .postbox import FEED_URL, split_post
+from .store import Store
--- a/postillon/main.py
+++ b/postillon/main.py
@ -0,0 +1,51 @@
+import argparse
+import asyncio
+import logging
+import os
+from typing import *
+
+import postillon
+
+log = logging.getLogger(__name__)
+
+logging.basicConfig(
+    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+    level=os.getenv("LOGLEVEL", "INFO"),
+)
+
+
+async def all_posts(feed_url, throttle: int = 10) -> AsyncIterable[postillon.Post]:
+    """We can't use feed's all_posts because blogger creates broken next URLs."""
+    feed = postillon.Feed(feed_url, url="", next_url=feed_url)
+    while (feed := feed.load_next()) :
+        log.debug(f"New feed page: {feed}")
+        if feed.next_url:
+            feed.next_url = feed.next_url.replace(
+                f"{postillon.FEED_URL}/-/Newsticker", postillon.FEED_URL
+            )
+        for post in feed.posts:
+            yield post
+        log.debug(f"Waiting for {throttle} seconds ...")
+        await asyncio.sleep(throttle)
+
+
+async def dump_all(dbpath: str, feed_url: str):
+    store = postillon.Store(dbpath)
+    store.connect()
+    async for post in all_posts(feed_url):
+        store.add(postillon.split_post(post))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--database", "--db", "-d", required=True, help="Path to database.sqlite"
+    )
+    parser.add_argument("--feed", "-f", default=postillon.FEED_URL, help="Feed URL")
+    args = parser.parse_args()
+
+    asyncio.run(dump_all(args.database, args.feed))
+
+
+if __name__ == "__main__":
+    main()
--- a/postillon/postbox.py
+++ b/postillon/postbox.py
@ -0,0 +1,44 @@
+import re
+from dataclasses import replace
+from html.parser import HTMLParser
+from io import StringIO
+from typing import *
+
+from . import Post
+
+FEED_URL = "https://www.blogger.com/feeds/746298260979647434/posts/default/-/Newsticker"
+
+
+class TextonlyParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self._text = StringIO()
+
+    def handle_data(self, d):
+        self._text.write(d)
+
+    @property
+    def text(self):
+        return self._text.getvalue()
+
+
+def strip_tags(html):
+    s = TextonlyParser()
+    s.feed(html)
+    return s.text
+
+
+find_tags = re.compile(r"\+\+\+ (.*?) \+\+\+").finditer
+
+
+def feed_page(page: int = 1, per_page: int = 25) -> str:
+    start = 1 + (page - 1) * per_page
+    return f"{FEED_URL}?start-index={start}&max-results={per_page}"
+
+
+def split_post(post: Post) -> Iterable[Post]:
+    for match in find_tags(strip_tags(post.content)):
+        yield replace(post, content=match[1])
--- a/postillon/store.py
+++ b/postillon/store.py
@ -0,0 +1,78 @@
+import sqlite3
+from datetime import datetime, timezone
+from typing import *
+
+from . import Post
+
+
+class Store:
+    def __init__(self, dbpath: Optional[str] = None):
+        self.dbpath = dbpath
+        self.connection: Optional[sqlite3.Connection] = None
+
+    def connect(self, path: Optional[str] = None):
+        if path:
+            self.dbpath = path
+        if self.connection is not None:
+            return self.connection
+        self.connection = sqlite3.connect(
+            self.dbpath, isolation_level=None
+        )  # auto commit
+        self.init()
+
+    def disconnect(self):
+        conn = self.connection
+        if conn:
+            conn.close()
+
+    def init(self):
+        conn = self.connection
+        conn.execute(
+            """
+            create table if not exists post (
+                id integer primary key,
+                content text unique not null,
+                source text, -- link to the source of this post
+                date integer not null
+            )
+            """
+        )
+        conn.execute(
+            """
+            create index if not exists post_date
+            on post(date)
+            """
+        )
+
+    def add(self, posts: Iterable[Post]):
+        sql = f"""
+            insert into post(content, source, date)
+            values (?, ?, ?)
+            on conflict do nothing
+        """
+        self.connection.executemany(
+            sql,
+            (
+                (p.content, p.link, int(p.date.timestamp()) if p.date else None)
+                for p in posts
+            ),
+        )
+
+    def _select(self, condition="", params=[]) -> Iterable[Post]:
+        sql = f"select id, content, date, source from post {condition}"
+        for row in self.connection.execute(sql, params):
+            id, content, date, source = row
+            if date is not None:
+                date = datetime.fromtimestamp(date, tz=timezone.utc)
+            post = Post(id, content, date, link=source)
+            yield post
+
+    def random_post(self) -> Optional[Post]:
+        cond = "where id in (select id from post order by random() limit 1)"
+        for post in self._select(cond):
+            return post
+
+    def search(self, term, skip: int = 0) -> Iterable[Post]:
+        cond = "where content like ? order by date desc limit -1 offset ?"
+        for post in self._select(cond, (term, skip)):
+            yield post