dump current state (wip-ish)

This commit is contained in:
ducklet 2020-11-01 16:31:37 +01:00
parent 0124c35472
commit 51fb1c9f26
46 changed files with 3749 additions and 0 deletions

4
postillon/__init__.py Normal file
View file

@ -0,0 +1,4 @@
from feeder import Feed, Post, all_posts
from .postbox import FEED_URL, split_post
from .store import Store

51
postillon/__main__.py Normal file
View file

@ -0,0 +1,51 @@
import argparse
import asyncio
import logging
import os
from typing import *
import postillon
log = logging.getLogger(__name__)
logging.basicConfig(
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
level=os.getenv("LOGLEVEL", "INFO"),
)
async def all_posts(feed_url, throttle: int = 10) -> AsyncIterable[postillon.Post]:
"""We can't use feed's all_posts because blogger creates broken next URLs."""
feed = postillon.Feed(feed_url, url="", next_url=feed_url)
while (feed := feed.load_next()) :
log.debug(f"New feed page: {feed}")
if feed.next_url:
feed.next_url = feed.next_url.replace(
f"{postillon.FEED_URL}/-/Newsticker", postillon.FEED_URL
)
for post in feed.posts:
yield post
log.debug(f"Waiting for {throttle} seconds ...")
await asyncio.sleep(throttle)
async def dump_all(dbpath: str, feed_url: str):
store = postillon.Store(dbpath)
store.connect()
async for post in all_posts(feed_url):
store.add(postillon.split_post(post))
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--database", "--db", "-d", required=True, help="Path to database.sqlite"
)
parser.add_argument("--feed", "-f", default=postillon.FEED_URL, help="Feed URL")
args = parser.parse_args()
asyncio.run(dump_all(args.database, args.feed))
if __name__ == "__main__":
main()

44
postillon/postbox.py Normal file
View file

@ -0,0 +1,44 @@
import re
from dataclasses import replace
from html.parser import HTMLParser
from io import StringIO
from typing import *
from . import Post
FEED_URL = "https://www.blogger.com/feeds/746298260979647434/posts/default/-/Newsticker"
class TextonlyParser(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self._text = StringIO()
def handle_data(self, d):
self._text.write(d)
@property
def text(self):
return self._text.getvalue()
def strip_tags(html):
s = TextonlyParser()
s.feed(html)
return s.text
find_tags = re.compile(r"\+\+\+ (.*?) \+\+\+").finditer
def feed_page(page: int = 1, per_page: int = 25) -> str:
start = 1 + (page - 1) * per_page
return f"{FEED_URL}?start-index={start}&max-results={per_page}"
def split_post(post: Post) -> Iterable[Post]:
for match in find_tags(strip_tags(post.content)):
yield replace(post, content=match[1])

78
postillon/store.py Normal file
View file

@ -0,0 +1,78 @@
import sqlite3
from datetime import datetime, timezone
from typing import *
from . import Post
class Store:
def __init__(self, dbpath: Optional[str] = None):
self.dbpath = dbpath
self.connection: Optional[sqlite3.Connection] = None
def connect(self, path: Optional[str] = None):
if path:
self.dbpath = path
if self.connection is not None:
return self.connection
self.connection = sqlite3.connect(
self.dbpath, isolation_level=None
) # auto commit
self.init()
def disconnect(self):
conn = self.connection
if conn:
conn.close()
def init(self):
conn = self.connection
conn.execute(
"""
create table if not exists post (
id integer primary key,
content text unique not null,
source text, -- link to the source of this post
date integer not null
)
"""
)
conn.execute(
"""
create index if not exists post_date
on post(date)
"""
)
def add(self, posts: Iterable[Post]):
sql = f"""
insert into post(content, source, date)
values (?, ?, ?)
on conflict do nothing
"""
self.connection.executemany(
sql,
(
(p.content, p.link, int(p.date.timestamp()) if p.date else None)
for p in posts
),
)
def _select(self, condition="", params=[]) -> Iterable[Post]:
sql = f"select id, content, date, source from post {condition}"
for row in self.connection.execute(sql, params):
id, content, date, source = row
if date is not None:
date = datetime.fromtimestamp(date, tz=timezone.utc)
post = Post(id, content, date, link=source)
yield post
def random_post(self) -> Optional[Post]:
cond = "where id in (select id from post order by random() limit 1)"
for post in self._select(cond):
return post
def search(self, term, skip: int = 0) -> Iterable[Post]:
cond = "where content like ? order by date desc limit -1 offset ?"
for post in self._select(cond, (term, skip)):
yield post