dump current state (wip-ish)
This commit is contained in:
parent
0124c35472
commit
51fb1c9f26
46 changed files with 3749 additions and 0 deletions
4
postillon/__init__.py
Normal file
4
postillon/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from feeder import Feed, Post, all_posts
|
||||
|
||||
from .postbox import FEED_URL, split_post
|
||||
from .store import Store
|
||||
51
postillon/__main__.py
Normal file
51
postillon/__main__.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from typing import *
|
||||
|
||||
import postillon
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||||
level=os.getenv("LOGLEVEL", "INFO"),
|
||||
)
|
||||
|
||||
|
||||
async def all_posts(feed_url, throttle: int = 10) -> AsyncIterable[postillon.Post]:
|
||||
"""We can't use feed's all_posts because blogger creates broken next URLs."""
|
||||
feed = postillon.Feed(feed_url, url="", next_url=feed_url)
|
||||
while (feed := feed.load_next()) :
|
||||
log.debug(f"New feed page: {feed}")
|
||||
if feed.next_url:
|
||||
feed.next_url = feed.next_url.replace(
|
||||
f"{postillon.FEED_URL}/-/Newsticker", postillon.FEED_URL
|
||||
)
|
||||
for post in feed.posts:
|
||||
yield post
|
||||
log.debug(f"Waiting for {throttle} seconds ...")
|
||||
await asyncio.sleep(throttle)
|
||||
|
||||
|
||||
async def dump_all(dbpath: str, feed_url: str):
|
||||
store = postillon.Store(dbpath)
|
||||
store.connect()
|
||||
async for post in all_posts(feed_url):
|
||||
store.add(postillon.split_post(post))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--database", "--db", "-d", required=True, help="Path to database.sqlite"
|
||||
)
|
||||
parser.add_argument("--feed", "-f", default=postillon.FEED_URL, help="Feed URL")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(dump_all(args.database, args.feed))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
44
postillon/postbox.py
Normal file
44
postillon/postbox.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
import re
|
||||
from dataclasses import replace
|
||||
from html.parser import HTMLParser
|
||||
from io import StringIO
|
||||
from typing import *
|
||||
|
||||
from . import Post
|
||||
|
||||
FEED_URL = "https://www.blogger.com/feeds/746298260979647434/posts/default/-/Newsticker"
|
||||
|
||||
|
||||
class TextonlyParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.reset()
|
||||
self.strict = False
|
||||
self.convert_charrefs = True
|
||||
self._text = StringIO()
|
||||
|
||||
def handle_data(self, d):
|
||||
self._text.write(d)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
return self._text.getvalue()
|
||||
|
||||
|
||||
def strip_tags(html):
|
||||
s = TextonlyParser()
|
||||
s.feed(html)
|
||||
return s.text
|
||||
|
||||
|
||||
find_tags = re.compile(r"\+\+\+ (.*?) \+\+\+").finditer
|
||||
|
||||
|
||||
def feed_page(page: int = 1, per_page: int = 25) -> str:
|
||||
start = 1 + (page - 1) * per_page
|
||||
return f"{FEED_URL}?start-index={start}&max-results={per_page}"
|
||||
|
||||
|
||||
def split_post(post: Post) -> Iterable[Post]:
|
||||
for match in find_tags(strip_tags(post.content)):
|
||||
yield replace(post, content=match[1])
|
||||
78
postillon/store.py
Normal file
78
postillon/store.py
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from typing import *
|
||||
|
||||
from . import Post
|
||||
|
||||
|
||||
class Store:
|
||||
def __init__(self, dbpath: Optional[str] = None):
|
||||
self.dbpath = dbpath
|
||||
self.connection: Optional[sqlite3.Connection] = None
|
||||
|
||||
def connect(self, path: Optional[str] = None):
|
||||
if path:
|
||||
self.dbpath = path
|
||||
if self.connection is not None:
|
||||
return self.connection
|
||||
self.connection = sqlite3.connect(
|
||||
self.dbpath, isolation_level=None
|
||||
) # auto commit
|
||||
self.init()
|
||||
|
||||
def disconnect(self):
|
||||
conn = self.connection
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def init(self):
|
||||
conn = self.connection
|
||||
conn.execute(
|
||||
"""
|
||||
create table if not exists post (
|
||||
id integer primary key,
|
||||
content text unique not null,
|
||||
source text, -- link to the source of this post
|
||||
date integer not null
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
create index if not exists post_date
|
||||
on post(date)
|
||||
"""
|
||||
)
|
||||
|
||||
def add(self, posts: Iterable[Post]):
|
||||
sql = f"""
|
||||
insert into post(content, source, date)
|
||||
values (?, ?, ?)
|
||||
on conflict do nothing
|
||||
"""
|
||||
self.connection.executemany(
|
||||
sql,
|
||||
(
|
||||
(p.content, p.link, int(p.date.timestamp()) if p.date else None)
|
||||
for p in posts
|
||||
),
|
||||
)
|
||||
|
||||
def _select(self, condition="", params=[]) -> Iterable[Post]:
|
||||
sql = f"select id, content, date, source from post {condition}"
|
||||
for row in self.connection.execute(sql, params):
|
||||
id, content, date, source = row
|
||||
if date is not None:
|
||||
date = datetime.fromtimestamp(date, tz=timezone.utc)
|
||||
post = Post(id, content, date, link=source)
|
||||
yield post
|
||||
|
||||
def random_post(self) -> Optional[Post]:
|
||||
cond = "where id in (select id from post order by random() limit 1)"
|
||||
for post in self._select(cond):
|
||||
return post
|
||||
|
||||
def search(self, term, skip: int = 0) -> Iterable[Post]:
|
||||
cond = "where content like ? order by date desc limit -1 offset ?"
|
||||
for post in self._select(cond, (term, skip)):
|
||||
yield post
|
||||
Loading…
Add table
Add a link
Reference in a new issue