dump current state (wip-ish)

2020-11-01 16:31:37 +01:00 · 2020-11-01 16:31:37 +01:00 · 51fb1c9f26
commit 51fb1c9f26
parent 0124c35472
46 changed files with 3749 additions and 0 deletions
--- a/feeder/init.py
+++ b/feeder/init.py
@ -0,0 +1,3 @@
+from .feeder import Feeder, all_posts
+from .models import Feed, Post
+from .store import Store
--- a/feeder/feeder.py
+++ b/feeder/feeder.py
@ -0,0 +1,57 @@
+import asyncio
+import logging
+from typing import *
+
+from .models import Feed, FeedId, Post, PostId
+from .store import Store
+
+log = logging.getLogger(__name__)
+
+
+class Feeder:
+    def __init__(self, store: Store, feeds: Iterable[Feed] = None):
+        self.feeds: Dict[str, Feed] = {}
+        self.store: Store = store
+        self.news: Mapping[FeedId, Set[PostId]] = {}
+
+        if feeds:
+            self.add_feeds(feeds)
+
+    def add_feeds(self, feeds: Iterable[Feed]):
+        self.feeds.update({f.id: f for f in feeds})
+        self.store.sync_feeds(self.feeds)
+
+    async def update_all(self) -> Mapping[FeedId, Set[PostId]]:
+        new_post_ids = self.news = dict(
+            zip(
+                self.feeds,
+                await asyncio.gather(*(self.update(id) for id in self.feeds)),
+            )
+        )
+        self.store.sync_feeds(self.feeds)
+        return new_post_ids
+
+    async def update(self, feed_id) -> Set[PostId]:
+        feed = self.feeds[feed_id]
+        post_ids = feed.post_ids
+        feed.load()
+        return feed.post_ids - post_ids
+
+    def posts(self, feed_id: FeedId, post_ids: Sequence[PostId]) -> Sequence[Post]:
+        return self.store.posts(feed_id, post_ids)
+
+
+async def all_posts(feed_url: str, throttle: int = 10) -> AsyncIterable[Post]:
+    """Yield all posts from the given feed URL and all following pages.
+
+    A feed can be split into multiple pages.
+    The Feed's normal load function ignores them.  This function follows
+    them and returns all Posts from all pages.
+    """
+    feed = Feed(id=feed_url, url="", next_url=feed_url)
+    while (feed := feed.load_next()) :
+        log.debug(f"New feed page: {feed}")
+        for post in feed.posts:
+            yield post
+        log.debug(f"Waiting for {throttle} seconds ...")
+        await asyncio.sleep(throttle)
--- a/feeder/models.py
+++ b/feeder/models.py
@ -0,0 +1,109 @@
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from hashlib import md5
+from typing import *
+
+import feedparser
+
+USER_AGENT = "curl/7.64.1"
+
+log = logging.getLogger(__name__)
+
+FeedId = str
+PostId = str
+
+
+@dataclass
+class Feed:
+    id: FeedId
+    url: str
+    title: Optional[str] = None
+    posts: List["Post"] = field(default_factory=list)
+    etag: Optional[str] = None
+    modified: Optional[str] = None
+    active: bool = True
+    next_url: Optional[str] = None
+
+    @property
+    def post_ids(self) -> Set[PostId]:
+        return {p.id for p in self.posts}
+
+    def load(self) -> None:
+        """Load all posts from the current feed URL."""
+        log.debug(f"Loading {self.url} ...")
+        r = feedparser.parse(
+            self.url, agent=USER_AGENT, etag=self.etag, modified=self.modified
+        )
+        log.debug(f"Loaded {self.url}: {r.get('status')} {r.headers}")
+        if r.get("status") is None:
+            log.error(f"Feed could not be loaded: {self.id}: {self.url}")
+            return
+        elif r.get("status") == 301:
+            log.warning(f"Feed URL changed: {self.id}: {r.href}")
+            self.url = r.href
+        elif r.get("status") == 410:
+            log.error(f"Feed is gone: {self.id}")
+            self.active = False
+            return
+
+        if "etag" in r:
+            self.etag = r.etag
+        if "modified" in r:
+            self.modified = r.modified
+        if "title" in r.feed:
+            self.title = r.feed.title
+
+        posts = [Post.from_entry(e) for e in r.entries]
+        for post in posts:
+            if post.date is None:
+                post.date = pubdate(r.feed)
+        posts.sort(key=lambda e: e.date, reverse=True)
+        self.posts = posts
+
+        for link in r.feed.get("links", []):
+            if link.get("rel") == "next":
+                self.next_url = link.get("href")
+                break
+        else:
+            self.next_url = None
+
+    def load_next(self) -> Optional["Feed"]:
+        if not self.next_url:
+            return None
+        feed = Feed(self.id, self.next_url)
+        feed.load()
+        return feed
+
+
+@dataclass
+class Post:
+    id: PostId
+    content: Optional[str] = None
+    date: Optional[datetime] = None
+    link: Optional[str] = None
+    title: Optional[str] = None
+
+    @classmethod
+    def from_entry(cls, entry):
+        content = entry.get("summary", "")
+        title = entry.get("title", "")
+        return cls(
+            id=(
+                entry.get("id")
+                or entry.get("link")
+                or md5(f"{title}|{content}".encode()).hexdigest()
+            ),
+            date=pubdate(entry),
+            content=content,
+            title=title,
+            link=entry.get("link"),
+        )
+
+
+def pubdate(entry) -> Optional[datetime]:
+    date = entry.get("published_parsed") or entry.get("updated_parsed")
+    if date is None:
+        return None
+    return datetime(*date[:6], tzinfo=timezone.utc)
--- a/feeder/store.py
+++ b/feeder/store.py
@ -0,0 +1,123 @@
+import logging
+import sqlite3
+from datetime import datetime, timezone
+from typing import *
+
+from .models import Feed, Post
+
+log = logging.getLogger(__name__)
+
+
+class Store:
+    def __init__(self, dbpath: Optional[str] = None):
+        self.dbpath = dbpath
+        self.connection: Optional[sqlite3.Connection] = None
+
+    def connect(self, path: Optional[str] = None) -> None:
+        if path:
+            self.dbpath = path
+        if self.connection is not None:
+            return self.connection
+        log.debug("Connecting to %s", self.dbpath)
+        self.connection = sqlite3.connect(
+            self.dbpath, isolation_level=None
+        )  # auto commit
+        self.init()
+
+    def disconnect(self) -> None:
+        conn = self.connection
+        if conn:
+            conn.close()
+
+    def init(self) -> None:
+        conn = self.connection
+        conn.execute(
+            """
+            create table if not exists feed (
+                id text primary key not null,
+                url text unique not null,
+                active integer not null,
+                etag text,
+                modified text
+            )
+            """
+        )
+        conn.execute(
+            """
+            create table if not exists post (
+                id text primary key not null,
+                feed_id text not null references feed(id) on delete cascade,
+                content text,
+                date text,
+                link text,
+                title text
+            )
+            """
+        )
+
+    def sync_feeds(self, feeds: Dict[str, Feed]) -> None:
+        """Write the current state of feeds to store, and load existing info back."""
+        conn = self.connection
+        conn.executemany(
+            """
+            insert into feed(id, url, active)
+            values(?, ?, 1)
+            on conflict(id) do update set url=?, active=?, etag=?, modified=?
+            """,
+            (
+                (f.id, f.url, f.url, 1 if f.active else 0, f.etag, f.modified)
+                for f in feeds.values()
+            ),
+        )
+
+        conn.executemany(
+            """
+            insert into post(id, feed_id, content, date, link, title)
+            values(?, ?, ?, ?, ?, ?)
+            on conflict do nothing
+            """,
+            (
+                (p.id, f.id, p.content, p.date, p.link, p.title)
+                for f in feeds.values()
+                for p in f.posts
+            ),
+        )
+
+        sql = "select id, url, active from feed"
+        for row in conn.execute(sql):
+            id, url, active = row
+            if id not in feeds:
+                feeds[id] = Feed(id, url)
+            else:
+                if active:
+                    if feeds[id].url != url:
+                        log.warning(f"Feed URL changed: {id}: {url}")
+                        feeds[id].url = url
+                else:
+                    log.warning(f"Feed is marked inactive: {id}")
+                    del feeds[id]
+
+        post_ids = {f.id: f.post_ids for f in feeds.values()}
+        sql = """
+            select post.id, feed_id from post
+            join feed on feed.id=feed_id
+            where feed.active=1
+        """
+        for row in conn.execute(sql):
+            post_id, feed_id = row
+            if post_id not in post_ids[feed_id]:
+                post_ids[feed_id].add(post_id)
+                feeds[feed_id].posts.append(Post(post_id))
+
+    def posts(self, feed_id, post_ids) -> Sequence[Post]:
+        qs = ",".join(["?"] * len(post_ids))
+        sql = f"""
+            select id, content, date, link, title from post
+            where feed_id=? and id in ({qs})
+        """
+        conn = self.connection
+        posts = [Post(*row) for row in conn.execute(sql, (feed_id, *post_ids))]
+        for post in posts:
+            if post.date is not None:
+                post.date = datetime.fromisoformat(post.date)
+        return posts