dump current state (wip-ish)
This commit is contained in:
parent
0124c35472
commit
51fb1c9f26
46 changed files with 3749 additions and 0 deletions
3
feeder/__init__.py
Normal file
3
feeder/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
from .feeder import Feeder, all_posts
|
||||
from .models import Feed, Post
|
||||
from .store import Store
|
||||
57
feeder/feeder.py
Normal file
57
feeder/feeder.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
import asyncio
|
||||
import logging
|
||||
from typing import *
|
||||
|
||||
from .models import Feed, FeedId, Post, PostId
|
||||
from .store import Store
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Feeder:
|
||||
def __init__(self, store: Store, feeds: Iterable[Feed] = None):
|
||||
self.feeds: Dict[str, Feed] = {}
|
||||
self.store: Store = store
|
||||
self.news: Mapping[FeedId, Set[PostId]] = {}
|
||||
|
||||
if feeds:
|
||||
self.add_feeds(feeds)
|
||||
|
||||
def add_feeds(self, feeds: Iterable[Feed]):
|
||||
self.feeds.update({f.id: f for f in feeds})
|
||||
self.store.sync_feeds(self.feeds)
|
||||
|
||||
async def update_all(self) -> Mapping[FeedId, Set[PostId]]:
|
||||
new_post_ids = self.news = dict(
|
||||
zip(
|
||||
self.feeds,
|
||||
await asyncio.gather(*(self.update(id) for id in self.feeds)),
|
||||
)
|
||||
)
|
||||
self.store.sync_feeds(self.feeds)
|
||||
return new_post_ids
|
||||
|
||||
async def update(self, feed_id) -> Set[PostId]:
|
||||
feed = self.feeds[feed_id]
|
||||
post_ids = feed.post_ids
|
||||
feed.load()
|
||||
return feed.post_ids - post_ids
|
||||
|
||||
def posts(self, feed_id: FeedId, post_ids: Sequence[PostId]) -> Sequence[Post]:
|
||||
return self.store.posts(feed_id, post_ids)
|
||||
|
||||
|
||||
async def all_posts(feed_url: str, throttle: int = 10) -> AsyncIterable[Post]:
|
||||
"""Yield all posts from the given feed URL and all following pages.
|
||||
|
||||
A feed can be split into multiple pages.
|
||||
The Feed's normal load function ignores them. This function follows
|
||||
them and returns all Posts from all pages.
|
||||
"""
|
||||
feed = Feed(id=feed_url, url="", next_url=feed_url)
|
||||
while (feed := feed.load_next()) :
|
||||
log.debug(f"New feed page: {feed}")
|
||||
for post in feed.posts:
|
||||
yield post
|
||||
log.debug(f"Waiting for {throttle} seconds ...")
|
||||
await asyncio.sleep(throttle)
|
||||
109
feeder/models.py
Normal file
109
feeder/models.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
import asyncio
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from hashlib import md5
|
||||
from typing import *
|
||||
|
||||
import feedparser
|
||||
|
||||
USER_AGENT = "curl/7.64.1"
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
FeedId = str
|
||||
PostId = str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Feed:
|
||||
id: FeedId
|
||||
url: str
|
||||
title: Optional[str] = None
|
||||
posts: List["Post"] = field(default_factory=list)
|
||||
etag: Optional[str] = None
|
||||
modified: Optional[str] = None
|
||||
active: bool = True
|
||||
next_url: Optional[str] = None
|
||||
|
||||
@property
|
||||
def post_ids(self) -> Set[PostId]:
|
||||
return {p.id for p in self.posts}
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load all posts from the current feed URL."""
|
||||
log.debug(f"Loading {self.url} ...")
|
||||
r = feedparser.parse(
|
||||
self.url, agent=USER_AGENT, etag=self.etag, modified=self.modified
|
||||
)
|
||||
log.debug(f"Loaded {self.url}: {r.get('status')} {r.headers}")
|
||||
if r.get("status") is None:
|
||||
log.error(f"Feed could not be loaded: {self.id}: {self.url}")
|
||||
return
|
||||
elif r.get("status") == 301:
|
||||
log.warning(f"Feed URL changed: {self.id}: {r.href}")
|
||||
self.url = r.href
|
||||
elif r.get("status") == 410:
|
||||
log.error(f"Feed is gone: {self.id}")
|
||||
self.active = False
|
||||
return
|
||||
|
||||
if "etag" in r:
|
||||
self.etag = r.etag
|
||||
if "modified" in r:
|
||||
self.modified = r.modified
|
||||
if "title" in r.feed:
|
||||
self.title = r.feed.title
|
||||
|
||||
posts = [Post.from_entry(e) for e in r.entries]
|
||||
for post in posts:
|
||||
if post.date is None:
|
||||
post.date = pubdate(r.feed)
|
||||
posts.sort(key=lambda e: e.date, reverse=True)
|
||||
self.posts = posts
|
||||
|
||||
for link in r.feed.get("links", []):
|
||||
if link.get("rel") == "next":
|
||||
self.next_url = link.get("href")
|
||||
break
|
||||
else:
|
||||
self.next_url = None
|
||||
|
||||
def load_next(self) -> Optional["Feed"]:
|
||||
if not self.next_url:
|
||||
return None
|
||||
feed = Feed(self.id, self.next_url)
|
||||
feed.load()
|
||||
return feed
|
||||
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
id: PostId
|
||||
content: Optional[str] = None
|
||||
date: Optional[datetime] = None
|
||||
link: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def from_entry(cls, entry):
|
||||
content = entry.get("summary", "")
|
||||
title = entry.get("title", "")
|
||||
return cls(
|
||||
id=(
|
||||
entry.get("id")
|
||||
or entry.get("link")
|
||||
or md5(f"{title}|{content}".encode()).hexdigest()
|
||||
),
|
||||
date=pubdate(entry),
|
||||
content=content,
|
||||
title=title,
|
||||
link=entry.get("link"),
|
||||
)
|
||||
|
||||
|
||||
def pubdate(entry) -> Optional[datetime]:
|
||||
date = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
if date is None:
|
||||
return None
|
||||
return datetime(*date[:6], tzinfo=timezone.utc)
|
||||
123
feeder/store.py
Normal file
123
feeder/store.py
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
import logging
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from typing import *
|
||||
|
||||
from .models import Feed, Post
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Store:
|
||||
def __init__(self, dbpath: Optional[str] = None):
|
||||
self.dbpath = dbpath
|
||||
self.connection: Optional[sqlite3.Connection] = None
|
||||
|
||||
def connect(self, path: Optional[str] = None) -> None:
|
||||
if path:
|
||||
self.dbpath = path
|
||||
if self.connection is not None:
|
||||
return self.connection
|
||||
log.debug("Connecting to %s", self.dbpath)
|
||||
self.connection = sqlite3.connect(
|
||||
self.dbpath, isolation_level=None
|
||||
) # auto commit
|
||||
self.init()
|
||||
|
||||
def disconnect(self) -> None:
|
||||
conn = self.connection
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def init(self) -> None:
|
||||
conn = self.connection
|
||||
conn.execute(
|
||||
"""
|
||||
create table if not exists feed (
|
||||
id text primary key not null,
|
||||
url text unique not null,
|
||||
active integer not null,
|
||||
etag text,
|
||||
modified text
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
create table if not exists post (
|
||||
id text primary key not null,
|
||||
feed_id text not null references feed(id) on delete cascade,
|
||||
content text,
|
||||
date text,
|
||||
link text,
|
||||
title text
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
def sync_feeds(self, feeds: Dict[str, Feed]) -> None:
|
||||
"""Write the current state of feeds to store, and load existing info back."""
|
||||
conn = self.connection
|
||||
conn.executemany(
|
||||
"""
|
||||
insert into feed(id, url, active)
|
||||
values(?, ?, 1)
|
||||
on conflict(id) do update set url=?, active=?, etag=?, modified=?
|
||||
""",
|
||||
(
|
||||
(f.id, f.url, f.url, 1 if f.active else 0, f.etag, f.modified)
|
||||
for f in feeds.values()
|
||||
),
|
||||
)
|
||||
|
||||
conn.executemany(
|
||||
"""
|
||||
insert into post(id, feed_id, content, date, link, title)
|
||||
values(?, ?, ?, ?, ?, ?)
|
||||
on conflict do nothing
|
||||
""",
|
||||
(
|
||||
(p.id, f.id, p.content, p.date, p.link, p.title)
|
||||
for f in feeds.values()
|
||||
for p in f.posts
|
||||
),
|
||||
)
|
||||
|
||||
sql = "select id, url, active from feed"
|
||||
for row in conn.execute(sql):
|
||||
id, url, active = row
|
||||
if id not in feeds:
|
||||
feeds[id] = Feed(id, url)
|
||||
else:
|
||||
if active:
|
||||
if feeds[id].url != url:
|
||||
log.warning(f"Feed URL changed: {id}: {url}")
|
||||
feeds[id].url = url
|
||||
else:
|
||||
log.warning(f"Feed is marked inactive: {id}")
|
||||
del feeds[id]
|
||||
|
||||
post_ids = {f.id: f.post_ids for f in feeds.values()}
|
||||
sql = """
|
||||
select post.id, feed_id from post
|
||||
join feed on feed.id=feed_id
|
||||
where feed.active=1
|
||||
"""
|
||||
for row in conn.execute(sql):
|
||||
post_id, feed_id = row
|
||||
if post_id not in post_ids[feed_id]:
|
||||
post_ids[feed_id].add(post_id)
|
||||
feeds[feed_id].posts.append(Post(post_id))
|
||||
|
||||
def posts(self, feed_id, post_ids) -> Sequence[Post]:
|
||||
qs = ",".join(["?"] * len(post_ids))
|
||||
sql = f"""
|
||||
select id, content, date, link, title from post
|
||||
where feed_id=? and id in ({qs})
|
||||
"""
|
||||
conn = self.connection
|
||||
posts = [Post(*row) for row in conn.execute(sql, (feed_id, *post_ids))]
|
||||
for post in posts:
|
||||
if post.date is not None:
|
||||
post.date = datetime.fromisoformat(post.date)
|
||||
return posts
|
||||
Loading…
Add table
Add a link
Reference in a new issue