diff --git a/unwind/imdb.py b/unwind/imdb.py index 6044d97..477ec64 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -6,7 +6,7 @@ from urllib.parse import urljoin from . import db from .models import Movie, Rating, User -from .request import cache_path, session, soup_from_url +from .request import asession, asoup_from_url, cache_path log = logging.getLogger(__name__) @@ -35,7 +35,7 @@ log = logging.getLogger(__name__) async def refresh_user_ratings_from_imdb(stop_on_dupe: bool = True): - with session() as s: + async with asession() as s: s.headers["Accept-Language"] = "en-US, en;q=0.5" for user in await db.get_all(User): @@ -152,7 +152,7 @@ ForgedRequest = namedtuple("ForgedRequest", "url headers") async def parse_page(url: str) -> tuple[list[Rating], str | None]: ratings = [] - soup = soup_from_url(url) + soup = await asoup_from_url(url) meta = soup.find("meta", property="pageId") headline = soup.h1 diff --git a/unwind/request.py b/unwind/request.py index a4d1778..4579313 100644 --- a/unwind/request.py +++ b/unwind/request.py @@ -202,6 +202,56 @@ def _http_get(s: _Session_T, url: str, *args, **kwds) -> _Response_T: return resp +@_throttle(1, 1, random) +async def _ahttp_get(s: _ASession_T, url: str, *args, **kwds) -> _Response_T: + req = s.build_request(method="GET", url=url, *args, **kwds) + + cachefile = cache_path(req) if config.debug else None + + if cachefile: + if cachefile.exists(): + log.debug( + "💾 loading %s (%a) from cache %s ...", req.url, req.headers, cachefile + ) + with cachefile.open() as fp: + resp = _CachedResponse(**json.load(fp)) + if 300 <= resp.status_code <= 399: + raise _RedirectError( + from_url=resp.url, to_url=resp.headers["location"], is_cached=True + ) + return cast(_Response_T, resp) + + log.debug("⚡️ loading %s (%a) ...", req.url, req.headers) + resp = await s.send(req, follow_redirects=False, stream=True) + resp.raise_for_status() + + await resp.aread() # Download the response stream to allow `resp.text` access. + + if cachefile: + log.debug( + "💾 writing response to cache: %s (%a) -> %s", + req.url, + req.headers, + cachefile, + ) + with cachefile.open("w") as fp: + json.dump( + { + "status_code": resp.status_code, + "text": resp.text, + "url": str(resp.url), + "headers": dict(resp.headers), + }, + fp, + ) + + if resp.is_redirect: + # Redirects could mean trouble, we need to stay on top of that! + raise _RedirectError(from_url=str(resp.url), to_url=resp.headers["location"]) + + return resp + + def soup_from_url(url): """Return a BeautifulSoup instance from the contents for the given URL.""" with session() as s: @@ -211,6 +261,15 @@ def soup_from_url(url): return soup +async def asoup_from_url(url): + """Return a BeautifulSoup instance from the contents for the given URL.""" + async with asession() as s: + r = await _ahttp_get(s, url) + + soup = bs4.BeautifulSoup(r.text, "html5lib") + return soup + + def _last_modified_from_response(resp: _Response_T) -> float | None: if last_mod := resp.headers.get("last-modified"): try: