use async requests to refresh user ratings

2023-02-04 17:55:22 +01:00 · 2023-02-04 17:55:22 +01:00 · 099770c80c
commit 099770c80c
parent 60d38e9b49
2 changed files with 62 additions and 3 deletions
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@ -6,7 +6,7 @@ from urllib.parse import urljoin

 from . import db
 from .models import Movie, Rating, User
-from .request import cache_path, session, soup_from_url
+from .request import asession, asoup_from_url, cache_path

 log = logging.getLogger(__name__)

@ -35,7 +35,7 @@ log = logging.getLogger(__name__)


 async def refresh_user_ratings_from_imdb(stop_on_dupe: bool = True):
-    with session() as s:
+    async with asession() as s:
        s.headers["Accept-Language"] = "en-US, en;q=0.5"

        for user in await db.get_all(User):
@ -152,7 +152,7 @@ ForgedRequest = namedtuple("ForgedRequest", "url headers")
 async def parse_page(url: str) -> tuple[list[Rating], str | None]:
    ratings = []

-    soup = soup_from_url(url)
+    soup = await asoup_from_url(url)

    meta = soup.find("meta", property="pageId")
    headline = soup.h1
--- a/unwind/request.py
+++ b/unwind/request.py
@ -202,6 +202,56 @@ def _http_get(s: _Session_T, url: str, *args, **kwds) -> _Response_T:
    return resp


+@_throttle(1, 1, random)
+async def _ahttp_get(s: _ASession_T, url: str, *args, **kwds) -> _Response_T:
+    req = s.build_request(method="GET", url=url, *args, **kwds)
+
+    cachefile = cache_path(req) if config.debug else None
+
+    if cachefile:
+        if cachefile.exists():
+            log.debug(
+                "💾 loading %s (%a) from cache %s ...", req.url, req.headers, cachefile
+            )
+            with cachefile.open() as fp:
+                resp = _CachedResponse(**json.load(fp))
+            if 300 <= resp.status_code <= 399:
+                raise _RedirectError(
+                    from_url=resp.url, to_url=resp.headers["location"], is_cached=True
+                )
+            return cast(_Response_T, resp)
+
+    log.debug("⚡️ loading %s (%a) ...", req.url, req.headers)
+    resp = await s.send(req, follow_redirects=False, stream=True)
+    resp.raise_for_status()
+
+    await resp.aread()  # Download the response stream to allow `resp.text` access.
+
+    if cachefile:
+        log.debug(
+            "💾 writing response to cache: %s (%a) -> %s",
+            req.url,
+            req.headers,
+            cachefile,
+        )
+        with cachefile.open("w") as fp:
+            json.dump(
+                {
+                    "status_code": resp.status_code,
+                    "text": resp.text,
+                    "url": str(resp.url),
+                    "headers": dict(resp.headers),
+                },
+                fp,
+            )
+
+    if resp.is_redirect:
+        # Redirects could mean trouble, we need to stay on top of that!
+        raise _RedirectError(from_url=str(resp.url), to_url=resp.headers["location"])
+
+    return resp
+
+
 def soup_from_url(url):
    """Return a BeautifulSoup instance from the contents for the given URL."""
    with session() as s:
@ -211,6 +261,15 @@ def soup_from_url(url):
    return soup


+async def asoup_from_url(url):
+    """Return a BeautifulSoup instance from the contents for the given URL."""
+    async with asession() as s:
+        r = await _ahttp_get(s, url)
+
+    soup = bs4.BeautifulSoup(r.text, "html5lib")
+    return soup
+
+
 def _last_modified_from_response(resp: _Response_T) -> float | None:
    if last_mod := resp.headers.get("last-modified"):
        try: