implemented fetching of asset

2025-06-10 14:11:56 +02:00 · 2025-06-10 14:11:56 +02:00 · 57f2dd2f32
commit 57f2dd2f32
parent bc05ee49ea
2 changed files with 86 additions and 1 deletions
--- a/scribble_to_epub/init.py
+++ b/scribble_to_epub/init.py
@ -0,0 +1 @@
 __name__ = "scribble_to_epub"
--- a/scribble_to_epub/scribblehub.py
+++ b/scribble_to_epub/scribblehub.py
@ -7,10 +7,16 @@ import logging
 import cloudscraper
 import arrow
 import ftfy
-from typing import Iterable, List
+from typing import List, Optional, Dict
 import re
 import mimetypes
 import math
 from codecs import encode
 from hashlib import sha1
 from pathlib import Path
 import requests
 from . import __name__
 try:
    import http.client as http_client
@ -38,6 +44,69 @@ CHAPTER_MATCH = re.compile(
 STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
 DATE_MATCH = re.compile("Last updated: .*")
 temp_path = Path("/tmp", __name__)
 temp_path.mkdir(exist_ok=True)
 class Asset:
    """
    - `content`: the `bytes` content of the image
    - `relpath`: "static/{fname}{ext}"
        - `fname`: a SHA-1 hash of the URL
        - `ext`: a mimetypes guessed extension
    - `mimetype`: mimetype of the asset
    - `uid`: `fname`
    """
    success: bool = False
    url: str        # indexes by url
    content: bytes  # content of asset
    @cached_property
    def mimetype(self) -> str:
        mimetype, _ = mimetypes.guess_type(self.url)
        return mimetype
    @cached_property
    def filename(self) -> str:
        """
        "{fname}{ext}"
        - fname`: a SHA-1 hash of the URL
        - `ext`: a mimetypes guessed extension
        """
        fname = sha1(encode(self.url, "utf-8")).hexdigest()
        ext = mimetypes.guess_extension(self.mimetype)
        return f"{fname}{ext}"
    @cached_property
    def relpath(self) -> str:
        return f"static/{self.filename}"
    def __init__(self, url: str, session: Optional[requests.Session] = None):
        self.url = url
        self.session = session or requests.Session()
        self.fetch()
    def fetch(self):
        temp = Path(temp_path, self.filename)
        if temp.exists():
            self.content = temp.read_bytes()
            self.success = True
            return
        try:
            r = self.session.get(self.url, headers=headers)
            self.content = r.content
            temp.write_bytes(r.content)
            self.success = True
        except requests.HTTPError as e:
            log.warning(
                f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"'
            )
 class ScribbleChapter:
    parent: ScribbleBook
@ -191,6 +260,7 @@ class ScribbleBook:
    def __init__(self, url: str):
        self.source_url = url
        self.assets: Dict[str, Asset] = {}
        self.languages = []
        self.genres = []
@ -208,6 +278,18 @@ class ScribbleBook:
        c.load()
        print(c.text)
    def add_asset(self, url: str):
        if url is None:
            return
        if url.strip() == "":
            return
        a = Asset(url, self.session)
        if a.success:
            self.assets[a.url] = a
        else:
            log.warning(f"couldn't fetch asset {url}")
    def load_metadata(self) -> None:
        """
        Load the metadata for this object
@ -237,6 +319,8 @@ class ScribbleBook:
        print(f"Book Title: {self.title}")
        self.cover_url = soup.find(property="og:image")["content"] or ""
        self.add_asset(self.cover_url)
        self.date = arrow.get(
            soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
        )