implemented fetching of asset

This commit is contained in:
Hazel Noack 2025-06-10 14:11:56 +02:00
parent bc05ee49ea
commit 57f2dd2f32
2 changed files with 86 additions and 1 deletions

View File

@ -0,0 +1 @@
__name__ = "scribble_to_epub"

View File

@ -7,10 +7,16 @@ import logging
import cloudscraper
import arrow
import ftfy
from typing import Iterable, List
from typing import List, Optional, Dict
import re
import mimetypes
import math
from codecs import encode
from hashlib import sha1
from pathlib import Path
import requests
from . import __name__
try:
import http.client as http_client
@ -38,6 +44,69 @@ CHAPTER_MATCH = re.compile(
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
DATE_MATCH = re.compile("Last updated: .*")
temp_path = Path("/tmp", __name__)
temp_path.mkdir(exist_ok=True)
class Asset:
"""
- `content`: the `bytes` content of the image
- `relpath`: "static/{fname}{ext}"
- `fname`: a SHA-1 hash of the URL
- `ext`: a mimetypes guessed extension
- `mimetype`: mimetype of the asset
- `uid`: `fname`
"""
success: bool = False
url: str # indexes by url
content: bytes # content of asset
@cached_property
def mimetype(self) -> str:
mimetype, _ = mimetypes.guess_type(self.url)
return mimetype
@cached_property
def filename(self) -> str:
"""
"{fname}{ext}"
- fname`: a SHA-1 hash of the URL
- `ext`: a mimetypes guessed extension
"""
fname = sha1(encode(self.url, "utf-8")).hexdigest()
ext = mimetypes.guess_extension(self.mimetype)
return f"{fname}{ext}"
@cached_property
def relpath(self) -> str:
return f"static/{self.filename}"
def __init__(self, url: str, session: Optional[requests.Session] = None):
self.url = url
self.session = session or requests.Session()
self.fetch()
def fetch(self):
temp = Path(temp_path, self.filename)
if temp.exists():
self.content = temp.read_bytes()
self.success = True
return
try:
r = self.session.get(self.url, headers=headers)
self.content = r.content
temp.write_bytes(r.content)
self.success = True
except requests.HTTPError as e:
log.warning(
f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"'
)
class ScribbleChapter:
parent: ScribbleBook
@ -191,6 +260,7 @@ class ScribbleBook:
def __init__(self, url: str):
self.source_url = url
self.assets: Dict[str, Asset] = {}
self.languages = []
self.genres = []
@ -208,6 +278,18 @@ class ScribbleBook:
c.load()
print(c.text)
def add_asset(self, url: str):
if url is None:
return
if url.strip() == "":
return
a = Asset(url, self.session)
if a.success:
self.assets[a.url] = a
else:
log.warning(f"couldn't fetch asset {url}")
def load_metadata(self) -> None:
"""
Load the metadata for this object
@ -237,6 +319,8 @@ class ScribbleBook:
print(f"Book Title: {self.title}")
self.cover_url = soup.find(property="og:image")["content"] or ""
self.add_asset(self.cover_url)
self.date = arrow.get(
soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
)