implemented fetching of asset

This commit is contained in:
Hazel Noack 2025-06-10 14:11:56 +02:00
parent bc05ee49ea
commit 57f2dd2f32
2 changed files with 86 additions and 1 deletions

View File

@ -0,0 +1 @@
__name__ = "scribble_to_epub"

View File

@ -7,10 +7,16 @@ import logging
import cloudscraper import cloudscraper
import arrow import arrow
import ftfy import ftfy
from typing import Iterable, List from typing import List, Optional, Dict
import re import re
import mimetypes import mimetypes
import math import math
from codecs import encode
from hashlib import sha1
from pathlib import Path
import requests
from . import __name__
try: try:
import http.client as http_client import http.client as http_client
@ -38,6 +44,69 @@ CHAPTER_MATCH = re.compile(
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)") STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
DATE_MATCH = re.compile("Last updated: .*") DATE_MATCH = re.compile("Last updated: .*")
temp_path = Path("/tmp", __name__)
temp_path.mkdir(exist_ok=True)
class Asset:
"""
- `content`: the `bytes` content of the image
- `relpath`: "static/{fname}{ext}"
- `fname`: a SHA-1 hash of the URL
- `ext`: a mimetypes guessed extension
- `mimetype`: mimetype of the asset
- `uid`: `fname`
"""
success: bool = False
url: str # indexes by url
content: bytes # content of asset
@cached_property
def mimetype(self) -> str:
mimetype, _ = mimetypes.guess_type(self.url)
return mimetype
@cached_property
def filename(self) -> str:
"""
"{fname}{ext}"
- fname`: a SHA-1 hash of the URL
- `ext`: a mimetypes guessed extension
"""
fname = sha1(encode(self.url, "utf-8")).hexdigest()
ext = mimetypes.guess_extension(self.mimetype)
return f"{fname}{ext}"
@cached_property
def relpath(self) -> str:
return f"static/{self.filename}"
def __init__(self, url: str, session: Optional[requests.Session] = None):
self.url = url
self.session = session or requests.Session()
self.fetch()
def fetch(self):
temp = Path(temp_path, self.filename)
if temp.exists():
self.content = temp.read_bytes()
self.success = True
return
try:
r = self.session.get(self.url, headers=headers)
self.content = r.content
temp.write_bytes(r.content)
self.success = True
except requests.HTTPError as e:
log.warning(
f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"'
)
class ScribbleChapter: class ScribbleChapter:
parent: ScribbleBook parent: ScribbleBook
@ -191,6 +260,7 @@ class ScribbleBook:
def __init__(self, url: str): def __init__(self, url: str):
self.source_url = url self.source_url = url
self.assets: Dict[str, Asset] = {}
self.languages = [] self.languages = []
self.genres = [] self.genres = []
@ -208,6 +278,18 @@ class ScribbleBook:
c.load() c.load()
print(c.text) print(c.text)
def add_asset(self, url: str):
if url is None:
return
if url.strip() == "":
return
a = Asset(url, self.session)
if a.success:
self.assets[a.url] = a
else:
log.warning(f"couldn't fetch asset {url}")
def load_metadata(self) -> None: def load_metadata(self) -> None:
""" """
Load the metadata for this object Load the metadata for this object
@ -237,6 +319,8 @@ class ScribbleBook:
print(f"Book Title: {self.title}") print(f"Book Title: {self.title}")
self.cover_url = soup.find(property="og:image")["content"] or "" self.cover_url = soup.find(property="og:image")["content"] or ""
self.add_asset(self.cover_url)
self.date = arrow.get( self.date = arrow.get(
soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A" soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
) )