generated from Hazel/python-project
implemented fetching of asset
This commit is contained in:
parent
bc05ee49ea
commit
57f2dd2f32
@ -0,0 +1 @@
|
|||||||
|
__name__ = "scribble_to_epub"
|
@ -7,10 +7,16 @@ import logging
|
|||||||
import cloudscraper
|
import cloudscraper
|
||||||
import arrow
|
import arrow
|
||||||
import ftfy
|
import ftfy
|
||||||
from typing import Iterable, List
|
from typing import List, Optional, Dict
|
||||||
import re
|
import re
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import math
|
import math
|
||||||
|
from codecs import encode
|
||||||
|
from hashlib import sha1
|
||||||
|
from pathlib import Path
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from . import __name__
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.client as http_client
|
import http.client as http_client
|
||||||
@ -38,6 +44,69 @@ CHAPTER_MATCH = re.compile(
|
|||||||
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
|
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
|
||||||
DATE_MATCH = re.compile("Last updated: .*")
|
DATE_MATCH = re.compile("Last updated: .*")
|
||||||
|
|
||||||
|
temp_path = Path("/tmp", __name__)
|
||||||
|
temp_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Asset:
|
||||||
|
"""
|
||||||
|
- `content`: the `bytes` content of the image
|
||||||
|
- `relpath`: "static/{fname}{ext}"
|
||||||
|
- `fname`: a SHA-1 hash of the URL
|
||||||
|
- `ext`: a mimetypes guessed extension
|
||||||
|
- `mimetype`: mimetype of the asset
|
||||||
|
- `uid`: `fname`
|
||||||
|
"""
|
||||||
|
success: bool = False
|
||||||
|
url: str # indexes by url
|
||||||
|
content: bytes # content of asset
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def mimetype(self) -> str:
|
||||||
|
mimetype, _ = mimetypes.guess_type(self.url)
|
||||||
|
return mimetype
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def filename(self) -> str:
|
||||||
|
"""
|
||||||
|
"{fname}{ext}"
|
||||||
|
- fname`: a SHA-1 hash of the URL
|
||||||
|
- `ext`: a mimetypes guessed extension
|
||||||
|
"""
|
||||||
|
fname = sha1(encode(self.url, "utf-8")).hexdigest()
|
||||||
|
ext = mimetypes.guess_extension(self.mimetype)
|
||||||
|
return f"{fname}{ext}"
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def relpath(self) -> str:
|
||||||
|
return f"static/{self.filename}"
|
||||||
|
|
||||||
|
def __init__(self, url: str, session: Optional[requests.Session] = None):
|
||||||
|
self.url = url
|
||||||
|
self.session = session or requests.Session()
|
||||||
|
|
||||||
|
self.fetch()
|
||||||
|
|
||||||
|
def fetch(self):
|
||||||
|
temp = Path(temp_path, self.filename)
|
||||||
|
|
||||||
|
if temp.exists():
|
||||||
|
self.content = temp.read_bytes()
|
||||||
|
self.success = True
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = self.session.get(self.url, headers=headers)
|
||||||
|
self.content = r.content
|
||||||
|
temp.write_bytes(r.content)
|
||||||
|
self.success = True
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
log.warning(
|
||||||
|
f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ScribbleChapter:
|
class ScribbleChapter:
|
||||||
parent: ScribbleBook
|
parent: ScribbleBook
|
||||||
|
|
||||||
@ -191,6 +260,7 @@ class ScribbleBook:
|
|||||||
|
|
||||||
def __init__(self, url: str):
|
def __init__(self, url: str):
|
||||||
self.source_url = url
|
self.source_url = url
|
||||||
|
self.assets: Dict[str, Asset] = {}
|
||||||
|
|
||||||
self.languages = []
|
self.languages = []
|
||||||
self.genres = []
|
self.genres = []
|
||||||
@ -208,6 +278,18 @@ class ScribbleBook:
|
|||||||
c.load()
|
c.load()
|
||||||
print(c.text)
|
print(c.text)
|
||||||
|
|
||||||
|
def add_asset(self, url: str):
|
||||||
|
if url is None:
|
||||||
|
return
|
||||||
|
if url.strip() == "":
|
||||||
|
return
|
||||||
|
|
||||||
|
a = Asset(url, self.session)
|
||||||
|
if a.success:
|
||||||
|
self.assets[a.url] = a
|
||||||
|
else:
|
||||||
|
log.warning(f"couldn't fetch asset {url}")
|
||||||
|
|
||||||
def load_metadata(self) -> None:
|
def load_metadata(self) -> None:
|
||||||
"""
|
"""
|
||||||
Load the metadata for this object
|
Load the metadata for this object
|
||||||
@ -237,6 +319,8 @@ class ScribbleBook:
|
|||||||
print(f"Book Title: {self.title}")
|
print(f"Book Title: {self.title}")
|
||||||
|
|
||||||
self.cover_url = soup.find(property="og:image")["content"] or ""
|
self.cover_url = soup.find(property="og:image")["content"] or ""
|
||||||
|
self.add_asset(self.cover_url)
|
||||||
|
|
||||||
self.date = arrow.get(
|
self.date = arrow.get(
|
||||||
soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
|
soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user