generated from Hazel/python-project
358 lines
11 KiB
Python
358 lines
11 KiB
Python
from __future__ import annotations
|
|
|
|
from functools import cached_property
|
|
from bs4 import BeautifulSoup
|
|
from ebooklib import epub
|
|
import logging
|
|
import cloudscraper
|
|
import arrow
|
|
import ftfy
|
|
from typing import List, Optional, Dict
|
|
import re
|
|
import mimetypes
|
|
import math
|
|
from codecs import encode
|
|
from hashlib import sha1
|
|
from pathlib import Path
|
|
import requests
|
|
|
|
from . import __name__
|
|
|
|
"""
|
|
try:
|
|
import http.client as http_client
|
|
except ImportError:
|
|
# Python 2
|
|
import httplib as http_client
|
|
http_client.HTTPConnection.debuglevel = 1
|
|
|
|
# You must initialize logging, otherwise you'll not see debug output.
|
|
logging.basicConfig()
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
requests_log = logging.getLogger("requests.packages.urllib3")
|
|
requests_log.setLevel(logging.DEBUG)
|
|
requests_log.propagate = True
|
|
"""
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
headers = {"User-Agent": "node"}
|
|
|
|
CHAPTER_MATCH = re.compile(
|
|
r"(?P<url_root>.*)/read/(?P<story_id>\d*)-(?P<slug>.*?)/chapter/(?P<chapter_id>\d*)"
|
|
)
|
|
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
|
|
DATE_MATCH = re.compile("Last updated: .*")
|
|
|
|
temp_path = Path("/tmp", __name__)
|
|
temp_path.mkdir(exist_ok=True)
|
|
|
|
|
|
class Asset:
|
|
"""
|
|
- `content`: the `bytes` content of the image
|
|
- `relpath`: "static/{fname}{ext}"
|
|
- `fname`: a SHA-1 hash of the URL
|
|
- `ext`: a mimetypes guessed extension
|
|
- `mimetype`: mimetype of the asset
|
|
- `uid`: `fname`
|
|
"""
|
|
success: bool = False
|
|
url: str # indexes by url
|
|
content: bytes # content of asset
|
|
|
|
@cached_property
|
|
def mimetype(self) -> str:
|
|
mimetype, _ = mimetypes.guess_type(self.url)
|
|
return mimetype
|
|
|
|
@cached_property
|
|
def filename(self) -> str:
|
|
"""
|
|
"{fname}{ext}"
|
|
- fname`: a SHA-1 hash of the URL
|
|
- `ext`: a mimetypes guessed extension
|
|
"""
|
|
fname = sha1(encode(self.url, "utf-8")).hexdigest()
|
|
ext = mimetypes.guess_extension(self.mimetype)
|
|
return f"{fname}{ext}"
|
|
|
|
@cached_property
|
|
def relpath(self) -> str:
|
|
return f"static/{self.filename}"
|
|
|
|
def __init__(self, url: str, session: Optional[requests.Session] = None):
|
|
self.url = url
|
|
self.session = session or requests.Session()
|
|
|
|
self.fetch()
|
|
|
|
def fetch(self):
|
|
temp = Path(temp_path, self.filename)
|
|
|
|
if temp.exists():
|
|
self.content = temp.read_bytes()
|
|
self.success = True
|
|
return
|
|
|
|
try:
|
|
r = self.session.get(self.url, headers=headers)
|
|
self.content = r.content
|
|
temp.write_bytes(r.content)
|
|
self.success = True
|
|
except requests.HTTPError as e:
|
|
log.warning(
|
|
f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"'
|
|
)
|
|
|
|
|
|
|
|
class ScribbleChapter:
|
|
parent: ScribbleBook
|
|
|
|
index: int
|
|
title: str
|
|
text: str # HTML content of chapter
|
|
date: arrow.Arrow
|
|
|
|
def __init__(self, parent: ScribbleBook, url: str, session: cloudscraper.CloudScraper):
|
|
self.parent = parent
|
|
self.source_url = url
|
|
|
|
self.session = session
|
|
self.add_asset = self.parent.add_asset
|
|
|
|
def __str__(self):
|
|
return (
|
|
f"ScribbleChapter(\n"
|
|
f" Index: {self.index}\n"
|
|
f" Title: {self.title}\n"
|
|
f" Date: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
|
|
f" Url: {self.source_url}\n"
|
|
f")"
|
|
)
|
|
|
|
def load(self):
|
|
resp = self.session.get(self.source_url, headers=headers)
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
|
|
for tag in soup.find_all(lambda x: x.has_attr("lang")):
|
|
if tag["lang"] not in self.parent.languages:
|
|
log.debug(f'Found language {tag["lang"]}')
|
|
self.parent.languages.append(tag["lang"])
|
|
|
|
self.title = soup.find(class_="chapter-title").text
|
|
log.info(f"{self.parent.title} Chapter {self.index}: {self.title}")
|
|
|
|
if not mimetypes.inited:
|
|
mimetypes.init(None)
|
|
|
|
for asset in soup.select("#chp_contents img[src]"):
|
|
self.add_asset(asset["src"])
|
|
|
|
header_tag = soup.new_tag("h2")
|
|
header_tag.string = self.title
|
|
chap_text = soup.find(class_="chp_raw").extract()
|
|
chap_text.insert(0, header_tag)
|
|
self.text = ftfy.fix_text(chap_text.prettify())
|
|
self.fix_footnotes()
|
|
|
|
def fix_footnotes(self):
|
|
"""
|
|
Iterate through any footnotes and refactor them to ePub format
|
|
"""
|
|
soup = BeautifulSoup(self.text, "lxml")
|
|
footnotes = []
|
|
for tag in soup.select(".modern-footnotes-footnote"):
|
|
mfn = tag["data-mfn"].text
|
|
log.debug(f"Found footnote {mfn}")
|
|
anchor = tag.find_all("a")[-1]
|
|
content_tag_element = soup.select(
|
|
f".modern-footnotes-footnote__note[data-mfn={mfn}]"
|
|
)
|
|
content_tag = content_tag_element[0]
|
|
if not anchor or not content_tag:
|
|
return
|
|
anchor["id"] = f"noteanchor-{mfn}"
|
|
anchor["href"] = f"#note-{mfn}"
|
|
anchor["epub:type"] = "noteref"
|
|
|
|
content_tag.name = "aside"
|
|
content_tag["id"] = f"note-{mfn}"
|
|
content_tag["epub:type"] = "footnote"
|
|
footnote_anchor = soup.new_tag("a", href=f"#noteanchor-{mfn}")
|
|
footnote_anchor.string = f"{mfn}."
|
|
content_tag_element.insert(0, footnote_anchor)
|
|
footnotes.append(content_tag_element)
|
|
if footnotes:
|
|
tag = soup.find_all("p")[-1]
|
|
footnote_header = soup.new_tag("h2", id="footnotes")
|
|
footnote_header.string = "Footnotes"
|
|
tag.append(footnote_header)
|
|
tag.extend(footnotes)
|
|
|
|
soup.smooth()
|
|
self.text = ftfy.fix_text(soup.prettify())
|
|
|
|
|
|
|
|
class ScribbleBook:
|
|
slug: str
|
|
title: str
|
|
languages: List[str] # Dublin-core language codes
|
|
cover_url: str
|
|
date: arrow.Arrow
|
|
description: str
|
|
author: str
|
|
publisher: str
|
|
identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.)
|
|
genres: List[str]
|
|
tags: List[str]
|
|
|
|
chapter_count: int
|
|
|
|
@cached_property
|
|
def rights(self) -> str:
|
|
return f"© {self.date.year} {self.author}"
|
|
|
|
def __str__(self):
|
|
return (
|
|
f"BookMetadata(\n"
|
|
f" Title: {self.title}\n"
|
|
f" Author: {self.author}\n"
|
|
f" Identifier: {self.identifier}\n"
|
|
f" Languages: {', '.join(self.languages)}\n"
|
|
f" Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
|
|
f" Publisher: {self.publisher}\n"
|
|
f" Genres: {', '.join(self.genres)}\n"
|
|
f" Tags: {', '.join(self.tags)}\n"
|
|
f" Rights: {self.rights}\n"
|
|
f" Cover URL: {self.cover_url}\n"
|
|
f" Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n"
|
|
f")"
|
|
)
|
|
|
|
def __init__(self, url: str):
|
|
self.source_url = url
|
|
self.assets: Dict[str, Asset] = {}
|
|
|
|
self.languages = []
|
|
self.genres = []
|
|
self.tags = []
|
|
|
|
self.chapters: List[ScribbleChapter] = []
|
|
self.session = cloudscraper.create_scraper()
|
|
|
|
def add_asset(self, url: str):
|
|
if url is None:
|
|
return
|
|
if url.strip() == "":
|
|
return
|
|
|
|
a = Asset(url, self.session)
|
|
if a.success:
|
|
self.assets[a.url] = a
|
|
else:
|
|
log.warning(f"couldn't fetch asset {url}")
|
|
|
|
def load(self, limit_chapters: Optional[int] = None):
|
|
self.load_metadata()
|
|
print(str(self))
|
|
|
|
self.fetch_chapters(limit=limit_chapters)
|
|
if limit_chapters is not None:
|
|
self.chapters = self.chapters[:limit_chapters]
|
|
|
|
for chapter in self.chapters:
|
|
print(str(chapter))
|
|
chapter.load()
|
|
|
|
def load_metadata(self) -> None:
|
|
"""
|
|
Load the metadata for this object
|
|
will make web requests
|
|
"""
|
|
|
|
# parse info from the source url
|
|
_parts = [p for p in self.source_url.split("/") if len(p.strip())]
|
|
self.slug = _parts[-1]
|
|
self.identifier = _parts[-2]
|
|
|
|
html = self.session.get(self.source_url, headers=headers)
|
|
|
|
html = self.session.get(self.source_url)
|
|
soup = BeautifulSoup(html.text, "lxml")
|
|
|
|
for tag in soup.find_all(lambda x: x.has_attr("lang")):
|
|
log.debug(f'Found language {tag["lang"]}')
|
|
self.languages.append(tag["lang"])
|
|
|
|
url = soup.find(property="og:url")["content"]
|
|
if self.source_url != url:
|
|
log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}")
|
|
|
|
self.title = soup.find(property="og:title")["content"]
|
|
print(f"Book Title: {self.title}")
|
|
|
|
self.cover_url = soup.find(property="og:image")["content"] or ""
|
|
self.add_asset(self.cover_url)
|
|
|
|
self.date = arrow.get(
|
|
soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
|
|
)
|
|
description = soup.find(class_="wi_fic_desc")
|
|
self.intro = ftfy.fix_text(description.prettify())
|
|
self.description = ftfy.fix_text(description.text)
|
|
self.author = soup.find(attrs={"name": "twitter:creator"})["content"]
|
|
self.publisher = soup.find(property="og:site_name")["content"]
|
|
|
|
self.genres = [a.string for a in soup.find_all(class_="fic_genre")]
|
|
self.tags = [a.string for a in soup.find_all(class_="stag")]
|
|
self.chapter_count = int(soup.find(class_="cnt_toc").text)
|
|
|
|
|
|
imgs = soup.find(class_="sb_content copyright").find_all("img")
|
|
for img in imgs:
|
|
if "copy" not in img["class"]:
|
|
continue
|
|
self.rights = ftfy.fix_text(img.next.string)
|
|
|
|
def fetch_chapters(self, limit: Optional[int] = None) -> None:
|
|
"""
|
|
Fetch the chapters for the work, based on the TOC API
|
|
"""
|
|
page_count = math.ceil(self.chapter_count / 15)
|
|
log.debug(
|
|
f"Expecting {self.chapter_count} chapters, page_count={page_count}"
|
|
)
|
|
|
|
if limit is not None:
|
|
page_count = min(page_count, limit)
|
|
|
|
for page in range(1, page_count + 1):
|
|
chapter_resp = self.session.post(
|
|
"https://www.scribblehub.com/wp-admin/admin-ajax.php",
|
|
{
|
|
"action": "wi_getreleases_pagination",
|
|
"pagenum": page,
|
|
"mypostid": self.identifier,
|
|
},
|
|
headers=headers,
|
|
)
|
|
|
|
chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
|
|
for chapter_tag in chapter_soup.find_all(class_="toc_w"):
|
|
chapter = ScribbleChapter(self, chapter_tag.a["href"], self.session)
|
|
chapter.index = int(chapter_tag["order"])
|
|
chapter.title = chapter_tag.a.text
|
|
chapter.date = arrow.get(
|
|
chapter_tag.span["title"], "MMM D, YYYY hh:mm A"
|
|
)
|
|
self.chapters.append(chapter)
|
|
|
|
self.chapters.sort(key=lambda x: x.index)
|
|
|
|
def build(self):
|
|
pass
|