scribble_to_epub/scribble_to_epub/scribblehub.py

from __future__ import annotations

from functools import cached_property
from bs4 import BeautifulSoup
from ebooklib import epub
import logging
import cloudscraper
import arrow
import ftfy
from typing import List, Optional, Dict
import re
import mimetypes
import math
from codecs import encode
from hashlib import sha1
from pathlib import Path
import requests

from . import __name__

"""
try:
    import http.client as http_client
except ImportError:
    # Python 2
    import httplib as http_client
http_client.HTTPConnection.debuglevel = 1

# You must initialize logging, otherwise you'll not see debug output.
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
"""


log = logging.getLogger(__name__)

headers = {"User-Agent": "node"}

CHAPTER_MATCH = re.compile(
    r"(?P<url_root>.*)/read/(?P<story_id>\d*)-(?P<slug>.*?)/chapter/(?P<chapter_id>\d*)"
)
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
DATE_MATCH = re.compile("Last updated: .*")

temp_path = Path("/tmp", __name__)
temp_path.mkdir(exist_ok=True)


class Asset:
    """
    - `content`: the `bytes` content of the image
    - `relpath`: "static/{fname}{ext}"
        - `fname`: a SHA-1 hash of the URL
        - `ext`: a mimetypes guessed extension
    - `mimetype`: mimetype of the asset
    - `uid`: `fname`
    """
    success: bool = False
    url: str        # indexes by url
    content: bytes  # content of asset

    @cached_property
    def mimetype(self) -> str:
        mimetype, _ = mimetypes.guess_type(self.url)
        return mimetype

    @cached_property
    def filename(self) -> str:
        """
        "{fname}{ext}"
        - fname`: a SHA-1 hash of the URL
        - `ext`: a mimetypes guessed extension
        """
        fname = sha1(encode(self.url, "utf-8")).hexdigest()
        ext = mimetypes.guess_extension(self.mimetype)
        return f"{fname}{ext}"

    @cached_property
    def relpath(self) -> str:
        return f"static/{self.filename}"

    def __init__(self, url: str, session: Optional[requests.Session] = None):
        self.url = url
        self.session = session or requests.Session()

        self.fetch()

    def fetch(self):
        temp = Path(temp_path, self.filename)

        if temp.exists():
            self.content = temp.read_bytes()
            self.success = True
            return

        try:
            r = self.session.get(self.url, headers=headers)
            self.content = r.content
            temp.write_bytes(r.content)
            self.success = True
        except requests.HTTPError as e:
            log.warning(
                f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"'
            )


class ScribbleChapter:
    parent: ScribbleBook

    index: int
    title: str
    text: str   # HTML content of chapter
    date: arrow.Arrow

    def __init__(self, parent: ScribbleBook, url: str, session: cloudscraper.CloudScraper):
        self.parent = parent
        self.source_url = url

        self.session = session
        self.add_asset = self.parent.add_asset

    def __str__(self):
        return (
            f"ScribbleChapter(\n"
            f"  Index: {self.index}\n"
            f"  Title: {self.title}\n"
            f"  Date: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
            f"  Url: {self.source_url}\n"
            f")"
        )

    def load(self):
        resp = self.session.get(self.source_url, headers=headers)
        soup = BeautifulSoup(resp.text, "lxml")

        for tag in soup.find_all(lambda x: x.has_attr("lang")):
            if tag["lang"] not in self.parent.languages:
                log.debug(f'Found language {tag["lang"]}')
                self.parent.languages.append(tag["lang"])

        self.title = soup.find(class_="chapter-title").text
        log.info(f"{self.parent.title} Chapter {self.index}: {self.title}")

        if not mimetypes.inited:
            mimetypes.init(None)

        for asset in soup.select("#chp_contents img[src]"):
            self.add_asset(asset["src"])

        header_tag = soup.new_tag("h2")
        header_tag.string = self.title
        chap_text = soup.find(class_="chp_raw").extract()
        chap_text.insert(0, header_tag)
        self.text = ftfy.fix_text(chap_text.prettify())
        self.fix_footnotes()

    def fix_footnotes(self):
        """
        Iterate through any footnotes and refactor them to ePub format
        """
        soup = BeautifulSoup(self.text, "lxml")
        footnotes = []
        for tag in soup.select(".modern-footnotes-footnote"):
            mfn = tag["data-mfn"].text
            log.debug(f"Found footnote {mfn}")
            anchor = tag.find_all("a")[-1]
            content_tag_element = soup.select(
                f".modern-footnotes-footnote__note[data-mfn={mfn}]"
            )
            content_tag = content_tag_element[0]
            if not anchor or not content_tag:
                return
            anchor["id"] = f"noteanchor-{mfn}"
            anchor["href"] = f"#note-{mfn}"
            anchor["epub:type"] = "noteref"

            content_tag.name = "aside"
            content_tag["id"] = f"note-{mfn}"
            content_tag["epub:type"] = "footnote"
            footnote_anchor = soup.new_tag("a", href=f"#noteanchor-{mfn}")
            footnote_anchor.string = f"{mfn}."
            content_tag_element.insert(0, footnote_anchor)
            footnotes.append(content_tag_element)
        if footnotes:
            tag = soup.find_all("p")[-1]
            footnote_header = soup.new_tag("h2", id="footnotes")
            footnote_header.string = "Footnotes"
            tag.append(footnote_header)
            tag.extend(footnotes)

        soup.smooth()
        self.text = ftfy.fix_text(soup.prettify())


class ScribbleBook:
    slug: str
    title: str
    languages: List[str]    # Dublin-core language codes
    cover_url: str
    date: arrow.Arrow
    description: str
    author: str
    publisher: str
    identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.)
    genres: List[str]
    tags: List[str]

    chapter_count: int

    @cached_property
    def rights(self) -> str:
        return f"© {self.date.year} {self.author}"

    def __str__(self):
        return (
            f"BookMetadata(\n"
            f"  Title: {self.title}\n"
            f"  Author: {self.author}\n"
            f"  Identifier: {self.identifier}\n"
            f"  Languages: {', '.join(self.languages)}\n"
            f"  Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
            f"  Publisher: {self.publisher}\n"
            f"  Genres: {', '.join(self.genres)}\n"
            f"  Tags: {', '.join(self.tags)}\n"
            f"  Rights: {self.rights}\n"
            f"  Cover URL: {self.cover_url}\n"
            f"  Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n"
            f")"
        )

    def __init__(self, url: str):
        self.source_url = url
        self.assets: Dict[str, Asset] = {}

        self.languages = []
        self.genres = []
        self.tags = []

        self.chapters: List[ScribbleChapter] = []
        self.session = cloudscraper.create_scraper()

    def add_asset(self, url: str):
        if url is None:
            return
        if url.strip() == "":
            return

        a = Asset(url, self.session)
        if a.success:
            self.assets[a.url] = a
        else:
            log.warning(f"couldn't fetch asset {url}")

    def load(self, limit_chapters: Optional[int] = None):
        self.load_metadata()
        print(str(self))

        self.fetch_chapters(limit=limit_chapters)
        if limit_chapters is not None:
            self.chapters = self.chapters[:limit_chapters]

        for chapter in self.chapters:
            print(str(chapter))
            chapter.load()

    def load_metadata(self) -> None:
        """
        Load the metadata for this object
        will make web requests
        """

        # parse info from the source url
        _parts = [p for p in self.source_url.split("/") if len(p.strip())]
        self.slug = _parts[-1]
        self.identifier = _parts[-2]

        html = self.session.get(self.source_url, headers=headers)

        html = self.session.get(self.source_url)
        soup = BeautifulSoup(html.text, "lxml")

        for tag in soup.find_all(lambda x: x.has_attr("lang")):
            log.debug(f'Found language {tag["lang"]}')
            self.languages.append(tag["lang"])

        url = soup.find(property="og:url")["content"]
        if self.source_url != url:
            log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}")

        self.title = soup.find(property="og:title")["content"]
        print(f"Book Title: {self.title}")

        self.cover_url = soup.find(property="og:image")["content"] or ""
        self.add_asset(self.cover_url)

        self.date = arrow.get(
            soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
        )
        description = soup.find(class_="wi_fic_desc")
        self.intro = ftfy.fix_text(description.prettify())
        self.description = ftfy.fix_text(description.text)
        self.author = soup.find(attrs={"name": "twitter:creator"})["content"]
        self.publisher = soup.find(property="og:site_name")["content"]

        self.genres = [a.string for a in soup.find_all(class_="fic_genre")]
        self.tags = [a.string for a in soup.find_all(class_="stag")]
        self.chapter_count = int(soup.find(class_="cnt_toc").text)


        imgs = soup.find(class_="sb_content copyright").find_all("img")
        for img in imgs:
            if "copy" not in img["class"]:
                continue
            self.rights = ftfy.fix_text(img.next.string)

    def fetch_chapters(self, limit: Optional[int] = None) -> None:
        """
        Fetch the chapters for the work, based on the TOC API
        """
        page_count = math.ceil(self.chapter_count / 15)
        log.debug(
            f"Expecting {self.chapter_count} chapters, page_count={page_count}"
        )

        if limit is not None:
            page_count = min(page_count, limit)

        for page in range(1, page_count + 1):
            chapter_resp = self.session.post(
                "https://www.scribblehub.com/wp-admin/admin-ajax.php",
                {
                    "action": "wi_getreleases_pagination",
                    "pagenum": page,
                    "mypostid": self.identifier,
                },
                headers=headers,
            )

            chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
            for chapter_tag in chapter_soup.find_all(class_="toc_w"):
                chapter = ScribbleChapter(self, chapter_tag.a["href"], self.session)
                chapter.index = int(chapter_tag["order"])
                chapter.title = chapter_tag.a.text
                chapter.date = arrow.get(
                    chapter_tag.span["title"], "MMM D, YYYY hh:mm A"
                )
                self.chapters.append(chapter)

        self.chapters.sort(key=lambda x: x.index)

    def build(self):
        pass