removed book metadata

2025-06-10 13:46:32 +02:00 · 2025-06-10 13:46:32 +02:00 · bc05ee49ea
commit bc05ee49ea
parent 52f64d824a
1 changed files with 97 additions and 47 deletions
--- a/scribble_to_epub/scribblehub.py
+++ b/scribble_to_epub/scribblehub.py
@ -9,6 +9,7 @@ import arrow
 import ftfy
 from typing import Iterable, List
 import re
+import mimetypes
 import math

 try:
@ -37,51 +38,6 @@ CHAPTER_MATCH = re.compile(
 STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
 DATE_MATCH = re.compile("Last updated: .*")

-
-class BookMetadata:
-    """
-    Represents the metadata for the book
-    """
-
-    slug: str
-    title: str
-    languages: Iterable[str]    # Dublin-core language codes
-    cover_url: str
-    date: arrow.Arrow
-    description: str
-    author: str
-    publisher: str
-    identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.)
-    genres: Iterable[str]
-    tags: Iterable[str]
-    
-    @cached_property
-    def rights(self) -> str:
-        return f"© {self.date.year} {self.author}"
-
-    def __init__(self):
-        self.languages = []
-        self.genres = []
-        self.tags = []
-
-    def __str__(self):
-        return (
-            f"BookMetadata(\n"
-            f"  Title: {self.title}\n"
-            f"  Author: {self.author}\n"
-            f"  Identifier: {self.identifier}\n"
-            f"  Languages: {', '.join(self.languages)}\n"
-            f"  Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
-            f"  Publisher: {self.publisher}\n"
-            f"  Genres: {', '.join(self.genres)}\n"
-            f"  Tags: {', '.join(self.tags)}\n"
-            f"  Rights: {self.rights}\n"
-            f"  Cover URL: {self.cover_url}\n"
-            f"  Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n"
-            f")"
-        )
-
-
 class ScribbleChapter:
    parent: ScribbleBook

@ -90,10 +46,12 @@ class ScribbleChapter:
    text: str   # HTML content of chapter
    date: arrow.Arrow

-    def __init__(self, parent: ScribbleBook, url: str):
+    def __init__(self, parent: ScribbleBook, url: str, session: cloudscraper.CloudScraper):
        self.parent = parent
        self.source_url = url

+        self.session = session
+
    def __str__(self):
        return (
            f"ScribbleChapter(\n"
@ -104,6 +62,95 @@ class ScribbleChapter:
            f")"
        )
    
+    def load(self):
+        resp = self.session.get(self.source_url, headers=headers)
+        soup = BeautifulSoup(resp.text, "lxml")
+
+        for tag in soup.find_all(lambda x: x.has_attr("lang")):
+            if tag["lang"] not in self.parent.languages:
+                log.debug(f'Found language {tag["lang"]}')
+                self.parent.languages.append(tag["lang"])
+
+        self.title = soup.find(class_="chapter-title").text
+        log.info(f"{self.parent.title} Chapter {self.index}: {self.title}")
+
+        if not mimetypes.inited:
+            mimetypes.init(None)
+
+        """
+        for asset in soup.select("#chp_contents img[src]"):
+            if asset["src"] not in self.assets:
+                log.debug(f'Found asset at {asset["src"]}')
+                try:
+                    asset_resp = session.get(asset["src"], headers=headers)
+                except HTTPError as e:
+                    # just remove the asset from HTML if we have fetch issues
+                    log.warning(
+                        f'Issue fetching asset {asset["src"]} because "{e.response.status_code}: {e.response.reason}"'
+                    )
+                    asset.extract()
+                    continue
+                fname = sha1(encode(asset["src"], "utf-8")).hexdigest()
+                mimetype, _ = mimetypes.guess_type(asset["src"])
+                log.debug(f"Asset is {mimetype}")
+                ext = mimetypes.guess_extension(mimetype)
+                relpath = f"static/{fname}{ext}"
+                self.assets[asset["src"]] = {
+                    "content": asset_resp.content,
+                    "relpath": relpath,
+                    "mimetype": mimetype,
+                    "uid": fname,
+                }
+            else:
+                relpath = self.assets[asset["src"]]["relpath"]
+            log.debug(f"Updating asset to {relpath} from {asset['src']}")
+            asset["src"] = relpath
+        """
+            
+        header_tag = soup.new_tag("h2")
+        header_tag.string = self.title
+        chap_text = soup.find(class_="chp_raw").extract()
+        chap_text.insert(0, header_tag)
+        self.text = ftfy.fix_text(chap_text.prettify())
+        self.fix_footnotes()
+
+    def fix_footnotes(self):
+        """
+        Iterate through any footnotes and refactor them to ePub format
+        """
+        soup = BeautifulSoup(self.text, "lxml")
+        footnotes = []
+        for tag in soup.select(".modern-footnotes-footnote"):
+            mfn = tag["data-mfn"].text
+            log.debug(f"Found footnote {mfn}")
+            anchor = tag.find_all("a")[-1]
+            content_tag_element = soup.select(
+                f".modern-footnotes-footnote__note[data-mfn={mfn}]"
+            )
+            content_tag = content_tag_element[0]
+            if not anchor or not content_tag:
+                return
+            anchor["id"] = f"noteanchor-{mfn}"
+            anchor["href"] = f"#note-{mfn}"
+            anchor["epub:type"] = "noteref"
+
+            content_tag.name = "aside"
+            content_tag["id"] = f"note-{mfn}"
+            content_tag["epub:type"] = "footnote"
+            footnote_anchor = soup.new_tag("a", href=f"#noteanchor-{mfn}")
+            footnote_anchor.string = f"{mfn}."
+            content_tag_element.insert(0, footnote_anchor)
+            footnotes.append(content_tag_element)
+        if footnotes:
+            tag = soup.find_all("p")[-1]
+            footnote_header = soup.new_tag("h2", id="footnotes")
+            footnote_header.string = "Footnotes"
+            tag.append(footnote_header)
+            tag.extend(footnotes)
+
+        soup.smooth()
+        self.text = ftfy.fix_text(soup.prettify())
+    


 class ScribbleBook:
@ -157,6 +204,9 @@ class ScribbleBook:
        print(str(self))

        self.get_chapters()
+        c = self.chapters[0]
+        c.load()
+        print(c.text)

    def load_metadata(self) -> None:
        """
@ -229,7 +279,7 @@ class ScribbleBook:

            chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
            for chapter_tag in chapter_soup.find_all(class_="toc_w"):
-                chapter = ScribbleChapter(self, chapter_tag.a["href"])
+                chapter = ScribbleChapter(self, chapter_tag.a["href"], self.session)
                chapter.index = int(chapter_tag["order"])
                chapter.title = chapter_tag.a.text
                chapter.date = arrow.get(