From 52f64d824a26f710d954d25520626445ef5b237e Mon Sep 17 00:00:00 2001
From: Hazel Noack <hazel@webcontact.de>
Date: Tue, 10 Jun 2025 13:02:35 +0200
Subject: [PATCH] fetching chapter overview

---
 scribble_to_epub/scribblehub.py | 143 +++++++++++++++++++++++++++-----
 1 file changed, 120 insertions(+), 23 deletions(-)

diff --git a/scribble_to_epub/scribblehub.py b/scribble_to_epub/scribblehub.py
index 2b68fc5..329cae0 100644
--- a/scribble_to_epub/scribblehub.py
+++ b/scribble_to_epub/scribblehub.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from functools import cached_property
 from bs4 import BeautifulSoup
 from ebooklib import epub
@@ -5,8 +7,9 @@ import logging
 import cloudscraper
 import arrow
 import ftfy
-from typing import Iterable
+from typing import Iterable, List
 import re
+import math
 
 try:
     import http.client as http_client
@@ -45,7 +48,6 @@ class BookMetadata:
     languages: Iterable[str]    # Dublin-core language codes
     cover_url: str
     date: arrow.Arrow
-
     description: str
     author: str
     publisher: str
@@ -80,23 +82,81 @@ class BookMetadata:
         )
 
 
+class ScribbleChapter:
+    parent: ScribbleBook
 
-class ScribbleBook:
-    def __init__(self, url: str):
-        self.metadata = BookMetadata()
-        
+    index: int
+    title: str
+    text: str   # HTML content of chapter
+    date: arrow.Arrow
+
+    def __init__(self, parent: ScribbleBook, url: str):
+        self.parent = parent
         self.source_url = url
 
-        print(f"scraping {url})")
+    def __str__(self):
+        return (
+            f"ScribbleChapter(\n"
+            f"  Index: {self.index}\n"
+            f"  Title: {self.title}\n"
+            f"  Date: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
+            f"  Url: {self.source_url}\n"
+            f")"
+        )
+    
 
-        self.chapters = []
+
+class ScribbleBook:
+    slug: str
+    title: str
+    languages: List[str]    # Dublin-core language codes
+    cover_url: str
+    date: arrow.Arrow
+    description: str
+    author: str
+    publisher: str
+    identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.)
+    genres: List[str]
+    tags: List[str]
+
+    chapter_count: int
+    
+    @cached_property
+    def rights(self) -> str:
+        return f"© {self.date.year} {self.author}"
+
+    def __str__(self):
+        return (
+            f"BookMetadata(\n"
+            f"  Title: {self.title}\n"
+            f"  Author: {self.author}\n"
+            f"  Identifier: {self.identifier}\n"
+            f"  Languages: {', '.join(self.languages)}\n"
+            f"  Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
+            f"  Publisher: {self.publisher}\n"
+            f"  Genres: {', '.join(self.genres)}\n"
+            f"  Tags: {', '.join(self.tags)}\n"
+            f"  Rights: {self.rights}\n"
+            f"  Cover URL: {self.cover_url}\n"
+            f"  Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n"
+            f")"
+        )
+
+    def __init__(self, url: str):
+        self.source_url = url
+        
         self.languages = []
         self.genres = []
         self.tags = []
 
+        self.chapters: List[ScribbleChapter] = []
+
+        # fetching metadata
         self.session = cloudscraper.create_scraper()
         self.load_metadata()
-        print(str(self.metadata))
+        print(str(self))
+
+        self.get_chapters()
 
     def load_metadata(self) -> None:
         """
@@ -106,10 +166,10 @@ class ScribbleBook:
 
         # parse info from the source url
         _parts = [p for p in self.source_url.split("/") if len(p.strip())]
-        self.metadata.slug = _parts[-1]
-        self.metadata.identifier = _parts[-2]
+        self.slug = _parts[-1]
+        self.identifier = _parts[-2]
 
-        html = self.session.get(self.source_url)
+        html = self.session.get(self.source_url, headers=headers)
         print(html)
 
         html = self.session.get(self.source_url)
@@ -123,24 +183,61 @@ class ScribbleBook:
         if self.source_url != url:
             log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}")
 
-        self.metadata.title = soup.find(property="og:title")["content"]
-        print(f"Book Title: {self.metadata.title}")
+        self.title = soup.find(property="og:title")["content"]
+        print(f"Book Title: {self.title}")
 
-        self.metadata.cover_url = soup.find(property="og:image")["content"] or ""
-        self.metadata.date = arrow.get(
+        self.cover_url = soup.find(property="og:image")["content"] or ""
+        self.date = arrow.get(
             soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
         )
         description = soup.find(class_="wi_fic_desc")
-        self.metadata.intro = ftfy.fix_text(description.prettify())
-        self.metadata.description = ftfy.fix_text(description.text)
-        self.metadata.author = soup.find(attrs={"name": "twitter:creator"})["content"]
-        self.metadata.publisher = soup.find(property="og:site_name")["content"]
+        self.intro = ftfy.fix_text(description.prettify())
+        self.description = ftfy.fix_text(description.text)
+        self.author = soup.find(attrs={"name": "twitter:creator"})["content"]
+        self.publisher = soup.find(property="og:site_name")["content"]
         
-        self.metadata.genres = [a.string for a in soup.find_all(class_="fic_genre")]
-        self.metadata.tags = [a.string for a in soup.find_all(class_="stag")]
+        self.genres = [a.string for a in soup.find_all(class_="fic_genre")]
+        self.tags = [a.string for a in soup.find_all(class_="stag")]
+        self.chapter_count = int(soup.find(class_="cnt_toc").text)
+
 
         imgs = soup.find(class_="sb_content copyright").find_all("img")
         for img in imgs:
             if "copy" not in img["class"]:
                 continue
-            self.metadata.rights = ftfy.fix_text(img.next.string)
+            self.rights = ftfy.fix_text(img.next.string)
+
+    def get_chapters(self) -> None:
+        """
+        Fetch the chapters for the work, based on the TOC API
+        """
+        page_count = math.ceil(self.chapter_count / 15)
+        log.debug(
+            f"Expecting {self.chapter_count} chapters, page_count={page_count}"
+        )
+
+        for page in range(1, page_count + 1):
+            chapter_resp = self.session.post(
+                "https://www.scribblehub.com/wp-admin/admin-ajax.php",
+                {
+                    "action": "wi_getreleases_pagination",
+                    "pagenum": page,
+                    "mypostid": self.identifier,
+                },
+                headers=headers,
+            )
+
+            chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
+            for chapter_tag in chapter_soup.find_all(class_="toc_w"):
+                chapter = ScribbleChapter(self, chapter_tag.a["href"])
+                chapter.index = int(chapter_tag["order"])
+                chapter.title = chapter_tag.a.text
+                chapter.date = arrow.get(
+                    chapter_tag.span["title"], "MMM D, YYYY hh:mm A"
+                )
+                self.chapters.append(chapter)
+
+        self.chapters.sort(key=lambda x: x.index)
+
+        for c in self.chapters:
+            print(str(c))