From 52f64d824a26f710d954d25520626445ef5b237e Mon Sep 17 00:00:00 2001 From: Hazel Noack Date: Tue, 10 Jun 2025 13:02:35 +0200 Subject: [PATCH] fetching chapter overview --- scribble_to_epub/scribblehub.py | 143 +++++++++++++++++++++++++++----- 1 file changed, 120 insertions(+), 23 deletions(-) diff --git a/scribble_to_epub/scribblehub.py b/scribble_to_epub/scribblehub.py index 2b68fc5..329cae0 100644 --- a/scribble_to_epub/scribblehub.py +++ b/scribble_to_epub/scribblehub.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from functools import cached_property from bs4 import BeautifulSoup from ebooklib import epub @@ -5,8 +7,9 @@ import logging import cloudscraper import arrow import ftfy -from typing import Iterable +from typing import Iterable, List import re +import math try: import http.client as http_client @@ -45,7 +48,6 @@ class BookMetadata: languages: Iterable[str] # Dublin-core language codes cover_url: str date: arrow.Arrow - description: str author: str publisher: str @@ -80,23 +82,81 @@ class BookMetadata: ) +class ScribbleChapter: + parent: ScribbleBook -class ScribbleBook: - def __init__(self, url: str): - self.metadata = BookMetadata() - + index: int + title: str + text: str # HTML content of chapter + date: arrow.Arrow + + def __init__(self, parent: ScribbleBook, url: str): + self.parent = parent self.source_url = url - print(f"scraping {url})") + def __str__(self): + return ( + f"ScribbleChapter(\n" + f" Index: {self.index}\n" + f" Title: {self.title}\n" + f" Date: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n" + f" Url: {self.source_url}\n" + f")" + ) + - self.chapters = [] + +class ScribbleBook: + slug: str + title: str + languages: List[str] # Dublin-core language codes + cover_url: str + date: arrow.Arrow + description: str + author: str + publisher: str + identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.) + genres: List[str] + tags: List[str] + + chapter_count: int + + @cached_property + def rights(self) -> str: + return f"© {self.date.year} {self.author}" + + def __str__(self): + return ( + f"BookMetadata(\n" + f" Title: {self.title}\n" + f" Author: {self.author}\n" + f" Identifier: {self.identifier}\n" + f" Languages: {', '.join(self.languages)}\n" + f" Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n" + f" Publisher: {self.publisher}\n" + f" Genres: {', '.join(self.genres)}\n" + f" Tags: {', '.join(self.tags)}\n" + f" Rights: {self.rights}\n" + f" Cover URL: {self.cover_url}\n" + f" Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n" + f")" + ) + + def __init__(self, url: str): + self.source_url = url + self.languages = [] self.genres = [] self.tags = [] + self.chapters: List[ScribbleChapter] = [] + + # fetching metadata self.session = cloudscraper.create_scraper() self.load_metadata() - print(str(self.metadata)) + print(str(self)) + + self.get_chapters() def load_metadata(self) -> None: """ @@ -106,10 +166,10 @@ class ScribbleBook: # parse info from the source url _parts = [p for p in self.source_url.split("/") if len(p.strip())] - self.metadata.slug = _parts[-1] - self.metadata.identifier = _parts[-2] + self.slug = _parts[-1] + self.identifier = _parts[-2] - html = self.session.get(self.source_url) + html = self.session.get(self.source_url, headers=headers) print(html) html = self.session.get(self.source_url) @@ -123,24 +183,61 @@ class ScribbleBook: if self.source_url != url: log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}") - self.metadata.title = soup.find(property="og:title")["content"] - print(f"Book Title: {self.metadata.title}") + self.title = soup.find(property="og:title")["content"] + print(f"Book Title: {self.title}") - self.metadata.cover_url = soup.find(property="og:image")["content"] or "" - self.metadata.date = arrow.get( + self.cover_url = soup.find(property="og:image")["content"] or "" + self.date = arrow.get( soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A" ) description = soup.find(class_="wi_fic_desc") - self.metadata.intro = ftfy.fix_text(description.prettify()) - self.metadata.description = ftfy.fix_text(description.text) - self.metadata.author = soup.find(attrs={"name": "twitter:creator"})["content"] - self.metadata.publisher = soup.find(property="og:site_name")["content"] + self.intro = ftfy.fix_text(description.prettify()) + self.description = ftfy.fix_text(description.text) + self.author = soup.find(attrs={"name": "twitter:creator"})["content"] + self.publisher = soup.find(property="og:site_name")["content"] - self.metadata.genres = [a.string for a in soup.find_all(class_="fic_genre")] - self.metadata.tags = [a.string for a in soup.find_all(class_="stag")] + self.genres = [a.string for a in soup.find_all(class_="fic_genre")] + self.tags = [a.string for a in soup.find_all(class_="stag")] + self.chapter_count = int(soup.find(class_="cnt_toc").text) + imgs = soup.find(class_="sb_content copyright").find_all("img") for img in imgs: if "copy" not in img["class"]: continue - self.metadata.rights = ftfy.fix_text(img.next.string) + self.rights = ftfy.fix_text(img.next.string) + + def get_chapters(self) -> None: + """ + Fetch the chapters for the work, based on the TOC API + """ + page_count = math.ceil(self.chapter_count / 15) + log.debug( + f"Expecting {self.chapter_count} chapters, page_count={page_count}" + ) + + for page in range(1, page_count + 1): + chapter_resp = self.session.post( + "https://www.scribblehub.com/wp-admin/admin-ajax.php", + { + "action": "wi_getreleases_pagination", + "pagenum": page, + "mypostid": self.identifier, + }, + headers=headers, + ) + + chapter_soup = BeautifulSoup(chapter_resp.text, "lxml") + for chapter_tag in chapter_soup.find_all(class_="toc_w"): + chapter = ScribbleChapter(self, chapter_tag.a["href"]) + chapter.index = int(chapter_tag["order"]) + chapter.title = chapter_tag.a.text + chapter.date = arrow.get( + chapter_tag.span["title"], "MMM D, YYYY hh:mm A" + ) + self.chapters.append(chapter) + + self.chapters.sort(key=lambda x: x.index) + + for c in self.chapters: + print(str(c))