fetching chapter overview

This commit is contained in:
Hazel Noack 2025-06-10 13:02:35 +02:00
parent dd658da548
commit 52f64d824a

View File

@ -1,3 +1,5 @@
from __future__ import annotations
from functools import cached_property from functools import cached_property
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ebooklib import epub from ebooklib import epub
@ -5,8 +7,9 @@ import logging
import cloudscraper import cloudscraper
import arrow import arrow
import ftfy import ftfy
from typing import Iterable from typing import Iterable, List
import re import re
import math
try: try:
import http.client as http_client import http.client as http_client
@ -45,7 +48,6 @@ class BookMetadata:
languages: Iterable[str] # Dublin-core language codes languages: Iterable[str] # Dublin-core language codes
cover_url: str cover_url: str
date: arrow.Arrow date: arrow.Arrow
description: str description: str
author: str author: str
publisher: str publisher: str
@ -80,23 +82,81 @@ class BookMetadata:
) )
class ScribbleChapter:
parent: ScribbleBook
class ScribbleBook: index: int
def __init__(self, url: str): title: str
self.metadata = BookMetadata() text: str # HTML content of chapter
date: arrow.Arrow
def __init__(self, parent: ScribbleBook, url: str):
self.parent = parent
self.source_url = url self.source_url = url
print(f"scraping {url})") def __str__(self):
return (
f"ScribbleChapter(\n"
f" Index: {self.index}\n"
f" Title: {self.title}\n"
f" Date: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
f" Url: {self.source_url}\n"
f")"
)
class ScribbleBook:
slug: str
title: str
languages: List[str] # Dublin-core language codes
cover_url: str
date: arrow.Arrow
description: str
author: str
publisher: str
identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.)
genres: List[str]
tags: List[str]
chapter_count: int
@cached_property
def rights(self) -> str:
return f"© {self.date.year} {self.author}"
def __str__(self):
return (
f"BookMetadata(\n"
f" Title: {self.title}\n"
f" Author: {self.author}\n"
f" Identifier: {self.identifier}\n"
f" Languages: {', '.join(self.languages)}\n"
f" Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
f" Publisher: {self.publisher}\n"
f" Genres: {', '.join(self.genres)}\n"
f" Tags: {', '.join(self.tags)}\n"
f" Rights: {self.rights}\n"
f" Cover URL: {self.cover_url}\n"
f" Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n"
f")"
)
def __init__(self, url: str):
self.source_url = url
self.chapters = []
self.languages = [] self.languages = []
self.genres = [] self.genres = []
self.tags = [] self.tags = []
self.chapters: List[ScribbleChapter] = []
# fetching metadata
self.session = cloudscraper.create_scraper() self.session = cloudscraper.create_scraper()
self.load_metadata() self.load_metadata()
print(str(self.metadata)) print(str(self))
self.get_chapters()
def load_metadata(self) -> None: def load_metadata(self) -> None:
""" """
@ -106,10 +166,10 @@ class ScribbleBook:
# parse info from the source url # parse info from the source url
_parts = [p for p in self.source_url.split("/") if len(p.strip())] _parts = [p for p in self.source_url.split("/") if len(p.strip())]
self.metadata.slug = _parts[-1] self.slug = _parts[-1]
self.metadata.identifier = _parts[-2] self.identifier = _parts[-2]
html = self.session.get(self.source_url) html = self.session.get(self.source_url, headers=headers)
print(html) print(html)
html = self.session.get(self.source_url) html = self.session.get(self.source_url)
@ -123,24 +183,61 @@ class ScribbleBook:
if self.source_url != url: if self.source_url != url:
log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}") log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}")
self.metadata.title = soup.find(property="og:title")["content"] self.title = soup.find(property="og:title")["content"]
print(f"Book Title: {self.metadata.title}") print(f"Book Title: {self.title}")
self.metadata.cover_url = soup.find(property="og:image")["content"] or "" self.cover_url = soup.find(property="og:image")["content"] or ""
self.metadata.date = arrow.get( self.date = arrow.get(
soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A" soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
) )
description = soup.find(class_="wi_fic_desc") description = soup.find(class_="wi_fic_desc")
self.metadata.intro = ftfy.fix_text(description.prettify()) self.intro = ftfy.fix_text(description.prettify())
self.metadata.description = ftfy.fix_text(description.text) self.description = ftfy.fix_text(description.text)
self.metadata.author = soup.find(attrs={"name": "twitter:creator"})["content"] self.author = soup.find(attrs={"name": "twitter:creator"})["content"]
self.metadata.publisher = soup.find(property="og:site_name")["content"] self.publisher = soup.find(property="og:site_name")["content"]
self.genres = [a.string for a in soup.find_all(class_="fic_genre")]
self.tags = [a.string for a in soup.find_all(class_="stag")]
self.chapter_count = int(soup.find(class_="cnt_toc").text)
self.metadata.genres = [a.string for a in soup.find_all(class_="fic_genre")]
self.metadata.tags = [a.string for a in soup.find_all(class_="stag")]
imgs = soup.find(class_="sb_content copyright").find_all("img") imgs = soup.find(class_="sb_content copyright").find_all("img")
for img in imgs: for img in imgs:
if "copy" not in img["class"]: if "copy" not in img["class"]:
continue continue
self.metadata.rights = ftfy.fix_text(img.next.string) self.rights = ftfy.fix_text(img.next.string)
def get_chapters(self) -> None:
"""
Fetch the chapters for the work, based on the TOC API
"""
page_count = math.ceil(self.chapter_count / 15)
log.debug(
f"Expecting {self.chapter_count} chapters, page_count={page_count}"
)
for page in range(1, page_count + 1):
chapter_resp = self.session.post(
"https://www.scribblehub.com/wp-admin/admin-ajax.php",
{
"action": "wi_getreleases_pagination",
"pagenum": page,
"mypostid": self.identifier,
},
headers=headers,
)
chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
for chapter_tag in chapter_soup.find_all(class_="toc_w"):
chapter = ScribbleChapter(self, chapter_tag.a["href"])
chapter.index = int(chapter_tag["order"])
chapter.title = chapter_tag.a.text
chapter.date = arrow.get(
chapter_tag.span["title"], "MMM D, YYYY hh:mm A"
)
self.chapters.append(chapter)
self.chapters.sort(key=lambda x: x.index)
for c in self.chapters:
print(str(c))