diff --git a/README.md b/README.md index ca92027..1ac15af 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # Scribble_to_epub -This scrapes books from https://www.scribblehub.com/ and creates epub from them \ No newline at end of file +This scrapes books from https://www.scribblehub.com/ and creates epub from them. + +Inspired by [py-scribblehub-to-epub](https://github.com/agmlego/py-scribblehub-to-epub). Thanks. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d92d00e..fe7367a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,8 @@ dependencies = [ 'beautifulsoup4', 'requests', 'EbookLib', + 'arrow', + 'ftfy', ] authors = [] description = "This scrapes books from https://www.scribblehub.com/ and creates epub from them" diff --git a/scribble_to_epub/connection.py b/scribble_to_epub/connection.py deleted file mode 100644 index 3629729..0000000 --- a/scribble_to_epub/connection.py +++ /dev/null @@ -1,36 +0,0 @@ -import cloudscraper -import logging - -try: - import http.client as http_client -except ImportError: - # Python 2 - import httplib as http_client -http_client.HTTPConnection.debuglevel = 1 - -# You must initialize logging, otherwise you'll not see debug output. -logging.basicConfig() -logging.getLogger().setLevel(logging.DEBUG) -requests_log = logging.getLogger("requests.packages.urllib3") -requests_log.setLevel(logging.DEBUG) -requests_log.propagate = True - - - -def get_session() -> cloudscraper.CloudScraper: - """ - session = requests.Session() - session.headers = { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, br, zstd", - "Accept-Language": "en-US,en;q=0.5", - "Cache-Control": "no-cache", - "Connection": "keep-alive", - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0", - "Referer": "https://www.scribblehub.com/" - } - - return session - """ - - return cloudscraper.create_scraper() \ No newline at end of file diff --git a/scribble_to_epub/scribblehub.py b/scribble_to_epub/scribblehub.py index 49fc892..2b68fc5 100644 --- a/scribble_to_epub/scribblehub.py +++ b/scribble_to_epub/scribblehub.py @@ -1,29 +1,146 @@ from functools import cached_property +from bs4 import BeautifulSoup +from ebooklib import epub +import logging +import cloudscraper +import arrow +import ftfy +from typing import Iterable +import re + +try: + import http.client as http_client +except ImportError: + # Python 2 + import httplib as http_client +http_client.HTTPConnection.debuglevel = 1 + +# You must initialize logging, otherwise you'll not see debug output. +logging.basicConfig() +logging.getLogger().setLevel(logging.DEBUG) +requests_log = logging.getLogger("requests.packages.urllib3") +requests_log.setLevel(logging.DEBUG) +requests_log.propagate = True + + + +log = logging.getLogger(__name__) + +headers = {"User-Agent": "node"} + +CHAPTER_MATCH = re.compile( + r"(?P.*)/read/(?P\d*)-(?P.*?)/chapter/(?P\d*)" +) +STORY_MATCH = re.compile(r"(?P.*)/series/(?P\d*)/(?P[a-z-]*)") +DATE_MATCH = re.compile("Last updated: .*") + + +class BookMetadata: + """ + Represents the metadata for the book + """ + + slug: str + title: str + languages: Iterable[str] # Dublin-core language codes + cover_url: str + date: arrow.Arrow + + description: str + author: str + publisher: str + identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.) + genres: Iterable[str] + tags: Iterable[str] + + @cached_property + def rights(self) -> str: + return f"© {self.date.year} {self.author}" + + def __init__(self): + self.languages = [] + self.genres = [] + self.tags = [] + + def __str__(self): + return ( + f"BookMetadata(\n" + f" Title: {self.title}\n" + f" Author: {self.author}\n" + f" Identifier: {self.identifier}\n" + f" Languages: {', '.join(self.languages)}\n" + f" Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n" + f" Publisher: {self.publisher}\n" + f" Genres: {', '.join(self.genres)}\n" + f" Tags: {', '.join(self.tags)}\n" + f" Rights: {self.rights}\n" + f" Cover URL: {self.cover_url}\n" + f" Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n" + f")" + ) -from .connection import get_session class ScribbleBook: def __init__(self, url: str): - self.session = get_session() - + self.metadata = BookMetadata() + self.source_url = url - _parts = [p for p in self.source_url.split("/") if len(p.strip())] - self.slug = _parts[-1] - self.identifier = _parts[-2] - print(f"scraping {self.slug} ({self.identifier})") + print(f"scraping {url})") self.chapters = [] self.languages = [] self.genres = [] self.tags = [] - self.load() + self.session = cloudscraper.create_scraper() + self.load_metadata() + print(str(self.metadata)) - def load(self) -> None: + def load_metadata(self) -> None: """ Load the metadata for this object + will make web requests """ + + # parse info from the source url + _parts = [p for p in self.source_url.split("/") if len(p.strip())] + self.metadata.slug = _parts[-1] + self.metadata.identifier = _parts[-2] + html = self.session.get(self.source_url) print(html) + + html = self.session.get(self.source_url) + soup = BeautifulSoup(html.text, "lxml") + + for tag in soup.find_all(lambda x: x.has_attr("lang")): + log.debug(f'Found language {tag["lang"]}') + self.languages.append(tag["lang"]) + + url = soup.find(property="og:url")["content"] + if self.source_url != url: + log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}") + + self.metadata.title = soup.find(property="og:title")["content"] + print(f"Book Title: {self.metadata.title}") + + self.metadata.cover_url = soup.find(property="og:image")["content"] or "" + self.metadata.date = arrow.get( + soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A" + ) + description = soup.find(class_="wi_fic_desc") + self.metadata.intro = ftfy.fix_text(description.prettify()) + self.metadata.description = ftfy.fix_text(description.text) + self.metadata.author = soup.find(attrs={"name": "twitter:creator"})["content"] + self.metadata.publisher = soup.find(property="og:site_name")["content"] + + self.metadata.genres = [a.string for a in soup.find_all(class_="fic_genre")] + self.metadata.tags = [a.string for a in soup.find_all(class_="stag")] + + imgs = soup.find(class_="sb_content copyright").find_all("img") + for img in imgs: + if "copy" not in img["class"]: + continue + self.metadata.rights = ftfy.fix_text(img.next.string)