fetching of metadata

2025-06-10 12:43:12 +02:00 · 2025-06-10 12:43:12 +02:00 · dd658da548
commit dd658da548
parent 307f86aeee
4 changed files with 131 additions and 46 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
 # Scribble_to_epub

-This scrapes books from https://www.scribblehub.com/ and  creates epub from them
+This scrapes books from https://www.scribblehub.com/ and  creates epub from them.
+
+Inspired by [py-scribblehub-to-epub](https://github.com/agmlego/py-scribblehub-to-epub). Thanks. 
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,6 +4,8 @@ dependencies = [
    'beautifulsoup4',
    'requests',
    'EbookLib',
+    'arrow',
+    'ftfy',
 ]
 authors = []
 description = "This scrapes books from https://www.scribblehub.com/ and  creates epub from them"
--- a/scribble_to_epub/connection.py
+++ b/scribble_to_epub/connection.py
@ -1,36 +0,0 @@
-import cloudscraper
-import logging
-
-try:
-    import http.client as http_client
-except ImportError:
-    # Python 2
-    import httplib as http_client
-http_client.HTTPConnection.debuglevel = 1
-
-# You must initialize logging, otherwise you'll not see debug output.
-logging.basicConfig()
-logging.getLogger().setLevel(logging.DEBUG)
-requests_log = logging.getLogger("requests.packages.urllib3")
-requests_log.setLevel(logging.DEBUG)
-requests_log.propagate = True
-
-
-
-def get_session() -> cloudscraper.CloudScraper:
-    """
-    session = requests.Session()
-    session.headers = {
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-        "Accept-Encoding": "gzip, deflate, br, zstd",
-        "Accept-Language": "en-US,en;q=0.5",
-        "Cache-Control": "no-cache",
-        "Connection": "keep-alive",
-        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
-        "Referer": "https://www.scribblehub.com/"
-    }
-
-    return session
-    """
-
-    return cloudscraper.create_scraper()
--- a/scribble_to_epub/scribblehub.py
+++ b/scribble_to_epub/scribblehub.py
@ -1,29 +1,146 @@
 from functools import cached_property
+from bs4 import BeautifulSoup
+from ebooklib import epub
+import logging
+import cloudscraper
+import arrow
+import ftfy
+from typing import Iterable
+import re
+
+try:
+    import http.client as http_client
+except ImportError:
+    # Python 2
+    import httplib as http_client
+http_client.HTTPConnection.debuglevel = 1
+
+# You must initialize logging, otherwise you'll not see debug output.
+logging.basicConfig()
+logging.getLogger().setLevel(logging.DEBUG)
+requests_log = logging.getLogger("requests.packages.urllib3")
+requests_log.setLevel(logging.DEBUG)
+requests_log.propagate = True
+
+
+
+log = logging.getLogger(__name__)
+
+headers = {"User-Agent": "node"}
+
+CHAPTER_MATCH = re.compile(
+    r"(?P<url_root>.*)/read/(?P<story_id>\d*)-(?P<slug>.*?)/chapter/(?P<chapter_id>\d*)"
+)
+STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
+DATE_MATCH = re.compile("Last updated: .*")
+
+
+class BookMetadata:
+    """
+    Represents the metadata for the book
+    """
+
+    slug: str
+    title: str
+    languages: Iterable[str]    # Dublin-core language codes
+    cover_url: str
+    date: arrow.Arrow
+
+    description: str
+    author: str
+    publisher: str
+    identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.)
+    genres: Iterable[str]
+    tags: Iterable[str]
+    
+    @cached_property
+    def rights(self) -> str:
+        return f"© {self.date.year} {self.author}"
+
+    def __init__(self):
+        self.languages = []
+        self.genres = []
+        self.tags = []
+
+    def __str__(self):
+        return (
+            f"BookMetadata(\n"
+            f"  Title: {self.title}\n"
+            f"  Author: {self.author}\n"
+            f"  Identifier: {self.identifier}\n"
+            f"  Languages: {', '.join(self.languages)}\n"
+            f"  Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
+            f"  Publisher: {self.publisher}\n"
+            f"  Genres: {', '.join(self.genres)}\n"
+            f"  Tags: {', '.join(self.tags)}\n"
+            f"  Rights: {self.rights}\n"
+            f"  Cover URL: {self.cover_url}\n"
+            f"  Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n"
+            f")"
+        )

-from .connection import get_session


 class ScribbleBook:
    def __init__(self, url: str):
-        self.session = get_session()
-
+        self.metadata = BookMetadata()
+        
        self.source_url = url
-        _parts = [p for p in self.source_url.split("/") if len(p.strip())]
-        self.slug = _parts[-1]
-        self.identifier = _parts[-2]

-        print(f"scraping {self.slug} ({self.identifier})")
+        print(f"scraping {url})")

        self.chapters = []
        self.languages = []
        self.genres = []
        self.tags = []

-        self.load()
+        self.session = cloudscraper.create_scraper()
+        self.load_metadata()
+        print(str(self.metadata))

-    def load(self) -> None:
+    def load_metadata(self) -> None:
        """
        Load the metadata for this object
+        will make web requests
        """
+
+        # parse info from the source url
+        _parts = [p for p in self.source_url.split("/") if len(p.strip())]
+        self.metadata.slug = _parts[-1]
+        self.metadata.identifier = _parts[-2]
+
        html = self.session.get(self.source_url)
        print(html)
+
+        html = self.session.get(self.source_url)
+        soup = BeautifulSoup(html.text, "lxml")
+
+        for tag in soup.find_all(lambda x: x.has_attr("lang")):
+            log.debug(f'Found language {tag["lang"]}')
+            self.languages.append(tag["lang"])
+
+        url = soup.find(property="og:url")["content"]
+        if self.source_url != url:
+            log.warning(f"Metadata URL mismatch!\n\t{self.source_url}\n\t{url}")
+
+        self.metadata.title = soup.find(property="og:title")["content"]
+        print(f"Book Title: {self.metadata.title}")
+
+        self.metadata.cover_url = soup.find(property="og:image")["content"] or ""
+        self.metadata.date = arrow.get(
+            soup.find("span", title=DATE_MATCH)["title"][14:], "MMM D, YYYY hh:mm A"
+        )
+        description = soup.find(class_="wi_fic_desc")
+        self.metadata.intro = ftfy.fix_text(description.prettify())
+        self.metadata.description = ftfy.fix_text(description.text)
+        self.metadata.author = soup.find(attrs={"name": "twitter:creator"})["content"]
+        self.metadata.publisher = soup.find(property="og:site_name")["content"]
+        
+        self.metadata.genres = [a.string for a in soup.find_all(class_="fic_genre")]
+        self.metadata.tags = [a.string for a in soup.find_all(class_="stag")]
+
+        imgs = soup.find(class_="sb_content copyright").find_all("img")
+        for img in imgs:
+            if "copy" not in img["class"]:
+                continue
+            self.metadata.rights = ftfy.fix_text(img.next.string)