bypass cloudflare

2025-06-10 12:13:02 +02:00 · 2025-06-10 12:13:02 +02:00 · 307f86aeee
commit 307f86aeee
parent c9efcce693
4 changed files with 74 additions and 2 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,10 @@
 [project]
 name = "scribble_to_epub"
-dependencies = []
+dependencies = [
+    'beautifulsoup4',
+    'requests',
+    'EbookLib',
+]
 authors = []
 description = "This scrapes books from https://www.scribblehub.com/ and  creates epub from them"
 readme = "README.md"
--- a/scribble_to_epub/main.py
+++ b/scribble_to_epub/main.py
@ -1,5 +1,7 @@
 import argparse

+from .scribblehub import ScribbleBook
+

 def cli():
    parser = argparse.ArgumentParser(
@ -15,7 +17,8 @@ def cli():
    args = parser.parse_args()

    print(f"Running scribble_to_epub for URL: {args.url}")
-    # You would call your main scraping and EPUB creation logic here
+
+    ScribbleBook(args.url)


 if __name__ == "__main__":
--- a/scribble_to_epub/connection.py
+++ b/scribble_to_epub/connection.py
@ -0,0 +1,36 @@
+import cloudscraper
+import logging
+
+try:
+    import http.client as http_client
+except ImportError:
+    # Python 2
+    import httplib as http_client
+http_client.HTTPConnection.debuglevel = 1
+
+# You must initialize logging, otherwise you'll not see debug output.
+logging.basicConfig()
+logging.getLogger().setLevel(logging.DEBUG)
+requests_log = logging.getLogger("requests.packages.urllib3")
+requests_log.setLevel(logging.DEBUG)
+requests_log.propagate = True
+
+
+
+def get_session() -> cloudscraper.CloudScraper:
+    """
+    session = requests.Session()
+    session.headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Encoding": "gzip, deflate, br, zstd",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
+        "Referer": "https://www.scribblehub.com/"
+    }
+
+    return session
+    """
+
+    return cloudscraper.create_scraper()
--- a/scribble_to_epub/scribblehub.py
+++ b/scribble_to_epub/scribblehub.py
@ -0,0 +1,29 @@
+from functools import cached_property
+
+from .connection import get_session
+
+
+class ScribbleBook:
+    def __init__(self, url: str):
+        self.session = get_session()
+
+        self.source_url = url
+        _parts = [p for p in self.source_url.split("/") if len(p.strip())]
+        self.slug = _parts[-1]
+        self.identifier = _parts[-2]
+
+        print(f"scraping {self.slug} ({self.identifier})")
+
+        self.chapters = []
+        self.languages = []
+        self.genres = []
+        self.tags = []
+
+        self.load()
+
+    def load(self) -> None:
+        """
+        Load the metadata for this object
+        """
+        html = self.session.get(self.source_url)
+        print(html)