bypass cloudflare

This commit is contained in:
Hazel Noack 2025-06-10 12:13:02 +02:00
parent c9efcce693
commit 307f86aeee
4 changed files with 74 additions and 2 deletions

View File

@ -1,6 +1,10 @@
[project] [project]
name = "scribble_to_epub" name = "scribble_to_epub"
dependencies = [] dependencies = [
'beautifulsoup4',
'requests',
'EbookLib',
]
authors = [] authors = []
description = "This scrapes books from https://www.scribblehub.com/ and creates epub from them" description = "This scrapes books from https://www.scribblehub.com/ and creates epub from them"
readme = "README.md" readme = "README.md"

View File

@ -1,5 +1,7 @@
import argparse import argparse
from .scribblehub import ScribbleBook
def cli(): def cli():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@ -15,7 +17,8 @@ def cli():
args = parser.parse_args() args = parser.parse_args()
print(f"Running scribble_to_epub for URL: {args.url}") print(f"Running scribble_to_epub for URL: {args.url}")
# You would call your main scraping and EPUB creation logic here
ScribbleBook(args.url)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -0,0 +1,36 @@
import cloudscraper
import logging
try:
import http.client as http_client
except ImportError:
# Python 2
import httplib as http_client
http_client.HTTPConnection.debuglevel = 1
# You must initialize logging, otherwise you'll not see debug output.
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
def get_session() -> cloudscraper.CloudScraper:
"""
session = requests.Session()
session.headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
"Referer": "https://www.scribblehub.com/"
}
return session
"""
return cloudscraper.create_scraper()

View File

@ -0,0 +1,29 @@
from functools import cached_property
from .connection import get_session
class ScribbleBook:
def __init__(self, url: str):
self.session = get_session()
self.source_url = url
_parts = [p for p in self.source_url.split("/") if len(p.strip())]
self.slug = _parts[-1]
self.identifier = _parts[-2]
print(f"scraping {self.slug} ({self.identifier})")
self.chapters = []
self.languages = []
self.genres = []
self.tags = []
self.load()
def load(self) -> None:
"""
Load the metadata for this object
"""
html = self.session.get(self.source_url)
print(html)