generated from Hazel/python-project
bypass cloudflare
This commit is contained in:
parent
c9efcce693
commit
307f86aeee
@ -1,6 +1,10 @@
|
||||
[project]
|
||||
name = "scribble_to_epub"
|
||||
dependencies = []
|
||||
dependencies = [
|
||||
'beautifulsoup4',
|
||||
'requests',
|
||||
'EbookLib',
|
||||
]
|
||||
authors = []
|
||||
description = "This scrapes books from https://www.scribblehub.com/ and creates epub from them"
|
||||
readme = "README.md"
|
||||
|
@ -1,5 +1,7 @@
|
||||
import argparse
|
||||
|
||||
from .scribblehub import ScribbleBook
|
||||
|
||||
|
||||
def cli():
|
||||
parser = argparse.ArgumentParser(
|
||||
@ -15,7 +17,8 @@ def cli():
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Running scribble_to_epub for URL: {args.url}")
|
||||
# You would call your main scraping and EPUB creation logic here
|
||||
|
||||
ScribbleBook(args.url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
36
scribble_to_epub/connection.py
Normal file
36
scribble_to_epub/connection.py
Normal file
@ -0,0 +1,36 @@
|
||||
import cloudscraper
|
||||
import logging
|
||||
|
||||
try:
|
||||
import http.client as http_client
|
||||
except ImportError:
|
||||
# Python 2
|
||||
import httplib as http_client
|
||||
http_client.HTTPConnection.debuglevel = 1
|
||||
|
||||
# You must initialize logging, otherwise you'll not see debug output.
|
||||
logging.basicConfig()
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
requests_log = logging.getLogger("requests.packages.urllib3")
|
||||
requests_log.setLevel(logging.DEBUG)
|
||||
requests_log.propagate = True
|
||||
|
||||
|
||||
|
||||
def get_session() -> cloudscraper.CloudScraper:
|
||||
"""
|
||||
session = requests.Session()
|
||||
session.headers = {
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
|
||||
"Referer": "https://www.scribblehub.com/"
|
||||
}
|
||||
|
||||
return session
|
||||
"""
|
||||
|
||||
return cloudscraper.create_scraper()
|
@ -0,0 +1,29 @@
|
||||
from functools import cached_property
|
||||
|
||||
from .connection import get_session
|
||||
|
||||
|
||||
class ScribbleBook:
|
||||
def __init__(self, url: str):
|
||||
self.session = get_session()
|
||||
|
||||
self.source_url = url
|
||||
_parts = [p for p in self.source_url.split("/") if len(p.strip())]
|
||||
self.slug = _parts[-1]
|
||||
self.identifier = _parts[-2]
|
||||
|
||||
print(f"scraping {self.slug} ({self.identifier})")
|
||||
|
||||
self.chapters = []
|
||||
self.languages = []
|
||||
self.genres = []
|
||||
self.tags = []
|
||||
|
||||
self.load()
|
||||
|
||||
def load(self) -> None:
|
||||
"""
|
||||
Load the metadata for this object
|
||||
"""
|
||||
html = self.session.get(self.source_url)
|
||||
print(html)
|
Loading…
x
Reference in New Issue
Block a user