generated from Hazel/python-project
bypass cloudflare
This commit is contained in:
parent
c9efcce693
commit
307f86aeee
@ -1,6 +1,10 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "scribble_to_epub"
|
name = "scribble_to_epub"
|
||||||
dependencies = []
|
dependencies = [
|
||||||
|
'beautifulsoup4',
|
||||||
|
'requests',
|
||||||
|
'EbookLib',
|
||||||
|
]
|
||||||
authors = []
|
authors = []
|
||||||
description = "This scrapes books from https://www.scribblehub.com/ and creates epub from them"
|
description = "This scrapes books from https://www.scribblehub.com/ and creates epub from them"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
from .scribblehub import ScribbleBook
|
||||||
|
|
||||||
|
|
||||||
def cli():
|
def cli():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -15,7 +17,8 @@ def cli():
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
print(f"Running scribble_to_epub for URL: {args.url}")
|
print(f"Running scribble_to_epub for URL: {args.url}")
|
||||||
# You would call your main scraping and EPUB creation logic here
|
|
||||||
|
ScribbleBook(args.url)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
36
scribble_to_epub/connection.py
Normal file
36
scribble_to_epub/connection.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import cloudscraper
|
||||||
|
import logging
|
||||||
|
|
||||||
|
try:
|
||||||
|
import http.client as http_client
|
||||||
|
except ImportError:
|
||||||
|
# Python 2
|
||||||
|
import httplib as http_client
|
||||||
|
http_client.HTTPConnection.debuglevel = 1
|
||||||
|
|
||||||
|
# You must initialize logging, otherwise you'll not see debug output.
|
||||||
|
logging.basicConfig()
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
requests_log = logging.getLogger("requests.packages.urllib3")
|
||||||
|
requests_log.setLevel(logging.DEBUG)
|
||||||
|
requests_log.propagate = True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_session() -> cloudscraper.CloudScraper:
|
||||||
|
"""
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers = {
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||||
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
|
||||||
|
"Referer": "https://www.scribblehub.com/"
|
||||||
|
}
|
||||||
|
|
||||||
|
return session
|
||||||
|
"""
|
||||||
|
|
||||||
|
return cloudscraper.create_scraper()
|
@ -0,0 +1,29 @@
|
|||||||
|
from functools import cached_property
|
||||||
|
|
||||||
|
from .connection import get_session
|
||||||
|
|
||||||
|
|
||||||
|
class ScribbleBook:
|
||||||
|
def __init__(self, url: str):
|
||||||
|
self.session = get_session()
|
||||||
|
|
||||||
|
self.source_url = url
|
||||||
|
_parts = [p for p in self.source_url.split("/") if len(p.strip())]
|
||||||
|
self.slug = _parts[-1]
|
||||||
|
self.identifier = _parts[-2]
|
||||||
|
|
||||||
|
print(f"scraping {self.slug} ({self.identifier})")
|
||||||
|
|
||||||
|
self.chapters = []
|
||||||
|
self.languages = []
|
||||||
|
self.genres = []
|
||||||
|
self.tags = []
|
||||||
|
|
||||||
|
self.load()
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
"""
|
||||||
|
Load the metadata for this object
|
||||||
|
"""
|
||||||
|
html = self.session.get(self.source_url)
|
||||||
|
print(html)
|
Loading…
x
Reference in New Issue
Block a user