diff --git a/pyproject.toml b/pyproject.toml index e7259bd..d92d00e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,10 @@ [project] name = "scribble_to_epub" -dependencies = [] +dependencies = [ + 'beautifulsoup4', + 'requests', + 'EbookLib', +] authors = [] description = "This scrapes books from https://www.scribblehub.com/ and creates epub from them" readme = "README.md" diff --git a/scribble_to_epub/__main__.py b/scribble_to_epub/__main__.py index 21c2698..2683fc6 100644 --- a/scribble_to_epub/__main__.py +++ b/scribble_to_epub/__main__.py @@ -1,5 +1,7 @@ import argparse +from .scribblehub import ScribbleBook + def cli(): parser = argparse.ArgumentParser( @@ -15,7 +17,8 @@ def cli(): args = parser.parse_args() print(f"Running scribble_to_epub for URL: {args.url}") - # You would call your main scraping and EPUB creation logic here + + ScribbleBook(args.url) if __name__ == "__main__": diff --git a/scribble_to_epub/connection.py b/scribble_to_epub/connection.py new file mode 100644 index 0000000..3629729 --- /dev/null +++ b/scribble_to_epub/connection.py @@ -0,0 +1,36 @@ +import cloudscraper +import logging + +try: + import http.client as http_client +except ImportError: + # Python 2 + import httplib as http_client +http_client.HTTPConnection.debuglevel = 1 + +# You must initialize logging, otherwise you'll not see debug output. +logging.basicConfig() +logging.getLogger().setLevel(logging.DEBUG) +requests_log = logging.getLogger("requests.packages.urllib3") +requests_log.setLevel(logging.DEBUG) +requests_log.propagate = True + + + +def get_session() -> cloudscraper.CloudScraper: + """ + session = requests.Session() + session.headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "en-US,en;q=0.5", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0", + "Referer": "https://www.scribblehub.com/" + } + + return session + """ + + return cloudscraper.create_scraper() \ No newline at end of file diff --git a/scribble_to_epub/scribblehub.py b/scribble_to_epub/scribblehub.py index e69de29..49fc892 100644 --- a/scribble_to_epub/scribblehub.py +++ b/scribble_to_epub/scribblehub.py @@ -0,0 +1,29 @@ +from functools import cached_property + +from .connection import get_session + + +class ScribbleBook: + def __init__(self, url: str): + self.session = get_session() + + self.source_url = url + _parts = [p for p in self.source_url.split("/") if len(p.strip())] + self.slug = _parts[-1] + self.identifier = _parts[-2] + + print(f"scraping {self.slug} ({self.identifier})") + + self.chapters = [] + self.languages = [] + self.genres = [] + self.tags = [] + + self.load() + + def load(self) -> None: + """ + Load the metadata for this object + """ + html = self.session.get(self.source_url) + print(html)