From 98c901b86c7b30b85e671bbedfc6fbc270cff15d Mon Sep 17 00:00:00 2001 From: Hazel Noack Date: Wed, 11 Jun 2025 15:22:24 +0200 Subject: [PATCH 1/2] added chapter identifier --- scribble_to_epub/scribblehub.py | 79 +++++++++------------------------ 1 file changed, 22 insertions(+), 57 deletions(-) diff --git a/scribble_to_epub/scribblehub.py b/scribble_to_epub/scribblehub.py index 47e9d8d..61dc9b9 100644 --- a/scribble_to_epub/scribblehub.py +++ b/scribble_to_epub/scribblehub.py @@ -17,6 +17,7 @@ from pathlib import Path import requests import uuid import time +from python_requests import Connection, set_cache_directory from . import __name__ @@ -36,10 +37,9 @@ requests_log.setLevel(logging.DEBUG) requests_log.propagate = True """ - +set_cache_directory(Path("/tmp", __name__)) log = logging.getLogger(__name__) -headers = {"User-Agent": "node"} CHAPTER_MATCH = re.compile( r"(?P.*)/read/(?P\d*)-(?P.*?)/chapter/(?P\d*)" @@ -47,33 +47,9 @@ CHAPTER_MATCH = re.compile( STORY_MATCH = re.compile(r"(?P.*)/series/(?P\d*)/(?P[a-z-]*)") DATE_MATCH = re.compile("Last updated: .*") -temp_path = Path("/tmp", __name__) -temp_path.mkdir(exist_ok=True) - __assets__ = str(Path(Path(__file__).parent, "assets")) -REQUEST_DELAY = 3 # in seconds -ADDITIONAL_DELAY_PER_TRY = 1 -last_request = 0 -def get_request(session: requests.Session, url: str, attempt: int = 0) -> requests.Response: - global last_request, REQUEST_DELAY, ADDITIONAL_DELAY_PER_TRY - - current_delay = REQUEST_DELAY + (ADDITIONAL_DELAY_PER_TRY * attempt) - elapsed_time = time.time() - last_request - to_wait = current_delay - elapsed_time - - if to_wait > 0: - log.info(f"waiting {to_wait} at attempt {attempt}: {url}") - time.sleep(to_wait) - - last_request = time.time() - resp = session.get(url, headers=headers) - - if resp.status_code == 429: - return get_request(session, url, attempt=attempt + 1) - - return resp class Asset: @@ -113,30 +89,13 @@ class Asset: def relpath(self) -> str: return f"static/{self.filename}" - def __init__(self, url: str, session: Optional[requests.Session] = None): + def __init__(self, url: str, connection: Optional[Connection] = None): self.url = url - self.session = session or requests.Session() - - self.fetch() - - def fetch(self): - temp = Path(temp_path, self.filename) - - if temp.exists(): - self.content = temp.read_bytes() - self.success = True - return - - try: - r = get_request(self.session, self.url) - self.content = r.content - temp.write_bytes(r.content) - self.success = True - except requests.HTTPError as e: - log.warning( - f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"' - ) + self.connection = connection or Connection() + resp = self.connection.get(self.url) + self.content = resp.content + self.success = True class ScribbleChapter: @@ -147,11 +106,11 @@ class ScribbleChapter: text: str # HTML content of chapter date: arrow.Arrow - def __init__(self, parent: ScribbleBook, url: str, session: cloudscraper.CloudScraper): + def __init__(self, parent: ScribbleBook, url: str, connection: Connection): self.parent = parent self.source_url = url - self.session = session + self.connection = connection self.add_asset = self.parent.add_asset def __str__(self): @@ -165,7 +124,7 @@ class ScribbleChapter: ) def load(self): - resp = get_request(self.session, self.source_url) + resp = self.connection.get(self.source_url) soup = BeautifulSoup(resp.text, "lxml") if self.parent.disable_author_quotes: @@ -284,7 +243,13 @@ class ScribbleBook: self.tags = [] self.chapters: List[ScribbleChapter] = [] - self.session = cloudscraper.create_scraper() + + self.connection = Connection( + session=cloudscraper.create_scraper(), + request_delay=3, + additional_delay_per_try=1, + max_retries=10, + ) if file_name is not None: self.file_name = file_name @@ -295,7 +260,7 @@ class ScribbleBook: if url.strip() == "": return - a = Asset(url, self.session) + a = Asset(url, self.connection) if a.success: self.assets[a.url] = a return a @@ -325,7 +290,7 @@ class ScribbleBook: self.slug = _parts[-1] self.identifier = _parts[-2] - resp = get_request(self.session, self.source_url) + resp = self.connection.get(self.source_url) soup = BeautifulSoup(resp.text, "lxml") for tag in soup.find_all(lambda x: x.has_attr("lang")): @@ -375,19 +340,19 @@ class ScribbleBook: page_count = min(page_count, limit) for page in range(1, page_count + 1): - chapter_resp = self.session.post( + chapter_resp = self.connection.post( "https://www.scribblehub.com/wp-admin/admin-ajax.php", { "action": "wi_getreleases_pagination", "pagenum": page, "mypostid": self.identifier, }, - headers=headers, + cache_identifier=f"pagenum{page}mypostid{self.identifier}", ) chapter_soup = BeautifulSoup(chapter_resp.text, "lxml") for chapter_tag in chapter_soup.find_all(class_="toc_w"): - chapter = ScribbleChapter(self, chapter_tag.a["href"], self.session) + chapter = ScribbleChapter(self, chapter_tag.a["href"], self.connection) chapter.index = int(chapter_tag["order"]) chapter.title = chapter_tag.a.text chapter.date = arrow.get( From 576b587044bea8c86c84d2b0d4ce03722c981ecf Mon Sep 17 00:00:00 2001 From: Hazel Noack Date: Wed, 11 Jun 2025 15:39:53 +0200 Subject: [PATCH 2/2] updated dependencies --- pyproject.toml | 2 +- scribble_to_epub/scribblehub.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6312608..2a9609d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scribble_to_epub" dependencies = [ 'beautifulsoup4~=4.13.4', - 'requests', + 'easy-requests~=0.0.0', 'EbookLib~=0.19', 'arrow~=1.3.0', 'ftfy~=6.3.1', diff --git a/scribble_to_epub/scribblehub.py b/scribble_to_epub/scribblehub.py index 61dc9b9..ffdb499 100644 --- a/scribble_to_epub/scribblehub.py +++ b/scribble_to_epub/scribblehub.py @@ -14,9 +14,7 @@ import math from codecs import encode from hashlib import sha1 from pathlib import Path -import requests import uuid -import time from python_requests import Connection, set_cache_directory from . import __name__