added chapter identifier #1

Merged
Hazel merged 2 commits from custom_request_library into main 2025-06-11 13:41:18 +00:00
2 changed files with 23 additions and 60 deletions

View File

@ -2,7 +2,7 @@
name = "scribble_to_epub" name = "scribble_to_epub"
dependencies = [ dependencies = [
'beautifulsoup4~=4.13.4', 'beautifulsoup4~=4.13.4',
'requests', 'easy-requests~=0.0.0',
'EbookLib~=0.19', 'EbookLib~=0.19',
'arrow~=1.3.0', 'arrow~=1.3.0',
'ftfy~=6.3.1', 'ftfy~=6.3.1',

View File

@ -14,9 +14,8 @@ import math
from codecs import encode from codecs import encode
from hashlib import sha1 from hashlib import sha1
from pathlib import Path from pathlib import Path
import requests
import uuid import uuid
import time from python_requests import Connection, set_cache_directory
from . import __name__ from . import __name__
@ -36,10 +35,9 @@ requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True requests_log.propagate = True
""" """
set_cache_directory(Path("/tmp", __name__))
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
headers = {"User-Agent": "node"}
CHAPTER_MATCH = re.compile( CHAPTER_MATCH = re.compile(
r"(?P<url_root>.*)/read/(?P<story_id>\d*)-(?P<slug>.*?)/chapter/(?P<chapter_id>\d*)" r"(?P<url_root>.*)/read/(?P<story_id>\d*)-(?P<slug>.*?)/chapter/(?P<chapter_id>\d*)"
@ -47,33 +45,9 @@ CHAPTER_MATCH = re.compile(
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)") STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
DATE_MATCH = re.compile("Last updated: .*") DATE_MATCH = re.compile("Last updated: .*")
temp_path = Path("/tmp", __name__)
temp_path.mkdir(exist_ok=True)
__assets__ = str(Path(Path(__file__).parent, "assets")) __assets__ = str(Path(Path(__file__).parent, "assets"))
REQUEST_DELAY = 3 # in seconds
ADDITIONAL_DELAY_PER_TRY = 1
last_request = 0
def get_request(session: requests.Session, url: str, attempt: int = 0) -> requests.Response:
global last_request, REQUEST_DELAY, ADDITIONAL_DELAY_PER_TRY
current_delay = REQUEST_DELAY + (ADDITIONAL_DELAY_PER_TRY * attempt)
elapsed_time = time.time() - last_request
to_wait = current_delay - elapsed_time
if to_wait > 0:
log.info(f"waiting {to_wait} at attempt {attempt}: {url}")
time.sleep(to_wait)
last_request = time.time()
resp = session.get(url, headers=headers)
if resp.status_code == 429:
return get_request(session, url, attempt=attempt + 1)
return resp
class Asset: class Asset:
@ -113,30 +87,13 @@ class Asset:
def relpath(self) -> str: def relpath(self) -> str:
return f"static/{self.filename}" return f"static/{self.filename}"
def __init__(self, url: str, session: Optional[requests.Session] = None): def __init__(self, url: str, connection: Optional[Connection] = None):
self.url = url self.url = url
self.session = session or requests.Session() self.connection = connection or Connection()
self.fetch()
def fetch(self):
temp = Path(temp_path, self.filename)
if temp.exists():
self.content = temp.read_bytes()
self.success = True
return
try:
r = get_request(self.session, self.url)
self.content = r.content
temp.write_bytes(r.content)
self.success = True
except requests.HTTPError as e:
log.warning(
f'Issue fetching asset {self.url} because "{e.response.status_code}: {e.response.reason}"'
)
resp = self.connection.get(self.url)
self.content = resp.content
self.success = True
class ScribbleChapter: class ScribbleChapter:
@ -147,11 +104,11 @@ class ScribbleChapter:
text: str # HTML content of chapter text: str # HTML content of chapter
date: arrow.Arrow date: arrow.Arrow
def __init__(self, parent: ScribbleBook, url: str, session: cloudscraper.CloudScraper): def __init__(self, parent: ScribbleBook, url: str, connection: Connection):
self.parent = parent self.parent = parent
self.source_url = url self.source_url = url
self.session = session self.connection = connection
self.add_asset = self.parent.add_asset self.add_asset = self.parent.add_asset
def __str__(self): def __str__(self):
@ -165,7 +122,7 @@ class ScribbleChapter:
) )
def load(self): def load(self):
resp = get_request(self.session, self.source_url) resp = self.connection.get(self.source_url)
soup = BeautifulSoup(resp.text, "lxml") soup = BeautifulSoup(resp.text, "lxml")
if self.parent.disable_author_quotes: if self.parent.disable_author_quotes:
@ -284,7 +241,13 @@ class ScribbleBook:
self.tags = [] self.tags = []
self.chapters: List[ScribbleChapter] = [] self.chapters: List[ScribbleChapter] = []
self.session = cloudscraper.create_scraper()
self.connection = Connection(
session=cloudscraper.create_scraper(),
request_delay=3,
additional_delay_per_try=1,
max_retries=10,
)
if file_name is not None: if file_name is not None:
self.file_name = file_name self.file_name = file_name
@ -295,7 +258,7 @@ class ScribbleBook:
if url.strip() == "": if url.strip() == "":
return return
a = Asset(url, self.session) a = Asset(url, self.connection)
if a.success: if a.success:
self.assets[a.url] = a self.assets[a.url] = a
return a return a
@ -325,7 +288,7 @@ class ScribbleBook:
self.slug = _parts[-1] self.slug = _parts[-1]
self.identifier = _parts[-2] self.identifier = _parts[-2]
resp = get_request(self.session, self.source_url) resp = self.connection.get(self.source_url)
soup = BeautifulSoup(resp.text, "lxml") soup = BeautifulSoup(resp.text, "lxml")
for tag in soup.find_all(lambda x: x.has_attr("lang")): for tag in soup.find_all(lambda x: x.has_attr("lang")):
@ -375,19 +338,19 @@ class ScribbleBook:
page_count = min(page_count, limit) page_count = min(page_count, limit)
for page in range(1, page_count + 1): for page in range(1, page_count + 1):
chapter_resp = self.session.post( chapter_resp = self.connection.post(
"https://www.scribblehub.com/wp-admin/admin-ajax.php", "https://www.scribblehub.com/wp-admin/admin-ajax.php",
{ {
"action": "wi_getreleases_pagination", "action": "wi_getreleases_pagination",
"pagenum": page, "pagenum": page,
"mypostid": self.identifier, "mypostid": self.identifier,
}, },
headers=headers, cache_identifier=f"pagenum{page}mypostid{self.identifier}",
) )
chapter_soup = BeautifulSoup(chapter_resp.text, "lxml") chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
for chapter_tag in chapter_soup.find_all(class_="toc_w"): for chapter_tag in chapter_soup.find_all(class_="toc_w"):
chapter = ScribbleChapter(self, chapter_tag.a["href"], self.session) chapter = ScribbleChapter(self, chapter_tag.a["href"], self.connection)
chapter.index = int(chapter_tag["order"]) chapter.index = int(chapter_tag["order"])
chapter.title = chapter_tag.a.text chapter.title = chapter_tag.a.text
chapter.date = arrow.get( chapter.date = arrow.get(