generated from Hazel/python-project
removed book metadata
This commit is contained in:
parent
52f64d824a
commit
bc05ee49ea
@ -9,6 +9,7 @@ import arrow
|
|||||||
import ftfy
|
import ftfy
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
import re
|
import re
|
||||||
|
import mimetypes
|
||||||
import math
|
import math
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -37,51 +38,6 @@ CHAPTER_MATCH = re.compile(
|
|||||||
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
|
STORY_MATCH = re.compile(r"(?P<url_root>.*)/series/(?P<story_id>\d*)/(?P<slug>[a-z-]*)")
|
||||||
DATE_MATCH = re.compile("Last updated: .*")
|
DATE_MATCH = re.compile("Last updated: .*")
|
||||||
|
|
||||||
|
|
||||||
class BookMetadata:
|
|
||||||
"""
|
|
||||||
Represents the metadata for the book
|
|
||||||
"""
|
|
||||||
|
|
||||||
slug: str
|
|
||||||
title: str
|
|
||||||
languages: Iterable[str] # Dublin-core language codes
|
|
||||||
cover_url: str
|
|
||||||
date: arrow.Arrow
|
|
||||||
description: str
|
|
||||||
author: str
|
|
||||||
publisher: str
|
|
||||||
identifier: str # unique identifier (e.g. UUID, hosting site book ID, ISBN, etc.)
|
|
||||||
genres: Iterable[str]
|
|
||||||
tags: Iterable[str]
|
|
||||||
|
|
||||||
@cached_property
|
|
||||||
def rights(self) -> str:
|
|
||||||
return f"© {self.date.year} {self.author}"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.languages = []
|
|
||||||
self.genres = []
|
|
||||||
self.tags = []
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return (
|
|
||||||
f"BookMetadata(\n"
|
|
||||||
f" Title: {self.title}\n"
|
|
||||||
f" Author: {self.author}\n"
|
|
||||||
f" Identifier: {self.identifier}\n"
|
|
||||||
f" Languages: {', '.join(self.languages)}\n"
|
|
||||||
f" Published: {self.date.format('YYYY-MM-DD') if self.date else 'Unknown'}\n"
|
|
||||||
f" Publisher: {self.publisher}\n"
|
|
||||||
f" Genres: {', '.join(self.genres)}\n"
|
|
||||||
f" Tags: {', '.join(self.tags)}\n"
|
|
||||||
f" Rights: {self.rights}\n"
|
|
||||||
f" Cover URL: {self.cover_url}\n"
|
|
||||||
f" Description: {self.description[:75]}{'...' if len(self.description) > 75 else ''}\n"
|
|
||||||
f")"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ScribbleChapter:
|
class ScribbleChapter:
|
||||||
parent: ScribbleBook
|
parent: ScribbleBook
|
||||||
|
|
||||||
@ -90,10 +46,12 @@ class ScribbleChapter:
|
|||||||
text: str # HTML content of chapter
|
text: str # HTML content of chapter
|
||||||
date: arrow.Arrow
|
date: arrow.Arrow
|
||||||
|
|
||||||
def __init__(self, parent: ScribbleBook, url: str):
|
def __init__(self, parent: ScribbleBook, url: str, session: cloudscraper.CloudScraper):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.source_url = url
|
self.source_url = url
|
||||||
|
|
||||||
|
self.session = session
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return (
|
return (
|
||||||
f"ScribbleChapter(\n"
|
f"ScribbleChapter(\n"
|
||||||
@ -104,6 +62,95 @@ class ScribbleChapter:
|
|||||||
f")"
|
f")"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
resp = self.session.get(self.source_url, headers=headers)
|
||||||
|
soup = BeautifulSoup(resp.text, "lxml")
|
||||||
|
|
||||||
|
for tag in soup.find_all(lambda x: x.has_attr("lang")):
|
||||||
|
if tag["lang"] not in self.parent.languages:
|
||||||
|
log.debug(f'Found language {tag["lang"]}')
|
||||||
|
self.parent.languages.append(tag["lang"])
|
||||||
|
|
||||||
|
self.title = soup.find(class_="chapter-title").text
|
||||||
|
log.info(f"{self.parent.title} Chapter {self.index}: {self.title}")
|
||||||
|
|
||||||
|
if not mimetypes.inited:
|
||||||
|
mimetypes.init(None)
|
||||||
|
|
||||||
|
"""
|
||||||
|
for asset in soup.select("#chp_contents img[src]"):
|
||||||
|
if asset["src"] not in self.assets:
|
||||||
|
log.debug(f'Found asset at {asset["src"]}')
|
||||||
|
try:
|
||||||
|
asset_resp = session.get(asset["src"], headers=headers)
|
||||||
|
except HTTPError as e:
|
||||||
|
# just remove the asset from HTML if we have fetch issues
|
||||||
|
log.warning(
|
||||||
|
f'Issue fetching asset {asset["src"]} because "{e.response.status_code}: {e.response.reason}"'
|
||||||
|
)
|
||||||
|
asset.extract()
|
||||||
|
continue
|
||||||
|
fname = sha1(encode(asset["src"], "utf-8")).hexdigest()
|
||||||
|
mimetype, _ = mimetypes.guess_type(asset["src"])
|
||||||
|
log.debug(f"Asset is {mimetype}")
|
||||||
|
ext = mimetypes.guess_extension(mimetype)
|
||||||
|
relpath = f"static/{fname}{ext}"
|
||||||
|
self.assets[asset["src"]] = {
|
||||||
|
"content": asset_resp.content,
|
||||||
|
"relpath": relpath,
|
||||||
|
"mimetype": mimetype,
|
||||||
|
"uid": fname,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
relpath = self.assets[asset["src"]]["relpath"]
|
||||||
|
log.debug(f"Updating asset to {relpath} from {asset['src']}")
|
||||||
|
asset["src"] = relpath
|
||||||
|
"""
|
||||||
|
|
||||||
|
header_tag = soup.new_tag("h2")
|
||||||
|
header_tag.string = self.title
|
||||||
|
chap_text = soup.find(class_="chp_raw").extract()
|
||||||
|
chap_text.insert(0, header_tag)
|
||||||
|
self.text = ftfy.fix_text(chap_text.prettify())
|
||||||
|
self.fix_footnotes()
|
||||||
|
|
||||||
|
def fix_footnotes(self):
|
||||||
|
"""
|
||||||
|
Iterate through any footnotes and refactor them to ePub format
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(self.text, "lxml")
|
||||||
|
footnotes = []
|
||||||
|
for tag in soup.select(".modern-footnotes-footnote"):
|
||||||
|
mfn = tag["data-mfn"].text
|
||||||
|
log.debug(f"Found footnote {mfn}")
|
||||||
|
anchor = tag.find_all("a")[-1]
|
||||||
|
content_tag_element = soup.select(
|
||||||
|
f".modern-footnotes-footnote__note[data-mfn={mfn}]"
|
||||||
|
)
|
||||||
|
content_tag = content_tag_element[0]
|
||||||
|
if not anchor or not content_tag:
|
||||||
|
return
|
||||||
|
anchor["id"] = f"noteanchor-{mfn}"
|
||||||
|
anchor["href"] = f"#note-{mfn}"
|
||||||
|
anchor["epub:type"] = "noteref"
|
||||||
|
|
||||||
|
content_tag.name = "aside"
|
||||||
|
content_tag["id"] = f"note-{mfn}"
|
||||||
|
content_tag["epub:type"] = "footnote"
|
||||||
|
footnote_anchor = soup.new_tag("a", href=f"#noteanchor-{mfn}")
|
||||||
|
footnote_anchor.string = f"{mfn}."
|
||||||
|
content_tag_element.insert(0, footnote_anchor)
|
||||||
|
footnotes.append(content_tag_element)
|
||||||
|
if footnotes:
|
||||||
|
tag = soup.find_all("p")[-1]
|
||||||
|
footnote_header = soup.new_tag("h2", id="footnotes")
|
||||||
|
footnote_header.string = "Footnotes"
|
||||||
|
tag.append(footnote_header)
|
||||||
|
tag.extend(footnotes)
|
||||||
|
|
||||||
|
soup.smooth()
|
||||||
|
self.text = ftfy.fix_text(soup.prettify())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ScribbleBook:
|
class ScribbleBook:
|
||||||
@ -157,6 +204,9 @@ class ScribbleBook:
|
|||||||
print(str(self))
|
print(str(self))
|
||||||
|
|
||||||
self.get_chapters()
|
self.get_chapters()
|
||||||
|
c = self.chapters[0]
|
||||||
|
c.load()
|
||||||
|
print(c.text)
|
||||||
|
|
||||||
def load_metadata(self) -> None:
|
def load_metadata(self) -> None:
|
||||||
"""
|
"""
|
||||||
@ -229,7 +279,7 @@ class ScribbleBook:
|
|||||||
|
|
||||||
chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
|
chapter_soup = BeautifulSoup(chapter_resp.text, "lxml")
|
||||||
for chapter_tag in chapter_soup.find_all(class_="toc_w"):
|
for chapter_tag in chapter_soup.find_all(class_="toc_w"):
|
||||||
chapter = ScribbleChapter(self, chapter_tag.a["href"])
|
chapter = ScribbleChapter(self, chapter_tag.a["href"], self.session)
|
||||||
chapter.index = int(chapter_tag["order"])
|
chapter.index = int(chapter_tag["order"])
|
||||||
chapter.title = chapter_tag.a.text
|
chapter.title = chapter_tag.a.text
|
||||||
chapter.date = arrow.get(
|
chapter.date = arrow.get(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user