STSG/stsg/build.py
2025-05-22 16:08:36 +02:00

465 lines
15 KiB
Python

from __future__ import annotations
import logging
import shutil
from pathlib import Path
import os
from markdown2 import markdown
from typing import Optional, Union, Dict, Generator, List, DefaultDict, Any, TypedDict, Set
from bs4 import BeautifulSoup
from collections import defaultdict, UserList
import toml
from datetime import datetime
import jinja2
from functools import cached_property
from .definitions import *
from . import config
def shorten_text_and_clean(html_string, max_length=config.formatting.preview_length):
soup = BeautifulSoup(html_string, 'html.parser')
# Keep track of total characters added
total_chars = 0
finished = False
# Function to recursively trim and clean text
def process_element(element):
nonlocal total_chars, finished
for child in list(element.children):
if finished:
child.extract()
continue
if isinstance(child, str):
remaining = max_length - total_chars
if remaining <= 0:
child.extract()
finished = True
elif len(child) > remaining:
child.replace_with(child[:remaining] + '...')
total_chars = max_length
finished = True
else:
total_chars += len(child)
elif hasattr(child, 'children'):
process_element(child)
# Remove empty tags
if not child.text.strip():
child.decompose()
process_element(soup)
return str(soup)
def shift_headings(html_string, header_shift=config.formatting.preview_header_shift):
soup = BeautifulSoup(html_string, 'html.parser')
for level in range(6, 0, -1): # Start from h6 to h1 to avoid overwriting
old_tag = f'h{level}'
for tag in soup.find_all(old_tag):
new_level = min(level + header_shift, 6) # Cap at h6
new_tag = f'h{new_level}'
tag.name = new_tag
return str(soup)
def get_preview_text(html_string: str):
return shift_headings(shorten_text_and_clean(html_string))
class TemplateDict(dict):
def __init__(self, folder: Path):
self.folder = folder
super().__init__()
def __missing__(self, name: str) -> jinja2.Template:
f = self.folder / (name + ".html")
if not f.exists():
logger.error("no template with the name %s exists", name)
exit(1)
t = jinja2.Template(f.read_text())
self[name] = t
return t
TEMPLATE: Dict[str, jinja2.Template] = TemplateDict(Path(config.setup.source_directory, "templates"))
class LanguageDict(dict):
def __missing__(self, key: str):
if key not in config.languages:
raise KeyError(key)
lang_dict = config.languages[key]
lang_dict["priority"] = lang_dict.get("priority", 0)
elements = key.split("_")
if len(elements) > 1:
elements[-1] = elements[-1].upper()
lang_dict["code"] = "-".join(elements)
return lang_dict
LANGUAGES = LanguageDict()
def add_html_link(c):
name = c["name"]
url = c["url"]
c["link"] = f'<a href="{url}">{name}</a>'
def get_translated_articles(articles: List[Article], language_code: str = None) -> List[Union[ArticleTranslation, Article]]:
result = {}
for a in articles:
if a.slug in result:
continue
if language_code is None:
result[a.slug] = a
continue
if not config.fall_back_to_overview_in_translation and language_code not in a.article_translations_map:
continue
result[a.slug] = a.article_translations_map.get(language_code, a)
class ArticleList(UserList):
def __init__(self, iterable):
super().__init__(item for item in iterable)
self.used_slugs = set()
def append(self, a: Union[Article, str]):
if isinstance(a, str):
a = ARTICLE_LAKE[a]
if a.slug in self.used_slugs:
return
self.used_slugs.add(a.slug)
self.data.append(a)
def extend(self, other):
for a in other:
self.append(a)
def get_translated(self, language_code: str) -> ArticleList[Union[ArticleTranslation, Article]]:
res = ArticleList([])
for a in self:
if not config.fall_back_to_overview_in_translation and language_code not in a.article_translations_map:
continue
res.append(a.article_translations_map.get(language_code, a))
return res
@property
def context(self) -> List[Union[ArticleContext, ArticleTranslationContext]]:
return [a.context for a in self]
class ArticleTranslation:
article: Article
slug: str = property(fget=lambda self: self.article.slug)
file: Path
@cached_property
def html_content(self) -> str:
html_content = self.file.read_text()
if self.file.suffix == ".md":
return markdown(html_content, extras=config.formatting.markdown_extras)
return html_content
@cached_property
def language_code(self) -> str:
language_code = self.file.stem.lower().replace("-", "_")
if language_code in config.languages:
return language_code
language_code = language_code.split("_")[0]
if language_code in config.languages:
return language_code
logger.error("Didn't recognize %s as a valid language code, add it to the config, or fix your structure.", stem)
exit(1)
@cached_property
def priority(self) -> int:
return LANGUAGES[self.language_code]["priority"]
@cached_property
def slug_path(self) -> List[str]:
return [self.language_code, *self.article.slug_path]
@cached_property
def url(self) -> str:
return "/" + "/".join(self.slug_path)
@cached_property
def dist_path(self) -> Path:
return Path(config.setup.dist_directory, *self.slug_path)
context: ArticleTranslationContext
cross_article_context: Dict[str, Any]
def __init__(self, file: Path, article: Article):
self.article = article
self.file = file
self.context = TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code][self.article.slug] = {}
@cached_property
def name(self) -> str:
soup = BeautifulSoup(self.html_content, 'html.parser')
for level in range(1, 7):
header = soup.find(f'h{level}')
if header:
return header.get_text(strip=True)
return self.article.name
def __init_context__(self):
self.context["slug"] = self.article.slug
self.context["name"] = self.name
self.context["url"] = self.url
add_html_link(self.context)
self.context["date"] = self.article.modified_at.strftime(config.formatting.datetime_format)
self.context["iso_date"] = self.article.modified_at.isoformat()
self.context["author"] = self.article.author
self.context["language"] = LANGUAGES[self.language_code]
self.context["article_url"] = self.article.url
# get children
self.context["children"] = self.article.child_articles.get_translated(self.language_code).context
self.context["breadcrumbs"] = ArticleList(self.article.article_path).get_translated(self.language_code).context
def __init_content_context__(self):
template = jinja2.Template(self.html_content)
template.environment.accessed_keys = []
template.environment.context_class = ContextDict
self.html_content = template.render({
**CROSS_ARTICLE_CONTEXT,
**TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code],
})
template.environment.context_class = jinja2.runtime.Context
accessed_keys = template.environment.accessed_keys
for key in accessed_keys:
self.article.linked_articles.append(key)
self.context["content"] = self.html_content
self.context["preview"] = get_preview_text(html_string=self.html_content)
self.context["linked"] = self.article.linked_articles.get_translated(self.language_code).context
self.context["related"] = self.article.related_articles.get_translated(self.language_code).context
def build(self):
self.dist_path.mkdir(parents=True, exist_ok=True)
with Path(self.dist_path, "index.html").open("w") as f:
f.write(TEMPLATE["article_translation"].render(self.context))
class Article:
directory: Path
@cached_property
def config(self) -> ArticleConfig:
config_file = self.directory / "index.toml"
return toml.load(config_file) if config_file.exists() else {}
@cached_property
def slug(self) -> str:
slug = self.config.get("name", self.directory.name)
if slug in ARTICLE_LAKE:
logger.error("two articles have the same name at %s and %r", ARTICLE_LAKE[slug].directory, self.directory)
exit(1)
return slug
@cached_property
def name(self) -> str:
return self.config.get("name", self.slug)
article_path: List[Article]
@cached_property
def slug_path(self) -> List[str]:
return [a.slug for a in self.article_path[1:]]
@cached_property
def url(self) -> str:
return "/" + "/".join(self.slug_path)
@cached_property
def dist_path(self) -> Path:
return Path(config.setup.dist_directory, *self.slug_path)
context: ArticleContext
child_articles: ArticleList[Article]
article_translations_list: List[ArticleTranslation]
article_translations_map: Dict[str, ArticleTranslation]
linked_articles: ArticleList[Article]
@cached_property
def related_articles(self) -> ArticleList[Article]:
res = ArticleList(self.child_articles)
res.extend(self.linked_articles)
return res
def __init__(self, directory: Path, article_path: Optional[List[str]] = None, is_root: bool = False, parent: Optional[Article] = None):
self.directory = directory
self.article_path: List[Article] = article_path or []
self.article_path.append(self)
self.context = CROSS_ARTICLE_CONTEXT[self.slug] = {}
ARTICLE_LAKE[self.slug] = self
self.linked_articles = ArticleList([])
# build the tree
self.child_articles = ArticleList([])
self.article_translations_list = []
self.article_translations_map = {}
for c in self.directory.iterdir():
if c.name == "index.toml":
continue
if c.is_file():
at = ArticleTranslation(c, self)
self.article_translations_list.append(at)
self.article_translations_map[at.language_code] = at
elif c.is_dir():
self.child_articles.append(Article(
directory=c,
article_path=self.article_path.copy(),
parent=self,
))
self.article_translations_list.sort(key=lambda a: a.priority, reverse=True)
logger.info("found %s at %s with the translations %s", self.slug, ".".join(list(self.slug_path)), ",".join(self.article_translations_map.keys()))
@cached_property
def modified_at(self) -> datetime:
if "iso_date" in self.config:
return datetime.fromisoformat(self.config["iso_date"])
"""
TODO
scann every article file and use the youngest article file
"""
return datetime.fromtimestamp(self.directory.stat().st_mtime)
@cached_property
def author(self) -> str:
return self.config.get("author", config.default_author)
def __init_context__(self):
self.context["slug"] = self.slug
self.context["name"] = self.name
self.context["url"] = self.url
add_html_link(self.context)
self.context["date"] = self.modified_at.strftime(config.formatting.datetime_format)
self.context["iso_date"] = self.modified_at.isoformat()
self.context["author"] = self.author
# recursive context structures
self.context["translations"] = [c.context for c in self.article_translations_list]
self.context["children"] = self.child_articles.context
self.context["breadcrumbs"] = [b.context for b in self.article_path]
for lang, article in self.article_translations_map.items():
self.context[lang] = article.context
for at in self.article_translations_list:
at.__init_context__()
for a in self.child_articles:
a.__init_context__()
def __init_content_context__(self):
for at in self.article_translations_list:
at.__init_content_context__()
self.context["linked"] = self.linked_articles.context
self.context["related"] = self.related_articles.context
for a in self.child_articles:
a.__init_content_context__()
def build(self):
self.dist_path.mkdir(parents=True, exist_ok=True)
with Path(self.dist_path, "index.html").open("w") as f:
f.write(TEMPLATE["article"].render(self.context))
for at in self.article_translations_list:
at.build()
for ac in self.child_articles:
ac.build()
class ContextDict(jinja2.runtime.Context):
def resolve_or_missing(self, key: str) -> Any:
self.environment.accessed_keys.append(key)
return super().resolve_or_missing(key)
# GLOBALS
logger = logging.getLogger("stsg.build")
ARTICLE_LAKE: Dict[str, Article] = {}
CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, Any]] = {}
TRANSLATED_CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict)
ARTICLE_REFERENCE_VALUES: DefaultDict[str, Dict[str, str]] = defaultdict(dict)
def build():
logger.info("starting build process...")
logger.info("copying static folder...")
shutil.copytree(Path(config.setup.source_directory, "static"), Path(config.setup.dist_directory, "static"), dirs_exist_ok=True)
logger.info("building page tree...")
tree = Article(directory=Path(config.setup.source_directory, "articles"), is_root=True)
logger.info("compiling tree context...")
tree.__init_context__()
tree.__init_content_context__()
"""
import json
with Path("context.json").open("w") as f:
json.dump(tree.context, f, indent=4)
with Path("cross_article_context.json").open("w") as f:
json.dump(CROSS_ARTICLE_CONTEXT, f, indent=4)
with Path("t_cross_article_context.json").open("w") as f:
json.dump(TRANSLATED_CROSS_ARTICLE_CONTEXT, f, indent=4)
"""
logger.info("dumping page tree...")
tree.build()