from __future__ import annotations import logging import shutil from pathlib import Path import os import markdown from typing import Optional, Union, Dict, Generator, List, DefaultDict, Any, TypedDict, Set from bs4 import BeautifulSoup from collections import defaultdict import toml from datetime import datetime import jinja2 from . import config def get_first_header_content(content, fallback: str = ""): soup = BeautifulSoup(content, 'html.parser') for level in range(1, 7): header = soup.find(f'h{level}') if header: return header.get_text(strip=True) return fallback def shorten_text_and_clean(html_string, max_length=config.formatting.preview_length): soup = BeautifulSoup(html_string, 'html.parser') # Keep track of total characters added total_chars = 0 finished = False # Function to recursively trim and clean text def process_element(element): nonlocal total_chars, finished for child in list(element.children): if finished: child.extract() continue if isinstance(child, str): remaining = max_length - total_chars if remaining <= 0: child.extract() finished = True elif len(child) > remaining: child.replace_with(child[:remaining] + '...') total_chars = max_length finished = True else: total_chars += len(child) elif hasattr(child, 'children'): process_element(child) # Remove empty tags if not child.text.strip(): child.decompose() process_element(soup) return str(soup) def shift_headings(html_string, header_shift=config.formatting.preview_header_shift): soup = BeautifulSoup(html_string, 'html.parser') for level in range(6, 0, -1): # Start from h6 to h1 to avoid overwriting old_tag = f'h{level}' for tag in soup.find_all(old_tag): new_level = min(level + header_shift, 6) # Cap at h6 new_tag = f'h{new_level}' tag.name = new_tag return str(soup) def get_preview_text(html_string: str): return shift_headings(shorten_text_and_clean(html_string)) def stem_to_language_code(stem: str) -> str: language_code = stem.lower().replace("-", "_") if language_code in config.languages: return language_code language_code = language_code.split("_")[0] if language_code in config.languages: return language_code logger.error("Didn't recognize %s as a valid language code, add it to the config, or fix your structure.", stem) exit(1) class TemplateDict(dict): def __init__(self, folder: Path): self.folder = folder super().__init__() def __missing__(self, name: str) -> jinja2.Template: f = self.folder / (name + ".html") if not f.exists(): logger.error("no template with the name %s exists", name) exit(1) t = jinja2.Template(f.read_text()) self[name] = t return t TEMPLATE: Dict[str, jinja2.Template] = TemplateDict(Path(config.setup.source_directory, "templates")) class LanguageDict(dict): def __missing__(self, key: str): if key not in config.languages: raise KeyError(key) lang_dict = config.languages[key] lang_dict["priority"] = lang_dict.get("priority", 0) elements = key.split("_") if len(elements) > 1: elements[-1] = elements[-1].upper() lang_dict["code"] = "-".join(elements) return lang_dict LANGUAGES = LanguageDict() def compile_cross_article_context(cross_article_context): title = cross_article_context["title"] url = cross_article_context["url"] cross_article_context["link"] = f'{title}' class ArticleTranslation: def __init__(self, file: Path, article: Article): self.file = file self.article = article self.context: Dict[str, Any] = {} # initializing the location of the article translation self.language_code = stem_to_language_code(self.file.stem) self.location_in_tree = [self.language_code, *self.article.location_in_tree] self.url = "/" + "/".join(self.location_in_tree) self.dist_path = Path(config.setup.dist_directory, *self.location_in_tree) self.cross_article_context = TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code][self.article.slug] = {} self.priority = LANGUAGES[self.language_code]["priority"] self.real_language_code = LANGUAGES[self.language_code]["code"] self.html_content = self.file.read_text() if self.file.suffix == ".md": self.html_content = markdown.markdown(self.html_content) def __init_context__(self): self.context["meta"] = self.article.context_shared self.context["url"] = self.url self.context["language"] = LANGUAGES[self.language_code] self.context["article_url"] = self.article.url self.context["title"] = get_first_header_content(self.html_content, fallback=LANGUAGES[self.language_code]["native_name"]) self.cross_article_context.update(self.article.context_shared) self.cross_article_context["title"] = self.context["title"] self.cross_article_context["article_url"] = self.article.url self.cross_article_context["url"] = self.url compile_cross_article_context(self.cross_article_context) # get children self.context["children"] = [ c.article_translations_map[self.language_code].context for c in self.article.child_articles if self.language_code in c.article_translations_map ] self.linked_context = self.context["linked"] = [] def __init_content_context__(self): template = jinja2.Template(self.html_content) template.environment.accessed_keys = [] template.environment.context_class = ContextDict self.html_content = template.render({ **CROSS_ARTICLE_CONTEXT, **TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code], }) template.environment.context_class = jinja2.runtime.Context accessed_keys = template.environment.accessed_keys for key in accessed_keys: a = ARTICLE_LAKE[key] if self.language_code in a.article_translations_map: self.linked_context.append(a.article_translations_map[self.language_code].context) self.context["content"] = self.html_content self.context["preview"] = get_preview_text(html_string=self.html_content) def build(self): self.dist_path.mkdir(parents=True, exist_ok=True) with Path(self.dist_path, "index.html").open("w") as f: f.write(TEMPLATE["article_translation"].render(self.context)) class Article: def __init__(self, directory: Path, location_in_tree: Optional[List[str]] = None, is_root: bool = False, parent: Optional[Article] = None): self.directory = directory self.context: Dict[str, Any] = {} self.context_shared: Dict[str, Any] = {} if parent is not None: self.context["parent"] = parent.context_shared # initializing the config values of the article config_file = self.directory / "index.toml" self.config = toml.load(config_file) if config_file.exists() else {} # initializing the location and slug of the article self.slug = self.config.get("name", self.directory.name) if self.slug in ARTICLE_LAKE: logger.error("two articles have the same name at %s and %r", ARTICLE_LAKE[self.slug].directory, self.directory) exit(1) self.cross_article_context = CROSS_ARTICLE_CONTEXT[self.slug] = {} ARTICLE_LAKE[self.slug] = self self.location_in_tree: List[str] = location_in_tree or [] if not is_root: self.location_in_tree.append(self.slug) self.url = "/" + "/".join(self.location_in_tree) self.dist_path = Path(config.setup.dist_directory, *self.location_in_tree) # build the tree self.child_articles: List[Article] = [] self.article_translations_list: List[ArticleTranslation] = [] self.article_translations_map: Dict[str, ArticleTranslation] = {} for c in self.directory.iterdir(): if c.name == "index.toml": continue if c.is_file(): at = ArticleTranslation(c, self) self.article_translations_list.append(at) self.article_translations_map[at.language_code] = at elif c.is_dir(): self.child_articles.append(Article( directory=c, location_in_tree=self.location_in_tree.copy(), parent=self, )) self.article_translations_list.sort(key=lambda a: a.priority, reverse=True) logger.info("found %s at %s with the translations %s", self.slug, ".".join(list(self.location_in_tree)), ",".join(self.article_translations_map.keys())) def __init_context__(self): self.context_shared["url"] = self.url self.context_shared["slug"] = self.slug modified_at = datetime.fromisoformat(self.config["datetime"]) if "datetime" in self.config else datetime.fromtimestamp(self.directory.stat().st_mtime) self.context_shared["date"] = modified_at.strftime(config.formatting.datetime_format) self.context_shared["iso_date"] = modified_at.isoformat() self.context.update(self.context_shared) self.cross_article_context.update(self.context_shared) self.cross_article_context["title"] = self.context_shared["slug"] self.cross_article_context["article_url"] = self.context_shared["url"] compile_cross_article_context(self.cross_article_context) # recursive context structures translation_list = self.context["translations"] = [] child_article_list = self.context["children"] = [] for article_translation in self.article_translations_list: self.context[article_translation.real_language_code] = article_translation.context translation_list.append(article_translation.context) for child_article in self.child_articles: child_article_list.append(child_article.context) # recursively build context for at in self.article_translations_list: at.__init_context__() for a in self.child_articles: a.__init_context__() def __init_content_context__(self): for at in self.article_translations_list: at.__init_content_context__() for a in self.child_articles: a.__init_content_context__() def build(self): self.dist_path.mkdir(parents=True, exist_ok=True) with Path(self.dist_path, "index.html").open("w") as f: f.write(TEMPLATE["article"].render(self.context)) for at in self.article_translations_list: at.build() for ac in self.child_articles: ac.build() class ContextDict(jinja2.runtime.Context): def resolve_or_missing(self, key: str) -> Any: self.environment.accessed_keys.append(key) return super().resolve_or_missing(key) # GLOBALS logger = logging.getLogger("stsg.build") CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, Any]] = {} TRANSLATED_CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict) ARTICLE_LAKE: Dict[str, Article] = {} ARTICLE_REFERENCE_VALUES: DefaultDict[str, Dict[str, str]] = defaultdict(dict) def build(): logger.info("starting build process...") logger.info("copying static folder...") shutil.copytree(Path(config.setup.source_directory, "static"), Path(config.setup.dist_directory, "static"), dirs_exist_ok=True) logger.info("building page tree...") tree = Article(directory=Path(config.setup.source_directory, "articles"), is_root=True) logger.info("compiling tree context...") tree.__init_context__() tree.__init_content_context__() import json with Path("context.json").open("w") as f: json.dump(tree.context, f, indent=4) with Path("cross_article_context.json").open("w") as f: json.dump(CROSS_ARTICLE_CONTEXT, f, indent=4) with Path("t_cross_article_context.json").open("w") as f: json.dump(TRANSLATED_CROSS_ARTICLE_CONTEXT, f, indent=4) logger.info("dumping page tree...") tree.build()