from __future__ import annotations import logging import shutil from pathlib import Path import os from markdown2 import markdown from typing import Optional, Union, Dict, Generator, List, DefaultDict, Any, TypedDict, Set from bs4 import BeautifulSoup from collections import defaultdict, UserList import toml from datetime import datetime import jinja2 from functools import cached_property from .definitions import * from . import config def shorten_text_and_clean(html_string, max_length=config.formatting.preview_length): soup = BeautifulSoup(html_string, 'html.parser') # Keep track of total characters added total_chars = 0 finished = False # Function to recursively trim and clean text def process_element(element): nonlocal total_chars, finished for child in list(element.children): if finished: child.extract() continue if isinstance(child, str): remaining = max_length - total_chars if remaining <= 0: child.extract() finished = True elif len(child) > remaining: child.replace_with(child[:remaining] + '...') total_chars = max_length finished = True else: total_chars += len(child) elif hasattr(child, 'children'): process_element(child) # Remove empty tags if not child.text.strip(): child.decompose() process_element(soup) return str(soup) def shift_headings(html_string, header_shift=config.formatting.preview_header_shift): soup = BeautifulSoup(html_string, 'html.parser') for level in range(6, 0, -1): # Start from h6 to h1 to avoid overwriting old_tag = f'h{level}' for tag in soup.find_all(old_tag): new_level = min(level + header_shift, 6) # Cap at h6 new_tag = f'h{new_level}' tag.name = new_tag return str(soup) def get_preview_text(html_string: str): return shift_headings(shorten_text_and_clean(html_string)) class TemplateDict(dict): def __init__(self, folder: Path): self.folder = folder super().__init__() def __missing__(self, name: str) -> jinja2.Template: f = self.folder / (name + ".html") if not f.exists(): logger.error("no template with the name %s exists", name) exit(1) t = jinja2.Template(f.read_text()) self[name] = t return t TEMPLATE: Dict[str, jinja2.Template] = TemplateDict(Path(config.setup.source_directory, "templates")) class LanguageDict(dict): def __missing__(self, key: str): if key not in config.languages: raise KeyError(key) lang_dict = config.languages[key] lang_dict["priority"] = lang_dict.get("priority", 0) elements = key.split("_") if len(elements) > 1: elements[-1] = elements[-1].upper() lang_dict["code"] = "-".join(elements) return lang_dict LANGUAGES = LanguageDict() def add_html_link(c): name = c["name"] url = c["url"] c["link"] = f'{name}' def get_translated_articles(articles: List[Article], language_code: str = None) -> List[Union[ArticleTranslation, Article]]: result = {} for a in articles: if a.slug in result: continue if language_code is None: result[a.slug] = a continue if not config.fall_back_to_overview_in_translation and language_code not in a.article_translations_map: continue result[a.slug] = a.article_translations_map.get(language_code, a) class ArticleList(UserList): def __init__(self, iterable): super().__init__(item for item in iterable) self.used_slugs = set() def append(self, a: Union[Article, str]): if isinstance(a, str): a = ARTICLE_LAKE[a] if a.slug in self.used_slugs: return self.used_slugs.add(a.slug) self.data.append(a) def extend(self, other): for a in other: self.append(a) def get_translated(self, language_code: str) -> ArticleList[Union[ArticleTranslation, Article]]: res = ArticleList([]) for a in self: if not config.fall_back_to_overview_in_translation and language_code not in a.article_translations_map: continue res.append(a.article_translations_map.get(language_code, a)) return res @property def context(self) -> List[Union[ArticleContext, ArticleTranslationContext]]: return [a.context for a in self] class ArticleTranslation: article: Article slug: str = property(fget=lambda self: self.article.slug) file: Path @cached_property def html_content(self) -> str: html_content = self.file.read_text() if self.file.suffix == ".md": return markdown(html_content, extras=config.formatting.markdown_extras) return html_content @cached_property def language_code(self) -> str: language_code = self.file.stem.lower().replace("-", "_") if language_code in config.languages: return language_code language_code = language_code.split("_")[0] if language_code in config.languages: return language_code logger.error("Didn't recognize %s as a valid language code, add it to the config, or fix your structure.", stem) exit(1) @cached_property def priority(self) -> int: return LANGUAGES[self.language_code]["priority"] @cached_property def slug_path(self) -> List[str]: return [self.language_code, *self.article.slug_path] @cached_property def url(self) -> str: return "/" + "/".join(self.slug_path) @cached_property def dist_path(self) -> Path: return Path(config.setup.dist_directory, *self.slug_path) context: ArticleTranslationContext cross_article_context: Dict[str, Any] def __init__(self, file: Path, article: Article): self.article = article self.file = file self.context = TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code][self.article.slug] = {} @cached_property def name(self) -> str: soup = BeautifulSoup(self.html_content, 'html.parser') for level in range(1, 7): header = soup.find(f'h{level}') if header: return header.get_text(strip=True) return self.article.name def __init_context__(self): self.context["slug"] = self.article.slug self.context["name"] = self.name self.context["url"] = self.url add_html_link(self.context) self.context["date"] = self.article.modified_at.strftime(config.formatting.datetime_format) self.context["iso_date"] = self.article.modified_at.isoformat() self.context["author"] = self.article.author self.context["language"] = LANGUAGES[self.language_code] self.context["article_url"] = self.article.url # get children self.context["children"] = self.article.child_articles.get_translated(self.language_code).context self.context["breadcrumbs"] = ArticleList(self.article.article_path).get_translated(self.language_code).context def __init_content_context__(self): template = jinja2.Template(self.html_content) template.environment.accessed_keys = [] template.environment.context_class = ContextDict self.html_content = template.render({ **CROSS_ARTICLE_CONTEXT, **TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code], }) template.environment.context_class = jinja2.runtime.Context accessed_keys = template.environment.accessed_keys for key in accessed_keys: self.article.linked_articles.append(key) self.context["content"] = self.html_content self.context["preview"] = get_preview_text(html_string=self.html_content) self.context["linked"] = self.article.linked_articles.get_translated(self.language_code).context self.context["related"] = self.article.related_articles.get_translated(self.language_code).context def build(self): self.dist_path.mkdir(parents=True, exist_ok=True) with Path(self.dist_path, "index.html").open("w") as f: f.write(TEMPLATE["article_translation"].render(self.context)) class Article: directory: Path @cached_property def config(self) -> ArticleConfig: config_file = self.directory / "index.toml" return toml.load(config_file) if config_file.exists() else {} @cached_property def slug(self) -> str: slug = self.config.get("name", self.directory.name) if slug in ARTICLE_LAKE: logger.error("two articles have the same name at %s and %r", ARTICLE_LAKE[slug].directory, self.directory) exit(1) return slug @cached_property def name(self) -> str: return self.config.get("name", self.slug) article_path: List[Article] @cached_property def slug_path(self) -> List[str]: return [a.slug for a in self.article_path[1:]] @cached_property def url(self) -> str: return "/" + "/".join(self.slug_path) @cached_property def dist_path(self) -> Path: return Path(config.setup.dist_directory, *self.slug_path) context: ArticleContext child_articles: ArticleList[Article] article_translations_list: List[ArticleTranslation] article_translations_map: Dict[str, ArticleTranslation] linked_articles: ArticleList[Article] @cached_property def related_articles(self) -> ArticleList[Article]: res = ArticleList(self.child_articles) res.extend(self.linked_articles) return res def __init__(self, directory: Path, article_path: Optional[List[str]] = None, is_root: bool = False, parent: Optional[Article] = None): self.directory = directory self.article_path: List[Article] = article_path or [] self.article_path.append(self) self.context = CROSS_ARTICLE_CONTEXT[self.slug] = {} ARTICLE_LAKE[self.slug] = self self.linked_articles = ArticleList([]) # build the tree self.child_articles = ArticleList([]) self.article_translations_list = [] self.article_translations_map = {} for c in self.directory.iterdir(): if c.name == "index.toml": continue if c.is_file(): at = ArticleTranslation(c, self) self.article_translations_list.append(at) self.article_translations_map[at.language_code] = at elif c.is_dir(): self.child_articles.append(Article( directory=c, article_path=self.article_path.copy(), parent=self, )) self.article_translations_list.sort(key=lambda a: a.priority, reverse=True) logger.info("found %s at %s with the translations %s", self.slug, ".".join(list(self.slug_path)), ",".join(self.article_translations_map.keys())) @cached_property def modified_at(self) -> datetime: if "iso_date" in self.config: return datetime.fromisoformat(self.config["iso_date"]) """ TODO scann every article file and use the youngest article file """ return datetime.fromtimestamp(self.directory.stat().st_mtime) @cached_property def author(self) -> str: return self.config.get("author", config.default_author) def __init_context__(self): self.context["slug"] = self.slug self.context["name"] = self.name self.context["url"] = self.url add_html_link(self.context) self.context["date"] = self.modified_at.strftime(config.formatting.datetime_format) self.context["iso_date"] = self.modified_at.isoformat() self.context["author"] = self.author # recursive context structures self.context["translations"] = [c.context for c in self.article_translations_list] self.context["children"] = self.child_articles.context self.context["breadcrumbs"] = [b.context for b in self.article_path] for lang, article in self.article_translations_map.items(): self.context[lang] = article.context for at in self.article_translations_list: at.__init_context__() for a in self.child_articles: a.__init_context__() def __init_content_context__(self): for at in self.article_translations_list: at.__init_content_context__() self.context["linked"] = self.linked_articles.context self.context["related"] = self.related_articles.context for a in self.child_articles: a.__init_content_context__() def build(self): self.dist_path.mkdir(parents=True, exist_ok=True) with Path(self.dist_path, "index.html").open("w") as f: f.write(TEMPLATE["article"].render(self.context)) for at in self.article_translations_list: at.build() for ac in self.child_articles: ac.build() class ContextDict(jinja2.runtime.Context): def resolve_or_missing(self, key: str) -> Any: self.environment.accessed_keys.append(key) return super().resolve_or_missing(key) # GLOBALS logger = logging.getLogger("stsg.build") ARTICLE_LAKE: Dict[str, Article] = {} CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, Any]] = {} TRANSLATED_CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict) ARTICLE_REFERENCE_VALUES: DefaultDict[str, Dict[str, str]] = defaultdict(dict) def build(): logger.info("starting build process...") logger.info("copying static folder...") shutil.copytree(Path(config.setup.source_directory, "static"), Path(config.setup.dist_directory, "static"), dirs_exist_ok=True) logger.info("building page tree...") tree = Article(directory=Path(config.setup.source_directory, "articles"), is_root=True) logger.info("compiling tree context...") tree.__init_context__() tree.__init_content_context__() """ import json with Path("context.json").open("w") as f: json.dump(tree.context, f, indent=4) with Path("cross_article_context.json").open("w") as f: json.dump(CROSS_ARTICLE_CONTEXT, f, indent=4) with Path("t_cross_article_context.json").open("w") as f: json.dump(TRANSLATED_CROSS_ARTICLE_CONTEXT, f, indent=4) """ logger.info("dumping page tree...") tree.build()