STSG/stsg/build.py

327 lines
11 KiB
Python

from __future__ import annotations
import logging
import shutil
from pathlib import Path
import os
import markdown
from typing import Optional, Union, Dict, Generator, List, DefaultDict, Any, TypedDict, Set
from bs4 import BeautifulSoup
from collections import defaultdict
import toml
from datetime import datetime
import jinja2
from . import config
def get_first_header_content(content, fallback: str = ""):
soup = BeautifulSoup(content, 'html.parser')
for level in range(1, 7):
header = soup.find(f'h{level}')
if header:
return header.get_text(strip=True)
return fallback
def shorten_text_and_clean(html_string, max_length=config.formatting.preview_length):
soup = BeautifulSoup(html_string, 'html.parser')
# Keep track of total characters added
total_chars = 0
finished = False
# Function to recursively trim and clean text
def process_element(element):
nonlocal total_chars, finished
for child in list(element.children):
if finished:
child.extract()
continue
if isinstance(child, str):
remaining = max_length - total_chars
if remaining <= 0:
child.extract()
finished = True
elif len(child) > remaining:
child.replace_with(child[:remaining] + '...')
total_chars = max_length
finished = True
else:
total_chars += len(child)
elif hasattr(child, 'children'):
process_element(child)
# Remove empty tags
if not child.text.strip():
child.decompose()
process_element(soup)
return str(soup)
def shift_headings(html_string, header_shift=config.formatting.preview_header_shift):
soup = BeautifulSoup(html_string, 'html.parser')
for level in range(6, 0, -1): # Start from h6 to h1 to avoid overwriting
old_tag = f'h{level}'
for tag in soup.find_all(old_tag):
new_level = min(level + header_shift, 6) # Cap at h6
new_tag = f'h{new_level}'
tag.name = new_tag
return str(soup)
def get_preview_text(html_string: str):
return shift_headings(shorten_text_and_clean(html_string))
def stem_to_language_code(stem: str) -> str:
language_code = stem.lower().replace("-", "_")
if language_code in config.languages:
return language_code
language_code = language_code.split("_")[0]
if language_code in config.languages:
return language_code
logger.error("Didn't recognize %s as a valid language code, add it to the config, or fix your structure.", stem)
exit(1)
class TemplateDict(dict):
def __init__(self, folder: Path):
self.folder = folder
super().__init__()
def __missing__(self, name: str) -> jinja2.Template:
f = self.folder / (name + ".html")
if not f.exists():
logger.error("no template with the name %s exists", name)
exit(1)
t = jinja2.Template(f.read_text())
self[name] = t
return t
TEMPLATE: Dict[str, jinja2.Template] = TemplateDict(Path(config.setup.source_directory, "templates"))
class LanguageDict(dict):
def __missing__(self, key: str):
if key not in config.languages:
raise KeyError(key)
lang_dict = config.languages[key]
lang_dict["priority"] = lang_dict.get("priority", 0)
elements = key.split("_")
if len(elements) > 1:
elements[-1] = elements[-1].upper()
lang_dict["code"] = "-".join(elements)
return lang_dict
LANGUAGES = LanguageDict()
class CrossArticleContext(TypedDict):
slug: str
title: str # from article is the same as slug
url: str
article_url: str # from article it is the same as url, from article_taanslation it links to the article
date: str
iso_date: str
class ArticleTranslation:
def __init__(self, file: Path, article: Article):
self.file = file
self.article = article
self.context: Dict[str, Any] = {}
# initializing the location of the article translation
self.language_code = stem_to_language_code(self.file.stem)
self.location_in_tree = [self.language_code, *self.article.location_in_tree]
self.url = "/" + "/".join(self.location_in_tree)
self.dist_path = Path(config.setup.dist_directory, *self.location_in_tree)
self.cross_article_context = TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code][self.article.slug] = {}
self.priority = LANGUAGES[self.language_code]["priority"]
self.real_language_code = LANGUAGES[self.language_code]["code"]
self.html_content = self.file.read_text()
if self.file.suffix == ".md":
self.html_content = markdown.markdown(self.html_content)
def __init_context__(self):
self.context["meta"] = self.article.context_shared
self.context["url"] = self.url
self.context["language"] = LANGUAGES[self.language_code]
self.context["article_url"] = self.article.url
self.context["title"] = get_first_header_content(self.html_content, fallback=LANGUAGES[self.language_code]["native_name"])
self.cross_article_context.update(self.article.context_shared)
self.cross_article_context["title"] = self.context["title"]
self.cross_article_context["article_url"] = self.article.url
self.cross_article_context["url"] = self.url
# get children
self.context["children"] = [
c.article_translations_map[self.language_code].context for c in self.article.child_articles
if self.language_code in c.article_translations_map
]
def __init_content_context__(self):
self.context["content"] = self.html_content
self.context["preview"] = get_preview_text(html_string=self.html_content)
def build(self):
self.dist_path.mkdir(parents=True, exist_ok=True)
with Path(self.dist_path, "index.html").open("w") as f:
f.write(TEMPLATE["article_translation"].render(self.context))
class Article:
def __init__(self, directory: Path, location_in_tree: Optional[List[str]] = None, is_root: bool = False, parent: Optional[Article] = None):
self.directory = directory
self.context: Dict[str, Any] = {}
self.context_shared: Dict[str, Any] = {}
if parent is not None:
self.context["parent"] = parent.context_shared
# initializing the config values of the article
config_file = self.directory / "index.toml"
self.config = toml.load(config_file) if config_file.exists() else {}
# initializing the location and slug of the article
self.slug = self.config.get("name", self.directory.name)
if self.slug in ARTICLE_LAKE:
logger.error("two articles have the same name at %s and %r", ARTICLE_LAKE[self.slug].directory, self.directory)
exit(1)
self.cross_article_context = CROSS_ARTICLE_CONTEXT[self.slug] = {}
ARTICLE_LAKE[self.slug] = self
self.location_in_tree: List[str] = location_in_tree or []
if not is_root:
self.location_in_tree.append(self.slug)
self.url = "/" + "/".join(self.location_in_tree)
self.dist_path = Path(config.setup.dist_directory, *self.location_in_tree)
# build the tree
self.child_articles: List[Article] = []
self.article_translations_list: List[ArticleTranslation] = []
self.article_translations_map: Dict[str, ArticleTranslation] = {}
for c in self.directory.iterdir():
if c.name == "index.toml":
continue
if c.is_file():
at = ArticleTranslation(c, self)
self.article_translations_list.append(at)
self.article_translations_map[at.language_code] = at
elif c.is_dir():
self.child_articles.append(Article(
directory=c,
location_in_tree=self.location_in_tree.copy(),
parent=self,
))
self.article_translations_list.sort(key=lambda a: a.priority, reverse=True)
logger.info("found %s at %s with the translations %s", self.slug, ".".join(list(self.location_in_tree)), ",".join(self.article_translations_map.keys()))
def __init_context__(self):
self.context_shared["url"] = self.url
self.context_shared["slug"] = self.slug
modified_at = datetime.fromisoformat(self.config["datetime"]) if "datetime" in self.config else datetime.fromtimestamp(self.directory.stat().st_mtime)
self.context_shared["date"] = modified_at.strftime(config.formatting.datetime_format)
self.context_shared["iso_date"] = modified_at.isoformat()
self.context.update(self.context_shared)
self.cross_article_context.update(self.context_shared)
self.cross_article_context["title"] = self.context_shared["slug"]
self.cross_article_context["article_url"] = self.context_shared["url"]
# recursive context structures
translation_list = self.context["translations"] = []
child_article_list = self.context["children"] = []
for article_translation in self.article_translations_list:
self.context[article_translation.real_language_code] = article_translation.context
translation_list.append(article_translation.context)
for child_article in self.child_articles:
child_article_list.append(child_article.context)
# recursively build context
for at in self.article_translations_list:
at.__init_context__()
for a in self.child_articles:
a.__init_context__()
def __init_content_context__(self):
for at in self.article_translations_list:
at.__init_content_context__()
for a in self.child_articles:
a.__init_content_context__()
def build(self):
self.dist_path.mkdir(parents=True, exist_ok=True)
with Path(self.dist_path, "index.html").open("w") as f:
f.write(TEMPLATE["article"].render(self.context))
for at in self.article_translations_list:
at.build()
for ac in self.child_articles:
ac.build()
# GLOBALS
logger = logging.getLogger("stsg.build")
CROSS_ARTICLE_CONTEXT: Dict[str, CrossArticleContext] = {}
TRANSLATED_CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, CrossArticleContext]] = defaultdict(dict)
ARTICLE_LAKE: Dict[str, Article] = {}
ARTICLE_REFERENCE_VALUES: DefaultDict[str, Dict[str, str]] = defaultdict(dict)
def build():
logger.info("starting build process...")
logger.info("copying static folder...")
shutil.copytree(Path(config.setup.source_directory, "static"), Path(config.setup.dist_directory, "static"), dirs_exist_ok=True)
logger.info("building page tree...")
tree = Article(directory=Path(config.setup.source_directory, "articles"), is_root=True)
logger.info("compiling tree context...")
tree.__init_context__()
tree.__init_content_context__()
import json
with Path("context.json").open("w") as f:
json.dump(tree.context, f, indent=4)
with Path("cross_article_context.json").open("w") as f:
json.dump(CROSS_ARTICLE_CONTEXT, f, indent=4)
with Path("t_cross_article_context.json").open("w") as f:
json.dump(TRANSLATED_CROSS_ARTICLE_CONTEXT, f, indent=4)
logger.info("dumping page tree...")
tree.build()