STSG/stsg/build.py

from __future__ import annotations
import logging
import shutil
from pathlib import Path
import os
import markdown
from typing import Optional, Union, Dict, Generator, List, DefaultDict, Any, TypedDict, Set
from bs4 import BeautifulSoup
from collections import defaultdict
import toml
from datetime import datetime
import jinja2

from . import config


def get_first_header_content(content, fallback: str = ""):
    soup = BeautifulSoup(content, 'html.parser')
    for level in range(1, 7):
        header = soup.find(f'h{level}')
        if header:
            return header.get_text(strip=True)

    return fallback


def shorten_text_and_clean(html_string, max_length=config.formatting.preview_length):
    soup = BeautifulSoup(html_string, 'html.parser')

    # Keep track of total characters added
    total_chars = 0
    finished = False

    # Function to recursively trim and clean text
    def process_element(element):
        nonlocal total_chars, finished

        for child in list(element.children):
            if finished:
                child.extract()
                continue

            if isinstance(child, str):
                remaining = max_length - total_chars
                if remaining <= 0:
                    child.extract()
                    finished = True
                elif len(child) > remaining:
                    child.replace_with(child[:remaining] + '...')
                    total_chars = max_length
                    finished = True
                else:
                    total_chars += len(child)
            elif hasattr(child, 'children'):
                process_element(child)
                # Remove empty tags
                if not child.text.strip():
                    child.decompose()

    process_element(soup)

    return str(soup)


def shift_headings(html_string, header_shift=config.formatting.preview_header_shift):
    soup = BeautifulSoup(html_string, 'html.parser')

    for level in range(6, 0, -1):  # Start from h6 to h1 to avoid overwriting
        old_tag = f'h{level}'
        for tag in soup.find_all(old_tag):
            new_level = min(level + header_shift, 6)  # Cap at h6
            new_tag = f'h{new_level}'
            tag.name = new_tag

    return str(soup)


def get_preview_text(html_string: str):
    return shift_headings(shorten_text_and_clean(html_string))


def stem_to_language_code(stem: str) -> str:
    language_code = stem.lower().replace("-", "_")

    if language_code in config.languages:
        return language_code

    language_code = language_code.split("_")[0]
    if language_code in config.languages:
        return language_code

    logger.error("Didn't recognize %s as a valid language code, add it to the config, or fix your structure.", stem)
    exit(1)


class TemplateDict(dict):
    def __init__(self, folder: Path):
        self.folder = folder
        super().__init__()

    def __missing__(self, name: str) -> jinja2.Template:
        f = self.folder / (name + ".html")
        if not f.exists():
            logger.error("no template with the name %s exists", name)
            exit(1)

        t = jinja2.Template(f.read_text())
        self[name] = t
        return t


TEMPLATE: Dict[str, jinja2.Template] = TemplateDict(Path(config.setup.source_directory, "templates"))


class LanguageDict(dict):
    def __missing__(self, key: str):
        if key not in config.languages:
            raise KeyError(key)

        lang_dict = config.languages[key]
        lang_dict["priority"] = lang_dict.get("priority", 0)

        elements = key.split("_")
        if len(elements) > 1:
            elements[-1] = elements[-1].upper()
        lang_dict["code"] = "-".join(elements)

        return lang_dict


LANGUAGES = LanguageDict()


class CrossArticleContext(TypedDict):
    slug: str
    title: str  # from article is the same as slug
    url: str
    article_url: str    # from article it is the same as url, from article_taanslation it links to the article
    date: str
    iso_date: str


class ArticleTranslation:
    def __init__(self, file: Path, article: Article):
        self.file = file
        self.article = article

        self.context: Dict[str, Any] = {}

        # initializing the location of the article translation
        self.language_code = stem_to_language_code(self.file.stem)
        self.location_in_tree = [self.language_code, *self.article.location_in_tree]
        self.url = "/" + "/".join(self.location_in_tree)
        self.dist_path = Path(config.setup.dist_directory, *self.location_in_tree)
        self.cross_article_context = TRANSLATED_CROSS_ARTICLE_CONTEXT[self.language_code][self.article.slug] = {}


        self.priority = LANGUAGES[self.language_code]["priority"]
        self.real_language_code = LANGUAGES[self.language_code]["code"]

        self.html_content = self.file.read_text()
        if self.file.suffix == ".md":
            self.html_content = markdown.markdown(self.html_content)

    def __init_context__(self):
        self.context["meta"] = self.article.context_shared
        self.context["url"] = self.url
        self.context["language"] = LANGUAGES[self.language_code]
        self.context["article_url"] = self.article.url
        self.context["title"] = get_first_header_content(self.html_content, fallback=LANGUAGES[self.language_code]["native_name"])

        self.cross_article_context.update(self.article.context_shared)
        self.cross_article_context["title"] = self.context["title"]
        self.cross_article_context["article_url"] = self.article.url
        self.cross_article_context["url"] = self.url

        # get children
        self.context["children"] = [
            c.article_translations_map[self.language_code].context for c in self.article.child_articles
            if self.language_code in c.article_translations_map
        ]

    def __init_content_context__(self):
        self.context["content"] = self.html_content
        self.context["preview"] = get_preview_text(html_string=self.html_content)

    def build(self):
        self.dist_path.mkdir(parents=True, exist_ok=True)

        with Path(self.dist_path, "index.html").open("w") as f:
            f.write(TEMPLATE["article_translation"].render(self.context))


class Article:
    def __init__(self, directory: Path, location_in_tree: Optional[List[str]] = None, is_root: bool = False, parent: Optional[Article] = None):
        self.directory = directory

        self.context: Dict[str, Any] = {}
        self.context_shared: Dict[str, Any] = {}
        if parent is not None:
            self.context["parent"] = parent.context_shared

        # initializing the config values of the article
        config_file = self.directory / "index.toml"
        self.config = toml.load(config_file) if config_file.exists() else {}

        # initializing the location and slug of the article
        self.slug = self.config.get("name", self.directory.name)
        if self.slug in ARTICLE_LAKE:
            logger.error("two articles have the same name at %s and %r", ARTICLE_LAKE[self.slug].directory, self.directory)
            exit(1)
        self.cross_article_context = CROSS_ARTICLE_CONTEXT[self.slug] = {}
        ARTICLE_LAKE[self.slug] = self

        self.location_in_tree: List[str] = location_in_tree or []
        if not is_root:
            self.location_in_tree.append(self.slug)
        self.url = "/" + "/".join(self.location_in_tree)
        self.dist_path = Path(config.setup.dist_directory, *self.location_in_tree)

        # build the tree
        self.child_articles: List[Article] = []
        self.article_translations_list: List[ArticleTranslation] = []
        self.article_translations_map: Dict[str, ArticleTranslation] = {}

        for c in self.directory.iterdir():
            if c.name == "index.toml":
                continue

            if c.is_file():
                at = ArticleTranslation(c, self)
                self.article_translations_list.append(at)
                self.article_translations_map[at.language_code] = at
            elif c.is_dir():
                self.child_articles.append(Article(
                    directory=c,
                    location_in_tree=self.location_in_tree.copy(),
                    parent=self,
                ))

        self.article_translations_list.sort(key=lambda a: a.priority, reverse=True)

        logger.info("found %s at %s with the translations %s", self.slug, ".".join(list(self.location_in_tree)), ",".join(self.article_translations_map.keys()))

    def __init_context__(self):
        self.context_shared["url"] = self.url
        self.context_shared["slug"] = self.slug

        modified_at = datetime.fromisoformat(self.config["datetime"]) if "datetime" in self.config else datetime.fromtimestamp(self.directory.stat().st_mtime)
        self.context_shared["date"] = modified_at.strftime(config.formatting.datetime_format)
        self.context_shared["iso_date"] = modified_at.isoformat()

        self.context.update(self.context_shared)

        self.cross_article_context.update(self.context_shared)
        self.cross_article_context["title"] = self.context_shared["slug"]
        self.cross_article_context["article_url"] = self.context_shared["url"]


        # recursive context structures
        translation_list = self.context["translations"] = []
        child_article_list = self.context["children"] = []

        for article_translation in self.article_translations_list:
            self.context[article_translation.real_language_code] = article_translation.context
            translation_list.append(article_translation.context)

        for child_article in self.child_articles:
            child_article_list.append(child_article.context)

        # recursively build context
        for at in self.article_translations_list:
            at.__init_context__()
        for a in self.child_articles:
            a.__init_context__()

    def __init_content_context__(self):
        for at in self.article_translations_list:
            at.__init_content_context__()
        for a in self.child_articles:
            a.__init_content_context__()

    def build(self):
        self.dist_path.mkdir(parents=True, exist_ok=True)

        with Path(self.dist_path, "index.html").open("w") as f:
            f.write(TEMPLATE["article"].render(self.context))

        for at in self.article_translations_list:
            at.build()

        for ac in self.child_articles:
            ac.build()


# GLOBALS
logger = logging.getLogger("stsg.build")
CROSS_ARTICLE_CONTEXT: Dict[str, CrossArticleContext] = {}
TRANSLATED_CROSS_ARTICLE_CONTEXT: Dict[str, Dict[str, CrossArticleContext]] = defaultdict(dict)
ARTICLE_LAKE: Dict[str, Article] = {}
ARTICLE_REFERENCE_VALUES: DefaultDict[str, Dict[str, str]] = defaultdict(dict)


def build():
    logger.info("starting build process...")

    logger.info("copying static folder...")
    shutil.copytree(Path(config.setup.source_directory, "static"), Path(config.setup.dist_directory, "static"), dirs_exist_ok=True)

    logger.info("building page tree...")
    tree = Article(directory=Path(config.setup.source_directory, "articles"), is_root=True)

    logger.info("compiling tree context...")
    tree.__init_context__()
    tree.__init_content_context__()

    import json
    with Path("context.json").open("w") as f:
        json.dump(tree.context, f, indent=4)
    with Path("cross_article_context.json").open("w") as f:
        json.dump(CROSS_ARTICLE_CONTEXT, f, indent=4)
    with Path("t_cross_article_context.json").open("w") as f:
        json.dump(TRANSLATED_CROSS_ARTICLE_CONTEXT, f, indent=4)

    logger.info("dumping page tree...")
    tree.build()