ag-index/scripts/optimize_assets.py

#!/usr/bin/env python3

from __future__ import annotations

import argparse
import re
import shutil
import subprocess
from collections import Counter
from dataclasses import dataclass
from pathlib import Path


ROOT = Path(__file__).resolve().parent.parent
ASSETS_DIR = ROOT / "assets"
IMAGES_DIR = ASSETS_DIR / "images"
BACKUP_DIR = ROOT / "assets_backup"
TEXT_EXTENSIONS = {".html", ".css", ".js"}
SOURCE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif"}
MANAGED_IMAGE_EXTENSIONS = SOURCE_EXTENSIONS | {".webp"}
MAX_FILE_SIZE = 300_000
MAX_DIMENSION = 2560
SMALL_FILE_LIMIT = 150 * 1024
SMALL_MIN_SAVINGS = 8 * 1024
LARGE_MIN_RATIO = 0.05


@dataclass(frozen=True)
class ImageInfo:
    width: int
    height: int
    colors: int


def run(cmd: list[str], capture: bool = False) -> str:
    result = subprocess.run(
        cmd,
        check=True,
        text=True,
        capture_output=capture,
    )
    return result.stdout.strip() if capture else ""


def should_keep_conversion(source_size: int, candidate_size: int) -> bool:
    savings = source_size - candidate_size
    if savings <= 0:
        return False
    if source_size < SMALL_FILE_LIMIT:
        return savings >= SMALL_MIN_SAVINGS
    return (savings / source_size) >= LARGE_MIN_RATIO


def classify_large_image(source_size: int, width: int, height: int) -> bool:
    return source_size > MAX_FILE_SIZE or max(width, height) > MAX_DIMENSION


def image_info(path: Path) -> ImageInfo:
    output = run(
        ["magick", "identify", "-ping", "-format", "%w %h %k", str(path)],
        capture=True,
    )
    width_text, height_text, colors_text = output.split()
    return ImageInfo(int(width_text), int(height_text), int(colors_text))


def text_files() -> list[Path]:
    return sorted(
        path
        for path in ROOT.rglob("*")
        if path.is_file()
        and path.suffix.lower() in TEXT_EXTENSIONS
        and "assets_backup" not in path.parts
        and ".git" not in path.parts
        and "docs" not in path.parts
        and "tests" not in path.parts
    )


def source_images() -> list[Path]:
    return sorted(
        path
        for path in IMAGES_DIR.rglob("*")
        if path.is_file() and path.suffix.lower() in SOURCE_EXTENSIONS and not path.name.startswith("._")
    )


def colliding_stems(paths: list[Path]) -> set[str]:
    counts = Counter(path.with_suffix("").relative_to(ROOT).as_posix() for path in paths)
    return {stem for stem, count in counts.items() if count > 1}


def build_webp_path(source: Path, duplicate_stems: set[str]) -> Path:
    stem_key = source.with_suffix("").as_posix()
    if not source.is_absolute():
        stem_key = source.with_suffix("").as_posix()
    else:
        stem_key = source.with_suffix("").relative_to(ROOT).as_posix()
    if stem_key in duplicate_stems:
        return source.with_name(f"{source.name}.webp")
    return source.with_suffix(".webp")


def managed_webps() -> list[Path]:
    return sorted(
        path
        for path in IMAGES_DIR.rglob("*.webp")
        if path.is_file() and not path.name.startswith("._")
    )


def render_candidate(source: Path, operations: list[str], suffix: str) -> Path:
    target = source.with_name(f"{source.stem}.{suffix}{source.suffix}")
    run(["magick", str(source), *operations, str(target)])
    return target


def render_webp_candidate(source: Path, operations: list[str], suffix: str) -> Path:
    target = source.with_name(f"{source.stem}.{suffix}.webp")
    run(["magick", str(source), *operations, str(target)])
    return target


def choose_smaller_original(source: Path, info: ImageInfo) -> tuple[int, int]:
    original_size = source.stat().st_size
    extension = source.suffix.lower()
    candidates: list[Path] = []
    try:
        if extension in {".jpg", ".jpeg"}:
            jpeg_profiles = [
                ["-strip", "-sampling-factor", "4:2:0", "-quality", "86", "-interlace", "Plane"],
            ]
            if classify_large_image(original_size, info.width, info.height):
                jpeg_profiles.extend(
                    [
                        ["-strip", "-sampling-factor", "4:2:0", "-quality", "82", "-interlace", "Plane"],
                        ["-strip", "-sampling-factor", "4:2:0", "-quality", "78", "-interlace", "Plane"],
                    ]
                )
            if max(info.width, info.height) > MAX_DIMENSION:
                jpeg_profiles.extend(
                    [
                        [
                            "-strip",
                            "-resize",
                            f"{MAX_DIMENSION}x{MAX_DIMENSION}>",
                            "-sampling-factor",
                            "4:2:0",
                            "-quality",
                            "84",
                            "-interlace",
                            "Plane",
                        ],
                        [
                            "-strip",
                            "-resize",
                            f"{MAX_DIMENSION}x{MAX_DIMENSION}>",
                            "-sampling-factor",
                            "4:2:0",
                            "-quality",
                            "80",
                            "-interlace",
                            "Plane",
                        ],
                    ]
                )
            for index, profile in enumerate(jpeg_profiles):
                candidates.append(render_candidate(source, profile, f"jpg-opt-{index}"))
        elif extension == ".png":
            candidates.append(
                render_candidate(
                    source,
                    ["-strip", "-define", "png:compression-level=9", "-define", "png:compression-filter=5"],
                    "png-opt",
                )
            )
        elif extension == ".gif":
            candidates.append(render_candidate(source, ["-strip"], "gif-opt"))

        best = min([source, *candidates], key=lambda candidate: candidate.stat().st_size)
        best_size = best.stat().st_size
        if best != source:
            best.replace(source)
        return original_size, best_size
    finally:
        for candidate in candidates:
            if candidate.exists():
                candidate.unlink()


def choose_webp(source: Path, info: ImageInfo, target_path: Path) -> tuple[bool, int]:
    source_size = source.stat().st_size
    extension = source.suffix.lower()
    candidates: list[Path] = []
    try:
        if extension in {".jpg", ".jpeg"}:
            profiles = [
                ["-strip", "-quality", "84", "-define", "webp:method=6"],
                ["-strip", "-quality", "80", "-define", "webp:method=6"],
            ]
            if classify_large_image(source_size, info.width, info.height):
                profiles.append(["-strip", "-quality", "76", "-define", "webp:method=6"])
            if max(info.width, info.height) > MAX_DIMENSION:
                profiles.extend(
                    [
                        ["-strip", "-resize", f"{MAX_DIMENSION}x{MAX_DIMENSION}>", "-quality", "82", "-define", "webp:method=6"],
                        ["-strip", "-resize", f"{MAX_DIMENSION}x{MAX_DIMENSION}>", "-quality", "78", "-define", "webp:method=6"],
                    ]
                )
        elif extension == ".png":
            if info.colors <= 256 and source_size < SMALL_FILE_LIMIT:
                profiles = [
                    ["-strip", "-define", "webp:lossless=true", "-define", "webp:method=6"],
                ]
            else:
                profiles = [
                    ["-strip", "-quality", "92", "-define", "webp:alpha-quality=95", "-define", "webp:method=6"],
                    ["-strip", "-quality", "88", "-define", "webp:alpha-quality=90", "-define", "webp:method=6"],
                ]
                if max(info.width, info.height) > MAX_DIMENSION:
                    profiles.extend(
                        [
                            [
                                "-strip",
                                "-resize",
                                f"{MAX_DIMENSION}x{MAX_DIMENSION}>",
                                "-quality",
                                "90",
                                "-define",
                                "webp:alpha-quality=92",
                                "-define",
                                "webp:method=6",
                            ],
                            [
                                "-strip",
                                "-resize",
                                f"{MAX_DIMENSION}x{MAX_DIMENSION}>",
                                "-quality",
                                "86",
                                "-define",
                                "webp:alpha-quality=88",
                                "-define",
                                "webp:method=6",
                            ],
                        ]
                    )
        else:
            profiles = [["-strip", "-define", "webp:lossless=true", "-define", "webp:method=6"]]

        for index, profile in enumerate(profiles):
            candidates.append(render_webp_candidate(source, profile, f"webp-opt-{index}"))

        best = min(candidates, key=lambda candidate: candidate.stat().st_size)
        best_size = best.stat().st_size
        if should_keep_conversion(source_size, best_size):
            best.replace(target_path)
            accepted = True
        else:
            if target_path.exists():
                target_path.unlink()
            accepted = False
        return accepted, best_size
    finally:
        for candidate in candidates:
            if candidate.exists():
                candidate.unlink()


def build_replacements(accepted_sources: list[Path], duplicate_stems: set[str]) -> dict[str, str]:
    replacements: dict[str, str] = {}
    for source in accepted_sources:
        relative = source.relative_to(ROOT).as_posix()
        target = build_webp_path(source, duplicate_stems).relative_to(ROOT).as_posix()
        replacements[relative] = target
        if relative.startswith("assets/images/"):
            replacements[relative.replace("assets/images/", "../images/", 1)] = target.replace(
                "assets/images/", "../images/", 1
            )
            replacements[relative.replace("assets/", "", 1)] = target.replace("assets/", "", 1)
    return replacements


def referenced_asset_paths() -> set[str]:
    references: set[str] = set()
    pattern = re.compile(r"(assets/images|../images|images)/[^\"')\s>]+\.(?:png|jpg|jpeg|gif|webp|svg|ico)", re.IGNORECASE)
    for path in text_files():
        content = path.read_text(encoding="utf-8")
        for match in pattern.finditer(content):
            ref = match.group(0)
            candidates = [ROOT / ref, path.parent / ref]
            for candidate in candidates:
                if candidate.exists() and candidate.is_file():
                    references.add(candidate.resolve().relative_to(ROOT).as_posix())
                    break
    return references


def should_prune_original(source: Path, referenced_paths: set[str], accepted_webp_paths: set[str]) -> bool:
    source_key = source.as_posix() if not source.is_absolute() else source.relative_to(ROOT).as_posix()
    if source_key in referenced_paths:
        return False
    return any(
        webp_key in accepted_webp_paths
        for webp_key in {
            source.with_suffix(".webp").as_posix() if not source.is_absolute() else source.with_suffix(".webp").relative_to(ROOT).as_posix(),
            source.with_name(f"{source.name}.webp").as_posix() if not source.is_absolute() else source.with_name(f"{source.name}.webp").relative_to(ROOT).as_posix(),
        }
    )


def update_references(accepted_sources: list[Path], duplicate_stems: set[str]) -> int:
    replacements = build_replacements(accepted_sources, duplicate_stems)
    if not replacements:
        return 0
    pattern = re.compile("|".join(sorted((re.escape(item) for item in replacements), key=len, reverse=True)))
    changed = 0
    for path in text_files():
        original = path.read_text(encoding="utf-8")
        updated = pattern.sub(lambda match: replacements[match.group(0)], original)
        if updated != original:
            path.write_text(updated, encoding="utf-8")
            changed += 1
    return changed


def prune_replaced_originals(images: list[Path], duplicate_stems: set[str]) -> int:
    references = referenced_asset_paths()
    accepted_webp_paths = {
        build_webp_path(source, duplicate_stems).relative_to(ROOT).as_posix()
        for source in images
        if build_webp_path(source, duplicate_stems).exists()
    }
    pruned = 0
    for source in images:
        if should_prune_original(source, references, accepted_webp_paths):
            source.unlink()
            pruned += 1
    return pruned


def check_rewritten_webps() -> int:
    missing: list[str] = []
    pattern = re.compile(r"(assets/images|../images|images)/[^\"')\s>]+\.webp", re.IGNORECASE)
    for path in text_files():
        content = path.read_text(encoding="utf-8")
        for match in pattern.finditer(content):
            ref = match.group(0)
            candidates = [
                ROOT / ref,
                path.parent / ref,
            ]
            if not any(candidate.exists() and candidate.is_file() for candidate in candidates):
                missing.append(f"{path.relative_to(ROOT)}: {ref}")
    if missing:
        print("\n".join(missing))
        return 1
    print("No missing rewritten webp references.")
    return 0


def ensure_backup_copy() -> None:
    if BACKUP_DIR.exists():
        shutil.rmtree(BACKUP_DIR)
    BACKUP_DIR.mkdir(parents=True, exist_ok=True)
    for child in ASSETS_DIR.iterdir():
        destination = BACKUP_DIR / child.name
        if child.is_dir():
            shutil.copytree(child, destination)
        else:
            shutil.copy2(child, destination)


def optimize_assets() -> int:
    if shutil.which("magick") is None:
        raise SystemExit("ImageMagick `magick` 未安装，无法继续处理图片。")

    ensure_backup_copy()

    images = source_images()
    duplicate_stems = colliding_stems(images)

    total_before = 0
    total_after_original = 0
    accepted_sources: list[Path] = []
    accepted_count = 0
    skipped_count = 0

    for source in images:
        info = image_info(source)
        before_size, after_original_size = choose_smaller_original(source, info)
        total_before += before_size
        total_after_original += after_original_size
        accepted, _ = choose_webp(source, image_info(source), build_webp_path(source, duplicate_stems))
        if accepted:
            accepted_sources.append(source)
            accepted_count += 1
        else:
            skipped_count += 1

    changed_files = update_references(accepted_sources, duplicate_stems)
    print(f"Backup refreshed at: {BACKUP_DIR.relative_to(ROOT)}")
    print(f"Raster size before optimization: {total_before / 1024 / 1024:.2f} MB")
    print(f"Raster size after original optimization: {total_after_original / 1024 / 1024:.2f} MB")
    print(f"Accepted webp conversions: {accepted_count}")
    print(f"Rejected webp conversions: {skipped_count}")
    print(f"Updated text files: {changed_files}")
    return 0


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--check", action="store_true", help="validate rewritten webp references")
    parser.add_argument("--prune-replaced", action="store_true", help="delete originals already replaced by accepted webp files")
    args = parser.parse_args()
    if args.check:
        return check_rewritten_webps()
    if args.prune_replaced:
        images = source_images()
        duplicate_stems = colliding_stems(images)
        pruned = prune_replaced_originals(images, duplicate_stems)
        print(f"Pruned replaced originals: {pruned}")
        return 0
    return optimize_assets()


if __name__ == "__main__":
    raise SystemExit(main())