kaiser-natron/scripts/emit_de_locale.py

"""Emit src/i18n/products/de.js + property-code helpers from scraped JSON.

Runs after scrape_kaiser_natron.py. Property labels (German UI text)
get converted into stable slug-style codes that messages.js translates
back to display strings — so DE and EN stay parallel without inline
translation in the data fixture.
"""

from __future__ import annotations

import json
import re
import unicodedata
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
SCRAPED = ROOT / "scripts" / "output" / "products-content.json"
OUT_DE = ROOT / "src" / "i18n" / "products" / "de.js"
OUT_HELP = ROOT / "scripts" / "output" / "manual-merge.md"

# Maps the German property phrase from the source site to a stable
# code we'll use in src/api/products.js + messages.js. New entries
# get added here when the source surface a new label.
PROP_CODES: dict[str, str] = {
    "Enthärtet":                                       "enthaertet",
    "Frei von Mikroplastik":                           "frei-von-mikroplastik",
    "Frei von tierischen Substanzen":                  "frei-von-tierischen-substanzen",
    "Für Böden und Oberflächen":                       "fuer-boeden-und-oberflaechen",
    "Für Küche und Haushalt":                          "fuer-kueche-und-haushalt",
    "Glutenfrei":                                      "glutenfrei",
    "Hautverträglich – dermatologisch bestätigt":      "hautvertraeglich-dermatologisch-bestaetigt",
    "Hygienische Sauberkeit":                          "hygienische-sauberkeit",
    "Laktosefrei":                                     "laktosefrei",
    "Mild-alkalisch":                                  "mild-alkalisch",
    "Mit Pampelmusenduft":                             "mit-pampelmusenduft",
    "Mit dem Duft der Latschenkiefer":                 "mit-dem-duft-der-latschenkiefer",
    "Mit entspannendem Apfelsinenduft":                "mit-entspannendem-apfelsinenduft",
    "Mit natürlicher Seife":                           "mit-natuerlicher-seife",
    "Mit pflegendem Lanolin":                          "mit-pflegendem-lanolin",
    "Nachhaltig durch Tenside nachwachsenden Ursprungs": "nachhaltig-durch-tenside-nachwachsenden-ursprungs",
    "Neutralisiert Säuren":                            "neutralisiert-saeuren",
    "Ohne Konservierungsstoffe":                       "ohne-konservierungsstoffe",
    "Ohne Mineralöl":                                  "ohne-mineraloel",
    "Vegan":                                           "vegan",
    "Wohltuend und erfrischend":                       "wohltuend-und-erfrischend",
    # Carry-overs from messages.js v1 (not yet seen in source).
    "In Österreich abgefüllt":                         "made-in-austria",
    "Bio":                                             "bio",
}

# English labels — translations land here (a single source of truth so
# we don't drift between locales).
EN_LABELS: dict[str, str] = {
    "enthaertet":                                "Softens water",
    "frei-von-mikroplastik":                     "Microplastic-free",
    "frei-von-tierischen-substanzen":            "No animal substances",
    "fuer-boeden-und-oberflaechen":              "For floors & surfaces",
    "fuer-kueche-und-haushalt":                  "For kitchen & home",
    "glutenfrei":                                "Gluten-free",
    "hautvertraeglich-dermatologisch-bestaetigt": "Skin-friendly — dermatologically tested",
    "hygienische-sauberkeit":                    "Hygienic clean",
    "laktosefrei":                               "Lactose-free",
    "mild-alkalisch":                            "Mildly alkaline",
    "mit-pampelmusenduft":                       "Pomelo scent",
    "mit-dem-duft-der-latschenkiefer":           "Mountain pine scent",
    "mit-entspannendem-apfelsinenduft":          "Sweet orange scent",
    "mit-natuerlicher-seife":                    "With natural soap",
    "mit-pflegendem-lanolin":                    "With nourishing lanolin",
    "nachhaltig-durch-tenside-nachwachsenden-ursprungs": "Sustainable plant-based surfactants",
    "neutralisiert-saeuren":                     "Neutralises acids",
    "ohne-konservierungsstoffe":                 "Preservative-free",
    "ohne-mineraloel":                           "Mineral-oil-free",
    "vegan":                                     "Vegan",
    "wohltuend-und-erfrischend":                 "Soothing & refreshing",
    "made-in-austria":                           "Bottled in Austria",
    "bio":                                       "Organic",
}


def js_str(s: str) -> str:
    """Single-line JS string with safe escaping (handles backticks/$)."""
    if s is None:
        return "null"
    s = s.replace("\\", "\\\\").replace("'", "\\'").replace("\n", " ")
    return f"'{s}'"


def normalise(text: str | None) -> str | None:
    if not text:
        return None
    text = text.replace("1/2 Tl", "½ TL").replace("1/2 TL", "½ TL")
    text = re.sub(r"\s+", " ", text).strip()
    return text or None


def emit() -> None:
    data = json.loads(SCRAPED.read_text(encoding="utf-8"))
    populated: dict[str, dict] = {}
    skipped: list[str] = []
    unknown_props: set[str] = set()

    for slug, p in data.items():
        tagline = normalise(p.get("tagline"))
        lead = normalise(p.get("lead"))
        long_desc = normalise(p.get("descriptionLong"))
        apps = []
        for a in p.get("applications") or []:
            t = normalise(a.get("title"))
            b = normalise(a.get("body"))
            if t or b:
                apps.append({"title": t, "body": b})

        has_content = any([tagline, lead, long_desc, apps])
        if not has_content:
            skipped.append(slug)
            continue
        populated[slug] = {
            "tagline": tagline,
            "lead": lead,
            "descriptionLong": long_desc,
            "applications": apps,
        }

    # Build js content
    lines = [
        "// German product long-form copy. Source of truth for all product pages.",
        "// Keyed by product id (matches src/api/products.js). UI chrome strings",
        "// live in src/i18n/messages.js — only product-specific narrative belongs here.",
        "//",
        "// Generated from scripts/output/products-content.json by",
        "// scripts/emit_de_locale.py — re-run after scraping to refresh.",
        "",
        "export default {",
    ]
    for slug, c in populated.items():
        lines.append(f"  {js_str(slug)}: {{")
        if c["tagline"]:
            lines.append(f"    tagline: {js_str(c['tagline'])},")
        if c["lead"]:
            lines.append(f"    lead: {js_str(c['lead'])},")
        if c["descriptionLong"]:
            lines.append(f"    descriptionLong: {js_str(c['descriptionLong'])},")
        if c["applications"]:
            lines.append("    applications: [")
            for a in c["applications"]:
                lines.append("      {")
                if a["title"]:
                    lines.append(f"        title: {js_str(a['title'])},")
                if a["body"]:
                    lines.append(f"        body: {js_str(a['body'])},")
                lines.append("      },")
            lines.append("    ],")
        lines.append("  },")
    lines.append("}")
    lines.append("")
    OUT_DE.write_text("\n".join(lines), encoding="utf-8")
    print(f"wrote {OUT_DE.relative_to(ROOT)} ({len(populated)} entries)")
    if skipped:
        print(f"skipped (no source content): {len(skipped)}")
        for s in skipped:
            print(f"  - {s}")

    # Manual-merge helper — code arrays per product + i18n keys to splice into
    # src/api/products.js and src/i18n/messages.js.
    helper_lines: list[str] = ["# Manual merge — copy these into the right files\n"]

    # 1. Property arrays per slug
    # The source page sometimes serves NFD-normalised umlauts and the
    # PROP_CODES dict is hand-typed in NFC — normalise both before lookup
    # so Python's `==` doesn't trip over decomposed combining marks.
    nfc_codes = {unicodedata.normalize("NFC", k): v for k, v in PROP_CODES.items()}

    helper_lines.append("## src/api/products.js — properties\n")
    for slug, p in data.items():
        codes = []
        for label in p.get("properties", []):
            code = nfc_codes.get(unicodedata.normalize("NFC", label))
            if code is None:
                unknown_props.add(label)
                continue
            codes.append(code)
        if codes:
            helper_lines.append(f"- `{slug}` → `properties: {json.dumps(codes)}`")
    helper_lines.append("")

    # 2. messages.js DE + EN keys
    helper_lines.append("## src/i18n/messages.js — DE block\n```js")
    for label, code in PROP_CODES.items():
        helper_lines.append(f"  'product.prop.{code}': '{label}',")
    helper_lines.append("```\n")
    helper_lines.append("## src/i18n/messages.js — EN block\n```js")
    for code, en in EN_LABELS.items():
        helper_lines.append(f"  'product.prop.{code}': '{en}',")
    helper_lines.append("```\n")

    if unknown_props:
        helper_lines.append("## Unknown labels (add to PROP_CODES in emit_de_locale.py)\n")
        for p in sorted(unknown_props):
            helper_lines.append(f"- {p!r}")

    OUT_HELP.write_text("\n".join(helper_lines), encoding="utf-8")
    print(f"wrote {OUT_HELP.relative_to(ROOT)}")


if __name__ == "__main__":
    emit()