Files
kaiser-natron/scripts/emit_de_locale.py

207 lines
9.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Emit src/i18n/products/de.js + property-code helpers from scraped JSON.
Runs after scrape_kaiser_natron.py. Property labels (German UI text)
get converted into stable slug-style codes that messages.js translates
back to display strings — so DE and EN stay parallel without inline
translation in the data fixture.
"""
from __future__ import annotations
import json
import re
import unicodedata
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
SCRAPED = ROOT / "scripts" / "output" / "products-content.json"
OUT_DE = ROOT / "src" / "i18n" / "products" / "de.js"
OUT_HELP = ROOT / "scripts" / "output" / "manual-merge.md"
# Maps the German property phrase from the source site to a stable
# code we'll use in src/api/products.js + messages.js. New entries
# get added here when the source surface a new label.
PROP_CODES: dict[str, str] = {
"Enthärtet": "enthaertet",
"Frei von Mikroplastik": "frei-von-mikroplastik",
"Frei von tierischen Substanzen": "frei-von-tierischen-substanzen",
"Für Böden und Oberflächen": "fuer-boeden-und-oberflaechen",
"Für Küche und Haushalt": "fuer-kueche-und-haushalt",
"Glutenfrei": "glutenfrei",
"Hautverträglich dermatologisch bestätigt": "hautvertraeglich-dermatologisch-bestaetigt",
"Hygienische Sauberkeit": "hygienische-sauberkeit",
"Laktosefrei": "laktosefrei",
"Mild-alkalisch": "mild-alkalisch",
"Mit Pampelmusenduft": "mit-pampelmusenduft",
"Mit dem Duft der Latschenkiefer": "mit-dem-duft-der-latschenkiefer",
"Mit entspannendem Apfelsinenduft": "mit-entspannendem-apfelsinenduft",
"Mit natürlicher Seife": "mit-natuerlicher-seife",
"Mit pflegendem Lanolin": "mit-pflegendem-lanolin",
"Nachhaltig durch Tenside nachwachsenden Ursprungs": "nachhaltig-durch-tenside-nachwachsenden-ursprungs",
"Neutralisiert Säuren": "neutralisiert-saeuren",
"Ohne Konservierungsstoffe": "ohne-konservierungsstoffe",
"Ohne Mineralöl": "ohne-mineraloel",
"Vegan": "vegan",
"Wohltuend und erfrischend": "wohltuend-und-erfrischend",
# Carry-overs from messages.js v1 (not yet seen in source).
"In Österreich abgefüllt": "made-in-austria",
"Bio": "bio",
}
# English labels — translations land here (a single source of truth so
# we don't drift between locales).
EN_LABELS: dict[str, str] = {
"enthaertet": "Softens water",
"frei-von-mikroplastik": "Microplastic-free",
"frei-von-tierischen-substanzen": "No animal substances",
"fuer-boeden-und-oberflaechen": "For floors & surfaces",
"fuer-kueche-und-haushalt": "For kitchen & home",
"glutenfrei": "Gluten-free",
"hautvertraeglich-dermatologisch-bestaetigt": "Skin-friendly — dermatologically tested",
"hygienische-sauberkeit": "Hygienic clean",
"laktosefrei": "Lactose-free",
"mild-alkalisch": "Mildly alkaline",
"mit-pampelmusenduft": "Pomelo scent",
"mit-dem-duft-der-latschenkiefer": "Mountain pine scent",
"mit-entspannendem-apfelsinenduft": "Sweet orange scent",
"mit-natuerlicher-seife": "With natural soap",
"mit-pflegendem-lanolin": "With nourishing lanolin",
"nachhaltig-durch-tenside-nachwachsenden-ursprungs": "Sustainable plant-based surfactants",
"neutralisiert-saeuren": "Neutralises acids",
"ohne-konservierungsstoffe": "Preservative-free",
"ohne-mineraloel": "Mineral-oil-free",
"vegan": "Vegan",
"wohltuend-und-erfrischend": "Soothing & refreshing",
"made-in-austria": "Bottled in Austria",
"bio": "Organic",
}
def js_str(s: str) -> str:
"""Single-line JS string with safe escaping (handles backticks/$)."""
if s is None:
return "null"
s = s.replace("\\", "\\\\").replace("'", "\\'").replace("\n", " ")
return f"'{s}'"
def normalise(text: str | None) -> str | None:
if not text:
return None
text = text.replace("1/2 Tl", "½ TL").replace("1/2 TL", "½ TL")
text = re.sub(r"\s+", " ", text).strip()
return text or None
def emit() -> None:
data = json.loads(SCRAPED.read_text(encoding="utf-8"))
populated: dict[str, dict] = {}
skipped: list[str] = []
unknown_props: set[str] = set()
for slug, p in data.items():
tagline = normalise(p.get("tagline"))
lead = normalise(p.get("lead"))
long_desc = normalise(p.get("descriptionLong"))
apps = []
for a in p.get("applications") or []:
t = normalise(a.get("title"))
b = normalise(a.get("body"))
if t or b:
apps.append({"title": t, "body": b})
has_content = any([tagline, lead, long_desc, apps])
if not has_content:
skipped.append(slug)
continue
populated[slug] = {
"tagline": tagline,
"lead": lead,
"descriptionLong": long_desc,
"applications": apps,
}
# Build js content
lines = [
"// German product long-form copy. Source of truth for all product pages.",
"// Keyed by product id (matches src/api/products.js). UI chrome strings",
"// live in src/i18n/messages.js — only product-specific narrative belongs here.",
"//",
"// Generated from scripts/output/products-content.json by",
"// scripts/emit_de_locale.py — re-run after scraping to refresh.",
"",
"export default {",
]
for slug, c in populated.items():
lines.append(f" {js_str(slug)}: {{")
if c["tagline"]:
lines.append(f" tagline: {js_str(c['tagline'])},")
if c["lead"]:
lines.append(f" lead: {js_str(c['lead'])},")
if c["descriptionLong"]:
lines.append(f" descriptionLong: {js_str(c['descriptionLong'])},")
if c["applications"]:
lines.append(" applications: [")
for a in c["applications"]:
lines.append(" {")
if a["title"]:
lines.append(f" title: {js_str(a['title'])},")
if a["body"]:
lines.append(f" body: {js_str(a['body'])},")
lines.append(" },")
lines.append(" ],")
lines.append(" },")
lines.append("}")
lines.append("")
OUT_DE.write_text("\n".join(lines), encoding="utf-8")
print(f"wrote {OUT_DE.relative_to(ROOT)} ({len(populated)} entries)")
if skipped:
print(f"skipped (no source content): {len(skipped)}")
for s in skipped:
print(f" - {s}")
# Manual-merge helper — code arrays per product + i18n keys to splice into
# src/api/products.js and src/i18n/messages.js.
helper_lines: list[str] = ["# Manual merge — copy these into the right files\n"]
# 1. Property arrays per slug
# The source page sometimes serves NFD-normalised umlauts and the
# PROP_CODES dict is hand-typed in NFC — normalise both before lookup
# so Python's `==` doesn't trip over decomposed combining marks.
nfc_codes = {unicodedata.normalize("NFC", k): v for k, v in PROP_CODES.items()}
helper_lines.append("## src/api/products.js — properties\n")
for slug, p in data.items():
codes = []
for label in p.get("properties", []):
code = nfc_codes.get(unicodedata.normalize("NFC", label))
if code is None:
unknown_props.add(label)
continue
codes.append(code)
if codes:
helper_lines.append(f"- `{slug}` → `properties: {json.dumps(codes)}`")
helper_lines.append("")
# 2. messages.js DE + EN keys
helper_lines.append("## src/i18n/messages.js — DE block\n```js")
for label, code in PROP_CODES.items():
helper_lines.append(f" 'product.prop.{code}': '{label}',")
helper_lines.append("```\n")
helper_lines.append("## src/i18n/messages.js — EN block\n```js")
for code, en in EN_LABELS.items():
helper_lines.append(f" 'product.prop.{code}': '{en}',")
helper_lines.append("```\n")
if unknown_props:
helper_lines.append("## Unknown labels (add to PROP_CODES in emit_de_locale.py)\n")
for p in sorted(unknown_props):
helper_lines.append(f"- {p!r}")
OUT_HELP.write_text("\n".join(helper_lines), encoding="utf-8")
print(f"wrote {OUT_HELP.relative_to(ROOT)}")
if __name__ == "__main__":
emit()