Files
farolero/tools/generate_word_banks.ps1
2026-05-04 22:23:11 +02:00

217 lines
6.6 KiB
PowerShell
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
param(
[string]$OutputDir = 'assets/words',
[int]$BatchSize = 100
)
$ErrorActionPreference = 'Stop'
$langMap = [ordered]@{
ar = 'ar'
ca = 'ca'
de = 'de'
en = 'en'
es = 'es'
eu = 'eu'
fr = 'fr'
hi = 'hi'
it = 'it'
ja = 'ja'
ko = 'ko'
nl = 'nl'
pl = 'pl'
pt = 'pt'
ru = 'ru'
tr = 'tr'
zh = 'zh-CN'
zh_TW = 'zh-TW'
}
$categoryKeyMap = [ordered]@{
animales = 'categoryAnimals'
comida = 'categoryFood'
paises = 'categoryCountries'
deportes = 'categorySports'
profesiones = 'categoryProfessions'
objetos = 'categoryObjects'
lugares = 'categoryPlaces'
peliculas = 'categoryMovies'
musica = 'categoryMusic'
tecnologia = 'categoryTechnology'
}
$contextMap = [ordered]@{
animales = 'animal'
comida = 'food'
paises = 'country'
deportes = 'sport'
profesiones = 'profession'
objetos = 'object'
lugares = 'place'
peliculas = 'movie'
musica = 'music'
tecnologia = 'technology'
}
$utf8Strict = [System.Text.UTF8Encoding]::new($false, $true)
$utf8NoBom = [System.Text.UTF8Encoding]::new($false)
function Read-Utf8Json([string]$path) {
$text = $utf8Strict.GetString([System.IO.File]::ReadAllBytes((Resolve-Path $path))).TrimStart([char]0xFEFF)
return $text | ConvertFrom-Json
}
function Write-Utf8Json([object]$obj, [string]$path) {
$full = Join-Path (Get-Location) $path
$dir = Split-Path -Parent $full
if ($dir -and -not (Test-Path $dir)) { New-Item -ItemType Directory -Force $dir | Out-Null }
[System.IO.File]::WriteAllText($full, ($obj | ConvertTo-Json -Depth 10) + "`n", $utf8NoBom)
}
function Strip-Context([string]$value) {
$clean = $value.Trim()
if ($clean -match '^\s*[^:]{1,30}\s*[:]\s*') {
return ($clean -replace '^\s*[^:]{1,30}\s*[:]\s*', '').Trim()
}
return $clean
}
function Translate-Batch([string[]]$terms, [string]$target) {
if ($terms.Count -eq 0) { return @() }
$numbered = New-Object System.Collections.Generic.List[string]
for ($i = 0; $i -lt $terms.Count; $i++) {
$numbered.Add("[$i] $($terms[$i])")
}
$query = [uri]::EscapeDataString(($numbered -join "`n"))
$url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=es&tl=$target&dt=t&q=$query"
try {
$response = Invoke-RestMethod -Uri $url -TimeoutSec 45
$translated = (($response[0] | ForEach-Object { $_[0] }) -join '')
$matches = [regex]::Matches(
$translated,
'(?s)\[(\d+)\]\s*(.*?)(?=\s*\[\d+\]\s*|$)'
)
if ($matches.Count -eq $terms.Count) {
$out = New-Object string[] $terms.Count
foreach ($match in $matches) {
$index = [int]$match.Groups[1].Value
if ($index -ge 0 -and $index -lt $terms.Count) {
$out[$index] = Strip-Context $match.Groups[2].Value
}
}
if (($out | Where-Object { $_ -eq $null -or $_.Trim().Length -eq 0 }).Count -eq 0) {
return $out
}
}
} catch {
Start-Sleep -Milliseconds 250
}
$out = New-Object System.Collections.Generic.List[string]
foreach ($term in $terms) {
$queryOne = [uri]::EscapeDataString($term)
$urlOne = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=es&tl=$target&dt=t&q=$queryOne"
$translatedOne = $null
for ($attempt = 1; $attempt -le 4; $attempt++) {
try {
$responseOne = Invoke-RestMethod -Uri $urlOne -TimeoutSec 45
$translatedOne = (($responseOne[0] | ForEach-Object { $_[0] }) -join '')
break
} catch {
if ($attempt -eq 4) { throw }
Start-Sleep -Milliseconds (250 * $attempt)
}
}
$out.Add((Strip-Context $translatedOne))
Start-Sleep -Milliseconds 35
}
return $out.ToArray()
}
$sourceEs = Read-Utf8Json 'assets/palabras.json'
$sourceEn = Read-Utf8Json 'assets/palabras_en.json'
$sourceFr = Read-Utf8Json 'assets/palabras_fr.json'
$arbByLang = @{}
foreach ($lang in $langMap.Keys) {
$arbByLang[$lang] = Read-Utf8Json ("lib/l10n/app_{0}.arb" -f $lang)
}
function New-LanguageBank([string]$lang) {
$bank = [ordered]@{
version = 2
idioma = $lang
categorias = [ordered]@{}
}
foreach ($category in $categoryKeyMap.Keys) {
$labelKey = $categoryKeyMap[$category]
$words = switch ($lang) {
'es' { @($sourceEs.categorias.$category) }
'en' { @($sourceEn.categorias.$category) }
'fr' { @($sourceFr.categorias.$category) }
default { @() }
}
$bank.categorias[$category] = [ordered]@{
pista = [string]$arbByLang[$lang].$labelKey
palabras = @($words | ForEach-Object { [string]$_ })
}
}
return $bank
}
foreach ($lang in @('es', 'en', 'fr')) {
Write-Utf8Json (New-LanguageBank $lang) (Join-Path $OutputDir "palabras_$lang.json")
}
$targets = @($langMap.Keys | Where-Object { $_ -notin @('es', 'en', 'fr') })
foreach ($lang in $targets) {
Write-Host "Generating $lang..."
$bank = New-LanguageBank $lang
$targetCode = $langMap[$lang]
foreach ($category in $categoryKeyMap.Keys) {
$spanishWords = @($sourceEs.categorias.$category)
$context = $contextMap[$category]
$translatedWords = New-Object System.Collections.Generic.List[string]
for ($offset = 0; $offset -lt $spanishWords.Count; $offset += $BatchSize) {
$last = [Math]::Min($offset + $BatchSize - 1, $spanishWords.Count - 1)
$terms = New-Object System.Collections.Generic.List[string]
for ($index = $offset; $index -le $last; $index++) {
$terms.Add("${context}: $($spanishWords[$index])")
}
$translated = @(Translate-Batch $terms.ToArray() $targetCode)
foreach ($word in $translated) { $translatedWords.Add($word) }
Start-Sleep -Milliseconds 70
}
$bank.categorias[$category].palabras = $translatedWords.ToArray()
}
Write-Utf8Json $bank (Join-Path $OutputDir "palabras_$lang.json")
}
Write-Host 'Validating UTF-8 and sample accents...'
foreach ($lang in $langMap.Keys) {
$file = Join-Path $OutputDir "palabras_$lang.json"
$bytes = [System.IO.File]::ReadAllBytes((Resolve-Path $file))
$null = $utf8Strict.GetString($bytes)
if ($bytes.Length -ge 3 -and $bytes[0] -eq 239 -and $bytes[1] -eq 187 -and $bytes[2] -eq 191) {
throw "Unexpected UTF-8 BOM in $file"
}
$bank = Read-Utf8Json $file
foreach ($category in $categoryKeyMap.Keys) {
$expected = @($sourceEs.categorias.$category).Count
$actual = @($bank.categorias.$category.palabras).Count
if ($actual -ne $expected) { throw "Word count mismatch in $file / $category. Expected $expected, got $actual" }
}
}
$esBank = Read-Utf8Json (Join-Path $OutputDir 'palabras_es.json')
$leon = @($esBank.categorias.animales.palabras) | Where-Object { $_ -eq 'León' } | Select-Object -First 1
if ($leon -ne 'León') { throw 'León accent validation failed' }
Write-Host "OK: generated split word banks in $OutputDir"