for line breaks — convert those (and other
// block-ending tags) into real newlines BEFORE stripping tags, otherwise
// the entire body collapses into one long run-on line.
// Heading tags (
…) carry the song title — drop their content
// entirely so the title never leaks into the lyric list.
$text = preg_replace('/<\s*h[1-6][^>]*>.*?<\s*\/\s*h[1-6]\s*>/isu', "\n", $desc);
$text = preg_replace('/<\s*br\s*\/?>/i', "\n", $text);
$text = preg_replace('/<\s*\/\s*(p|div|li|tr|blockquote)\s*>/i', "\n", $text);
$text = strip_tags($text);
// Decode HTML entities ( , &, etc.) so the comparison later isn't fooled.
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\r\n|\r/u', "\n", $text);
// First pass: clean each line and flag section headers (Verse / Ritornello
// / Bridge / etc.) so they can be dropped — those aren't sung.
$cleaned = [];
foreach (explode("\n", $text) as $line) {
$line = self::cleanLine($line);
if ($line === null) continue;
$cleaned[] = ['text' => $line, 'header' => self::isSectionHeader($line)];
}
// Title detection: if the first non-header line is immediately followed by
// a section header (e.g. "Figlio Mio — Viaggio di Vita" then "Verso 1"),
// that first line is the song title — drop it too.
$firstIdx = null;
foreach ($cleaned as $i => $c) {
if (! $c['header']) { $firstIdx = $i; break; }
}
$dropTitle = false;
if ($firstIdx !== null) {
for ($j = $firstIdx + 1; $j < count($cleaned); $j++) {
if ($cleaned[$j]['header']) { $dropTitle = true; break; }
break; // first thing after is a real lyric line → not a title block
}
}
$out = [];
foreach ($cleaned as $i => $c) {
if ($c['header']) continue;
if ($dropTitle && $i === $firstIdx) continue;
$out[] = $c['text'];
}
// Avoid mistakenly aligning non-lyric descriptions (a credit line, a URL,
// etc.). Require at least a handful of plausible lyric lines.
if (count($out) < self::MIN_LYRIC_LINES) return [];
return $out;
}
/**
* True when a line is a section marker (Verse / Chorus / Bridge / Outro /
* their many translations) rather than a sung lyric. Matches the WHOLE line
* so a real lyric containing one of these words isn't mistakenly dropped.
*/
private static function isSectionHeader(string $line): bool
{
$t = mb_strtolower(trim($line));
if ($t === '') return false;
$roots = [
'intro', 'outro', 'interlude', 'instrumental',
'verse', 'verso', 'verset', 'couplet', 'estrofa', 'strofa',
'chorus', 'ritornello', 'refrain', 'refrão', 'refrao', 'coro', 'estribillo',
'pre[\s\-]?chorus', 'pre[\s\-]?ritornello', 'pre[\s\-]?coro',
'pre[\s\-]?refrain', 'pre[\s\-]?refrão', 'pré[\s\-]?refrain',
'bridge', 'ponte', 'puente', 'pont', 'brücke', 'brucke',
'hook', 'drop', 'breakdown', 'tag', 'vamp', 'coda', 'reprise',
// CJK / Thai / Arabic / Korean
'サビ', 'コーラス', 'バース', 'ブリッジ', 'イントロ', 'アウトロ', 'フック', '間奏',
'ท่อน', 'คอรัส', 'ฮุก', 'บริดจ์', 'อินโทร', 'เอาท์โทร',
'前奏', '副歌', '桥段', '主歌', '尾奏',
'كورس', 'بريدج', 'كوبليه',
'후렴', '브릿지', '인트로', '아웃트로', '훅',
];
$rootRe = implode('|', $roots);
// Optional trailing number, "final/finale/reprise", roman numerals.
$pattern = '/^(?:' . $rootRe . ')[\s\d:\-—\.]*(?:final|finale|reprise|ii|iii|iv|v|vi|2|3|4|5)?\s*$/iu';
return (bool) preg_match($pattern, $t);
}
/** Returns the cleaned line, or null if it should be discarded. */
private static function cleanLine(string $line): ?string
{
$line = trim($line);
if ($line === '') return null;
// Strip markdown emphasis (* _ ~) and leading list bullets / quote markers.
$line = preg_replace('/^[\s>\-\*•♪♫·]+/u', '', $line);
$line = preg_replace('/[\*_~`]+/u', '', $line);
// Drop instrument / section annotations inside Japanese-style brackets:
// 【 箏・尺八・篠笛・優しい歌声 】 — these aren't lyrics.
$line = preg_replace('/【[^】]*】/u', '', $line);
$line = preg_replace('/〔[^〕]*〕/u', '', $line);
$line = preg_replace('/\[\[[^\]]*\]\]/u', '', $line);
// Strip emoji / pictographic symbols and the invisible glue that often
// sticks to them (variation selectors, ZWJ) so nothing leaves behind a
// bare diacritic when the visible emoji is removed.
$line = preg_replace(
'/[\x{1F000}-\x{1FFFF}\x{2600}-\x{27BF}\x{2B00}-\x{2BFF}\x{0F3A}-\x{0F3D}\x{FE00}-\x{FE0F}\x{200B}-\x{200F}\x{2060}]/u',
'', $line
);
// Collapse internal whitespace.
$line = preg_replace('/\s+/u', ' ', $line);
$line = trim($line);
if ($line === '') return null;
// Must contain at least one letter (Unicode), and at least 3 characters
// after stripping — discards "🌸 平穏 🌸" (header) and "──" separators.
if (! preg_match('/\p{L}/u', $line)) return null;
if (mb_strlen($line) < 3) return null;
return $line;
}
}