for line breaks — convert those (and other // block-ending tags) into real newlines BEFORE stripping tags, otherwise // the entire body collapses into one long run-on line. // Heading tags (

…

) carry the song title — drop their content // entirely so the title never leaks into the lyric list. $text = preg_replace('/<\sh[1-6][^>]>.?<\s\/\sh[1-6]\s>/isu', "\n", $desc); $text = preg_replace('/<\sbr\s\/?>/i', "\n", $text); $text = preg_replace('/<\s\/\s(p|div|li|tr|blockquote)\s*>/i', "\n", $text); $text = strip_tags($text); // Decode HTML entities ( , &, etc.) so the comparison later isn't fooled. $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = preg_replace('/\r\n|\r/u', "\n", $text); // First pass: clean each line and flag section headers (Verse / Ritornello // / Bridge / etc.) so they can be dropped — those aren't sung. $cleaned = []; foreach (explode("\n", $text) as $line) { $line = self::cleanLine($line); if ($line === null) continue; $cleaned[] = ['text' => $line, 'header' => self::isSectionHeader($line)]; } // Title detection: if the first non-header line is immediately followed by // a section header (e.g. "Figlio Mio — Viaggio di Vita" then "Verso 1"), // that first line is the song title — drop it too. $firstIdx = null; foreach ($cleaned as $i => $c) { if (! $c['header']) { $firstIdx = $i; break; } } $dropTitle = false; if ($firstIdx !== null) { for ($j = $firstIdx + 1; $j < count($cleaned); $j++) { if ($cleaned[$j]['header']) { $dropTitle = true; break; } break; // first thing after is a real lyric line → not a title block } } $out = []; foreach ($cleaned as $i => $c) { if ($c['header']) continue; if ($dropTitle && $i === $firstIdx) continue; $out[] = $c['text']; } // Avoid mistakenly aligning non-lyric descriptions (a credit line, a URL, // etc.). Require at least a handful of plausible lyric lines. if (count($out) < self::MIN_LYRIC_LINES) return []; return $out; } /** * True when a line is a section marker (Verse / Chorus / Bridge / Outro / * their many translations) rather than a sung lyric. Matches the WHOLE line * so a real lyric containing one of these words isn't mistakenly dropped. / private static function isSectionHeader(string $line): bool { $t = mb_strtolower(trim($line)); if ($t === '') return false; $roots = [ 'intro', 'outro', 'interlude', 'instrumental', 'verse', 'verso', 'verset', 'couplet', 'estrofa', 'strofa', 'chorus', 'ritornello', 'refrain', 'refrão', 'refrao', 'coro', 'estribillo', 'pre[\s\-]?chorus', 'pre[\s\-]?ritornello', 'pre[\s\-]?coro', 'pre[\s\-]?refrain', 'pre[\s\-]?refrão', 'pré[\s\-]?refrain', 'bridge', 'ponte', 'puente', 'pont', 'brücke', 'brucke', 'hook', 'drop', 'breakdown', 'tag', 'vamp', 'coda', 'reprise', // CJK / Thai / Arabic / Korean 'サビ', 'コーラス', 'バース', 'ブリッジ', 'イントロ', 'アウトロ', 'フック', '間奏', 'ท่อน', 'คอรัส', 'ฮุก', 'บริดจ์', 'อินโทร', 'เอาท์โทร', '前奏', '副歌', '桥段', '主歌', '尾奏', 'كورس', 'بريدج', 'كوبليه', '후렴', '브릿지', '인트로', '아웃트로', '훅', ]; $rootRe = implode('|', $roots); // Optional trailing number, "final/finale/reprise", roman numerals. $pattern = '/^(?:' . $rootRe . ')[\s\d:\-—\.](?:final|finale|reprise|ii|iii|iv|v|vi|2|3|4|5)?\s*$/iu'; return (bool) preg_match($pattern, $t); } /** Returns the cleaned line, or null if it should be discarded. / private static function cleanLine(string $line): ?string { $line = trim($line); if ($line === '') return null; // Strip markdown emphasis ( _ ~) and leading list bullets / quote markers. $line = preg_replace('/^[\s>\-\•♪♫·]+/u', '', $line); $line = preg_replace('/[\_~`]+/u', '', $line); // Drop instrument / section annotations inside Japanese-style brackets: // 【箏・尺八・篠笛・優しい歌声】 — these aren't lyrics. $line = preg_replace('/【[^】]】/u', '', $line); $line = preg_replace('/〔[^〕]〕/u', '', $line); $line = preg_replace('/\[\[[^\]]*\]\]/u', '', $line); // Strip emoji / pictographic symbols and the invisible glue that often // sticks to them (variation selectors, ZWJ) so nothing leaves behind a // bare diacritic when the visible emoji is removed. $line = preg_replace( '/[\x{1F000}-\x{1FFFF}\x{2600}-\x{27BF}\x{2B00}-\x{2BFF}\x{0F3A}-\x{0F3D}\x{FE00}-\x{FE0F}\x{200B}-\x{200F}\x{2060}]/u', '', $line ); // Collapse internal whitespace. $line = preg_replace('/\s+/u', ' ', $line); $line = trim($line); if ($line === '') return null; // Must contain at least one letter (Unicode), and at least 3 characters // after stripping — discards "🌸 平穏 🌸" (header) and "──" separators. if (! preg_match('/\p{L}/u', $line)) return null; if (mb_strlen($line) < 3) return null; return $line; } }