takeone-youtube-clone/app/Jobs/GenerateLyricsJob.php

<?php

namespace App\Jobs;

use App\Models\Setting;
use App\Models\Video;
use App\Models\VideoAudioTrack;
use App\Services\NasSyncService;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Process;

/**
 * Generate word-level synced lyrics for one audio track (the video's primary
 * audio when $trackId is null, otherwise a specific extra-language track).
 *
 * Output is a per-track lyrics JSON written through NasSyncService::putLyrics()
 * — source-of-truth, synced to NAS, never under cache/. Runs the GPU pipeline
 * exactly once; playback just loads the file afterwards.
 */
class GenerateLyricsJob implements ShouldQueue
{
    use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;

    public int $timeout = 3600;
    public int $tries = 1;

    public function __construct(public int $videoId, public ?int $trackId = null)
    {
        $this->onQueue('video-processing');
    }

    /** Shared progress-file path (written by the pipeline, read by the status endpoint). */
    public static function progressPath(int $videoId, ?int $trackId): string
    {
        return storage_path('app/tmp/lyrics_prog_' . $videoId . '_' . ($trackId ?? 'primary') . '.json');
    }

    /** Index of the GPU with the most free memory, or null if it can't be queried. */
    private function freestGpu(): ?int
    {
        $out = []; $code = 1;
        @exec('nvidia-smi --query-gpu=index,memory.free --format=csv,noheader,nounits 2>/dev/null', $out, $code);
        if ($code !== 0 || empty($out)) return null;
        $best = null; $bestFree = -1;
        foreach ($out as $line) {
            $parts = array_map('trim', explode(',', $line));
            if (count($parts) < 2) continue;
            $idx = (int) $parts[0]; $free = (int) $parts[1];
            if ($free > $bestFree) { $bestFree = $free; $best = $idx; }
        }
        return $best;
    }

    public function handle(NasSyncService $nas): void
    {
        $video = Video::find($this->videoId);
        if (! $video) return;

        $track = $this->trackId ? VideoAudioTrack::find($this->trackId) : null;
        if ($this->trackId && ! $track) return;

        $language = $track ? $track->language : $video->language;

        // Mark as processing so the UI can show a generating state before the file lands.
        $nas->putLyrics($video, $track, [
            'version'  => 1,
            'status'   => 'processing',
            'language' => $language,
        ]);

        // Resolve a readable local copy of the audio (downloads from NAS if needed).
        $audioPath     = $track ? $nas->ensureLocalTrackCopy($track) : $nas->ensureLocalCopy($video);
        $nasDownloaded = $audioPath && str_starts_with($audioPath, storage_path('app/nas_cache/'))
            ? $audioPath : null;

        if (! $audioPath || ! file_exists($audioPath)) {
            Log::error('GenerateLyricsJob: audio file unavailable', [
                'video_id' => $this->videoId, 'track_id' => $this->trackId,
            ]);
            $nas->putLyrics($video, $track, [
                'version' => 1, 'status' => 'failed', 'language' => $language,
                'error' => 'audio file unavailable',
            ]);
            return;
        }

        $python   = base_path('ml/venv/bin/python');
        $script   = base_path('ml/transcribe.py');
        $outTmp   = storage_path('app/tmp/lyrics_' . $this->videoId . '_' . ($this->trackId ?? 'primary') . '.json');
        $progress = self::progressPath($this->videoId, $this->trackId);
        if (! is_dir(dirname($outTmp))) @mkdir(dirname($outTmp), 0775, true);
        @file_put_contents($progress, json_encode(['status' => 'processing', 'pct' => 1, 'stage' => 'Queued']));

        // Model/weight downloads land in a www-data-writable cache, not root's $HOME.
        $cacheDir = base_path('ml/cache');
        if (! is_dir($cacheDir)) @mkdir($cacheDir, 0775, true);

        // NOTE: we deliberately do NOT force --language. The stored label is just
        // metadata and is often wrong (e.g. a Tagalog song mislabeled "en"), which
        // made WhisperX transcribe the wrong language. Auto-detecting from the
        // isolated vocals is ground truth; the detected language is saved instead.
        $args = [$python, $script, '--audio', $audioPath, '--out', $outTmp, '--progress', $progress];

        // Pipeline feature toggles (admin → Lyrics Pipeline). Defaults preserve
        // current behavior; admin can disable any sub-step that misbehaves.
        $useDescription = Setting::get('lyrics_use_description',      'true') === 'true';
        $vadEnabled     = Setting::get('lyrics_vad_enabled',          'true') === 'true';
        $vocalGapFill   = Setting::get('lyrics_vocal_region_gapfill', 'true') === 'true';
        $demucsEnabled  = Setting::get('lyrics_demucs_enabled',       'false') === 'true';

        if (! $vadEnabled)   $args[] = '--no-vad';
        if (! $vocalGapFill) $args[] = '--no-vocal-gapfill';

        // If the song's description contains the lyrics (typed by the uploader),
        // pass them to the pipeline so it ALIGNS those exact lines to the audio
        // instead of generating noisier text from scratch. Only for the primary
        // track — extra-language tracks have their own audio and aren't paired
        // with the description text.
        $userLyrFile = null;
        if ($useDescription && ! $this->trackId && $video->description) {
            // Prefer the deterministic regex parser. It strips emojis line-by-line
            // without touching the underlying words, so it preserves every
            // language a multilingual song contains (e.g. an English+Thai song
            // keeps both halves). The LLM cleaner is only a backup for cases
            // where the regex returns nothing — we've seen the LLM silently
            // drop whole verses that happened to be wrapped in emoji decoration.
            $descLines = \App\Support\LyricsDescriptionParser::extract($video->description);
            $source    = 'regex';

            if (empty($descLines)) {
                $llm = app(\App\Services\LlmLyricsService::class);
                if ($llm->cleanLyricsEnabled()) {
                    try {
                        $descLines = $llm->cleanDescription($video->description);
                        $source    = 'llm';
                    } catch (\Throwable $e) {
                        Log::warning('LLM clean failed: ' . $e->getMessage());
                    }
                }
            }

            if ($descLines) {
                $userLyrFile = storage_path('app/tmp/userlyr_' . $this->videoId . '.txt');
                file_put_contents($userLyrFile, implode("\n", $descLines));
                $args[] = '--user-lyrics';
                $args[] = $userLyrFile;
                // With description lyrics, Whisper is only providing word-timing
                // anchors — its actual transcription text is discarded by the
                // aligner. Vocal isolation (Demucs) helps transcription QUALITY
                // but is unnecessary for timing, AND the Demucs→Whisper CUDA-
                // context handoff has caused intermittent 50% futex deadlocks.
                // So we skip Demucs in this mode by default; the admin can
                // re-enable via the Lyrics Pipeline page.
                $args[] = '--no-demucs';
                Log::info('GenerateLyricsJob: using description lyrics', [
                    'video_id' => $this->videoId, 'lines' => count($descLines),
                    'source'   => $source, 'demucs' => false,
                    'vad'      => $vadEnabled, 'vocal_gapfill' => $vocalGapFill,
                ]);
            }
        }
        // Honor the admin Demucs toggle for tracks WITHOUT description lyrics
        // (where Whisper's transcription quality actually matters).
        if (! $userLyrFile && ! $demucsEnabled) {
            $args[] = '--no-demucs';
        }
        if (Setting::gpuUsable()) {
            // Run on the GPU with the most free VRAM so a busy card never forces an
            // out-of-memory fall back to slow CPU. With two cards this keeps every
            // generation on the GPU and fast.
            $args[] = '--gpu';
            $args[] = (string) ($this->freestGpu() ?? Setting::gpuDevice());
        }

        Log::info('GenerateLyricsJob: starting', [
            'video_id' => $this->videoId, 'track_id' => $this->trackId,
            'language' => $language, 'gpu' => Setting::gpuUsable(),
        ]);

        try {
            $result = Process::timeout($this->timeout)
                ->env([
                    'HOME'            => $cacheDir,
                    'XDG_CACHE_HOME'  => $cacheDir,
                    'HF_HOME'         => $cacheDir . '/huggingface',
                    'TORCH_HOME'      => $cacheDir . '/torch',
                    // Demucs runs as a subprocess BEFORE faster-whisper is imported.
                    // If OpenMP gets initialised in the parent before that fork, the
                    // post-fork CUDA/ctranslate2 stack can deadlock in futex_wait —
                    // we've seen this hang lyrics jobs at 50% indefinitely. Forcing
                    // single-threaded OpenMP in the parent eliminates the race
                    // (faster-whisper sets its own thread count internally anyway).
                    'OMP_NUM_THREADS'  => '1',
                    'MKL_NUM_THREADS'  => '1',
                    'OPENBLAS_NUM_THREADS' => '1',
                ])
                ->run($args);

            if (! $result->successful() || ! file_exists($outTmp)) {
                throw new \RuntimeException('transcribe.py failed: ' . substr($result->errorOutput(), -2000));
            }

            $data = json_decode((string) file_get_contents($outTmp), true);
            if (! is_array($data) || empty($data['lines'])) {
                throw new \RuntimeException('transcribe.py produced no lines');
            }

            $data['status']        = 'ready';
            $data['generated_at']  = now()->toIso8601String();
            $data['language']      = $data['language'] ?? $language;

            $nas->putLyrics($video, $track, $data);

            // Decoration is independent of the audio pipeline — kick it off as
            // its own job so a flaky LLM call can't delay or fail a successful
            // transcription. Skips itself silently if the decorator is off.
            DecorateLyricsJob::dispatch($this->videoId, $this->trackId)
                ->onConnection('database');

            Log::info('GenerateLyricsJob: done', [
                'video_id' => $this->videoId, 'track_id' => $this->trackId,
                'lines' => count($data['lines']),
            ]);
        } catch (\Throwable $e) {
            Log::error('GenerateLyricsJob failed: ' . $e->getMessage(), [
                'video_id' => $this->videoId, 'track_id' => $this->trackId,
            ]);
            $nas->putLyrics($video, $track, [
                'version' => 1, 'status' => 'failed', 'language' => $language,
                'error' => $e->getMessage(),
            ]);
        } finally {
            @unlink($outTmp);
            @unlink($progress);
            if ($userLyrFile) @unlink($userLyrFile);
            if ($nasDownloaded) @unlink($nasDownloaded);
        }
    }
}