#!/usr/bin/env python3
"""
Tesla V100 Lyrics Analyzer v2.0 - Enhanced Monitoring
Обработка 8000+ треков с полным отслеживанием динамики
"""

import sqlite3
import hashlib
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import json
import time
from datetime import datetime
import logging
import psutil
import threading
from collections import defaultdict

# Настройка логирования
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ProgressMonitor:
    """Класс для отслеживания прогресса и производительности"""
    
    def __init__(self):
        self.start_time = time.time()
        self.processed_count = 0
        self.total_count = 0
        self.batch_times = []
        self.gpu_usage = []
        self.emotion_stats = defaultdict(int)
        self.similarity_groups = defaultdict(int)
        self.monitoring = True
        
    def start_monitoring(self):
        """Запуск мониторинга ресурсов в отдельном потоке"""
        def monitor():
            while self.monitoring:
                try:
                    # GPU мониторинг
                    if torch.cuda.is_available():
                        gpu_memory = torch.cuda.memory_allocated() / 1024**3  # GB
                        gpu_util = torch.cuda.utilization()
                        self.gpu_usage.append({'memory': gpu_memory, 'util': gpu_util})
                    
                    time.sleep(5)  # Обновление каждые 5 секунд
                except Exception as e:
                    logger.warning(f"Monitoring error: {e}")
        
        monitor_thread = threading.Thread(target=monitor, daemon=True)
        monitor_thread.start()
    
    def update_progress(self, batch_size, batch_time, emotions_batch):
        """Обновление прогресса обработки"""
        self.processed_count += batch_size
        self.batch_times.append(batch_time)
        
        # Статистика эмоций в батче
        for emotion_data in emotions_batch:
            self.emotion_stats[emotion_data['primary']] += 1
        
        # Расчет ETA и скорости
        elapsed = time.time() - self.start_time
        speed = self.processed_count / elapsed if elapsed > 0 else 0
        eta = (self.total_count - self.processed_count) / speed if speed > 0 else 0
        
        # Прогресс в процентах
        progress = (self.processed_count / self.total_count * 100) if self.total_count > 0 else 0
        
        # Средние показатели
        avg_batch_time = np.mean(self.batch_times[-10:]) if self.batch_times else 0
        
        logger.info(f"""
🚀 TESLA V100 PROGRESS 🚀
Progress: {progress:.1f}% ({self.processed_count}/{self.total_count})
Speed: {speed:.1f} tracks/sec | ETA: {eta/60:.1f} min
Batch time: {batch_time:.2f}s (avg: {avg_batch_time:.2f}s)
Emotions: {dict(self.emotion_stats)}
        """)
    
    def get_gpu_stats(self):
        """Получение статистики GPU"""
        if not self.gpu_usage:
            return "No GPU data"
        
        recent_usage = self.gpu_usage[-10:]  # Последние 10 измерений
        avg_memory = np.mean([u['memory'] for u in recent_usage])
        avg_util = np.mean([u['util'] for u in recent_usage])
        
        return f"GPU: {avg_memory:.1f}GB RAM, {avg_util:.1f}% Util"
    
    def final_report(self):
        """Финальный отчет"""
        self.monitoring = False
        total_time = time.time() - self.start_time
        
        report = f"""
🎯 ANALYSIS COMPLETE! 🎯
Total time: {total_time/60:.1f} minutes
Total tracks: {self.processed_count}
Average speed: {self.processed_count/total_time:.1f} tracks/sec
{self.get_gpu_stats()}

📊 EMOTION DISTRIBUTION:
{json.dumps(dict(self.emotion_stats), indent=2)}

⚡ PERFORMANCE:
Fastest batch: {min(self.batch_times):.2f}s
Slowest batch: {max(self.batch_times):.2f}s
Average batch: {np.mean(self.batch_times):.2f}s
        """
        
        logger.info(report)
        return report

class TeslaLyricsAnalyzer:
    def __init__(self, db_path="radio/uploads/suno/db/tracks.sqlite"):
        self.db_path = db_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.monitor = ProgressMonitor()
        
        logger.info(f"🚀 Tesla V100 Analyzer starting...")
        logger.info(f"Device: {self.device}")
        logger.info(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            logger.info(f"GPU: {torch.cuda.get_device_name()}")
            logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
        
        self.setup_models()
        
    def setup_models(self):
        """Загрузка моделей на Tesla V100"""
        logger.info("🔄 Loading models on Tesla V100...")
        start_time = time.time()
        
        # Модель для эмоционального анализа
        self.emotion_analyzer = pipeline(
            "text-classification",
            model="cardiffnlp/twitter-roberta-base-sentiment-latest",
            device=0 if torch.cuda.is_available() else -1
        )
        
        # Модель для семантического сходства
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.similarity_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.similarity_model.to(self.device)
        
        load_time = time.time() - start_time
        logger.info(f"✅ Models loaded in {load_time:.1f}s")

    def get_tracks_with_lyrics(self):
        """Получаем треки с лирикой из БД"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Сначала статистика
        cursor.execute("SELECT COUNT(*) FROM tracks WHERE lyric IS NOT NULL AND lyric != ''")
        total_with_lyrics = cursor.fetchone()[0]
        
        cursor.execute("SELECT COUNT(*) FROM tracks WHERE processing_version >= 2")
        already_processed = cursor.fetchone()[0]
        
        logger.info(f"📊 Database stats: {total_with_lyrics} tracks with lyrics, {already_processed} already processed")
        
        cursor.execute("""
            SELECT id, title, lyric, tags, processing_version
            FROM tracks 
            WHERE lyric IS NOT NULL AND lyric != '' 
            AND (processing_version IS NULL OR processing_version < 2)
            ORDER BY id
        """)
        
        tracks = cursor.fetchall()
        conn.close()
        
        logger.info(f"🎯 Found {len(tracks)} tracks to process")
        return tracks

    def calculate_lyric_hash(self, lyric):
        """Хеш для группировки идентичных текстов"""
        clean_lyric = lyric.lower().strip()
        return hashlib.md5(clean_lyric.encode()).hexdigest()

    def analyze_emotions(self, lyric):
        """Анализ эмоций в тексте"""
        try:
            text_sample = lyric[:512]
            result = self.emotion_analyzer(text_sample)
            
            emotion_map = {
                'LABEL_0': 'negative',
                'LABEL_1': 'neutral', 
                'LABEL_2': 'positive',
                'NEGATIVE': 'negative',
                'NEUTRAL': 'neutral',
                'POSITIVE': 'positive'
            }
            
            primary_emotion = emotion_map.get(result[0]['label'], result[0]['label'].lower())
            confidence = result[0]['score']
            
            energy = self.calculate_energy(lyric)
            valence = confidence if primary_emotion == 'positive' else (1 - confidence)
            
            return {
                'primary': primary_emotion,
                'confidence': confidence,
                'energy': energy,
                'valence': valence
            }
            
        except Exception as e:
            return {
                'primary': 'neutral',
                'confidence': 0.5,
                'energy': 0.5,
                'valence': 0.5
            }

    def calculate_energy(self, lyric):
        """Расчет энергетики текста"""
        high_energy_words = ['dance', 'jump', 'energy', 'fast', 'loud', 'power', 'strong', 'танцуй', 'энергия', 'сильный']
        low_energy_words = ['slow', 'calm', 'quiet', 'sleep', 'rest', 'медленно', 'спокойно', 'тихо']
        
        lyric_lower = lyric.lower()
        high_count = sum(1 for word in high_energy_words if word in lyric_lower)
        low_count = sum(1 for word in low_energy_words if word in lyric_lower)
        
        if high_count + low_count == 0:
            return 0.5
        
        return high_count / (high_count + low_count)

    def find_similar_lyrics(self, tracks, similarity_threshold=0.8):
        """Группировка похожих текстов"""
        logger.info("🔍 Computing lyric similarities...")
        start_time = time.time()
        
        lyrics = [track[2] for track in tracks]
        
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(lyrics)
        
        similarity_matrix = cosine_similarity(tfidf_matrix)
        
        clustering = DBSCAN(eps=1-similarity_threshold, metric='precomputed')
        clusters = clustering.fit_predict(1 - similarity_matrix)
        
        # Статистика кластеров
        unique_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
        noise_points = list(clusters).count(-1)
        
        similarity_time = time.time() - start_time
        logger.info(f"✅ Similarity analysis: {unique_clusters} groups, {noise_points} unique texts ({similarity_time:.1f}s)")
        
        return clusters

    def process_batch(self, tracks_batch, clusters_batch):
        """Обработка батча треков"""
        batch_start = time.time()
        results = []
        emotions_batch = []
        
        for i, track in enumerate(tracks_batch):
            track_id, title, lyric, tags, _ = track
            
            lyric_hash = self.calculate_lyric_hash(lyric)
            emotions = self.analyze_emotions(lyric)
            emotions_batch.append(emotions)
            
            tag_list = tags.split(',') if tags else []
            dominant_tags = ','.join(tag_list[:3])
            
            result = {
                'id': track_id,
                'lyric_hash': lyric_hash,
                'lyric_group_id': int(clusters_batch[i]) if clusters_batch[i] != -1 else None,
                'emotion_primary': emotions['primary'],
                'emotion_confidence': emotions['confidence'],
                'mood_energy': emotions['energy'],
                'mood_valence': emotions['valence'],
                'dominant_tags': dominant_tags,
                'processed_at': datetime.utcnow().isoformat(),
                'processing_version': 2
            }
            
            results.append(result)
        
        batch_time = time.time() - batch_start
        self.monitor.update_progress(len(tracks_batch), batch_time, emotions_batch)
        
        return results

    def update_database(self, results):
        """Обновление БД с результатами"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        for result in results:
            cursor.execute("""
                UPDATE tracks SET
                    lyric_hash = ?,
                    lyric_group_id = ?,
                    emotion_primary = ?,
                    emotion_confidence = ?,
                    mood_energy = ?,
                    mood_valence = ?,
                    dominant_tags = ?,
                    processed_at = ?,
                    processing_version = ?
                WHERE id = ?
            """, (
                result['lyric_hash'],
                result['lyric_group_id'],
                result['emotion_primary'],
                result['emotion_confidence'],
                result['mood_energy'],
                result['mood_valence'],
                result['dominant_tags'],
                result['processed_at'],
                result['processing_version'],
                result['id']
            ))
        
        conn.commit()
        conn.close()

    def run_analysis(self, batch_size=32):
        """Основной процесс анализа"""
        logger.info("🚀 Starting Tesla V100 analysis...")
        
        # Запуск мониторинга
        self.monitor.start_monitoring()
        
        tracks = self.get_tracks_with_lyrics()
        if not tracks:
            logger.info("No tracks to process")
            return
        
        self.monitor.total_count = len(tracks)
        
        # Группировка похожих текстов
        clusters = self.find_similar_lyrics(tracks)
        
        # Обработка батчами
        for i in range(0, len(tracks), batch_size):
            batch = tracks[i:i+batch_size]
            batch_clusters = clusters[i:i+batch_size]
            
            results = self.process_batch(batch, batch_clusters)
            self.update_database(results)
        
        # Финальный отчет
        final_report = self.monitor.final_report()
        
        # Сохраняем отчет в файл
        with open(f"analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", 'w') as f:
            json.dump({
                'timestamp': datetime.utcnow().isoformat(),
                'total_processed': self.monitor.processed_count,
                'emotion_stats': dict(self.monitor.emotion_stats),
                'performance': {
                    'total_time': time.time() - self.monitor.start_time,
                    'avg_speed': self.monitor.processed_count / (time.time() - self.monitor.start_time),
                    'batch_times': self.monitor.batch_times
                }
            }, f, indent=2)

if __name__ == "__main__":
    analyzer = TeslaLyricsAnalyzer()
    analyzer.run_analysis()
