#!/usr/bin/env python3
"""
SUNO AI ML MUSIC SEARCH SYSTEM - FINAL WORKING VERSION
User: valkst
Date: 2025-08-02 11:24:00 UTC
Complete working ML demo system for Suno AI music collection
"""

import sqlite3
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import warnings
import re
warnings.filterwarnings('ignore')

class SunoMLSystem:
    def __init__(self):
        self.db_path = "tracks.sqlite"
        print("🎵 SUNO AI ML MUSIC SEARCH SYSTEM")
        print("="*60)
        
        # Ленивая загрузка модели
        self.model = None
        
        # Проверяем статистику базы
        self.show_database_stats()
        
    def show_database_stats(self):
        """Показываем статистику базы данных"""
        conn = sqlite3.connect(self.db_path)
        
        # Основная статистика
        cursor = conn.execute("""
        SELECT 
            COUNT(DISTINCT t.id) as total_tracks,
            COUNT(DISTINCT ml.track_id) as ml_tracks,
            COUNT(DISTINCT emb.track_id) as embedding_tracks
        FROM tracks t
        LEFT JOIN ml_analysis ml ON t.id = ml.track_id
        LEFT JOIN ml_embeddings emb ON t.id = emb.track_id
        WHERE t.lyric != '[Instrumental]'
        """)
        
        total, ml_count, emb_count = cursor.fetchone()
        
        print(f"📊 Всего треков: {total:,}")
        print(f"🧠 ML анализ: {ml_count:,} треков")
        print(f"🔤 Векторы: {emb_count:,} треков")
        print(f"✅ Покрытие: {ml_count/total*100:.1f}%")
        print("="*60)
        
        conn.close()
        
    def load_model_if_needed(self):
        """Загружаем модель только при необходимости"""
        if self.model is None:
            print("🧠 Загружаем SentenceTransformer...")
            self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
            print("✅ Модель готова!")
        
    def semantic_search(self, query, limit=10):
        """Семантический поиск треков"""
        print(f"🔍 Семантический поиск: '{query}'")
        print("-"*60)
        
        self.load_model_if_needed()
        
        # Создаем вектор запроса
        query_vector = self.model.encode([query], normalize_embeddings=True)
        
        conn = sqlite3.connect(self.db_path)
        
        # Получаем треки с векторами
        cursor = conn.execute("""
        SELECT t.id, t.title, t.lyric, t.tags, emb.embedding_vector,
               ml.ml_emotion_dominant, ml.ml_emotion_confidence, 
               t.quality_score, ml.semantic_cluster
        FROM tracks t
        JOIN ml_embeddings emb ON t.id = emb.track_id
        JOIN ml_analysis ml ON t.id = ml.track_id
        WHERE t.lyric != '[Instrumental]'
        ORDER BY t.quality_score DESC
        LIMIT 2000
        """)
        
        results = []
        processed = 0
        
        for row in cursor.fetchall():
            track_id, title, lyric, tags, vector_blob, emotion, emotion_conf, quality, cluster = row
            
            try:
                # Загружаем вектор
                vector = pickle.loads(vector_blob)
                
                # Вычисляем similarity
                similarity = cosine_similarity(query_vector, vector.reshape(1, -1))[0][0]
                
                # Извлекаем артиста из title
                artist = "Unknown Artist"
                clean_title = title
                
                # Различные форматы для извлечения артиста
                if " — " in title:
                    parts = title.split(" — ", 1)
                    artist, clean_title = parts[0].strip(), parts[1].strip()
                elif " - " in title:
                    parts = title.split(" - ", 1)
                    artist, clean_title = parts[0].strip(), parts[1].strip()
                elif title.startswith("["):
                    # Формат [title: название]
                    match = re.search(r'\[title:\s*([^\]]+)\]', title)
                    if match:
                        clean_title = match.group(1).strip()
                
                # Превью лирики
                lyric_preview = str(lyric)[:120] + "..." if len(str(lyric)) > 120 else str(lyric)
                
                results.append({
                    'id': track_id,
                    'title': clean_title,
                    'artist': artist,
                    'similarity': similarity,
                    'emotion': emotion,
                    'emotion_confidence': emotion_conf,
                    'quality': quality,
                    'cluster': cluster,
                    'lyric': lyric_preview,
                    'tags': str(tags)[:50] + "..." if len(str(tags)) > 50 else str(tags)
                })
                
                processed += 1
                
            except Exception as e:
                continue
        
        conn.close()
        
        # Сортируем по similarity
        results.sort(key=lambda x: x['similarity'], reverse=True)
        
        print(f"📊 Найдено {processed} треков, показываем топ-{limit}:")
        print()
        
        for i, result in enumerate(results[:limit], 1):
            emotion_icon = {
                'joy': '😊', 'sadness': '😢', 'anger': '😡', 
                'fear': '😨', 'disgust': '🤢', 'surprise': '😮'
            }.get(result['emotion'], '🎵')
            
            print(f"{i:2d}. 🎵 {result['title']}")
            print(f"    👤 {result['artist']}")
            print(f"    🎯 Similarity: {result['similarity']:.3f}")
            print(f"    {emotion_icon} {result['emotion'].title()} (conf: {result['emotion_confidence']:.2f})")
            print(f"    ⭐ Quality: {result['quality']:.3f} | 🔍 Cluster: {result['cluster']}")
            print(f"    🏷️ {result['tags']}")
            print(f"    📝 {result['lyric']}")
            print()
            
        return results[:limit]
        
    def emotion_filter(self, emotion, limit=15):
        """Фильтр треков по эмоции"""
        emotions_dict = {
            'радость': 'joy', 'грусть': 'sadness', 'гнев': 'anger',
            'страх': 'fear', 'отвращение': 'disgust', 'удивление': 'surprise',
            'joy': 'joy', 'sadness': 'sadness', 'anger': 'anger',
            'fear': 'fear', 'disgust': 'disgust', 'surprise': 'surprise'
        }
        
        emotion_en = emotions_dict.get(emotion.lower(), emotion.lower())
        emotion_icon = {
            'joy': '😊', 'sadness': '😢', 'anger': '😡', 
            'fear': '😨', 'disgust': '🤢', 'surprise': '😮'
        }.get(emotion_en, '🎵')
        
        print(f"{emotion_icon} Треки с эмоцией '{emotion.title()}':")
        print("-"*60)
        
        conn = sqlite3.connect(self.db_path)
        
        cursor = conn.execute("""
        SELECT t.title, t.lyric, t.tags, ml.ml_emotion_confidence, 
               t.quality_score, ml.semantic_cluster
        FROM tracks t
        JOIN ml_analysis ml ON t.id = ml.track_id
        WHERE ml.ml_emotion_dominant = ?
        AND t.lyric != '[Instrumental]'
        ORDER BY ml.ml_emotion_confidence DESC, t.quality_score DESC
        LIMIT ?
        """, (emotion_en, limit))
        
        results = cursor.fetchall()
        conn.close()
        
        if not results:
            print(f"❌ Треки с эмоцией '{emotion}' не найдены")
            return []
        
        for i, (title, lyric, tags, conf, quality, cluster) in enumerate(results, 1):
            # Извлекаем артиста
            artist = "Unknown Artist"
            clean_title = title
            
            if " — " in title:
                parts = title.split(" — ", 1)
                artist, clean_title = parts[0].strip(), parts[1].strip()
            elif " - " in title:
                parts = title.split(" - ", 1)
                artist, clean_title = parts[0].strip(), parts[1].strip()
            
            lyric_preview = str(lyric)[:100] + "..." if len(str(lyric)) > 100 else str(lyric)
            tags_preview = str(tags)[:60] + "..." if len(str(tags)) > 60 else str(tags)
            
            print(f"{i:2d}. 🎵 {clean_title}")
            print(f"    👤 {artist}")
            print(f"    🎯 Emotion confidence: {conf:.3f}")
            print(f"    ⭐ Quality: {quality:.3f} | 🔍 Cluster: {cluster}")
            print(f"    🏷️ {tags_preview}")
            print(f"    📝 {lyric_preview}")
            print()
            
        return results
        
    def cluster_analysis(self):
        """Анализ семантических кластеров"""
        print("🔍 АНАЛИЗ СЕМАНТИЧЕСКИХ КЛАСТЕРОВ:")
        print("="*60)
        
        conn = sqlite3.connect(self.db_path)
        
        # Получаем статистику по кластерам
        cursor = conn.execute("""
        SELECT 
            semantic_cluster,
            COUNT(*) as track_count,
            AVG(cluster_confidence) as avg_confidence,
            ml_emotion_dominant,
            AVG(ml_emotion_confidence) as avg_emotion_conf,
            AVG((SELECT quality_score FROM tracks WHERE id = ml_analysis.track_id)) as avg_quality
        FROM ml_analysis
        WHERE semantic_cluster >= 0
        GROUP BY semantic_cluster, ml_emotion_dominant
        ORDER BY semantic_cluster, track_count DESC
        """)
        
        clusters_data = {}
        for row in cursor.fetchall():
            cluster_id, count, conf, emotion, emo_conf, quality = row
            
            if cluster_id not in clusters_data:
                clusters_data[cluster_id] = {
                    'total_tracks': 0,
                    'avg_confidence': conf,
                    'avg_quality': quality,
                    'emotions': {},
                    'top_emotion': None,
                    'top_emotion_count': 0
                }
            
            clusters_data[cluster_id]['total_tracks'] += count
            clusters_data[cluster_id]['emotions'][emotion] = count
            
            if count > clusters_data[cluster_id]['top_emotion_count']:
                clusters_data[cluster_id]['top_emotion'] = emotion
                clusters_data[cluster_id]['top_emotion_count'] = count
        
        conn.close()
        
        # Сортируем кластеры по количеству треков
        sorted_clusters = sorted(clusters_data.items(), 
                               key=lambda x: x[1]['total_tracks'], reverse=True)
        
        for cluster_id, data in sorted_clusters:
            emotion_icon = {
                'joy': '😊', 'sadness': '😢', 'anger': '😡', 
                'fear': '😨', 'disgust': '🤢', 'surprise': '😮'
            }.get(data['top_emotion'], '🎵')
            
            print(f"🔸 Кластер {cluster_id}: {data['total_tracks']} треков")
            print(f"   {emotion_icon} Главная эмоция: {data['top_emotion']} ({data['top_emotion_count']} треков)")
            print(f"   🎯 Средняя уверенность: {data['avg_confidence']:.3f}")
            print(f"   ⭐ Среднее качество: {data['avg_quality']:.3f}")
            
            # Показываем распределение эмоций
            emotions_str = ", ".join([f"{k}: {v}" for k, v in sorted(data['emotions'].items(), key=lambda x: x[1], reverse=True)])
            print(f"   🎭 Эмоции: {emotions_str}")
            print()
            
    def search_by_artist(self, artist_name, limit=10):
        """Поиск треков по исполнителю"""
        print(f"👤 Поиск треков исполнителя: '{artist_name}'")
        print("-"*60)
        
        conn = sqlite3.connect(self.db_path)
        
        cursor = conn.execute("""
        SELECT t.title, t.lyric, t.tags, ml.ml_emotion_dominant, 
               ml.ml_emotion_confidence, t.quality_score, ml.semantic_cluster
        FROM tracks t
        JOIN ml_analysis ml ON t.id = ml.track_id
        WHERE (t.title LIKE ? OR t.title LIKE ? OR t.title LIKE ?)
        AND t.lyric != '[Instrumental]'
        ORDER BY t.quality_score DESC, ml.ml_emotion_confidence DESC
        LIMIT ?
        """, (f"{artist_name}%", f"%{artist_name} —%", f"%{artist_name} -%", limit))
        
        results = cursor.fetchall()
        conn.close()
        
        if not results:
            print(f"❌ Треки исполнителя '{artist_name}' не найдены")
            return []
        
        print(f"📊 Найдено {len(results)} треков:")
        print()
        
        for i, (title, lyric, tags, emotion, emotion_conf, quality, cluster) in enumerate(results, 1):
            emotion_icon = {
                'joy': '😊', 'sadness': '😢', 'anger': '😡', 
                'fear': '😨', 'disgust': '🤢', 'surprise': '😮'
            }.get(emotion, '🎵')
            
            # Извлекаем чистое название
            clean_title = title
            if " — " in title:
                clean_title = title.split(" — ", 1)[1].strip()
            elif " - " in title:
                clean_title = title.split(" - ", 1)[1].strip()
            
            lyric_preview = str(lyric)[:100] + "..." if len(str(lyric)) > 100 else str(lyric)
            
            print(f"{i:2d}. 🎵 {clean_title}")
            print(f"    {emotion_icon} {emotion.title()} (conf: {emotion_conf:.3f})")
            print(f"    ⭐ Quality: {quality:.3f} | 🔍 Cluster: {cluster}")
            print(f"    📝 {lyric_preview}")
            print()
            
        return results
        
    def statistics_overview(self):
        """Общая статистика системы"""
        print("📈 ОБЩАЯ СТАТИСТИКА ML СИСТЕМЫ:")
        print("="*50)
        
        conn = sqlite3.connect(self.db_path)
        
        # Основная статистика
        cursor = conn.execute("""
        SELECT 
            COUNT(*) as total_tracks,
            AVG(ml_emotion_confidence) as avg_emotion_conf,
            AVG(cluster_confidence) as avg_cluster_conf,
            COUNT(CASE WHEN ml_emotion_confidence > 0.8 THEN 1 END) as high_conf_tracks,
            COUNT(CASE WHEN ml_emotion_confidence > 0.9 THEN 1 END) as very_high_conf_tracks
        FROM ml_analysis ml
        JOIN tracks t ON ml.track_id = t.id
        WHERE t.lyric != '[Instrumental]'
        """)
        
        stats = cursor.fetchone()
        total, avg_emotion, avg_cluster, high_conf, very_high_conf = stats
        
        print(f"🎵 Всего треков в ML системе: {total:,}")
        print(f"🎭 Средняя эмоциональная уверенность: {avg_emotion:.3f}")
        print(f"🔍 Средняя кластерная уверенность: {avg_cluster:.3f}")
        print(f"🏆 Треков с высокой уверенностью (>0.8): {high_conf:,} ({high_conf/total*100:.1f}%)")
        print(f"🌟 Треков с очень высокой уверенностью (>0.9): {very_high_conf:,} ({very_high_conf/total*100:.1f}%)")
        print()
        
        # Распределение по эмоциям
        cursor = conn.execute("""
        SELECT 
            ml_emotion_dominant,
            COUNT(*) as count,
            ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM ml_analysis ml2 JOIN tracks t2 ON ml2.track_id = t2.id WHERE t2.lyric != '[Instrumental]'), 1) as percentage,
            AVG(ml_emotion_confidence) as avg_conf
        FROM ml_analysis ml
        JOIN tracks t ON ml.track_id = t.id
        WHERE t.lyric != '[Instrumental]'
        GROUP BY ml_emotion_dominant
        ORDER BY count DESC
        """)
        
        print("🎭 РАСПРЕДЕЛЕНИЕ ПО ЭМОЦИЯМ:")
        print("-"*40)
        for emotion, count, percentage, avg_conf in cursor.fetchall():
            emotion_icon = {
                'joy': '😊', 'sadness': '😢', 'anger': '😡', 
                'fear': '😨', 'disgust': '🤢', 'surprise': '😮'
            }.get(emotion, '🎵')
            
            print(f"{emotion_icon} {emotion.title()}: {count:,} треков ({percentage}%) - уверенность {avg_conf:.3f}")
        
        print()
        
        # Статистика качества
        cursor = conn.execute("""
        SELECT 
            AVG(t.quality_score) as avg_quality,
            COUNT(CASE WHEN t.quality_score > 0.9 THEN 1 END) as excellent_quality,
            COUNT(CASE WHEN t.quality_score > 0.8 THEN 1 END) as good_quality
        FROM tracks t
        JOIN ml_analysis ml ON t.id = ml.track_id
        WHERE t.lyric != '[Instrumental]'
        """)
        
        quality_stats = cursor.fetchone()
        avg_qual, excellent, good = quality_stats
        
        print("⭐ КАЧЕСТВО ТРЕКОВ:")
        print("-"*25)
        print(f"📊 Среднее качество: {avg_qual:.3f}")
        print(f"🌟 Отличное качество (>0.9): {excellent:,} треков")
        print(f"👍 Хорошее качество (>0.8): {good:,} треков")
        
        conn.close()

def main():
    """Главная функция"""
    system = SunoMLSystem()
    
    print("""
🎯 ДОСТУПНЫЕ КОМАНДЫ:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1. system.semantic_search("запрос") - семантический поиск треков
2. system.emotion_filter("эмоция") - фильтр по эмоции  
3. system.search_by_artist("исполнитель") - поиск по исполнителю
4. system.cluster_analysis() - анализ семантических кластеров
5. system.statistics_overview() - общая статистика системы

🎵 ПРИМЕРЫ ИСПОЛЬЗОВАНИЯ:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
system.semantic_search("love ballad romantic song")
system.semantic_search("грустная песня о любви") 
system.emotion_filter("радость")  
system.search_by_artist("БУХОЙ ВОЖАК")
system.cluster_analysis()
    """)
    
    return system

if __name__ == "__main__":
    system = main()
