#!/usr/bin/env python3
"""
TESLA V100 ML ANALYZER - COMPLETE ALL TRACKS
User: valkst
Date: 2025-08-02 02:18:46 UTC
Status: Process ALL remaining tracks (lower quality threshold)
"""

import sqlite3
import logging
import numpy as np
import pandas as pd
from datetime import datetime
import pickle
import time
import warnings
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap
from sklearn.cluster import KMeans

class TeslaMLCompleteAll:
    def __init__(self, db_path="tracks.sqlite"):
        self.db_path = db_path
        self.setup_logging()
        self.load_models()
        
    def setup_logging(self):
        log_file = f"tesla_ml_complete_all_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s UTC - TESLA ALL - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        self.logger.info("🚀 Tesla ML Complete All - User: valkst - 2025-08-02 02:18:46")
        
    def load_models(self):
        self.logger.info("🧠 Загружаем ML модели (уже кешированы)...")
        start_time = time.time()
        
        try:
            self.semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
            self.logger.info("✅ SentenceTransformer готов")
            
            self.topic_model = BERTopic(
                language="multilingual",
                min_topic_size=15,
                nr_topics=12,
                verbose=False
            )
            self.logger.info("✅ BERTopic готов")
            
            load_time = time.time() - start_time
            self.logger.info(f"🎯 Модели готовы за {load_time:.2f} секунд")
            self.models_ready = True
            
        except Exception as e:
            self.logger.error(f"❌ Ошибка: {e}")
            self.models_ready = False
            
    def get_remaining_tracks(self, batch_size=800, offset=0):
        """Получение ВСЕХ оставшихся треков (любое качество)"""
        conn = sqlite3.connect(self.db_path)
        
        query = f"""
        SELECT t.id, t.title, t.lyric, t.dominant_tags, t.quality_score,
               t.emotion_primary, t.mood_valence, t.mood_energy
        FROM tracks t
        LEFT JOIN ml_analysis ml ON t.id = ml.track_id
        WHERE t.processing_version = 2 
          AND t.lyric != '[Instrumental]'
          AND LENGTH(t.lyric) > 20
          AND ml.track_id IS NULL
        ORDER BY t.quality_score DESC
        LIMIT {batch_size} OFFSET {offset}
        """
        
        df = pd.read_sql_query(query, conn)
        conn.close()
        
        return df
        
    def create_embeddings_fast(self, df):
        """Быстрое создание векторов"""
        self.logger.info(f"🔤 Быстрые векторы для {len(df)} треков...")
        
        texts = []
        for _, row in df.iterrows():
            # Укороченный текст для скорости
            text = f"{row['title']} {str(row['lyric'])[:800]}"
            texts.append(text)
            
        embeddings = self.semantic_model.encode(
            texts,
            batch_size=32,  # Увеличили батч
            show_progress_bar=True,
            normalize_embeddings=True
        )
        
        self.logger.info(f"✅ {embeddings.shape[0]} векторов готово")
        return embeddings
        
    def emotions_fast(self, df):
        """Быстрый анализ эмоций"""
        self.logger.info(f"🎭 Быстрые эмоции для {len(df)} треков...")
        
        emotions = []
        
        for _, row in df.iterrows():
            # Используем существующие данные + простой анализ
            text = str(row['lyric']).lower()
            
            # Базовые эмоции из существующих данных
            base_emotion = str(row.get('emotion_primary', 'neutral')).lower()
            valence = float(row.get('mood_valence', 0.5))
            energy = float(row.get('mood_energy', 0.5))
            
            # Быстрая классификация
            if 'joy' in base_emotion or 'happy' in base_emotion or valence > 0.7:
                dominant_vals = {'joy': 0.7, 'sadness': 0.1, 'anger': 0.1, 'fear': 0.1}
            elif 'sad' in base_emotion or valence < 0.3:
                dominant_vals = {'joy': 0.1, 'sadness': 0.7, 'anger': 0.1, 'fear': 0.1}
            elif 'anger' in base_emotion or 'mad' in text:
                dominant_vals = {'joy': 0.1, 'sadness': 0.1, 'anger': 0.7, 'fear': 0.1}
            else:
                # Нейтральное распределение
                dominant_vals = {'joy': 0.3, 'sadness': 0.3, 'anger': 0.2, 'fear': 0.2}
                
            emotion_data = {
                'joy': float(dominant_vals['joy']),
                'sadness': float(dominant_vals['sadness']),
                'anger': float(dominant_vals['anger']),
                'fear': float(dominant_vals['fear']),
                'disgust': 0.1,
                'surprise': 0.1
            }
            
            emotions.append(emotion_data)
            
        self.logger.info(f"✅ Быстрые эмоции готовы: {len(emotions)}")
        return emotions
        
    def clustering_simple(self, embeddings):
        """Упрощенная кластеризация"""
        self.logger.info(f"🔍 Простая кластеризация {len(embeddings)} векторов...")
        
        # Только 2D UMAP
        umap_2d = umap.UMAP(
            n_neighbors=10,
            n_components=2,
            metric='cosine',
            random_state=42
        )
        
        coords_2d = umap_2d.fit_transform(embeddings)
        
        # Простой KMeans
        n_clusters = min(12, max(3, len(embeddings) // 100))
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(embeddings)
        
        # Простые confidences
        confidences = [0.5] * len(clusters)  # Фиксированная уверенность
        
        self.logger.info(f"✅ {len(np.unique(clusters))} кластеров готово")
        
        return clusters, coords_2d, confidences
        
    def save_fast(self, df, embeddings, emotions, clusters, coords_2d, confidences):
        """Быстрое сохранение"""
        self.logger.info("💾 Быстрое сохранение...")
        
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Batch insert для скорости
        ml_data = []
        embedding_data = []
        
        for idx, row in df.iterrows():
            track_id = str(row['id'])
            emotion_data = emotions[idx]
            
            dominant = max(emotion_data.items(), key=lambda x: x[1])[0]
            confidence = float(max(emotion_data.values()))
            
            ml_data.append((
                track_id,
                int(clusters[idx]),
                float(confidences[idx]),
                float(emotion_data['joy']),
                float(emotion_data['sadness']),
                float(emotion_data['anger']),
                float(emotion_data['fear']),
                float(emotion_data['disgust']),
                float(emotion_data['surprise']),
                dominant,
                confidence,
                float(coords_2d[idx][0]),
                float(coords_2d[idx][1])
            ))
            
            embedding_data.append((
                track_id,
                pickle.dumps(embeddings[idx].astype(np.float32)),
                384
            ))
        
        # Batch insert
        cursor.executemany("""
        INSERT OR REPLACE INTO ml_analysis 
        (track_id, semantic_cluster, cluster_confidence,
         ml_joy, ml_sadness, ml_anger, ml_fear, ml_disgust, ml_surprise,
         ml_emotion_dominant, ml_emotion_confidence, umap_x, umap_y)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, ml_data)
        
        cursor.executemany("""
        INSERT OR REPLACE INTO ml_embeddings 
        (track_id, embedding_vector, vector_dimension)
        VALUES (?, ?, ?)
        """, embedding_data)
        
        conn.commit()
        conn.close()
        
        self.logger.info(f"✅ Сохранено {len(df)} результатов")
        
    def run_complete_analysis(self):
        """Обработка ВСЕХ оставшихся треков"""
        if not self.models_ready:
            return False
            
        start_time = datetime.now()
        self.logger.info("🚀 TESLA ML COMPLETE ALL ANALYSIS STARTED")
        
        total_processed = 0
        batch_size = 1000
        offset = 0
        
        try:
            while True:
                df = self.get_remaining_tracks(batch_size, offset)
                if len(df) == 0:
                    break
                    
                self.logger.info(f"📊 Батч {offset//batch_size + 1}: {len(df)} треков")
                
                embeddings = self.create_embeddings_fast(df)
                emotions = self.emotions_fast(df)
                clusters, coords_2d, confidences = self.clustering_simple(embeddings)
                
                self.save_fast(df, embeddings, emotions, clusters, coords_2d, confidences)
                
                total_processed += len(df)
                offset += batch_size
                
                self.logger.info(f"✅ Обработано {total_processed} треков")
                
            duration = datetime.now() - start_time
            
            self.logger.info(f"""
🎯 TESLA ML COMPLETE ALL ANALYSIS FINISHED!
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📊 ВСЕГО обработано: {total_processed} треков
🔤 Векторов создано: {total_processed} × 384D
🎭 Эмоций проанализировано: {total_processed}
🔍 Кластеров создано: ~12 семантических групп
⏱️ Время выполнения: {duration}
💾 Сохранено в: ml_analysis, ml_embeddings
👤 User: valkst
🕐 Завершено: {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
            """)
            
            return True
            
        except Exception as e:
            self.logger.error(f"❌ Ошибка: {e}")
            return False

if __name__ == "__main__":
    print(f"""
🚀 TESLA V100 ML ANALYZER - COMPLETE ALL REMAINING
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
👤 User: valkst
📅 Current: 2025-08-02 02:18:46 UTC
🎯 Task: Process ALL remaining tracks (any quality)
🔧 Previous: 4,133 tracks processed (quality > 0.75)
🎯 Goal: Process remaining ~3,800 tracks
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    """)
    
    # Проверяем сколько осталось
    import sqlite3
    conn = sqlite3.connect("tracks.sqlite")
    cursor = conn.execute("""
    SELECT COUNT(*) FROM tracks t
    LEFT JOIN ml_analysis ml ON t.id = ml.track_id
    WHERE t.processing_version = 2 
      AND t.lyric != '[Instrumental]'
      AND LENGTH(t.lyric) > 20
      AND ml.track_id IS NULL
    """)
    remaining = cursor.fetchone()[0]
    conn.close()
    
    print(f"📊 Треков для обработки: {remaining}")
    
    if remaining > 0:
        analyzer = TeslaMLCompleteAll()
        
        if analyzer.models_ready:
            print("🎯 Обрабатываем ВСЕ оставшиеся треки...")
            success = analyzer.run_complete_analysis()
            
            if success:
                print(f"""
🎉 ПОЛНАЯ ОБРАБОТКА ЗАВЕРШЕНА!
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✅ Обработано дополнительно: {remaining} треков
🎯 Общий итог: 4,133 + {remaining} = {4133 + remaining} треков
💾 Полная ML база данных готова!
                """)
        else:
            print("❌ Модели не готовы")
    else:
        print("✅ Все треки уже обработаны!")