#!/usr/bin/env python3
"""
TESLA V100 ML ANALYZER - FINAL PRODUCTION
User: valkst
Date: 2025-08-02 01:52:53 UTC
Status: Production ready - ML tables created
"""

import sqlite3
import logging
import numpy as np
import pandas as pd
from datetime import datetime
import pickle
import time
import json
import hashlib
import warnings
warnings.filterwarnings('ignore')

# ML библиотеки (проверены и готовы)
from sentence_transformers import SentenceTransformer
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
from bertopic import BERTopic
import umap
from sklearn.cluster import KMeans

class TeslaMLProduction:
    def __init__(self, db_path="tracks.sqlite"):
        self.db_path = db_path
        self.setup_logging()
        self.load_models()
        
    def setup_logging(self):
        """Настройка логирования"""
        log_file = f"tesla_ml_production_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s UTC - TESLA PROD - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        self.logger.info("🚀 Tesla ML Production Analyzer - User: valkst - 2025-08-02 01:52:53")
        
    def load_models(self):
        """Загрузка ML моделей"""
        self.logger.info("🧠 Загружаем ML модели...")
        start_time = time.time()
        
        try:
            # Семантическая модель
            self.semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
            self.logger.info("✅ SentenceTransformer загружен (384D)")
            
            # Эмоциональная модель
            self.tokenizer = RegexTokenizer()
            self.emotion_model = FastTextSocialNetworkModel(tokenizer=self.tokenizer)
            self.logger.info("✅ Dostoevsky emotion model загружен")
            
            # Тематическая модель
            self.topic_model = BERTopic(
                language="multilingual",
                min_topic_size=20,
                nr_topics=25,
                verbose=False
            )
            self.logger.info("✅ BERTopic готов")
            
            load_time = time.time() - start_time
            self.logger.info(f"🎯 Модели загружены за {load_time:.2f} секунд")
            self.models_ready = True
            
        except Exception as e:
            self.logger.error(f"❌ Ошибка загрузки: {e}")
            self.models_ready = False
            
    def get_batch_tracks(self, batch_size=1000, offset=0):
        """Получение батча треков"""
        conn = sqlite3.connect(self.db_path)
        
        query = f"""
        SELECT t.id, t.title, t.lyric, t.dominant_tags, t.quality_score
        FROM tracks t
        LEFT JOIN ml_analysis ml ON t.id = ml.track_id
        WHERE t.processing_version = 2 
          AND t.lyric != '[Instrumental]'
          AND LENGTH(t.lyric) > 25
          AND t.quality_score > 0.8
          AND ml.track_id IS NULL
        ORDER BY t.quality_score DESC
        LIMIT {batch_size} OFFSET {offset}
        """
        
        df = pd.read_sql_query(query, conn)
        conn.close()
        
        return df
        
    def process_embeddings(self, df):
        """Создание векторов"""
        self.logger.info(f"🔤 Создаем векторы для {len(df)} треков...")
        
        texts = []
        for _, row in df.iterrows():
            text = f"{row['title']} {row['lyric']}"[:1000]
            texts.append(text)
            
        embeddings = self.semantic_model.encode(
            texts,
            batch_size=20,
            show_progress_bar=True,
            normalize_embeddings=True
        )
        
        self.logger.info(f"✅ Векторы созданы: {embeddings.shape}")
        return embeddings
        
    def process_emotions(self, df):
        """Анализ эмоций"""
        self.logger.info(f"🎭 Анализируем эмоции в {len(df)} треках...")
        
        emotions = []
        for idx, row in df.iterrows():
            if idx % 100 == 0:
                self.logger.info(f"🎭 Эмоции: {idx+1}/{len(df)}")
                
            try:
                text = str(row['lyric'])[:600]
                if len(text.strip()) > 10:
                    result = self.emotion_model.predict([text])[0]
                    emotions.append(result)
                else:
                    emotions.append({})
            except:
                emotions.append({})
                
        valid = len([e for e in emotions if e])
        self.logger.info(f"✅ Эмоции проанализированы: {valid}/{len(df)}")
        return emotions
        
    def process_clustering(self, embeddings):
        """Кластеризация"""
        self.logger.info(f"🔍 Кластеризация {len(embeddings)} векторов...")
        
        # UMAP 2D для визуализации
        umap_2d = umap.UMAP(
            n_neighbors=15,
            n_components=2,
            metric='cosine',
            random_state=42
        )
        coords_2d = umap_2d.fit_transform(embeddings)
        
        # KMeans кластеризация
        n_clusters = min(20, len(embeddings) // 50)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(embeddings)
        
        self.logger.info(f"✅ Создано {len(np.unique(clusters))} кластеров")
        return clusters, coords_2d
        
    def save_results(self, df, embeddings, emotions, clusters, coords_2d):
        """Сохранение результатов"""
        self.logger.info("💾 Сохраняем результаты...")
        
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        for idx, row in df.iterrows():
            track_id = row['id']
            
            # Эмоции
            emotion_data = emotions[idx] if idx < len(emotions) else {}
            if emotion_data:
                dominant = max(emotion_data.items(), key=lambda x: x[1])[0]
                confidence = max(emotion_data.values())
            else:
                dominant = 'neutral'
                confidence = 0.0
                
            # ML анализ
            cursor.execute("""
            INSERT OR REPLACE INTO ml_analysis 
            (track_id, semantic_cluster, ml_joy, ml_sadness, ml_anger, 
             ml_fear, ml_disgust, ml_surprise, ml_emotion_dominant, 
             ml_emotion_confidence, umap_x, umap_y)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                track_id,
                int(clusters[idx]),
                emotion_data.get('joy', 0.0),
                emotion_data.get('sadness', 0.0),
                emotion_data.get('anger', 0.0),
                emotion_data.get('fear', 0.0),
                emotion_data.get('disgust', 0.0),
                emotion_data.get('surprise', 0.0),
                dominant,
                confidence,
                float(coords_2d[idx][0]),
                float(coords_2d[idx][1])
            ))
            
            # Векторы
            cursor.execute("""
            INSERT OR REPLACE INTO ml_embeddings 
            (track_id, embedding_vector, vector_dimension)
            VALUES (?, ?, ?)
            """, (
                track_id,
                pickle.dumps(embeddings[idx]),
                embeddings.shape[1]
            ))
        
        conn.commit()
        conn.close()
        
        self.logger.info(f"✅ Сохранено {len(df)} результатов")
        
    def run_analysis(self):
        """Запуск анализа"""
        if not self.models_ready:
            return False
            
        self.logger.info("🚀 НАЧИНАЕМ ML АНАЛИЗ...")
        start_time = datetime.now()
        
        total_processed = 0
        batch_size = 800
        offset = 0
        
        while True:
            # Получаем батч
            df = self.get_batch_tracks(batch_size, offset)
            if len(df) == 0:
                break
                
            self.logger.info(f"📊 Батч {offset//batch_size + 1}: {len(df)} треков")
            
            # Обработка
            embeddings = self.process_embeddings(df)
            emotions = self.process_emotions(df)
            clusters, coords_2d = self.process_clustering(embeddings)
            
            # Сохранение
            self.save_results(df, embeddings, emotions, clusters, coords_2d)
            
            total_processed += len(df)
            offset += batch_size
            
            self.logger.info(f"✅ Обработано {total_processed} треков")
            
        duration = datetime.now() - start_time
        
        self.logger.info(f"""
🎯 TESLA ML АНАЛИЗ ЗАВЕРШЕН!
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📊 Всего обработано: {total_processed} треков
⏱️ Время выполнения: {duration}
💾 Результаты в: ml_analysis, ml_embeddings
👤 User: valkst
🕐 Завершено: {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
        """)
        
        return True

if __name__ == "__main__":
    print("🚀 TESLA V100 ML ANALYZER - PRODUCTION RUN")
    print("User: valkst | Time: 2025-08-02 01:52:53 UTC")
    print("=" * 60)
    
    analyzer = TeslaMLProduction()
    
    if analyzer.models_ready:
        success = analyzer.run_analysis()
        if success:
            print("🎉 ML АНАЛИЗ ЗАВЕРШЕН УСПЕШНО!")
        else:
            print("❌ Ошибка при анализе")
    else:
        print("❌ Модели не готовы")
