#!/usr/bin/env python3
"""
TESLA V100 ML ANALYZER - WORKING VERSION
User: valkst
Date: 2025-08-02 02:05:47 UTC
Status: Working without dostoevsky (problematic model)
"""

import sqlite3
import logging
import numpy as np
import pandas as pd
from datetime import datetime
import pickle
import time
import warnings
warnings.filterwarnings('ignore')

# Только рабочие ML библиотеки
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap
from sklearn.cluster import KMeans

class TeslaMLWorking:
    def __init__(self, db_path="tracks.sqlite"):
        self.db_path = db_path
        self.setup_logging()
        self.load_models()
        
    def setup_logging(self):
        """Настройка логирования"""
        log_file = f"tesla_ml_working_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s UTC - TESLA WORK - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        self.logger.info("🚀 Tesla ML Working Analyzer - User: valkst - 2025-08-02 02:05:47")
        
    def load_models(self):
        """Загрузка рабочих ML моделей"""
        self.logger.info("🧠 Загружаем рабочие ML модели...")
        start_time = time.time()
        
        try:
            # Семантическая модель (уже загружена)
            self.semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
            self.logger.info("✅ SentenceTransformer готов (384D)")
            
            # Тематическая модель
            self.topic_model = BERTopic(
                language="multilingual",
                min_topic_size=25,
                nr_topics=20,
                verbose=False,
                calculate_probabilities=True
            )
            self.logger.info("✅ BERTopic готов")
            
            load_time = time.time() - start_time
            self.logger.info(f"🎯 Модели готовы за {load_time:.2f} секунд")
            self.models_ready = True
            
        except Exception as e:
            self.logger.error(f"❌ Ошибка загрузки: {e}")
            self.models_ready = False
            
    def get_tracks_batch(self, batch_size=1200, offset=0):
        """Получение батча треков для обработки"""
        conn = sqlite3.connect(self.db_path)
        
        query = f"""
        SELECT t.id, t.title, t.lyric, t.dominant_tags, t.quality_score,
               t.emotion_primary, t.mood_valence, t.mood_energy
        FROM tracks t
        LEFT JOIN ml_analysis ml ON t.id = ml.track_id
        WHERE t.processing_version = 2 
          AND t.lyric != '[Instrumental]'
          AND LENGTH(t.lyric) > 30
          AND t.quality_score > 0.7
          AND ml.track_id IS NULL
        ORDER BY t.quality_score DESC, t.mood_valence DESC
        LIMIT {batch_size} OFFSET {offset}
        """
        
        df = pd.read_sql_query(query, conn)
        conn.close()
        
        return df
        
    def create_embeddings(self, df):
        """Создание семантических векторов"""
        self.logger.info(f"🔤 Создаем векторы для {len(df)} треков...")
        
        texts = []
        for _, row in df.iterrows():
            # Комбинируем все текстовые данные
            combined = f"Title: {row['title']} Tags: {row['dominant_tags']} Lyrics: {row['lyric']}"
            texts.append(combined[:1200])
            
        # Создание векторов
        embeddings = self.semantic_model.encode(
            texts,
            batch_size=24,
            show_progress_bar=True,
            normalize_embeddings=True,
            convert_to_numpy=True
        )
        
        self.logger.info(f"✅ Создано {embeddings.shape[0]} векторов размерности {embeddings.shape[1]}")
        return embeddings
        
    def analyze_emotions_simple(self, df):
        """Простой анализ эмоций на основе существующих данных"""
        self.logger.info(f"🎭 Простой анализ эмоций для {len(df)} треков...")
        
        emotions = []
        
        # Словари для анализа
        joy_words = ['happy', 'joy', 'love', 'celebrate', 'dance', 'party', 'fun', 'smile', 
                     'радость', 'счастье', 'любовь', 'танцы', 'веселье', 'улыбка']
        sadness_words = ['sad', 'cry', 'tear', 'lonely', 'depression', 'pain', 'hurt',
                        'грусть', 'печаль', 'слезы', 'одиночество', 'боль']
        anger_words = ['angry', 'rage', 'hate', 'fight', 'war', 'mad', 'fury',
                      'злость', 'ярость', 'ненависть', 'война', 'бешенство']
        fear_words = ['fear', 'afraid', 'scared', 'terror', 'anxiety', 'worry',
                     'страх', 'боязнь', 'ужас', 'тревога', 'беспокойство']
        
        for _, row in df.iterrows():
            text = str(row['lyric']).lower()
            
            # Подсчет эмоциональных слов
            joy_count = sum(1 for word in joy_words if word in text)
            sadness_count = sum(1 for word in sadness_words if word in text)
            anger_count = sum(1 for word in anger_words if word in text)
            fear_count = sum(1 for word in fear_words if word in text)
            
            # Нормализация
            total = max(1, joy_count + sadness_count + anger_count + fear_count)
            
            emotion_data = {
                'joy': joy_count / total,
                'sadness': sadness_count / total,
                'anger': anger_count / total,
                'fear': fear_count / total,
                'disgust': 0.1,  # Базовое значение
                'surprise': 0.1   # Базовое значение
            }
            
            # Используем существующие данные настроения
            if hasattr(row, 'mood_valence') and row['mood_valence'] > 0.7:
                emotion_data['joy'] = max(emotion_data['joy'], 0.6)
            elif hasattr(row, 'mood_valence') and row['mood_valence'] < 0.3:
                emotion_data['sadness'] = max(emotion_data['sadness'], 0.6)
                
            emotions.append(emotion_data)
            
        self.logger.info(f"✅ Простой анализ эмоций завершен для {len(emotions)} треков")
        return emotions
        
    def perform_clustering(self, embeddings, n_clusters=18):
        """Семантическая кластеризация"""
        self.logger.info(f"🔍 Кластеризация {len(embeddings)} векторов...")
        
        # UMAP для снижения размерности
        umap_2d = umap.UMAP(
            n_neighbors=20,
            n_components=2,
            metric='cosine',
            random_state=42,
            min_dist=0.1
        )
        
        coords_2d = umap_2d.fit_transform(embeddings)
        
        # UMAP для кластеризации
        umap_cluster = umap.UMAP(
            n_neighbors=20,
            n_components=12,
            metric='cosine',
            random_state=42
        )
        
        reduced = umap_cluster.fit_transform(embeddings)
        
        # KMeans кластеризация
        kmeans = KMeans(
            n_clusters=n_clusters,
            random_state=42,
            n_init=12,
            max_iter=400
        )
        
        clusters = kmeans.fit_predict(reduced)
        
        # Confidence как близость к центру кластера
        confidences = []
        centers = kmeans.cluster_centers_
        
        for i, cluster_id in enumerate(clusters):
            distance = np.linalg.norm(reduced[i] - centers[cluster_id])
            confidence = max(0.1, 1.0 - distance / 3.0)
            confidences.append(confidence)
            
        unique_clusters = len(np.unique(clusters))
        self.logger.info(f"✅ Создано {unique_clusters} семантических кластеров")
        
        return clusters, coords_2d, confidences
        
    def topic_modeling(self, df):
        """Тематическое моделирование"""
        self.logger.info(f"📋 Тематическое моделирование для {len(df)} треков...")
        
        docs = []
        for _, row in df.iterrows():
            doc = f"{row['title']} {row['lyric']}"
            docs.append(doc[:1000])
            
        # Обучение модели тем
        topics, probs = self.topic_model.fit_transform(docs)
        
        # Получение названий тем
        topic_info = self.topic_model.get_topic_info()
        topic_names = {}
        
        for _, topic_row in topic_info.iterrows():
            topic_id = topic_row['Topic']
            if topic_id >= 0:
                words = topic_row['Representation'][:3]
                topic_name = ' + '.join(words) if words else f'Topic_{topic_id}'
                topic_names[topic_id] = topic_name
                
        self.logger.info(f"✅ Найдено {len(topic_names)} тематических групп")
        
        return topics, probs, topic_names
        
    def save_results(self, df, embeddings, emotions, clusters, coords_2d, confidences, topics, topic_probs, topic_names):
        """Сохранение всех результатов"""
        self.logger.info("💾 Сохраняем ML результаты...")
        
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        for idx, row in df.iterrows():
            track_id = row['id']
            
            # Эмоциональные данные
            emotion_data = emotions[idx] if idx < len(emotions) else {}
            if emotion_data:
                dominant = max(emotion_data.items(), key=lambda x: x[1])[0]
                confidence = max(emotion_data.values())
            else:
                dominant = 'neutral'
                confidence = 0.0
                
            # Тематические данные
            topic_id = int(topics[idx]) if idx < len(topics) else -1
            topic_prob = float(topic_probs[idx]) if idx < len(topic_probs) else 0.0
            topic_name = topic_names.get(topic_id, 'Unknown') if topic_id >= 0 else 'Outlier'
            
            # Основная ML таблица
            cursor.execute("""
            INSERT OR REPLACE INTO ml_analysis 
            (track_id, semantic_cluster, cluster_confidence,
             ml_joy, ml_sadness, ml_anger, ml_fear, ml_disgust, ml_surprise,
             ml_emotion_dominant, ml_emotion_confidence,
             topic_id, topic_name, topic_probability,
             umap_x, umap_y)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                track_id,
                int(clusters[idx]),
                float(confidences[idx]),
                emotion_data.get('joy', 0.0),
                emotion_data.get('sadness', 0.0),
                emotion_data.get('anger', 0.0),
                emotion_data.get('fear', 0.0),
                emotion_data.get('disgust', 0.0),
                emotion_data.get('surprise', 0.0),
                dominant,
                confidence,
                topic_id,
                topic_name,
                topic_prob,
                float(coords_2d[idx][0]),
                float(coords_2d[idx][1])
            ))
            
            # Векторы
            cursor.execute("""
            INSERT OR REPLACE INTO ml_embeddings 
            (track_id, embedding_vector, vector_dimension)
            VALUES (?, ?, ?)
            """, (
                track_id,
                pickle.dumps(embeddings[idx].astype(np.float32)),
                embeddings.shape[1]
            ))
        
        conn.commit()
        conn.close()
        
        self.logger.info(f"✅ Сохранено {len(df)} ML результатов")
        
    def run_full_analysis(self):
        """Запуск полного ML анализа"""
        if not self.models_ready:
            self.logger.error("❌ Модели не готовы!")
            return False
            
        start_time = datetime.now()
        self.logger.info("🚀 TESLA ML WORKING ANALYSIS STARTED")
        
        total_processed = 0
        batch_size = 1000
        offset = 0
        
        try:
            while True:
                # Получаем батч
                df = self.get_tracks_batch(batch_size, offset)
                if len(df) == 0:
                    break
                    
                self.logger.info(f"📊 Батч {offset//batch_size + 1}: {len(df)} треков")
                
                # 1. Создание векторов
                embeddings = self.create_embeddings(df)
                
                # 2. Простой анализ эмоций
                emotions = self.analyze_emotions_simple(df)
                
                # 3. Кластеризация
                clusters, coords_2d, confidences = self.perform_clustering(embeddings)
                
                # 4. Тематическое моделирование
                topics, topic_probs, topic_names = self.topic_modeling(df)
                
                # 5. Сохранение
                self.save_results(df, embeddings, emotions, clusters, coords_2d, 
                                confidences, topics, topic_probs, topic_names)
                
                total_processed += len(df)
                offset += batch_size
                
                self.logger.info(f"✅ Обработано {total_processed} треков")
                
            duration = datetime.now() - start_time
            
            self.logger.info(f"""
🎯 TESLA ML WORKING ANALYSIS COMPLETE!
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📊 Всего обработано: {total_processed} треков
🔤 Векторов создано: {total_processed} × 384D
🎭 Эмоций проанализировано: {total_processed}
🔍 Кластеров найдено: 18 семантических групп
📋 Тематических групп: 20 топиков
⏱️ Время выполнения: {duration}
💾 Сохранено в: ml_analysis, ml_embeddings
👤 User: valkst
🕐 Завершено: {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
            """)
            
            return True
            
        except Exception as e:
            self.logger.error(f"❌ Ошибка ML анализа: {e}")
            return False

if __name__ == "__main__":
    print(f"""
🚀 TESLA V100 ML ANALYZER - WORKING VERSION
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
👤 User: valkst
📅 Current: 2025-08-02 02:05:47 UTC
🎯 Status: Working version without problematic dostoevsky
🧠 Models: SentenceTransformer + BERTopic + UMAP + KMeans + Simple Emotions
🏗️ Architecture: Separate ML tables
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    """)
    
    analyzer = TeslaMLWorking()
    
    if analyzer.models_ready:
        print("🎯 Запускаем рабочий ML анализ...")
        success = analyzer.run_full_analysis()
        
        if success:
            print("""
🎉 ML АНАЛИЗ ЗАВЕРШЕН УСПЕШНО!
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✅ Результаты сохранены в:
   • ml_analysis (ML данные)
   • ml_embeddings (384D векторы)

🎯 Готово к анализу и визуализации!
            """)
        else:
            print("❌ Ошибка при анализе")
    else:
        print("❌ Модели не готовы")