agc-chatbot/db/import_lkk_data.py

import os
import json
import mysql.connector
import numpy as np
import re
import sys
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Print environment variables for debugging
print(f"MySQL Host: {os.getenv('MYSQL_HOST')}")
print(f"MySQL User: {os.getenv('MYSQL_USER')}")
print(f"MySQL Database: {os.getenv('MYSQL_DATABASE')}")
print(f"MySQL Password: {'[SET]' if os.getenv('MYSQL_PASSWORD') else '[NOT SET]'}")

# Database configuration
DB_CONFIG = {
    'host': os.getenv('MYSQL_HOST', 'localhost'),
    'user': os.getenv('MYSQL_USER', 'root'),
    'password': os.getenv('MYSQL_PASSWORD', ''),
    'database': os.getenv('MYSQL_DATABASE', 'agc')
}

def get_db_connection():
    """Create a connection to the MySQL database"""
    return mysql.connector.connect(**DB_CONFIG)

def truncate_all_tables():
    """Truncate all tables to remove existing data before importing new data"""
    conn = get_db_connection()
    cursor = conn.cursor()

    print("Truncating all tables to ensure a clean import...")

    # Disable foreign key checks temporarily to allow truncating tables with foreign keys
    cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")

    try:
        # Get all tables in the database
        cursor.execute("SHOW TABLES;")
        tables = cursor.fetchall()

        for table in tables:
            table_name = table[0]
            print(f"Truncating table: {table_name}")
            cursor.execute(f"TRUNCATE TABLE {table_name};")

        print(f"Successfully truncated {len(tables)} tables")
    except mysql.connector.Error as err:
        print(f"Error truncating tables: {err}")
    finally:
        # Re-enable foreign key checks
        cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")

    conn.commit()
    cursor.close()
    conn.close()

def setup_tables():
    """Create the tables for document search if they don't exist"""
    conn = get_db_connection()
    cursor = conn.cursor()

    # Create document_search tables
    schema_sql = """
    -- Documents table
    CREATE TABLE IF NOT EXISTS documents (
        id INT AUTO_INCREMENT PRIMARY KEY,
        title VARCHAR(255) NOT NULL,
        content TEXT NOT NULL,
        source VARCHAR(255),
        doc_type VARCHAR(50),
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    );

    -- Embeddings table - store as JSON since MySQL doesn't have a vector type
    CREATE TABLE IF NOT EXISTS embeddings (
        id INT AUTO_INCREMENT PRIMARY KEY,
        document_id INT NOT NULL,
        embedding JSON NOT NULL,
        FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
    );

    -- Search history
    CREATE TABLE IF NOT EXISTS search_logs (
        id INT AUTO_INCREMENT PRIMARY KEY,
        query TEXT NOT NULL,
        results JSON,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    );
    """

    # Execute schema SQL statements
    for statement in schema_sql.split(';'):
        if statement.strip():
            try:
                cursor.execute(statement + ';')
                print(f"Executed: {statement[:50]}...")
            except mysql.connector.Error as err:
                print(f"Error executing statement: {err}")
                print(f"Statement: {statement}")

    conn.commit()
    cursor.close()
    conn.close()

    print("Document search tables created successfully")

def extract_sql_inserts(file_path, table_name):
    """Extract SQL INSERT statements from file and return as a list of SQL statements"""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()

            # Extract all INSERT statements for the specified table
            pattern = rf"INSERT INTO {table_name}[^;]*;"
            inserts = re.findall(pattern, content, re.DOTALL)

            # Clean up the inserts to make them more compatible
            cleaned_inserts = []
            for insert in inserts:
                # Replace TEXT fields that might have problematic characters
                # This is a simplistic approach - for a production system, you'd need more robust parsing
                if 'TEXT' in insert or 'text' in insert or 'Text' in insert:
                    # Try to handle the quotes better
                    insert = re.sub(r"'([^']*?)<([^>]*?)>'", r"'\\1<\\2>'", insert)

                cleaned_inserts.append(insert)

            return cleaned_inserts
    except Exception as e:
        print(f"Error reading SQL file {file_path}: {e}")
        return []

def execute_sql_statements(statements):
    """Execute a list of SQL statements"""
    if not statements:
        return

    conn = get_db_connection()
    cursor = conn.cursor()

    successful = 0
    failed = 0

    for statement in statements:
        try:
            cursor.execute(statement)
            successful += 1
        except mysql.connector.Error as err:
            # Skip duplicate key errors
            if err.errno == 1062:  # Duplicate entry error
                print(f"Skipping duplicate entry: {err}")
            else:
                print(f"Error executing SQL: {err}")
                print(f"Statement: {statement[:100]}...")  # Print first 100 chars of the statement
            failed += 1

    conn.commit()
    cursor.close()
    conn.close()

    print(f"Executed {successful} statements successfully, {failed} statements failed")

def scan_directory_for_sql_files(directory):
    """Scan a directory and its subdirectories for SQL files"""
    sql_files = {
        'info': [],
        'allegation': [],
        'person': []
    }

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.sql'):
                full_path = os.path.join(root, file)

                # Categorize files
                if 'LT_LKK_INFO' in file:
                    sql_files['info'].append(full_path)
                elif 'LT_LKK_ALLEGATION' in file:
                    sql_files['allegation'].append(full_path)
                elif 'LT_LKK_PERSON' in file or 'PERSON_RESPONSIBLE' in file:
                    sql_files['person'].append(full_path)

    return sql_files

def extract_case_data_from_directory(directory_path):
    """Extract legal case data directly from a directory"""
    # Get the directory name (category)
    category = os.path.basename(directory_path)

    # Dictionary to store case data
    cases = []

    # Look for PDF files first
    pdf_files = []
    for file in os.listdir(directory_path):
        if file.lower().endswith('.pdf'):
            pdf_files.append(os.path.join(directory_path, file))

    # Look for SQL files to extract data
    info_files = []
    allegation_files = []
    person_files = []

    for file in os.listdir(directory_path):
        if file.endswith('.sql'):
            full_path = os.path.join(directory_path, file)
            if 'LT_LKK_INFO' in file:
                info_files.append(full_path)
            elif 'LT_LKK_ALLEGATION' in file:
                allegation_files.append(full_path)
            elif 'LT_LKK_PERSON' in file or 'PERSON_RESPONSIBLE' in file:
                person_files.append(full_path)

    # Create a basic case from the directory
    case = {
        'title': f"Legal Case - {category}",
        'content': f"Legal category: {category}\n",
        'source': directory_path,
        'doc_type': "Legal Case Category",
        'pdf_files': pdf_files
    }

    cases.append(case)

    return cases

def import_pdf_files(directory):
    """Import PDF files as documents"""
    # Import required modules
    sys.path.append('.')  # Add current directory to path
    from db.db_utils import add_document, store_embedding

    try:
        from embedding.embedding_service import generate_embedding
    except Exception as e:
        print(f"Error importing regular embedding service: {e}")
        print("Falling back to HTTP-based embedding service...")
        from embedding.embedding_service_http import generate_embedding

    imported_count = 0

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                try:
                    full_path = os.path.join(root, file)

                    # Get the category from directory name
                    category = os.path.basename(root)

                    # Try to extract text from PDF (would need a PDF library)
                    # For now, we'll just use the filename as content
                    print(f"Found PDF: {full_path}")

                    # For a real implementation, you would use PyPDF2 or a similar library
                    # This is a placeholder for actual PDF text extraction
                    title = file.replace('.pdf', '')
                    content = f"PDF Document: {file} from category {category}\n\nSource: {full_path}"

                    # Add document to the database
                    doc_id = add_document(
                        title=title,
                        content=content,
                        source=full_path,
                        doc_type="Legal PDF Document"
                    )

                    # Generate and store embedding
                    embedding = generate_embedding(f"{title} {content}")
                    store_embedding(doc_id, embedding)

                    print(f"Added PDF document: {title}")
                    imported_count += 1
                except Exception as e:
                    print(f"Error processing PDF {file}: {e}")

    return imported_count

def import_data_from_directory(directory):
    """Import data directly from each category directory"""
    # Import required modules
    sys.path.append('.')  # Add current directory to path
    from db.db_utils import add_document, store_embedding

    try:
        from embedding.embedding_service import generate_embedding
    except Exception as e:
        print(f"Error importing regular embedding service: {e}")
        print("Falling back to HTTP-based embedding service...")
        from embedding.embedding_service_http import generate_embedding

    # Count for imported documents
    imported_count = 0

    # Get all subdirectories
    try:
        subdirs = [os.path.join(directory, d) for d in os.listdir(directory)
                  if os.path.isdir(os.path.join(directory, d))]

        print(f"Found {len(subdirs)} category directories")

        # Process each directory
        for subdir in subdirs:
            category = os.path.basename(subdir)
            print(f"Processing directory: {category}")

            # Extract cases from the directory
            cases = extract_case_data_from_directory(subdir)

            # Add each case as a document
            for case in cases:
                doc_id = add_document(
                    title=case['title'],
                    content=case['content'],
                    source=case['source'],
                    doc_type=case['doc_type']
                )

                # Generate and store embedding
                embedding = generate_embedding(f"{case['title']} {case['content']}")
                store_embedding(doc_id, embedding)

                print(f"Added document: {case['title']}")
                imported_count += 1

    except Exception as e:
        print(f"Error scanning directories: {e}")

    return imported_count

def import_sql_data_to_db():
    """Import SQL files from Data directory into the database"""
    data_dir = os.path.join(os.getcwd(), 'Data')

    print(f"Scanning directory: {data_dir}")
    sql_files = scan_directory_for_sql_files(data_dir)

    print(f"Found {len(sql_files['info'])} info files, {len(sql_files['allegation'])} allegation files, {len(sql_files['person'])} person files")

    # Import info files
    for file_path in sql_files['info']:
        print(f"Importing info from {file_path}")
        statements = extract_sql_inserts(file_path, 'LT_LKK_INFO')
        execute_sql_statements(statements)

    # Import allegation files
    for file_path in sql_files['allegation']:
        print(f"Importing allegations from {file_path}")
        statements = extract_sql_inserts(file_path, 'LT_LKK_ALLEGATION')
        execute_sql_statements(statements)

    # Import person files
    for file_path in sql_files['person']:
        print(f"Importing persons from {file_path}")
        statements = extract_sql_inserts(file_path, 'LT_LKK_PERSON_INVOLVE')
        execute_sql_statements(statements)

    print("SQL data import complete!")

    # Import directly from directories
    print("Importing data directly from directories...")
    dir_docs = import_data_from_directory(data_dir)
    print(f"Imported {dir_docs} documents from directories")

    # Also import PDF files
    print("Looking for PDF files...")
    pdf_docs = import_pdf_files(data_dir)
    print(f"Imported {pdf_docs} PDF documents")

def check_tables(db_config, label):
    """Check tables in the specified database"""
    conn = mysql.connector.connect(**db_config)
    cursor = conn.cursor()

    # Get all tables
    cursor.execute("SHOW TABLES;")
    tables = cursor.fetchall()

    print(f"\n{label} tables in database {db_config['database']}:")

    for table in tables:
        table_name = table[0]

        # Count records
        cursor.execute(f"SELECT COUNT(*) FROM {table_name};")
        count = cursor.fetchone()[0]

        print(f"  {table_name}: {count} records")

    cursor.close()
    conn.close()

def convert_user_data_to_documents():
    """Convert user data to documents in our application"""
    # Import required modules
    sys.path.append('.')  # Add current directory to path
    try:
        from db.db_utils import add_document, store_embedding
        from embedding.embedding_service_http import generate_embedding
    except Exception as e:
        print(f"Error importing required modules: {e}")
        return

    # Connect to the source database
    source_conn = get_db_connection()
    source_cursor = source_conn.cursor(dictionary=True)

    # Get all users with their details and preferences - adjusted for actual schema
    try:
        query = """
        SELECT
            u.user_id, u.username, u.email, u.created_at,
            ud.first_name, ud.last_name, ud.date_of_birth, ud.phone_number,
            up.language, up.theme, up.notifications_enabled
        FROM users u
        LEFT JOIN user_details ud ON u.user_id = ud.user_id
        LEFT JOIN user_preferences up ON u.user_id = up.user_id
        """

        source_cursor.execute(query)
        user_records = source_cursor.fetchall()

        if not user_records:
            print("No user records found in source database.")
            return

        print(f"Found {len(user_records)} user records to convert")

        # Process each user as a document
        user_count = 0

        for user in user_records:
            user_id = user.get('user_id')

            # Create title with user information
            username = user.get('username', f"User {user_id}")
            title = f"User Profile: {username}"

            # Compile content from specific fields based on the schema
            content_parts = [
                f"Username: {user.get('username', 'N/A')}",
                f"Email: {user.get('email', 'N/A')}",
                f"Created At: {user.get('created_at', 'N/A')}"
            ]

            # Add user details if available
            if user.get('first_name') or user.get('last_name'):
                name = f"{user.get('first_name', '')} {user.get('last_name', '')}".strip()
                content_parts.append(f"Name: {name}")

            if user.get('date_of_birth'):
                content_parts.append(f"Date of Birth: {user.get('date_of_birth')}")

            if user.get('phone_number'):
                content_parts.append(f"Phone Number: {user.get('phone_number')}")

            # Add user preferences if available
            if user.get('language'):
                content_parts.append(f"Language: {user.get('language')}")

            if user.get('theme'):
                content_parts.append(f"Theme: {user.get('theme')}")

            if user.get('notifications_enabled') is not None:
                notifications = "Enabled" if user.get('notifications_enabled') else "Disabled"
                content_parts.append(f"Notifications: {notifications}")

        # Combine all content
        content = "\n".join(content_parts)

        # Add document to the database
        print(f"Adding document for user {username}")

        doc_id = add_document(
            title=title,
            content=content,
            source=f"User ID: {user_id} from docdoc database",
            doc_type="User Profile"
        )

        # Generate and store embedding
        print(f"Generating embedding for document: {doc_id}")
        embedding = generate_embedding(f"{title} {content}")
        store_embedding(doc_id, embedding)

        user_count += 1

        print(f"Conversion complete! Created {user_count} user profile documents.")

    except mysql.connector.Error as err:
        print(f"Error accessing user data: {err}")
    finally:
        source_cursor.close()
        source_conn.close()

def setup_document_tables():
    """Create the document-related tables if they don't exist"""
    conn = get_db_connection()
    cursor = conn.cursor()

    # Create document search tables
    schema_sql = """
    -- Documents table for storing processed LKK data
    CREATE TABLE IF NOT EXISTS documents (
        id INT AUTO_INCREMENT PRIMARY KEY,
        title VARCHAR(255) NOT NULL,
        content TEXT NOT NULL,
        source VARCHAR(255),
        doc_type VARCHAR(50),
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        LKK_INFOID INT,
        FOREIGN KEY (LKK_INFOID) REFERENCES lt_lkk_info(LKK_INFOID)
    );

    -- Embeddings table for semantic search
    CREATE TABLE IF NOT EXISTS embeddings (
        id INT AUTO_INCREMENT PRIMARY KEY,
        document_id INT NOT NULL,
        embedding JSON NOT NULL,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
    );

    -- Search history
    CREATE TABLE IF NOT EXISTS search_logs (
        id INT AUTO_INCREMENT PRIMARY KEY,
        query TEXT NOT NULL,
        results JSON,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    );
    """

    # Drop existing tables if they exist
    cursor.execute("DROP TABLE IF EXISTS embeddings;")
    cursor.execute("DROP TABLE IF EXISTS documents;")
    cursor.execute("DROP TABLE IF EXISTS search_logs;")

    # Execute schema SQL statements
    for statement in schema_sql.split(';'):
        if statement.strip():
            try:
                cursor.execute(statement + ';')
                print(f"Executed: {statement[:50]}...")
            except mysql.connector.Error as err:
                print(f"Error executing statement: {err}")

    conn.commit()
    cursor.close()
    conn.close()

def convert_lkk_to_documents():
    """Convert LKK data into searchable documents based on LLA_CASE_NO"""
    # Import required modules for embeddings
    sys.path.append('.')
    try:
        from embedding.embedding_service import generate_embedding
    except Exception as e:
        print(f"Error importing embedding service: {e}")
        print("Falling back to HTTP-based embedding service...")
        from embedding.embedding_service_http import generate_embedding

    conn = get_db_connection()
    cursor = conn.cursor(dictionary=True)

    try:
        # Step 1: Get all unique case numbers from allegations
        cursor.execute("""
            SELECT DISTINCT LLA_CASE_NO
            FROM lt_lkk_allegation
            WHERE LLA_CASE_NO IS NOT NULL AND LLA_CASE_NO != ''
        """)

        case_numbers = cursor.fetchall()
        print(f"Found {len(case_numbers)} unique case numbers to process")

        for case_number_row in case_numbers:
            case_number = case_number_row['LLA_CASE_NO']
            try:
                # Step 2: For each case number, get all relevant data from allegations and info tables
                cursor.execute("""
                    SELECT
                        i.*,
                        a.*
                    FROM lt_lkk_allegation a
                    JOIN lt_lkk_info i ON a.LKK_INFOID = i.LKK_INFOID
                    WHERE a.LLA_CASE_NO = %s
                    GROUP BY a.LLA_ALLEGATION_ID
                """, (case_number,))

                allegation_records = cursor.fetchall()

                if not allegation_records:
                    print(f"No records found for case number {case_number}")
                    continue

                # Get the first record to extract basic info
                first_record = allegation_records[0]
                lkk_infoid = first_record['LKK_INFOID']

                # Step 3: Get person data separately
                cursor.execute("""
                    SELECT LTL_PERSON_ID, LTL_DATA
                    FROM lt_lkk_person_involve
                    WHERE LKK_INFOID = %s
                """, (lkk_infoid,))

                person_records = cursor.fetchall()
                involved_persons = []

                for person in person_records:
                    person_id = person['LTL_PERSON_ID']
                    try:
                        # Parse person data from JSON
                        person_data = {}
                        if person['LTL_DATA']:
                            try:
                                person_data = json.loads(person['LTL_DATA'])
                            except:
                                print(f"Error parsing LTL_DATA JSON for person ID {person_id}")

                        # Instead of just showing the ID, include the full LTL_DATA content
                        person_info = f"Person ID: {person_id}"

                        # Add the full JSON data
                        if person['LTL_DATA']:
                            person_info = f"Person ID: {person_id}\nPerson Data: {person['LTL_DATA']}"

                        involved_persons.append(person_info)
                    except Exception as e:
                        print(f"Error processing person data for ID {person_id}: {e}")

                # Start with basic case info
                base_info = {
                    'case_number': case_number,
                    'file_number': first_record['LKK_FILE_NO'],
                    'status': first_record['LKK_STATUS'],
                    'dpp_suggestion': first_record['LKK_DPP_ANT_SUGGESTION'],
                    'hod_decision': first_record['LKK_HOD_DECISION'],
                    'lkk_infoid': lkk_infoid,
                    'created_date': first_record['LKK_CREATEDDATE']
                }

                # Title using case number
                title = f"Case Number: {case_number}"

                # Compile content
                content_parts = [
                    f"File Number: {base_info['file_number']}",
                    f"Status: {base_info['status']}",
                    f"Case Number: {case_number}",
                    f"DPP Suggestion: {base_info['dpp_suggestion'] or 'None'}",
                    f"HOD Decision: {base_info['hod_decision'] or 'None'}",
                    "\n--- ALLEGATIONS ---"
                ]

                # Add all allegations
                for idx, record in enumerate(allegation_records, 1):
                    allegation_parts = [
                        f"\nALLEGATION #{idx}:",
                        f"Allegation ID: {record['LLA_ALLEGATION_ID']}",
                        f"Case Number: {record['LLA_CASE_NO']}",
                        f"Accused Name: {record['LLA_OKT_NAME'] or 'N/A'}",
                        f"Type: {record['LLA_TYPE'] or 'N/A'}",
                        f"Act ID: {record['LLA_ACT_ID'] or 'N/A'}",
                        f"Act Description: {record['LLA_ACT_DESC'] or 'N/A'}",
                        f"Section: {record['LLA_SECTION'] or 'N/A'}",
                        f"Date: {record['LLA_DATE']}",
                        f"Charge Notes: {record['LLA_CHARGE_NOTES'] or 'N/A'}",
                        f"Charge Type: {record['LKK_CHARGE_TYPE'] or 'N/A'}",
                        f"Charge Reason: {record['LLA_CHARGE_REASON'] or 'N/A'}",
                        f"Charged By: {record['LLA_CHARGE_BY'] or 'N/A'}"
                    ]
                    content_parts.extend(allegation_parts)

                # Add involved persons section
                if involved_persons:
                    content_parts.append("\n--- INVOLVED PERSONS ---")
                    for person in involved_persons:
                        content_parts.append(person)

                # Join all content
                content = "\n".join(filter(None, content_parts))

                # Insert into documents table
                cursor.execute("""
                    INSERT INTO documents (title, content, source, doc_type, LKK_INFOID, created_at)
                    VALUES (%s, %s, %s, %s, %s, %s)
                """, (
                    title,
                    content,
                    f"Case Number: {case_number}",
                    "Legal Case",
                    base_info['lkk_infoid'],
                    base_info['created_date']
                ))

                doc_id = cursor.lastrowid

                # Generate and store embedding
                embedding = generate_embedding(f"{title} {content}")

                # Convert numpy array to list if necessary
                if isinstance(embedding, np.ndarray):
                    embedding = embedding.tolist()

                cursor.execute("""
                    INSERT INTO embeddings (document_id, embedding)
                    VALUES (%s, %s)
                """, (doc_id, json.dumps(embedding)))

                conn.commit()
                print(f"Processed case number {case_number} into document {doc_id}")

            except Exception as e:
                print(f"Error processing case number {case_number}: {e}")
                conn.rollback()

    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        cursor.close()
        conn.close()

if __name__ == "__main__":
    # Ask for confirmation before proceeding
    print("This script will convert LKK data into searchable documents.")
    confirm = input("Do you want to proceed? (y/n): ")

    if confirm.lower() != 'y':
        print("Import cancelled.")
        sys.exit(0)

    # Set up document tables
    setup_document_tables()

    # Convert LKK data to documents
    convert_lkk_to_documents()

    print("\nData conversion completed successfully!")