agc-chatbot/db/import_lkk_data.py

746 lines
27 KiB
Python

import os
import json
import mysql.connector
import numpy as np
import re
import sys
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Print environment variables for debugging
print(f"MySQL Host: {os.getenv('MYSQL_HOST')}")
print(f"MySQL User: {os.getenv('MYSQL_USER')}")
print(f"MySQL Database: {os.getenv('MYSQL_DATABASE')}")
print(f"MySQL Password: {'[SET]' if os.getenv('MYSQL_PASSWORD') else '[NOT SET]'}")
# Database configuration
DB_CONFIG = {
'host': os.getenv('MYSQL_HOST', 'localhost'),
'user': os.getenv('MYSQL_USER', 'root'),
'password': os.getenv('MYSQL_PASSWORD', ''),
'database': os.getenv('MYSQL_DATABASE', 'agc')
}
def get_db_connection():
"""Create a connection to the MySQL database"""
return mysql.connector.connect(**DB_CONFIG)
def truncate_all_tables():
"""Truncate all tables to remove existing data before importing new data"""
conn = get_db_connection()
cursor = conn.cursor()
print("Truncating all tables to ensure a clean import...")
# Disable foreign key checks temporarily to allow truncating tables with foreign keys
cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
try:
# Get all tables in the database
cursor.execute("SHOW TABLES;")
tables = cursor.fetchall()
for table in tables:
table_name = table[0]
print(f"Truncating table: {table_name}")
cursor.execute(f"TRUNCATE TABLE {table_name};")
print(f"Successfully truncated {len(tables)} tables")
except mysql.connector.Error as err:
print(f"Error truncating tables: {err}")
finally:
# Re-enable foreign key checks
cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
conn.commit()
cursor.close()
conn.close()
def setup_tables():
"""Create the tables for document search if they don't exist"""
conn = get_db_connection()
cursor = conn.cursor()
# Create document_search tables
schema_sql = """
-- Documents table
CREATE TABLE IF NOT EXISTS documents (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
source VARCHAR(255),
doc_type VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Embeddings table - store as JSON since MySQL doesn't have a vector type
CREATE TABLE IF NOT EXISTS embeddings (
id INT AUTO_INCREMENT PRIMARY KEY,
document_id INT NOT NULL,
embedding JSON NOT NULL,
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
);
-- Search history
CREATE TABLE IF NOT EXISTS search_logs (
id INT AUTO_INCREMENT PRIMARY KEY,
query TEXT NOT NULL,
results JSON,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
"""
# Execute schema SQL statements
for statement in schema_sql.split(';'):
if statement.strip():
try:
cursor.execute(statement + ';')
print(f"Executed: {statement[:50]}...")
except mysql.connector.Error as err:
print(f"Error executing statement: {err}")
print(f"Statement: {statement}")
conn.commit()
cursor.close()
conn.close()
print("Document search tables created successfully")
def extract_sql_inserts(file_path, table_name):
"""Extract SQL INSERT statements from file and return as a list of SQL statements"""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
content = file.read()
# Extract all INSERT statements for the specified table
pattern = rf"INSERT INTO {table_name}[^;]*;"
inserts = re.findall(pattern, content, re.DOTALL)
# Clean up the inserts to make them more compatible
cleaned_inserts = []
for insert in inserts:
# Replace TEXT fields that might have problematic characters
# This is a simplistic approach - for a production system, you'd need more robust parsing
if 'TEXT' in insert or 'text' in insert or 'Text' in insert:
# Try to handle the quotes better
insert = re.sub(r"'([^']*?)<([^>]*?)>'", r"'\\1<\\2>'", insert)
cleaned_inserts.append(insert)
return cleaned_inserts
except Exception as e:
print(f"Error reading SQL file {file_path}: {e}")
return []
def execute_sql_statements(statements):
"""Execute a list of SQL statements"""
if not statements:
return
conn = get_db_connection()
cursor = conn.cursor()
successful = 0
failed = 0
for statement in statements:
try:
cursor.execute(statement)
successful += 1
except mysql.connector.Error as err:
# Skip duplicate key errors
if err.errno == 1062: # Duplicate entry error
print(f"Skipping duplicate entry: {err}")
else:
print(f"Error executing SQL: {err}")
print(f"Statement: {statement[:100]}...") # Print first 100 chars of the statement
failed += 1
conn.commit()
cursor.close()
conn.close()
print(f"Executed {successful} statements successfully, {failed} statements failed")
def scan_directory_for_sql_files(directory):
"""Scan a directory and its subdirectories for SQL files"""
sql_files = {
'info': [],
'allegation': [],
'person': []
}
# Walk through the directory
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.sql'):
full_path = os.path.join(root, file)
# Categorize files
if 'LT_LKK_INFO' in file:
sql_files['info'].append(full_path)
elif 'LT_LKK_ALLEGATION' in file:
sql_files['allegation'].append(full_path)
elif 'LT_LKK_PERSON' in file or 'PERSON_RESPONSIBLE' in file:
sql_files['person'].append(full_path)
return sql_files
def extract_case_data_from_directory(directory_path):
"""Extract legal case data directly from a directory"""
# Get the directory name (category)
category = os.path.basename(directory_path)
# Dictionary to store case data
cases = []
# Look for PDF files first
pdf_files = []
for file in os.listdir(directory_path):
if file.lower().endswith('.pdf'):
pdf_files.append(os.path.join(directory_path, file))
# Look for SQL files to extract data
info_files = []
allegation_files = []
person_files = []
for file in os.listdir(directory_path):
if file.endswith('.sql'):
full_path = os.path.join(directory_path, file)
if 'LT_LKK_INFO' in file:
info_files.append(full_path)
elif 'LT_LKK_ALLEGATION' in file:
allegation_files.append(full_path)
elif 'LT_LKK_PERSON' in file or 'PERSON_RESPONSIBLE' in file:
person_files.append(full_path)
# Create a basic case from the directory
case = {
'title': f"Legal Case - {category}",
'content': f"Legal category: {category}\n",
'source': directory_path,
'doc_type': "Legal Case Category",
'pdf_files': pdf_files
}
cases.append(case)
return cases
def import_pdf_files(directory):
"""Import PDF files as documents"""
# Import required modules
sys.path.append('.') # Add current directory to path
from db.db_utils import add_document, store_embedding
try:
from embedding.embedding_service import generate_embedding
except Exception as e:
print(f"Error importing regular embedding service: {e}")
print("Falling back to HTTP-based embedding service...")
from embedding.embedding_service_http import generate_embedding
imported_count = 0
# Walk through the directory
for root, dirs, files in os.walk(directory):
for file in files:
if file.lower().endswith('.pdf'):
try:
full_path = os.path.join(root, file)
# Get the category from directory name
category = os.path.basename(root)
# Try to extract text from PDF (would need a PDF library)
# For now, we'll just use the filename as content
print(f"Found PDF: {full_path}")
# For a real implementation, you would use PyPDF2 or a similar library
# This is a placeholder for actual PDF text extraction
title = file.replace('.pdf', '')
content = f"PDF Document: {file} from category {category}\n\nSource: {full_path}"
# Add document to the database
doc_id = add_document(
title=title,
content=content,
source=full_path,
doc_type="Legal PDF Document"
)
# Generate and store embedding
embedding = generate_embedding(f"{title} {content}")
store_embedding(doc_id, embedding)
print(f"Added PDF document: {title}")
imported_count += 1
except Exception as e:
print(f"Error processing PDF {file}: {e}")
return imported_count
def import_data_from_directory(directory):
"""Import data directly from each category directory"""
# Import required modules
sys.path.append('.') # Add current directory to path
from db.db_utils import add_document, store_embedding
try:
from embedding.embedding_service import generate_embedding
except Exception as e:
print(f"Error importing regular embedding service: {e}")
print("Falling back to HTTP-based embedding service...")
from embedding.embedding_service_http import generate_embedding
# Count for imported documents
imported_count = 0
# Get all subdirectories
try:
subdirs = [os.path.join(directory, d) for d in os.listdir(directory)
if os.path.isdir(os.path.join(directory, d))]
print(f"Found {len(subdirs)} category directories")
# Process each directory
for subdir in subdirs:
category = os.path.basename(subdir)
print(f"Processing directory: {category}")
# Extract cases from the directory
cases = extract_case_data_from_directory(subdir)
# Add each case as a document
for case in cases:
doc_id = add_document(
title=case['title'],
content=case['content'],
source=case['source'],
doc_type=case['doc_type']
)
# Generate and store embedding
embedding = generate_embedding(f"{case['title']} {case['content']}")
store_embedding(doc_id, embedding)
print(f"Added document: {case['title']}")
imported_count += 1
except Exception as e:
print(f"Error scanning directories: {e}")
return imported_count
def import_sql_data_to_db():
"""Import SQL files from Data directory into the database"""
data_dir = os.path.join(os.getcwd(), 'Data')
print(f"Scanning directory: {data_dir}")
sql_files = scan_directory_for_sql_files(data_dir)
print(f"Found {len(sql_files['info'])} info files, {len(sql_files['allegation'])} allegation files, {len(sql_files['person'])} person files")
# Import info files
for file_path in sql_files['info']:
print(f"Importing info from {file_path}")
statements = extract_sql_inserts(file_path, 'LT_LKK_INFO')
execute_sql_statements(statements)
# Import allegation files
for file_path in sql_files['allegation']:
print(f"Importing allegations from {file_path}")
statements = extract_sql_inserts(file_path, 'LT_LKK_ALLEGATION')
execute_sql_statements(statements)
# Import person files
for file_path in sql_files['person']:
print(f"Importing persons from {file_path}")
statements = extract_sql_inserts(file_path, 'LT_LKK_PERSON_INVOLVE')
execute_sql_statements(statements)
print("SQL data import complete!")
# Import directly from directories
print("Importing data directly from directories...")
dir_docs = import_data_from_directory(data_dir)
print(f"Imported {dir_docs} documents from directories")
# Also import PDF files
print("Looking for PDF files...")
pdf_docs = import_pdf_files(data_dir)
print(f"Imported {pdf_docs} PDF documents")
def check_tables(db_config, label):
"""Check tables in the specified database"""
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
# Get all tables
cursor.execute("SHOW TABLES;")
tables = cursor.fetchall()
print(f"\n{label} tables in database {db_config['database']}:")
for table in tables:
table_name = table[0]
# Count records
cursor.execute(f"SELECT COUNT(*) FROM {table_name};")
count = cursor.fetchone()[0]
print(f" {table_name}: {count} records")
cursor.close()
conn.close()
def convert_user_data_to_documents():
"""Convert user data to documents in our application"""
# Import required modules
sys.path.append('.') # Add current directory to path
try:
from db.db_utils import add_document, store_embedding
from embedding.embedding_service_http import generate_embedding
except Exception as e:
print(f"Error importing required modules: {e}")
return
# Connect to the source database
source_conn = get_db_connection()
source_cursor = source_conn.cursor(dictionary=True)
# Get all users with their details and preferences - adjusted for actual schema
try:
query = """
SELECT
u.user_id, u.username, u.email, u.created_at,
ud.first_name, ud.last_name, ud.date_of_birth, ud.phone_number,
up.language, up.theme, up.notifications_enabled
FROM users u
LEFT JOIN user_details ud ON u.user_id = ud.user_id
LEFT JOIN user_preferences up ON u.user_id = up.user_id
"""
source_cursor.execute(query)
user_records = source_cursor.fetchall()
if not user_records:
print("No user records found in source database.")
return
print(f"Found {len(user_records)} user records to convert")
# Process each user as a document
user_count = 0
for user in user_records:
user_id = user.get('user_id')
# Create title with user information
username = user.get('username', f"User {user_id}")
title = f"User Profile: {username}"
# Compile content from specific fields based on the schema
content_parts = [
f"Username: {user.get('username', 'N/A')}",
f"Email: {user.get('email', 'N/A')}",
f"Created At: {user.get('created_at', 'N/A')}"
]
# Add user details if available
if user.get('first_name') or user.get('last_name'):
name = f"{user.get('first_name', '')} {user.get('last_name', '')}".strip()
content_parts.append(f"Name: {name}")
if user.get('date_of_birth'):
content_parts.append(f"Date of Birth: {user.get('date_of_birth')}")
if user.get('phone_number'):
content_parts.append(f"Phone Number: {user.get('phone_number')}")
# Add user preferences if available
if user.get('language'):
content_parts.append(f"Language: {user.get('language')}")
if user.get('theme'):
content_parts.append(f"Theme: {user.get('theme')}")
if user.get('notifications_enabled') is not None:
notifications = "Enabled" if user.get('notifications_enabled') else "Disabled"
content_parts.append(f"Notifications: {notifications}")
# Combine all content
content = "\n".join(content_parts)
# Add document to the database
print(f"Adding document for user {username}")
doc_id = add_document(
title=title,
content=content,
source=f"User ID: {user_id} from docdoc database",
doc_type="User Profile"
)
# Generate and store embedding
print(f"Generating embedding for document: {doc_id}")
embedding = generate_embedding(f"{title} {content}")
store_embedding(doc_id, embedding)
user_count += 1
print(f"Conversion complete! Created {user_count} user profile documents.")
except mysql.connector.Error as err:
print(f"Error accessing user data: {err}")
finally:
source_cursor.close()
source_conn.close()
def setup_document_tables():
"""Create the document-related tables if they don't exist"""
conn = get_db_connection()
cursor = conn.cursor()
# Create document search tables
schema_sql = """
-- Documents table for storing processed LKK data
CREATE TABLE IF NOT EXISTS documents (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
source VARCHAR(255),
doc_type VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
LKK_INFOID INT,
FOREIGN KEY (LKK_INFOID) REFERENCES lt_lkk_info(LKK_INFOID)
);
-- Embeddings table for semantic search
CREATE TABLE IF NOT EXISTS embeddings (
id INT AUTO_INCREMENT PRIMARY KEY,
document_id INT NOT NULL,
embedding JSON NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
);
-- Search history
CREATE TABLE IF NOT EXISTS search_logs (
id INT AUTO_INCREMENT PRIMARY KEY,
query TEXT NOT NULL,
results JSON,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
"""
# Drop existing tables if they exist
cursor.execute("DROP TABLE IF EXISTS embeddings;")
cursor.execute("DROP TABLE IF EXISTS documents;")
cursor.execute("DROP TABLE IF EXISTS search_logs;")
# Execute schema SQL statements
for statement in schema_sql.split(';'):
if statement.strip():
try:
cursor.execute(statement + ';')
print(f"Executed: {statement[:50]}...")
except mysql.connector.Error as err:
print(f"Error executing statement: {err}")
conn.commit()
cursor.close()
conn.close()
def convert_lkk_to_documents():
"""Convert LKK data into searchable documents based on LLA_CASE_NO"""
# Import required modules for embeddings
sys.path.append('.')
try:
from embedding.embedding_service import generate_embedding
except Exception as e:
print(f"Error importing embedding service: {e}")
print("Falling back to HTTP-based embedding service...")
from embedding.embedding_service_http import generate_embedding
conn = get_db_connection()
cursor = conn.cursor(dictionary=True)
try:
# Step 1: Get all unique case numbers from allegations
cursor.execute("""
SELECT DISTINCT LLA_CASE_NO
FROM lt_lkk_allegation
WHERE LLA_CASE_NO IS NOT NULL AND LLA_CASE_NO != ''
""")
case_numbers = cursor.fetchall()
print(f"Found {len(case_numbers)} unique case numbers to process")
for case_number_row in case_numbers:
case_number = case_number_row['LLA_CASE_NO']
try:
# Step 2: For each case number, get all relevant data from allegations and info tables
cursor.execute("""
SELECT
i.*,
a.*
FROM lt_lkk_allegation a
JOIN lt_lkk_info i ON a.LKK_INFOID = i.LKK_INFOID
WHERE a.LLA_CASE_NO = %s
GROUP BY a.LLA_ALLEGATION_ID
""", (case_number,))
allegation_records = cursor.fetchall()
if not allegation_records:
print(f"No records found for case number {case_number}")
continue
# Get the first record to extract basic info
first_record = allegation_records[0]
lkk_infoid = first_record['LKK_INFOID']
# Step 3: Get person data separately
cursor.execute("""
SELECT LTL_PERSON_ID, LTL_DATA
FROM lt_lkk_person_involve
WHERE LKK_INFOID = %s
""", (lkk_infoid,))
person_records = cursor.fetchall()
involved_persons = []
for person in person_records:
person_id = person['LTL_PERSON_ID']
try:
# Parse person data from JSON
person_data = {}
if person['LTL_DATA']:
try:
person_data = json.loads(person['LTL_DATA'])
except:
print(f"Error parsing LTL_DATA JSON for person ID {person_id}")
# Instead of just showing the ID, include the full LTL_DATA content
person_info = f"Person ID: {person_id}"
# Add the full JSON data
if person['LTL_DATA']:
person_info = f"Person ID: {person_id}\nPerson Data: {person['LTL_DATA']}"
involved_persons.append(person_info)
except Exception as e:
print(f"Error processing person data for ID {person_id}: {e}")
# Start with basic case info
base_info = {
'case_number': case_number,
'file_number': first_record['LKK_FILE_NO'],
'status': first_record['LKK_STATUS'],
'dpp_suggestion': first_record['LKK_DPP_ANT_SUGGESTION'],
'hod_decision': first_record['LKK_HOD_DECISION'],
'lkk_infoid': lkk_infoid,
'created_date': first_record['LKK_CREATEDDATE']
}
# Title using case number
title = f"Case Number: {case_number}"
# Compile content
content_parts = [
f"File Number: {base_info['file_number']}",
f"Status: {base_info['status']}",
f"Case Number: {case_number}",
f"DPP Suggestion: {base_info['dpp_suggestion'] or 'None'}",
f"HOD Decision: {base_info['hod_decision'] or 'None'}",
"\n--- ALLEGATIONS ---"
]
# Add all allegations
for idx, record in enumerate(allegation_records, 1):
allegation_parts = [
f"\nALLEGATION #{idx}:",
f"Allegation ID: {record['LLA_ALLEGATION_ID']}",
f"Case Number: {record['LLA_CASE_NO']}",
f"Accused Name: {record['LLA_OKT_NAME'] or 'N/A'}",
f"Type: {record['LLA_TYPE'] or 'N/A'}",
f"Act ID: {record['LLA_ACT_ID'] or 'N/A'}",
f"Act Description: {record['LLA_ACT_DESC'] or 'N/A'}",
f"Section: {record['LLA_SECTION'] or 'N/A'}",
f"Date: {record['LLA_DATE']}",
f"Charge Notes: {record['LLA_CHARGE_NOTES'] or 'N/A'}",
f"Charge Type: {record['LKK_CHARGE_TYPE'] or 'N/A'}",
f"Charge Reason: {record['LLA_CHARGE_REASON'] or 'N/A'}",
f"Charged By: {record['LLA_CHARGE_BY'] or 'N/A'}"
]
content_parts.extend(allegation_parts)
# Add involved persons section
if involved_persons:
content_parts.append("\n--- INVOLVED PERSONS ---")
for person in involved_persons:
content_parts.append(person)
# Join all content
content = "\n".join(filter(None, content_parts))
# Insert into documents table
cursor.execute("""
INSERT INTO documents (title, content, source, doc_type, LKK_INFOID, created_at)
VALUES (%s, %s, %s, %s, %s, %s)
""", (
title,
content,
f"Case Number: {case_number}",
"Legal Case",
base_info['lkk_infoid'],
base_info['created_date']
))
doc_id = cursor.lastrowid
# Generate and store embedding
embedding = generate_embedding(f"{title} {content}")
# Convert numpy array to list if necessary
if isinstance(embedding, np.ndarray):
embedding = embedding.tolist()
cursor.execute("""
INSERT INTO embeddings (document_id, embedding)
VALUES (%s, %s)
""", (doc_id, json.dumps(embedding)))
conn.commit()
print(f"Processed case number {case_number} into document {doc_id}")
except Exception as e:
print(f"Error processing case number {case_number}: {e}")
conn.rollback()
except mysql.connector.Error as err:
print(f"Database error: {err}")
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
# Ask for confirmation before proceeding
print("This script will convert LKK data into searchable documents.")
confirm = input("Do you want to proceed? (y/n): ")
if confirm.lower() != 'y':
print("Import cancelled.")
sys.exit(0)
# Set up document tables
setup_document_tables()
# Convert LKK data to documents
convert_lkk_to_documents()
print("\nData conversion completed successfully!")