746 lines
27 KiB
Python
746 lines
27 KiB
Python
import os
|
|
import json
|
|
import mysql.connector
|
|
import numpy as np
|
|
import re
|
|
import sys
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Print environment variables for debugging
|
|
print(f"MySQL Host: {os.getenv('MYSQL_HOST')}")
|
|
print(f"MySQL User: {os.getenv('MYSQL_USER')}")
|
|
print(f"MySQL Database: {os.getenv('MYSQL_DATABASE')}")
|
|
print(f"MySQL Password: {'[SET]' if os.getenv('MYSQL_PASSWORD') else '[NOT SET]'}")
|
|
|
|
# Database configuration
|
|
DB_CONFIG = {
|
|
'host': os.getenv('MYSQL_HOST', 'localhost'),
|
|
'user': os.getenv('MYSQL_USER', 'root'),
|
|
'password': os.getenv('MYSQL_PASSWORD', ''),
|
|
'database': os.getenv('MYSQL_DATABASE', 'agc')
|
|
}
|
|
|
|
def get_db_connection():
|
|
"""Create a connection to the MySQL database"""
|
|
return mysql.connector.connect(**DB_CONFIG)
|
|
|
|
def truncate_all_tables():
|
|
"""Truncate all tables to remove existing data before importing new data"""
|
|
conn = get_db_connection()
|
|
cursor = conn.cursor()
|
|
|
|
print("Truncating all tables to ensure a clean import...")
|
|
|
|
# Disable foreign key checks temporarily to allow truncating tables with foreign keys
|
|
cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
|
|
|
|
try:
|
|
# Get all tables in the database
|
|
cursor.execute("SHOW TABLES;")
|
|
tables = cursor.fetchall()
|
|
|
|
for table in tables:
|
|
table_name = table[0]
|
|
print(f"Truncating table: {table_name}")
|
|
cursor.execute(f"TRUNCATE TABLE {table_name};")
|
|
|
|
print(f"Successfully truncated {len(tables)} tables")
|
|
except mysql.connector.Error as err:
|
|
print(f"Error truncating tables: {err}")
|
|
finally:
|
|
# Re-enable foreign key checks
|
|
cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def setup_tables():
|
|
"""Create the tables for document search if they don't exist"""
|
|
conn = get_db_connection()
|
|
cursor = conn.cursor()
|
|
|
|
# Create document_search tables
|
|
schema_sql = """
|
|
-- Documents table
|
|
CREATE TABLE IF NOT EXISTS documents (
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
title VARCHAR(255) NOT NULL,
|
|
content TEXT NOT NULL,
|
|
source VARCHAR(255),
|
|
doc_type VARCHAR(50),
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
-- Embeddings table - store as JSON since MySQL doesn't have a vector type
|
|
CREATE TABLE IF NOT EXISTS embeddings (
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
document_id INT NOT NULL,
|
|
embedding JSON NOT NULL,
|
|
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
|
|
);
|
|
|
|
-- Search history
|
|
CREATE TABLE IF NOT EXISTS search_logs (
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
query TEXT NOT NULL,
|
|
results JSON,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
"""
|
|
|
|
# Execute schema SQL statements
|
|
for statement in schema_sql.split(';'):
|
|
if statement.strip():
|
|
try:
|
|
cursor.execute(statement + ';')
|
|
print(f"Executed: {statement[:50]}...")
|
|
except mysql.connector.Error as err:
|
|
print(f"Error executing statement: {err}")
|
|
print(f"Statement: {statement}")
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
print("Document search tables created successfully")
|
|
|
|
def extract_sql_inserts(file_path, table_name):
|
|
"""Extract SQL INSERT statements from file and return as a list of SQL statements"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
|
|
content = file.read()
|
|
|
|
# Extract all INSERT statements for the specified table
|
|
pattern = rf"INSERT INTO {table_name}[^;]*;"
|
|
inserts = re.findall(pattern, content, re.DOTALL)
|
|
|
|
# Clean up the inserts to make them more compatible
|
|
cleaned_inserts = []
|
|
for insert in inserts:
|
|
# Replace TEXT fields that might have problematic characters
|
|
# This is a simplistic approach - for a production system, you'd need more robust parsing
|
|
if 'TEXT' in insert or 'text' in insert or 'Text' in insert:
|
|
# Try to handle the quotes better
|
|
insert = re.sub(r"'([^']*?)<([^>]*?)>'", r"'\\1<\\2>'", insert)
|
|
|
|
cleaned_inserts.append(insert)
|
|
|
|
return cleaned_inserts
|
|
except Exception as e:
|
|
print(f"Error reading SQL file {file_path}: {e}")
|
|
return []
|
|
|
|
def execute_sql_statements(statements):
|
|
"""Execute a list of SQL statements"""
|
|
if not statements:
|
|
return
|
|
|
|
conn = get_db_connection()
|
|
cursor = conn.cursor()
|
|
|
|
successful = 0
|
|
failed = 0
|
|
|
|
for statement in statements:
|
|
try:
|
|
cursor.execute(statement)
|
|
successful += 1
|
|
except mysql.connector.Error as err:
|
|
# Skip duplicate key errors
|
|
if err.errno == 1062: # Duplicate entry error
|
|
print(f"Skipping duplicate entry: {err}")
|
|
else:
|
|
print(f"Error executing SQL: {err}")
|
|
print(f"Statement: {statement[:100]}...") # Print first 100 chars of the statement
|
|
failed += 1
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
print(f"Executed {successful} statements successfully, {failed} statements failed")
|
|
|
|
def scan_directory_for_sql_files(directory):
|
|
"""Scan a directory and its subdirectories for SQL files"""
|
|
sql_files = {
|
|
'info': [],
|
|
'allegation': [],
|
|
'person': []
|
|
}
|
|
|
|
# Walk through the directory
|
|
for root, dirs, files in os.walk(directory):
|
|
for file in files:
|
|
if file.endswith('.sql'):
|
|
full_path = os.path.join(root, file)
|
|
|
|
# Categorize files
|
|
if 'LT_LKK_INFO' in file:
|
|
sql_files['info'].append(full_path)
|
|
elif 'LT_LKK_ALLEGATION' in file:
|
|
sql_files['allegation'].append(full_path)
|
|
elif 'LT_LKK_PERSON' in file or 'PERSON_RESPONSIBLE' in file:
|
|
sql_files['person'].append(full_path)
|
|
|
|
return sql_files
|
|
|
|
def extract_case_data_from_directory(directory_path):
|
|
"""Extract legal case data directly from a directory"""
|
|
# Get the directory name (category)
|
|
category = os.path.basename(directory_path)
|
|
|
|
# Dictionary to store case data
|
|
cases = []
|
|
|
|
# Look for PDF files first
|
|
pdf_files = []
|
|
for file in os.listdir(directory_path):
|
|
if file.lower().endswith('.pdf'):
|
|
pdf_files.append(os.path.join(directory_path, file))
|
|
|
|
# Look for SQL files to extract data
|
|
info_files = []
|
|
allegation_files = []
|
|
person_files = []
|
|
|
|
for file in os.listdir(directory_path):
|
|
if file.endswith('.sql'):
|
|
full_path = os.path.join(directory_path, file)
|
|
if 'LT_LKK_INFO' in file:
|
|
info_files.append(full_path)
|
|
elif 'LT_LKK_ALLEGATION' in file:
|
|
allegation_files.append(full_path)
|
|
elif 'LT_LKK_PERSON' in file or 'PERSON_RESPONSIBLE' in file:
|
|
person_files.append(full_path)
|
|
|
|
# Create a basic case from the directory
|
|
case = {
|
|
'title': f"Legal Case - {category}",
|
|
'content': f"Legal category: {category}\n",
|
|
'source': directory_path,
|
|
'doc_type': "Legal Case Category",
|
|
'pdf_files': pdf_files
|
|
}
|
|
|
|
cases.append(case)
|
|
|
|
return cases
|
|
|
|
def import_pdf_files(directory):
|
|
"""Import PDF files as documents"""
|
|
# Import required modules
|
|
sys.path.append('.') # Add current directory to path
|
|
from db.db_utils import add_document, store_embedding
|
|
|
|
try:
|
|
from embedding.embedding_service import generate_embedding
|
|
except Exception as e:
|
|
print(f"Error importing regular embedding service: {e}")
|
|
print("Falling back to HTTP-based embedding service...")
|
|
from embedding.embedding_service_http import generate_embedding
|
|
|
|
imported_count = 0
|
|
|
|
# Walk through the directory
|
|
for root, dirs, files in os.walk(directory):
|
|
for file in files:
|
|
if file.lower().endswith('.pdf'):
|
|
try:
|
|
full_path = os.path.join(root, file)
|
|
|
|
# Get the category from directory name
|
|
category = os.path.basename(root)
|
|
|
|
# Try to extract text from PDF (would need a PDF library)
|
|
# For now, we'll just use the filename as content
|
|
print(f"Found PDF: {full_path}")
|
|
|
|
# For a real implementation, you would use PyPDF2 or a similar library
|
|
# This is a placeholder for actual PDF text extraction
|
|
title = file.replace('.pdf', '')
|
|
content = f"PDF Document: {file} from category {category}\n\nSource: {full_path}"
|
|
|
|
# Add document to the database
|
|
doc_id = add_document(
|
|
title=title,
|
|
content=content,
|
|
source=full_path,
|
|
doc_type="Legal PDF Document"
|
|
)
|
|
|
|
# Generate and store embedding
|
|
embedding = generate_embedding(f"{title} {content}")
|
|
store_embedding(doc_id, embedding)
|
|
|
|
print(f"Added PDF document: {title}")
|
|
imported_count += 1
|
|
except Exception as e:
|
|
print(f"Error processing PDF {file}: {e}")
|
|
|
|
return imported_count
|
|
|
|
def import_data_from_directory(directory):
|
|
"""Import data directly from each category directory"""
|
|
# Import required modules
|
|
sys.path.append('.') # Add current directory to path
|
|
from db.db_utils import add_document, store_embedding
|
|
|
|
try:
|
|
from embedding.embedding_service import generate_embedding
|
|
except Exception as e:
|
|
print(f"Error importing regular embedding service: {e}")
|
|
print("Falling back to HTTP-based embedding service...")
|
|
from embedding.embedding_service_http import generate_embedding
|
|
|
|
# Count for imported documents
|
|
imported_count = 0
|
|
|
|
# Get all subdirectories
|
|
try:
|
|
subdirs = [os.path.join(directory, d) for d in os.listdir(directory)
|
|
if os.path.isdir(os.path.join(directory, d))]
|
|
|
|
print(f"Found {len(subdirs)} category directories")
|
|
|
|
# Process each directory
|
|
for subdir in subdirs:
|
|
category = os.path.basename(subdir)
|
|
print(f"Processing directory: {category}")
|
|
|
|
# Extract cases from the directory
|
|
cases = extract_case_data_from_directory(subdir)
|
|
|
|
# Add each case as a document
|
|
for case in cases:
|
|
doc_id = add_document(
|
|
title=case['title'],
|
|
content=case['content'],
|
|
source=case['source'],
|
|
doc_type=case['doc_type']
|
|
)
|
|
|
|
# Generate and store embedding
|
|
embedding = generate_embedding(f"{case['title']} {case['content']}")
|
|
store_embedding(doc_id, embedding)
|
|
|
|
print(f"Added document: {case['title']}")
|
|
imported_count += 1
|
|
|
|
except Exception as e:
|
|
print(f"Error scanning directories: {e}")
|
|
|
|
return imported_count
|
|
|
|
def import_sql_data_to_db():
|
|
"""Import SQL files from Data directory into the database"""
|
|
data_dir = os.path.join(os.getcwd(), 'Data')
|
|
|
|
print(f"Scanning directory: {data_dir}")
|
|
sql_files = scan_directory_for_sql_files(data_dir)
|
|
|
|
print(f"Found {len(sql_files['info'])} info files, {len(sql_files['allegation'])} allegation files, {len(sql_files['person'])} person files")
|
|
|
|
# Import info files
|
|
for file_path in sql_files['info']:
|
|
print(f"Importing info from {file_path}")
|
|
statements = extract_sql_inserts(file_path, 'LT_LKK_INFO')
|
|
execute_sql_statements(statements)
|
|
|
|
# Import allegation files
|
|
for file_path in sql_files['allegation']:
|
|
print(f"Importing allegations from {file_path}")
|
|
statements = extract_sql_inserts(file_path, 'LT_LKK_ALLEGATION')
|
|
execute_sql_statements(statements)
|
|
|
|
# Import person files
|
|
for file_path in sql_files['person']:
|
|
print(f"Importing persons from {file_path}")
|
|
statements = extract_sql_inserts(file_path, 'LT_LKK_PERSON_INVOLVE')
|
|
execute_sql_statements(statements)
|
|
|
|
print("SQL data import complete!")
|
|
|
|
# Import directly from directories
|
|
print("Importing data directly from directories...")
|
|
dir_docs = import_data_from_directory(data_dir)
|
|
print(f"Imported {dir_docs} documents from directories")
|
|
|
|
# Also import PDF files
|
|
print("Looking for PDF files...")
|
|
pdf_docs = import_pdf_files(data_dir)
|
|
print(f"Imported {pdf_docs} PDF documents")
|
|
|
|
def check_tables(db_config, label):
|
|
"""Check tables in the specified database"""
|
|
conn = mysql.connector.connect(**db_config)
|
|
cursor = conn.cursor()
|
|
|
|
# Get all tables
|
|
cursor.execute("SHOW TABLES;")
|
|
tables = cursor.fetchall()
|
|
|
|
print(f"\n{label} tables in database {db_config['database']}:")
|
|
|
|
for table in tables:
|
|
table_name = table[0]
|
|
|
|
# Count records
|
|
cursor.execute(f"SELECT COUNT(*) FROM {table_name};")
|
|
count = cursor.fetchone()[0]
|
|
|
|
print(f" {table_name}: {count} records")
|
|
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def convert_user_data_to_documents():
|
|
"""Convert user data to documents in our application"""
|
|
# Import required modules
|
|
sys.path.append('.') # Add current directory to path
|
|
try:
|
|
from db.db_utils import add_document, store_embedding
|
|
from embedding.embedding_service_http import generate_embedding
|
|
except Exception as e:
|
|
print(f"Error importing required modules: {e}")
|
|
return
|
|
|
|
# Connect to the source database
|
|
source_conn = get_db_connection()
|
|
source_cursor = source_conn.cursor(dictionary=True)
|
|
|
|
# Get all users with their details and preferences - adjusted for actual schema
|
|
try:
|
|
query = """
|
|
SELECT
|
|
u.user_id, u.username, u.email, u.created_at,
|
|
ud.first_name, ud.last_name, ud.date_of_birth, ud.phone_number,
|
|
up.language, up.theme, up.notifications_enabled
|
|
FROM users u
|
|
LEFT JOIN user_details ud ON u.user_id = ud.user_id
|
|
LEFT JOIN user_preferences up ON u.user_id = up.user_id
|
|
"""
|
|
|
|
source_cursor.execute(query)
|
|
user_records = source_cursor.fetchall()
|
|
|
|
if not user_records:
|
|
print("No user records found in source database.")
|
|
return
|
|
|
|
print(f"Found {len(user_records)} user records to convert")
|
|
|
|
# Process each user as a document
|
|
user_count = 0
|
|
|
|
for user in user_records:
|
|
user_id = user.get('user_id')
|
|
|
|
# Create title with user information
|
|
username = user.get('username', f"User {user_id}")
|
|
title = f"User Profile: {username}"
|
|
|
|
# Compile content from specific fields based on the schema
|
|
content_parts = [
|
|
f"Username: {user.get('username', 'N/A')}",
|
|
f"Email: {user.get('email', 'N/A')}",
|
|
f"Created At: {user.get('created_at', 'N/A')}"
|
|
]
|
|
|
|
# Add user details if available
|
|
if user.get('first_name') or user.get('last_name'):
|
|
name = f"{user.get('first_name', '')} {user.get('last_name', '')}".strip()
|
|
content_parts.append(f"Name: {name}")
|
|
|
|
if user.get('date_of_birth'):
|
|
content_parts.append(f"Date of Birth: {user.get('date_of_birth')}")
|
|
|
|
if user.get('phone_number'):
|
|
content_parts.append(f"Phone Number: {user.get('phone_number')}")
|
|
|
|
# Add user preferences if available
|
|
if user.get('language'):
|
|
content_parts.append(f"Language: {user.get('language')}")
|
|
|
|
if user.get('theme'):
|
|
content_parts.append(f"Theme: {user.get('theme')}")
|
|
|
|
if user.get('notifications_enabled') is not None:
|
|
notifications = "Enabled" if user.get('notifications_enabled') else "Disabled"
|
|
content_parts.append(f"Notifications: {notifications}")
|
|
|
|
# Combine all content
|
|
content = "\n".join(content_parts)
|
|
|
|
# Add document to the database
|
|
print(f"Adding document for user {username}")
|
|
|
|
doc_id = add_document(
|
|
title=title,
|
|
content=content,
|
|
source=f"User ID: {user_id} from docdoc database",
|
|
doc_type="User Profile"
|
|
)
|
|
|
|
# Generate and store embedding
|
|
print(f"Generating embedding for document: {doc_id}")
|
|
embedding = generate_embedding(f"{title} {content}")
|
|
store_embedding(doc_id, embedding)
|
|
|
|
user_count += 1
|
|
|
|
print(f"Conversion complete! Created {user_count} user profile documents.")
|
|
|
|
except mysql.connector.Error as err:
|
|
print(f"Error accessing user data: {err}")
|
|
finally:
|
|
source_cursor.close()
|
|
source_conn.close()
|
|
|
|
def setup_document_tables():
|
|
"""Create the document-related tables if they don't exist"""
|
|
conn = get_db_connection()
|
|
cursor = conn.cursor()
|
|
|
|
# Create document search tables
|
|
schema_sql = """
|
|
-- Documents table for storing processed LKK data
|
|
CREATE TABLE IF NOT EXISTS documents (
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
title VARCHAR(255) NOT NULL,
|
|
content TEXT NOT NULL,
|
|
source VARCHAR(255),
|
|
doc_type VARCHAR(50),
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
LKK_INFOID INT,
|
|
FOREIGN KEY (LKK_INFOID) REFERENCES lt_lkk_info(LKK_INFOID)
|
|
);
|
|
|
|
-- Embeddings table for semantic search
|
|
CREATE TABLE IF NOT EXISTS embeddings (
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
document_id INT NOT NULL,
|
|
embedding JSON NOT NULL,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
|
|
);
|
|
|
|
-- Search history
|
|
CREATE TABLE IF NOT EXISTS search_logs (
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
query TEXT NOT NULL,
|
|
results JSON,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
"""
|
|
|
|
# Drop existing tables if they exist
|
|
cursor.execute("DROP TABLE IF EXISTS embeddings;")
|
|
cursor.execute("DROP TABLE IF EXISTS documents;")
|
|
cursor.execute("DROP TABLE IF EXISTS search_logs;")
|
|
|
|
# Execute schema SQL statements
|
|
for statement in schema_sql.split(';'):
|
|
if statement.strip():
|
|
try:
|
|
cursor.execute(statement + ';')
|
|
print(f"Executed: {statement[:50]}...")
|
|
except mysql.connector.Error as err:
|
|
print(f"Error executing statement: {err}")
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def convert_lkk_to_documents():
|
|
"""Convert LKK data into searchable documents based on LLA_CASE_NO"""
|
|
# Import required modules for embeddings
|
|
sys.path.append('.')
|
|
try:
|
|
from embedding.embedding_service import generate_embedding
|
|
except Exception as e:
|
|
print(f"Error importing embedding service: {e}")
|
|
print("Falling back to HTTP-based embedding service...")
|
|
from embedding.embedding_service_http import generate_embedding
|
|
|
|
conn = get_db_connection()
|
|
cursor = conn.cursor(dictionary=True)
|
|
|
|
try:
|
|
# Step 1: Get all unique case numbers from allegations
|
|
cursor.execute("""
|
|
SELECT DISTINCT LLA_CASE_NO
|
|
FROM lt_lkk_allegation
|
|
WHERE LLA_CASE_NO IS NOT NULL AND LLA_CASE_NO != ''
|
|
""")
|
|
|
|
case_numbers = cursor.fetchall()
|
|
print(f"Found {len(case_numbers)} unique case numbers to process")
|
|
|
|
for case_number_row in case_numbers:
|
|
case_number = case_number_row['LLA_CASE_NO']
|
|
try:
|
|
# Step 2: For each case number, get all relevant data from allegations and info tables
|
|
cursor.execute("""
|
|
SELECT
|
|
i.*,
|
|
a.*
|
|
FROM lt_lkk_allegation a
|
|
JOIN lt_lkk_info i ON a.LKK_INFOID = i.LKK_INFOID
|
|
WHERE a.LLA_CASE_NO = %s
|
|
GROUP BY a.LLA_ALLEGATION_ID
|
|
""", (case_number,))
|
|
|
|
allegation_records = cursor.fetchall()
|
|
|
|
if not allegation_records:
|
|
print(f"No records found for case number {case_number}")
|
|
continue
|
|
|
|
# Get the first record to extract basic info
|
|
first_record = allegation_records[0]
|
|
lkk_infoid = first_record['LKK_INFOID']
|
|
|
|
# Step 3: Get person data separately
|
|
cursor.execute("""
|
|
SELECT LTL_PERSON_ID, LTL_DATA
|
|
FROM lt_lkk_person_involve
|
|
WHERE LKK_INFOID = %s
|
|
""", (lkk_infoid,))
|
|
|
|
person_records = cursor.fetchall()
|
|
involved_persons = []
|
|
|
|
for person in person_records:
|
|
person_id = person['LTL_PERSON_ID']
|
|
try:
|
|
# Parse person data from JSON
|
|
person_data = {}
|
|
if person['LTL_DATA']:
|
|
try:
|
|
person_data = json.loads(person['LTL_DATA'])
|
|
except:
|
|
print(f"Error parsing LTL_DATA JSON for person ID {person_id}")
|
|
|
|
# Instead of just showing the ID, include the full LTL_DATA content
|
|
person_info = f"Person ID: {person_id}"
|
|
|
|
# Add the full JSON data
|
|
if person['LTL_DATA']:
|
|
person_info = f"Person ID: {person_id}\nPerson Data: {person['LTL_DATA']}"
|
|
|
|
involved_persons.append(person_info)
|
|
except Exception as e:
|
|
print(f"Error processing person data for ID {person_id}: {e}")
|
|
|
|
# Start with basic case info
|
|
base_info = {
|
|
'case_number': case_number,
|
|
'file_number': first_record['LKK_FILE_NO'],
|
|
'status': first_record['LKK_STATUS'],
|
|
'dpp_suggestion': first_record['LKK_DPP_ANT_SUGGESTION'],
|
|
'hod_decision': first_record['LKK_HOD_DECISION'],
|
|
'lkk_infoid': lkk_infoid,
|
|
'created_date': first_record['LKK_CREATEDDATE']
|
|
}
|
|
|
|
# Title using case number
|
|
title = f"Case Number: {case_number}"
|
|
|
|
# Compile content
|
|
content_parts = [
|
|
f"File Number: {base_info['file_number']}",
|
|
f"Status: {base_info['status']}",
|
|
f"Case Number: {case_number}",
|
|
f"DPP Suggestion: {base_info['dpp_suggestion'] or 'None'}",
|
|
f"HOD Decision: {base_info['hod_decision'] or 'None'}",
|
|
"\n--- ALLEGATIONS ---"
|
|
]
|
|
|
|
# Add all allegations
|
|
for idx, record in enumerate(allegation_records, 1):
|
|
allegation_parts = [
|
|
f"\nALLEGATION #{idx}:",
|
|
f"Allegation ID: {record['LLA_ALLEGATION_ID']}",
|
|
f"Case Number: {record['LLA_CASE_NO']}",
|
|
f"Accused Name: {record['LLA_OKT_NAME'] or 'N/A'}",
|
|
f"Type: {record['LLA_TYPE'] or 'N/A'}",
|
|
f"Act ID: {record['LLA_ACT_ID'] or 'N/A'}",
|
|
f"Act Description: {record['LLA_ACT_DESC'] or 'N/A'}",
|
|
f"Section: {record['LLA_SECTION'] or 'N/A'}",
|
|
f"Date: {record['LLA_DATE']}",
|
|
f"Charge Notes: {record['LLA_CHARGE_NOTES'] or 'N/A'}",
|
|
f"Charge Type: {record['LKK_CHARGE_TYPE'] or 'N/A'}",
|
|
f"Charge Reason: {record['LLA_CHARGE_REASON'] or 'N/A'}",
|
|
f"Charged By: {record['LLA_CHARGE_BY'] or 'N/A'}"
|
|
]
|
|
content_parts.extend(allegation_parts)
|
|
|
|
# Add involved persons section
|
|
if involved_persons:
|
|
content_parts.append("\n--- INVOLVED PERSONS ---")
|
|
for person in involved_persons:
|
|
content_parts.append(person)
|
|
|
|
# Join all content
|
|
content = "\n".join(filter(None, content_parts))
|
|
|
|
# Insert into documents table
|
|
cursor.execute("""
|
|
INSERT INTO documents (title, content, source, doc_type, LKK_INFOID, created_at)
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
""", (
|
|
title,
|
|
content,
|
|
f"Case Number: {case_number}",
|
|
"Legal Case",
|
|
base_info['lkk_infoid'],
|
|
base_info['created_date']
|
|
))
|
|
|
|
doc_id = cursor.lastrowid
|
|
|
|
# Generate and store embedding
|
|
embedding = generate_embedding(f"{title} {content}")
|
|
|
|
# Convert numpy array to list if necessary
|
|
if isinstance(embedding, np.ndarray):
|
|
embedding = embedding.tolist()
|
|
|
|
cursor.execute("""
|
|
INSERT INTO embeddings (document_id, embedding)
|
|
VALUES (%s, %s)
|
|
""", (doc_id, json.dumps(embedding)))
|
|
|
|
conn.commit()
|
|
print(f"Processed case number {case_number} into document {doc_id}")
|
|
|
|
except Exception as e:
|
|
print(f"Error processing case number {case_number}: {e}")
|
|
conn.rollback()
|
|
|
|
except mysql.connector.Error as err:
|
|
print(f"Database error: {err}")
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
# Ask for confirmation before proceeding
|
|
print("This script will convert LKK data into searchable documents.")
|
|
confirm = input("Do you want to proceed? (y/n): ")
|
|
|
|
if confirm.lower() != 'y':
|
|
print("Import cancelled.")
|
|
sys.exit(0)
|
|
|
|
# Set up document tables
|
|
setup_document_tables()
|
|
|
|
# Convert LKK data to documents
|
|
convert_lkk_to_documents()
|
|
|
|
print("\nData conversion completed successfully!") |