agc-chatbot/embedding/simple_search_service.py

121 lines
4.7 KiB
Python

import json
import re
from typing import List, Dict, Any
from db.db_utils import get_all_documents
def simple_keyword_search(query: str, documents: List[Dict[str, Any]], top_k: int = 5) -> List[Dict[str, Any]]:
"""Simple keyword-based search without AI"""
query_words = query.lower().split()
results = []
for doc in documents:
score = 0
content = doc.get('content', '').lower()
title = doc.get('title', '').lower()
# Count keyword matches
for word in query_words:
score += content.count(word) * 1 # Content match worth 1 point
score += title.count(word) * 2 # Title match worth 2 points
if score > 0:
content_preview = doc.get('content', '')[:300] + "..." if len(doc.get('content', '')) > 300 else doc.get('content', '')
results.append({
'id': doc['id'],
'title': doc.get('title', 'Untitled'),
'content': doc.get('content', ''),
'content_preview': content_preview,
'doc_type': doc.get('doc_type', 'Unknown'),
'similarity': score / 100.0 # Normalize score
})
# Sort by score and return top results
results.sort(key=lambda x: x['similarity'], reverse=True)
return results[:top_k]
def generate_simple_answer(query: str, relevant_docs: List[Dict[str, Any]]) -> str:
"""Generate a simple answer based on keyword matching"""
if not relevant_docs:
return "I couldn't find any relevant documents to answer your question. Please try rephrasing your query."
# Find the most relevant document
best_doc = relevant_docs[0]
# Extract relevant sentences containing query keywords
query_words = query.lower().split()
content = best_doc.get('content', '')
sentences = re.split(r'[.!?]+', content)
relevant_sentences = []
for sentence in sentences:
sentence_clean = sentence.strip()
if any(word in sentence_clean.lower() for word in query_words):
relevant_sentences.append(sentence_clean)
if len(relevant_sentences) >= 3: # Limit to 3 sentences
break
if relevant_sentences:
answer = f"Based on the document '{best_doc.get('title', 'Untitled')}', here's what I found:\n\n"
answer += ". ".join(relevant_sentences[:2]) + "."
if len(relevant_docs) > 1:
answer += f"\n\nI found {len(relevant_docs)} relevant documents in total."
return answer
else:
return f"I found relevant documents but couldn't extract specific information about '{query}'. You may want to review the document '{best_doc.get('title', 'Untitled')}' for more details."
def simple_search(query: str, profile_search: bool = False) -> Dict[str, Any]:
"""
Perform simple keyword-based search without AI
Args:
query: The search query
profile_search: Whether to search in user profiles (not used currently)
Returns:
Dict containing search results and generated answer
"""
try:
print(f"Processing simple search query: {query}")
# Get all documents
documents = get_all_documents(include_embeddings=False)
print(f"Found {len(documents)} documents")
# Enhanced query (simple expansion)
enhanced_query = query
if len(query.split()) == 1:
# Add common legal terms for single word queries
legal_expansions = {
'seksyen': 'seksyen section akta',
'jenayah': 'jenayah criminal crime',
'hukuman': 'hukuman punishment penalty',
'kesalahan': 'kesalahan offense offence',
'mahkamah': 'mahkamah court tribunal'
}
enhanced_query = legal_expansions.get(query.lower(), query)
# Get relevant documents
relevant_docs = simple_keyword_search(enhanced_query, documents)
print(f"Found {len(relevant_docs)} relevant documents")
# Generate answer
answer = generate_simple_answer(query, relevant_docs)
print(f"Generated answer: {answer[:100]}...")
return {
"query": query,
"enhanced_query": enhanced_query,
"documents": relevant_docs,
"answer": answer
}
except Exception as e:
print(f"Error in simple search: {e}")
return {
"query": query,
"enhanced_query": query,
"documents": [],
"answer": f"I apologize, but I encountered an error while processing your query: {str(e)}. Please try again."
}