agc-chatbot/utils/text_processing.py

87 lines
3.0 KiB
Python

import re
def preprocess_text(text):
"""Clean and preprocess text for embedding"""
# Convert to lowercase
text = text.lower()
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Remove special characters (keep alphanumeric, spaces, and basic punctuation)
text = re.sub(r'[^\w\s.,?!-]', '', text)
return text
def chunk_text(text, max_chunk_size=1000, overlap=100):
"""Split text into chunks with overlap for processing long documents"""
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = []
current_size = 0
for sentence in sentences:
sentence_size = len(sentence)
if current_size + sentence_size <= max_chunk_size:
current_chunk.append(sentence)
current_size += sentence_size
else:
# Add the current chunk to the list of chunks
if current_chunk:
chunks.append(' '.join(current_chunk))
# Create a new chunk, potentially with overlap
if overlap > 0 and current_chunk:
# Calculate how many sentences to keep for overlap
overlap_size = 0
overlap_sentences = []
for s in reversed(current_chunk):
if overlap_size + len(s) <= overlap:
overlap_sentences.insert(0, s)
overlap_size += len(s)
else:
break
current_chunk = overlap_sentences + [sentence]
current_size = overlap_size + sentence_size
else:
current_chunk = [sentence]
current_size = sentence_size
# Add the last chunk if it exists
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def extract_keywords(text, max_keywords=5):
"""Extract key terms from text using basic frequency analysis"""
# Remove stopwords and extract important terms
stopwords = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'to', 'in',
'on', 'for', 'with', 'by', 'about', 'is', 'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'of'}
# Normalize and tokenize
words = re.findall(r'\b[a-z]{3,}\b', text.lower())
# Remove stopwords
words = [word for word in words if word not in stopwords]
# Count word frequencies
word_counts = {}
for word in words:
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
# Sort by frequency
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
# Return top keywords
return [word for word, count in sorted_words[:max_keywords]]