87 lines
3.0 KiB
Python
87 lines
3.0 KiB
Python
import re
|
|
|
|
def preprocess_text(text):
|
|
"""Clean and preprocess text for embedding"""
|
|
# Convert to lowercase
|
|
text = text.lower()
|
|
|
|
# Remove extra whitespace
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
# Remove special characters (keep alphanumeric, spaces, and basic punctuation)
|
|
text = re.sub(r'[^\w\s.,?!-]', '', text)
|
|
|
|
return text
|
|
|
|
def chunk_text(text, max_chunk_size=1000, overlap=100):
|
|
"""Split text into chunks with overlap for processing long documents"""
|
|
# Split text into sentences
|
|
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
|
|
chunks = []
|
|
current_chunk = []
|
|
current_size = 0
|
|
|
|
for sentence in sentences:
|
|
sentence_size = len(sentence)
|
|
|
|
if current_size + sentence_size <= max_chunk_size:
|
|
current_chunk.append(sentence)
|
|
current_size += sentence_size
|
|
else:
|
|
# Add the current chunk to the list of chunks
|
|
if current_chunk:
|
|
chunks.append(' '.join(current_chunk))
|
|
|
|
# Create a new chunk, potentially with overlap
|
|
if overlap > 0 and current_chunk:
|
|
# Calculate how many sentences to keep for overlap
|
|
overlap_size = 0
|
|
overlap_sentences = []
|
|
|
|
for s in reversed(current_chunk):
|
|
if overlap_size + len(s) <= overlap:
|
|
overlap_sentences.insert(0, s)
|
|
overlap_size += len(s)
|
|
else:
|
|
break
|
|
|
|
current_chunk = overlap_sentences + [sentence]
|
|
current_size = overlap_size + sentence_size
|
|
else:
|
|
current_chunk = [sentence]
|
|
current_size = sentence_size
|
|
|
|
# Add the last chunk if it exists
|
|
if current_chunk:
|
|
chunks.append(' '.join(current_chunk))
|
|
|
|
return chunks
|
|
|
|
def extract_keywords(text, max_keywords=5):
|
|
"""Extract key terms from text using basic frequency analysis"""
|
|
# Remove stopwords and extract important terms
|
|
stopwords = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
|
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'to', 'in',
|
|
'on', 'for', 'with', 'by', 'about', 'is', 'are', 'was', 'were', 'be',
|
|
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'of'}
|
|
|
|
# Normalize and tokenize
|
|
words = re.findall(r'\b[a-z]{3,}\b', text.lower())
|
|
|
|
# Remove stopwords
|
|
words = [word for word in words if word not in stopwords]
|
|
|
|
# Count word frequencies
|
|
word_counts = {}
|
|
for word in words:
|
|
if word in word_counts:
|
|
word_counts[word] += 1
|
|
else:
|
|
word_counts[word] = 1
|
|
|
|
# Sort by frequency
|
|
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
# Return top keywords
|
|
return [word for word, count in sorted_words[:max_keywords]] |