import re def preprocess_text(text): """Clean and preprocess text for embedding""" # Convert to lowercase text = text.lower() # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() # Remove special characters (keep alphanumeric, spaces, and basic punctuation) text = re.sub(r'[^\w\s.,?!-]', '', text) return text def chunk_text(text, max_chunk_size=1000, overlap=100): """Split text into chunks with overlap for processing long documents""" # Split text into sentences sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current_chunk = [] current_size = 0 for sentence in sentences: sentence_size = len(sentence) if current_size + sentence_size <= max_chunk_size: current_chunk.append(sentence) current_size += sentence_size else: # Add the current chunk to the list of chunks if current_chunk: chunks.append(' '.join(current_chunk)) # Create a new chunk, potentially with overlap if overlap > 0 and current_chunk: # Calculate how many sentences to keep for overlap overlap_size = 0 overlap_sentences = [] for s in reversed(current_chunk): if overlap_size + len(s) <= overlap: overlap_sentences.insert(0, s) overlap_size += len(s) else: break current_chunk = overlap_sentences + [sentence] current_size = overlap_size + sentence_size else: current_chunk = [sentence] current_size = sentence_size # Add the last chunk if it exists if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def extract_keywords(text, max_keywords=5): """Extract key terms from text using basic frequency analysis""" # Remove stopwords and extract important terms stopwords = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'to', 'in', 'on', 'for', 'with', 'by', 'about', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'of'} # Normalize and tokenize words = re.findall(r'\b[a-z]{3,}\b', text.lower()) # Remove stopwords words = [word for word in words if word not in stopwords] # Count word frequencies word_counts = {} for word in words: if word in word_counts: word_counts[word] += 1 else: word_counts[word] = 1 # Sort by frequency sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) # Return top keywords return [word for word, count in sorted_words[:max_keywords]]