agc-chatbot/utils/text_processing.py

import re

def preprocess_text(text):
    """Clean and preprocess text for embedding"""
    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove special characters (keep alphanumeric, spaces, and basic punctuation)
    text = re.sub(r'[^\w\s.,?!-]', '', text)

    return text

def chunk_text(text, max_chunk_size=1000, overlap=100):
    """Split text into chunks with overlap for processing long documents"""
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)

    chunks = []
    current_chunk = []
    current_size = 0

    for sentence in sentences:
        sentence_size = len(sentence)

        if current_size + sentence_size <= max_chunk_size:
            current_chunk.append(sentence)
            current_size += sentence_size
        else:
            # Add the current chunk to the list of chunks
            if current_chunk:
                chunks.append(' '.join(current_chunk))

            # Create a new chunk, potentially with overlap
            if overlap > 0 and current_chunk:
                # Calculate how many sentences to keep for overlap
                overlap_size = 0
                overlap_sentences = []

                for s in reversed(current_chunk):
                    if overlap_size + len(s) <= overlap:
                        overlap_sentences.insert(0, s)
                        overlap_size += len(s)
                    else:
                        break

                current_chunk = overlap_sentences + [sentence]
                current_size = overlap_size + sentence_size
            else:
                current_chunk = [sentence]
                current_size = sentence_size

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def extract_keywords(text, max_keywords=5):
    """Extract key terms from text using basic frequency analysis"""
    # Remove stopwords and extract important terms
    stopwords = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
                'when', 'where', 'how', 'who', 'which', 'this', 'that', 'to', 'in',
                'on', 'for', 'with', 'by', 'about', 'is', 'are', 'was', 'were', 'be',
                'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'of'}

    # Normalize and tokenize
    words = re.findall(r'\b[a-z]{3,}\b', text.lower())

    # Remove stopwords
    words = [word for word in words if word not in stopwords]

    # Count word frequencies
    word_counts = {}
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

    # Sort by frequency
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

    # Return top keywords
    return [word for word, count in sorted_words[:max_keywords]]