import numpy as np import requests import json import os from config import OPENAI_API_KEY, OPENAI_EMBEDDING_MODEL def generate_embedding(text): """Generate an embedding vector for text using OpenAI API via direct HTTP request""" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {OPENAI_API_KEY}" } payload = { "input": text, "model": OPENAI_EMBEDDING_MODEL } print(f"Requesting embedding using model: {OPENAI_EMBEDDING_MODEL}") response = requests.post( "https://api.openai.com/v1/embeddings", headers=headers, json=payload ) if response.status_code != 200: raise Exception(f"Error from OpenAI API: {response.text}") result = response.json() print(f"Successfully generated embedding of dimension: {len(result['data'][0]['embedding'])}") # Return the embedding as a numpy array return np.array(result["data"][0]["embedding"]) def cosine_similarity(vec_a, vec_b): """Calculate cosine similarity between two vectors""" return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)) def search_similar_documents(query_embedding, document_embeddings, top_k=5, threshold=0.7): """Find documents most similar to query based on embedding similarity""" similarities = [] for doc in document_embeddings: doc_embedding = np.array(doc['embedding']) similarity = cosine_similarity(query_embedding, doc_embedding) # Only include documents above similarity threshold if similarity >= threshold: similarities.append((doc, similarity)) # Sort by similarity (highest first) similarities.sort(key=lambda x: x[1], reverse=True) # Return top_k results return similarities[:top_k]