49 lines
1.6 KiB
Python
49 lines
1.6 KiB
Python
import numpy as np
|
|
from openai import OpenAI
|
|
import httpx
|
|
from config import OPENAI_API_KEY, OPENAI_EMBEDDING_MODEL
|
|
import os
|
|
|
|
# Print API key for debugging (only first few characters)
|
|
api_key = OPENAI_API_KEY
|
|
if api_key:
|
|
masked_key = api_key[:8] + "..." + api_key[-4:]
|
|
print(f"OpenAI API Key: {masked_key}")
|
|
else:
|
|
print("OpenAI API Key not found!")
|
|
|
|
# Initialize OpenAI client with explicitly configured httpx client to avoid proxy issues
|
|
http_client = httpx.Client()
|
|
client = OpenAI(api_key=OPENAI_API_KEY, http_client=http_client)
|
|
|
|
def generate_embedding(text):
|
|
"""Generate an embedding vector for text using OpenAI"""
|
|
response = client.embeddings.create(
|
|
input=text,
|
|
model=OPENAI_EMBEDDING_MODEL
|
|
)
|
|
|
|
# Return embedding as numpy array
|
|
return np.array(response.data[0].embedding)
|
|
|
|
def cosine_similarity(vec_a, vec_b):
|
|
"""Calculate cosine similarity between two vectors"""
|
|
return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
|
|
|
|
def search_similar_documents(query_embedding, document_embeddings, top_k=5, threshold=0.7):
|
|
"""Find documents most similar to query based on embedding similarity"""
|
|
similarities = []
|
|
|
|
for doc in document_embeddings:
|
|
doc_embedding = np.array(doc['embedding'])
|
|
similarity = cosine_similarity(query_embedding, doc_embedding)
|
|
|
|
# Only include documents above similarity threshold
|
|
if similarity >= threshold:
|
|
similarities.append((doc, similarity))
|
|
|
|
# Sort by similarity (highest first)
|
|
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Return top_k results
|
|
return similarities[:top_k] |