56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
import numpy as np
|
|
import requests
|
|
import json
|
|
import os
|
|
from config import OPENAI_API_KEY, OPENAI_EMBEDDING_MODEL
|
|
|
|
def generate_embedding(text):
|
|
"""Generate an embedding vector for text using OpenAI API via direct HTTP request"""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}"
|
|
}
|
|
|
|
payload = {
|
|
"input": text,
|
|
"model": OPENAI_EMBEDDING_MODEL
|
|
}
|
|
|
|
print(f"Requesting embedding using model: {OPENAI_EMBEDDING_MODEL}")
|
|
|
|
response = requests.post(
|
|
"https://api.openai.com/v1/embeddings",
|
|
headers=headers,
|
|
json=payload
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Error from OpenAI API: {response.text}")
|
|
|
|
result = response.json()
|
|
print(f"Successfully generated embedding of dimension: {len(result['data'][0]['embedding'])}")
|
|
|
|
# Return the embedding as a numpy array
|
|
return np.array(result["data"][0]["embedding"])
|
|
|
|
def cosine_similarity(vec_a, vec_b):
|
|
"""Calculate cosine similarity between two vectors"""
|
|
return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
|
|
|
|
def search_similar_documents(query_embedding, document_embeddings, top_k=5, threshold=0.7):
|
|
"""Find documents most similar to query based on embedding similarity"""
|
|
similarities = []
|
|
|
|
for doc in document_embeddings:
|
|
doc_embedding = np.array(doc['embedding'])
|
|
similarity = cosine_similarity(query_embedding, doc_embedding)
|
|
|
|
# Only include documents above similarity threshold
|
|
if similarity >= threshold:
|
|
similarities.append((doc, similarity))
|
|
|
|
# Sort by similarity (highest first)
|
|
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Return top_k results
|
|
return similarities[:top_k] |