Update app.py
Browse files
app.py
CHANGED
|
@@ -56,20 +56,94 @@ def chunk_document(text):
|
|
| 56 |
return chunks
|
| 57 |
|
| 58 |
def test_single_model(model_name, chunks, question):
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
def process_embeddings(document_text, progress=gr.Progress()):
|
| 75 |
"""Process document with all embedding models"""
|
|
|
|
| 56 |
return chunks
|
| 57 |
|
| 58 |
def test_single_model(model_name, chunks, question):
|
| 59 |
+
"""Test embedding with a single model"""
|
| 60 |
+
print(f"\n๐ค TESTING MODEL: {model_name}")
|
| 61 |
+
print(f"๐ Input data: {len(chunks)} chunks, question: '{question[:50]}...'")
|
| 62 |
+
log_memory_usage("Before model loading")
|
| 63 |
|
| 64 |
+
try:
|
| 65 |
+
start_time = time.time()
|
| 66 |
+
|
| 67 |
+
# Load model FIRST - this defines the 'model' variable
|
| 68 |
+
print(f"๐ฅ Loading SentenceTransformer model: {model_name}")
|
| 69 |
+
model = SentenceTransformer(model_name)
|
| 70 |
+
load_time = time.time() - start_time
|
| 71 |
+
print(f"โ
Model loaded successfully in {load_time:.2f}s")
|
| 72 |
+
log_memory_usage("After model loading")
|
| 73 |
+
|
| 74 |
+
# Create embeddings in batches
|
| 75 |
+
print(f"๐งฎ Creating embeddings in batches...")
|
| 76 |
+
embed_start = time.time()
|
| 77 |
+
|
| 78 |
+
batch_size = 5 # Process 5 chunks at a time
|
| 79 |
+
chunk_embeddings = []
|
| 80 |
+
|
| 81 |
+
for i in range(0, len(chunks), batch_size):
|
| 82 |
+
batch = chunks[i:i+batch_size]
|
| 83 |
+
print(f"๐ฆ Processing batch {i//batch_size + 1}: chunks {i+1}-{min(i+batch_size, len(chunks))}")
|
| 84 |
+
batch_embeddings = model.encode(batch, show_progress_bar=False)
|
| 85 |
+
chunk_embeddings.append(batch_embeddings)
|
| 86 |
+
print(f"โ
Batch {i//batch_size + 1} completed, shape: {batch_embeddings.shape}")
|
| 87 |
+
|
| 88 |
+
# Combine all batch embeddings
|
| 89 |
+
chunk_embeddings = np.vstack(chunk_embeddings)
|
| 90 |
+
print(f"๐ All chunk embeddings combined shape: {chunk_embeddings.shape}")
|
| 91 |
+
|
| 92 |
+
print(f"โ Encoding question...")
|
| 93 |
+
question_embedding = model.encode([question], show_progress_bar=False)
|
| 94 |
+
print(f"๐ Question embedding shape: {question_embedding.shape}")
|
| 95 |
+
|
| 96 |
+
embed_time = time.time() - embed_start
|
| 97 |
+
print(f"โ
All embeddings created in {embed_time:.2f}s")
|
| 98 |
+
log_memory_usage("After embedding creation")
|
| 99 |
+
|
| 100 |
+
# Calculate similarities
|
| 101 |
+
print(f"๐ Calculating cosine similarities...")
|
| 102 |
+
similarities = cosine_similarity(question_embedding, chunk_embeddings)[0]
|
| 103 |
+
print(f"๐ Similarity scores - Min: {similarities.min():.3f}, Max: {similarities.max():.3f}, Mean: {similarities.mean():.3f}")
|
| 104 |
+
|
| 105 |
+
# Get top K results
|
| 106 |
+
print(f"๐ Finding top {TOP_K_RESULTS} results...")
|
| 107 |
+
top_indices = np.argsort(similarities)[-TOP_K_RESULTS:][::-1]
|
| 108 |
+
print(f"๐ Top indices: {top_indices.tolist()}")
|
| 109 |
+
|
| 110 |
+
for i, idx in enumerate(top_indices):
|
| 111 |
+
score = similarities[idx]
|
| 112 |
+
print(f"๐ Result {i+1}: Chunk #{idx}, Score: {score:.3f}")
|
| 113 |
+
|
| 114 |
+
total_time = time.time() - start_time
|
| 115 |
+
print(f"โฑ๏ธ Total processing time: {total_time:.2f}s")
|
| 116 |
+
|
| 117 |
+
results = {
|
| 118 |
+
'status': 'success',
|
| 119 |
+
'total_time': total_time,
|
| 120 |
+
'load_time': load_time,
|
| 121 |
+
'embed_time': embed_time,
|
| 122 |
+
'top_chunks': [
|
| 123 |
+
{
|
| 124 |
+
'index': idx,
|
| 125 |
+
'score': similarities[idx],
|
| 126 |
+
'text': chunks[idx]
|
| 127 |
+
}
|
| 128 |
+
for idx in top_indices
|
| 129 |
+
]
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
print(f"โ
Model {model_name} completed successfully!")
|
| 133 |
+
return results
|
| 134 |
+
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f"โ ERROR in model {model_name}:")
|
| 137 |
+
print(f"๐ Error type: {type(e).__name__}")
|
| 138 |
+
print(f"๐ฌ Error message: {str(e)}")
|
| 139 |
+
print(f"๐ Full traceback:")
|
| 140 |
+
print(traceback.format_exc())
|
| 141 |
+
|
| 142 |
+
return {
|
| 143 |
+
'status': 'failed',
|
| 144 |
+
'error': str(e),
|
| 145 |
+
'traceback': traceback.format_exc()
|
| 146 |
+
}
|
| 147 |
|
| 148 |
def process_embeddings(document_text, progress=gr.Progress()):
|
| 149 |
"""Process document with all embedding models"""
|