VitaliyPolovyyEN commited on
Commit
f50042a
ยท
verified ยท
1 Parent(s): eed7088

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -13
app.py CHANGED
@@ -56,20 +56,94 @@ def chunk_document(text):
56
  return chunks
57
 
58
  def test_single_model(model_name, chunks, question):
59
- # ... existing code until embedding creation
 
 
 
60
 
61
- print(f"๐Ÿงฎ Creating embeddings in batches...")
62
- batch_size = 5 # Process 5 chunks at a time
63
- chunk_embeddings = []
64
-
65
- for i in range(0, len(chunks), batch_size):
66
- batch = chunks[i:i+batch_size]
67
- print(f"๐Ÿ“ฆ Processing batch {i//batch_size + 1}: chunks {i+1}-{min(i+batch_size, len(chunks))}")
68
- batch_embeddings = model.encode(batch, show_progress_bar=False)
69
- chunk_embeddings.append(batch_embeddings)
70
-
71
- chunk_embeddings = np.vstack(chunk_embeddings)
72
- print(f"๐Ÿ“ All chunk embeddings shape: {chunk_embeddings.shape}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  def process_embeddings(document_text, progress=gr.Progress()):
75
  """Process document with all embedding models"""
 
56
  return chunks
57
 
58
  def test_single_model(model_name, chunks, question):
59
+ """Test embedding with a single model"""
60
+ print(f"\n๐Ÿค– TESTING MODEL: {model_name}")
61
+ print(f"๐Ÿ“Š Input data: {len(chunks)} chunks, question: '{question[:50]}...'")
62
+ log_memory_usage("Before model loading")
63
 
64
+ try:
65
+ start_time = time.time()
66
+
67
+ # Load model FIRST - this defines the 'model' variable
68
+ print(f"๐Ÿ“ฅ Loading SentenceTransformer model: {model_name}")
69
+ model = SentenceTransformer(model_name)
70
+ load_time = time.time() - start_time
71
+ print(f"โœ… Model loaded successfully in {load_time:.2f}s")
72
+ log_memory_usage("After model loading")
73
+
74
+ # Create embeddings in batches
75
+ print(f"๐Ÿงฎ Creating embeddings in batches...")
76
+ embed_start = time.time()
77
+
78
+ batch_size = 5 # Process 5 chunks at a time
79
+ chunk_embeddings = []
80
+
81
+ for i in range(0, len(chunks), batch_size):
82
+ batch = chunks[i:i+batch_size]
83
+ print(f"๐Ÿ“ฆ Processing batch {i//batch_size + 1}: chunks {i+1}-{min(i+batch_size, len(chunks))}")
84
+ batch_embeddings = model.encode(batch, show_progress_bar=False)
85
+ chunk_embeddings.append(batch_embeddings)
86
+ print(f"โœ… Batch {i//batch_size + 1} completed, shape: {batch_embeddings.shape}")
87
+
88
+ # Combine all batch embeddings
89
+ chunk_embeddings = np.vstack(chunk_embeddings)
90
+ print(f"๐Ÿ“ All chunk embeddings combined shape: {chunk_embeddings.shape}")
91
+
92
+ print(f"โ“ Encoding question...")
93
+ question_embedding = model.encode([question], show_progress_bar=False)
94
+ print(f"๐Ÿ“ Question embedding shape: {question_embedding.shape}")
95
+
96
+ embed_time = time.time() - embed_start
97
+ print(f"โœ… All embeddings created in {embed_time:.2f}s")
98
+ log_memory_usage("After embedding creation")
99
+
100
+ # Calculate similarities
101
+ print(f"๐Ÿ“Š Calculating cosine similarities...")
102
+ similarities = cosine_similarity(question_embedding, chunk_embeddings)[0]
103
+ print(f"๐Ÿ“ˆ Similarity scores - Min: {similarities.min():.3f}, Max: {similarities.max():.3f}, Mean: {similarities.mean():.3f}")
104
+
105
+ # Get top K results
106
+ print(f"๐Ÿ” Finding top {TOP_K_RESULTS} results...")
107
+ top_indices = np.argsort(similarities)[-TOP_K_RESULTS:][::-1]
108
+ print(f"๐Ÿ† Top indices: {top_indices.tolist()}")
109
+
110
+ for i, idx in enumerate(top_indices):
111
+ score = similarities[idx]
112
+ print(f"๐Ÿ“Œ Result {i+1}: Chunk #{idx}, Score: {score:.3f}")
113
+
114
+ total_time = time.time() - start_time
115
+ print(f"โฑ๏ธ Total processing time: {total_time:.2f}s")
116
+
117
+ results = {
118
+ 'status': 'success',
119
+ 'total_time': total_time,
120
+ 'load_time': load_time,
121
+ 'embed_time': embed_time,
122
+ 'top_chunks': [
123
+ {
124
+ 'index': idx,
125
+ 'score': similarities[idx],
126
+ 'text': chunks[idx]
127
+ }
128
+ for idx in top_indices
129
+ ]
130
+ }
131
+
132
+ print(f"โœ… Model {model_name} completed successfully!")
133
+ return results
134
+
135
+ except Exception as e:
136
+ print(f"โŒ ERROR in model {model_name}:")
137
+ print(f"๐Ÿ› Error type: {type(e).__name__}")
138
+ print(f"๐Ÿ’ฌ Error message: {str(e)}")
139
+ print(f"๐Ÿ“‹ Full traceback:")
140
+ print(traceback.format_exc())
141
+
142
+ return {
143
+ 'status': 'failed',
144
+ 'error': str(e),
145
+ 'traceback': traceback.format_exc()
146
+ }
147
 
148
  def process_embeddings(document_text, progress=gr.Progress()):
149
  """Process document with all embedding models"""