Spaces:

Enderchef
/

SuperBench-Eval

Sleeping

App Files Files Community

Enderchef commited on Jun 25

Commit

566e353

verified ·

1 Parent(s): ff1ae6f

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -97

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ import spaces
 from datetime import datetime
 # --- Environment and Caching ---
 # It's good practice to ensure the cache directory exists.
 CACHE_DIR = "evaluation_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -26,14 +25,14 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
-MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
 BENCHMARK_MAP = {
     "MMLU": MMLU_DATASET,
-    "MMLU-Pro": MMLU_PRO_DATASET
 }
 # --- Data Loading and Preparation ---
 def get_all_benchmark_options():
     """
     Fetches and caches the available subjects (configs) for each benchmark dataset.
@@ -41,8 +40,9 @@ def get_all_benchmark_options():
     """
     if benchmark_subject_cache:
         return benchmark_subject_cache
     print("Fetching benchmark configurations for the first time...")
     for key, dataset_id in BENCHMARK_MAP.items():
         try:
             # Fetching dataset configurations requires authentication if the dataset is private
@@ -57,7 +57,6 @@ def get_all_benchmark_options():
 # Initialize the cache on startup
 ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options()
 @spaces.GPU()
 def load_model(model_id):
     """
@@ -66,16 +65,14 @@ def load_model(model_id):
     """
     if not model_id:
         raise ValueError("Model ID cannot be empty.")
-    gr.Info(f"Attempting to load model: {model_id}...")
     if model_id in model_cache:
         gr.Info(f"Model '{model_id}' found in cache.")
         return model_cache[model_id]
     try:
         # Use bfloat16 for better performance on modern GPUs
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
         tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
@@ -84,7 +81,7 @@ def load_model(model_id):
             trust_remote_code=True,
             low_cpu_mem_usage=True, # Optimization for large models
         ).to("cuda" if torch.cuda.is_available() else "cpu")
         # Create the pipeline for text generation
         generator = pipeline(
             "text-generation",
@@ -92,7 +89,7 @@ def load_model(model_id):
             tokenizer=tokenizer,
             device=0 if torch.cuda.is_available() else -1
         )
         model_cache[model_id] = generator
         gr.Info(f"Model '{model_id}' loaded successfully.")
         return generator
@@ -100,9 +97,7 @@ def load_model(model_id):
         # Raise a more specific error to be caught by the main evaluation function
         raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
 # --- Evaluation Logic ---
 def format_prompt(item):
     """Formats the MMLU question and choices into a standardized prompt."""
     prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
@@ -121,12 +116,11 @@ def extract_predicted_letter(output_text):
     match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
     if match:
         return match.group(1).upper()
     # Fallback: if the model just outputs a letter
     match = re.search(r"^\s*([ABCD])\b", output_text.strip())
     if match:
         return match.group(1).upper()
     return None
 def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
@@ -150,23 +144,22 @@ def evaluate_single_subject(generator, dataset_id, subject, sample_count, progre
     for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
         prompt, correct_answer_idx = format_prompt(item)
         expected_letter = get_choice_letter(correct_answer_idx)
         # The generated text is often just after the prompt. We need to slice it.
         full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
         # Generate a short response, aiming for a single letter answer.
         # do_sample=False (greedy decoding) is crucial for reproducibility.
         raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
         # Isolate the newly generated part
         generated_text_only = raw_output[len(full_prompt_text):].strip()
         predicted_letter = extract_predicted_letter(generated_text_only)
         is_correct = (predicted_letter == expected_letter)
         if is_correct:
             correct_predictions += 1
         results_details.append({
             "Question": item['question'],
             "Correct": "✅" if is_correct else "❌",
@@ -174,11 +167,9 @@ def evaluate_single_subject(generator, dataset_id, subject, sample_count, progre
             "Predicted": predicted_letter or "N/A",
             "Model Output": generated_text_only
         })
     accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
     return accuracy, results_details
 @spaces.GPU()
 def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
     """
@@ -189,7 +180,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
     try:
         gr.Info("Starting evaluation...")
         generator = load_model(model_id)
         dataset_id = BENCHMARK_MAP.get(benchmark_category)
         if not dataset_id:
             raise ValueError(f"Invalid benchmark category: {benchmark_category}")
@@ -198,7 +189,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
         summary_lines = []
         total_correct = 0
         total_samples = 0
         subjects_to_run = []
         if subject_name == "ALL":
             # Exclude the "ALL" placeholder from the list of subjects to run
@@ -219,23 +210,22 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
             gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
             try:
                 accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
                 all_results_details.extend(subject_details)
                 num_correct = sum(1 for d in subject_details if d['Correct'] == "✅")
                 num_evaluated = len(subject_details)
                 total_correct += num_correct
                 total_samples += num_evaluated
                 summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
             except Exception as e:
                 error_trace = traceback.format_exc()
                 gr.Error(f"Skipping {subject} due to an error: {e}")
                 summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
                 continue
         overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
         # --- Prepare Outputs ---
         if subject_name == "ALL":
             result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
@@ -244,7 +234,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
         else:
             result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
             result_summary += f"({total_correct:,}/{total_samples:,} correct)"
         # Save results for leaderboard
         record = {
             "model_id": model_id,
@@ -256,11 +246,11 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
         }
         with open(EVAL_FILE, "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
         df_details = pd.DataFrame(all_results_details)
         # Return a dictionary of component updates
         return {
             result_summary_output: gr.update(value=result_summary, visible=True),
@@ -268,12 +258,11 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
             details_box: gr.update(visible=True),
             detailed_results_df: gr.update(value=df_details)
         }
     except Exception as e:
         error_message = f"An unexpected error occurred during setup: {e}"
         error_details = traceback.format_exc()
         gr.Error(error_message)
         return {
             result_summary_output: gr.update(visible=False),
             error_box: gr.update(visible=True),
@@ -282,9 +271,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
             details_box: gr.update(visible=False)
         }
 # --- UI Helper Functions ---
 def update_subject_dropdown(benchmark_category):
     """Updates the subject dropdown choices based on the selected benchmark."""
     choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
@@ -300,7 +287,7 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
     try:
         if not os.path.exists(EVAL_FILE):
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         df = pd.read_json(EVAL_FILE, lines=True)
         if df.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
@@ -308,22 +295,21 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
         # Coerce accuracy to numeric and filter valid entries
         df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
         df.dropna(subset=['accuracy'], inplace=True)
         # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
         df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
         if df_filtered.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         # Find the latest evaluation for each model
         df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
         latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
         leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
         # Add Rank
         leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
         # Rename and format columns
         leaderboard_df.rename(columns={
             'model_id': 'Model ID',
@@ -331,67 +317,169 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
             'sample_count': 'Total Samples',
             'timestamp': 'Date'
         }, inplace=True)
         leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
         leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
         progress(1, desc="Done.")
         return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
         traceback.print_exc()
         return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
 # --- Gradio Interface Definition ---
-with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css="""
-    /* --- Global & Layout --- */
-    body { font-family: 'Inter', sans-serif; background-color: #f8f9fa; }
-    .gradio-container { max-width: 1280px !important; margin: auto; }
-    .gr-group { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.05) !important; border: 1px solid #e9ecef !important; background-color: white; }
-    /* --- Typography --- */
-    h1 { text-align: center; font-size: 2.5rem !important; font-weight: 800; color: #212529; margin-bottom: 0.5rem; letter-spacing: -1.5px; }
-    .subtitle { text-align: center; color: #6c757d; font-size: 1.1rem; margin-bottom: 2.5rem; max-width: 800px; margin-left: auto; margin-right: auto;}
-    /* --- Buttons & Inputs --- */
-    .gr-button { font-weight: 600 !important; transition: all 0.2s ease; }
-    .gr-button-primary { box-shadow: 0 4px 10px rgba(59, 130, 246, 0.2); }
-    .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(59, 130, 246, 0.3); }
     /* --- Custom Radio Buttons (Segmented Control) --- */
     #leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
-    #leaderboard-toggle { background-color: #e9ecef; padding: 5px; border-radius: 10px; display: inline-flex; }
     #leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
     #leaderboard-toggle input[type='radio'] { display: none; }
-    #leaderboard-toggle label { padding: 8px 16px; border-radius: 8px; cursor: pointer; transition: all 0.3s ease; font-weight: 500; color: #495057; background: transparent; border: none; box-shadow: none; }
-    #leaderboard-toggle input[type='radio']:checked + label { background-color: white; color: #0d6efd; font-weight: 600; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
     /* --- Dataframe / Table Styling --- */
     .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
-    .leaderboard-table .gr-dataframe thead th { background-color: #f8f9fa !important; color: #495057 !important; font-weight: 600 !important; text-align: left; padding: 12px 15px; border-bottom: 2px solid #dee2e6; }
-    .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #fdfdff; }
-    .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #f0f6ff; }
-    .leaderboard-table .gr-dataframe tbody td { padding: 12px 15px; border-bottom: 1px solid #e9ecef; }
-    .leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #495057; }
     /* --- Error & Result Panes --- */
-    #error-display-box { background-color: #fff3f3 !important; border-color: #ffc9c9 !important; }
-    #result-summary-box { background-color: #f3f9ff !important; border-color: #cde4ff !important; }
-""") as demo:
     gr.Markdown("<h1>🏆 Open LLM Evaluator</h1>")
-    gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU and MMLU-Pro. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
     with gr.Tabs() as tabs:
         # --- Leaderboard Tab ---
         with gr.TabItem("📊 Leaderboard", id=0):
             with gr.Column():
                 with gr.Row(elem_id="leaderboard-toggle-group"):
                     leaderboard_type_toggle = gr.Radio(
-                        ["MMLU", "MMLU-Pro"],
                         label="Select Benchmark",
                         value="MMLU",
                         interactive=True,
@@ -400,15 +488,15 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                         show_label=False,
                     )
                     refresh_button = gr.Button("🔄 Refresh", size="sm")
                 leaderboard_table_output = gr.DataFrame(
                     headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
                     interactive=False,
                     datatype=["number", "str", "str", "number", "str"],
-                    row_count=15,
-                    elem_classes="leaderboard-table"
                 )
         # --- Evaluation Tab ---
         with gr.TabItem("🚀 Run Evaluation", id=1):
             with gr.Row(variant='panel'):
@@ -418,10 +506,12 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                         model_id_input = gr.Textbox(
                             label="Hugging Face Model ID",
                             placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
-                            interactive=True
                         )
                         benchmark_selection_radio = gr.Radio(
-                            ["MMLU", "MMLU-Pro"],
                             label="Benchmark",
                             value="MMLU",
                             interactive=True,
@@ -429,7 +519,8 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                         with gr.Row():
                             benchmark_subject_dropdown = gr.Dropdown(
                                 label="Subject",
-                                choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
                                 value="ALL",
                                 interactive=True
                             )
@@ -437,21 +528,20 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                                 label="Samples per Subject",
                                 minimum=5, maximum=100, value=25, step=5, interactive=True
                             )
                     run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
                 with gr.Column(scale=3):
                     gr.Markdown("### 2. View Results")
                     # Panel for displaying the summary of results
                     with gr.Group(visible=False) as result_summary_box:
                         result_summary_output = gr.Markdown(elem_id="result-summary-box")
                     # Panel for displaying errors
                     with gr.Group(visible=False) as error_box:
                         error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
                         error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
                     # Panel for detailed, row-by-row results
                     with gr.Group(visible=False) as details_box:
                         gr.Markdown("#### Detailed Evaluation Log")
@@ -459,20 +549,19 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                             headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
                             datatype=["str", "str", "str", "str", "str"],
                             interactive=False,
-                            row_count=10,
-                            col_count=5,
                             wrap=True,
                         )
-    # --- Event Handlers & Logic ---
     # Update subject dropdown when benchmark type changes
     benchmark_selection_radio.change(
         fn=update_subject_dropdown,
         inputs=[benchmark_selection_radio],
         outputs=[benchmark_subject_dropdown]
     )
     # Main evaluation trigger
     run_button.click(
         fn=run_evaluation,
@@ -506,4 +595,4 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
 # Launch the Gradio app
 if __name__ == "__main__":
-    demo.launch(debug=True)

 from datetime import datetime
 # --- Environment and Caching ---
 # It's good practice to ensure the cache directory exists.
 CACHE_DIR = "evaluation_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
+# Temporarily remove MMLU-Pro references
+# MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
 BENCHMARK_MAP = {
     "MMLU": MMLU_DATASET,
+    # "MMLU-Pro": MMLU_PRO_DATASET # Temporarily removed
 }
 # --- Data Loading and Preparation ---
 def get_all_benchmark_options():
     """
     Fetches and caches the available subjects (configs) for each benchmark dataset.
     """
     if benchmark_subject_cache:
         return benchmark_subject_cache
     print("Fetching benchmark configurations for the first time...")
+    # Only iterate over the allowed benchmarks (MMLU)
     for key, dataset_id in BENCHMARK_MAP.items():
         try:
             # Fetching dataset configurations requires authentication if the dataset is private
 # Initialize the cache on startup
 ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options()
 @spaces.GPU()
 def load_model(model_id):
     """
     """
     if not model_id:
         raise ValueError("Model ID cannot be empty.")
+        gr.Info(f"Attempting to load model: {model_id}...")
     if model_id in model_cache:
         gr.Info(f"Model '{model_id}' found in cache.")
         return model_cache[model_id]
     try:
         # Use bfloat16 for better performance on modern GPUs
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
         tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             trust_remote_code=True,
             low_cpu_mem_usage=True, # Optimization for large models
         ).to("cuda" if torch.cuda.is_available() else "cpu")
         # Create the pipeline for text generation
         generator = pipeline(
             "text-generation",
             tokenizer=tokenizer,
             device=0 if torch.cuda.is_available() else -1
         )
         model_cache[model_id] = generator
         gr.Info(f"Model '{model_id}' loaded successfully.")
         return generator
         # Raise a more specific error to be caught by the main evaluation function
         raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
 # --- Evaluation Logic ---
 def format_prompt(item):
     """Formats the MMLU question and choices into a standardized prompt."""
     prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
     match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
     if match:
         return match.group(1).upper()
     # Fallback: if the model just outputs a letter
     match = re.search(r"^\s*([ABCD])\b", output_text.strip())
     if match:
         return match.group(1).upper()
     return None
 def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
     for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
         prompt, correct_answer_idx = format_prompt(item)
         expected_letter = get_choice_letter(correct_answer_idx)
         # The generated text is often just after the prompt. We need to slice it.
         full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
         # Generate a short response, aiming for a single letter answer.
         # do_sample=False (greedy decoding) is crucial for reproducibility.
         raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
         # Isolate the newly generated part
         generated_text_only = raw_output[len(full_prompt_text):].strip()
         predicted_letter = extract_predicted_letter(generated_text_only)
         is_correct = (predicted_letter == expected_letter)
         if is_correct:
             correct_predictions += 1
         results_details.append({
             "Question": item['question'],
             "Correct": "✅" if is_correct else "❌",
             "Predicted": predicted_letter or "N/A",
             "Model Output": generated_text_only
         })
     accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
     return accuracy, results_details
 @spaces.GPU()
 def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
     """
     try:
         gr.Info("Starting evaluation...")
         generator = load_model(model_id)
         dataset_id = BENCHMARK_MAP.get(benchmark_category)
         if not dataset_id:
             raise ValueError(f"Invalid benchmark category: {benchmark_category}")
         summary_lines = []
         total_correct = 0
         total_samples = 0
         subjects_to_run = []
         if subject_name == "ALL":
             # Exclude the "ALL" placeholder from the list of subjects to run
             gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
             try:
                 accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
                 all_results_details.extend(subject_details)
                 num_correct = sum(1 for d in subject_details if d['Correct'] == "✅")
                 num_evaluated = len(subject_details)
                 total_correct += num_correct
                 total_samples += num_evaluated
                 summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
             except Exception as e:
                 error_trace = traceback.format_exc()
                 gr.Error(f"Skipping {subject} due to an error: {e}")
                 summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
                 continue
         overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
         # --- Prepare Outputs ---
         if subject_name == "ALL":
             result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
         else:
             result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
             result_summary += f"({total_correct:,}/{total_samples:,} correct)"
         # Save results for leaderboard
         record = {
             "model_id": model_id,
         }
         with open(EVAL_FILE, "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
         df_details = pd.DataFrame(all_results_details)
         # Return a dictionary of component updates
         return {
             result_summary_output: gr.update(value=result_summary, visible=True),
             details_box: gr.update(visible=True),
             detailed_results_df: gr.update(value=df_details)
         }
     except Exception as e:
         error_message = f"An unexpected error occurred during setup: {e}"
         error_details = traceback.format_exc()
         gr.Error(error_message)
         return {
             result_summary_output: gr.update(visible=False),
             error_box: gr.update(visible=True),
             details_box: gr.update(visible=False)
         }
 # --- UI Helper Functions ---
 def update_subject_dropdown(benchmark_category):
     """Updates the subject dropdown choices based on the selected benchmark."""
     choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
     try:
         if not os.path.exists(EVAL_FILE):
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         df = pd.read_json(EVAL_FILE, lines=True)
         if df.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         # Coerce accuracy to numeric and filter valid entries
         df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
         df.dropna(subset=['accuracy'], inplace=True)
         # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
         df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
         if df_filtered.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         # Find the latest evaluation for each model
         df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
         latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
         leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
         # Add Rank
         leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
         # Rename and format columns
         leaderboard_df.rename(columns={
             'model_id': 'Model ID',
             'sample_count': 'Total Samples',
             'timestamp': 'Date'
         }, inplace=True)
         leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
         leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
         progress(1, desc="Done.")
         return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
         traceback.print_exc()
         return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
 # --- Gradio Interface Definition ---
+# Black/Orange Theme and bigger to fit screen
+custom_css = """
+    /* --- Global & Layout (Bigger to fit screen) --- */
+    body { font-family: 'Inter', sans-serif; background-color: #1a1a1a; color: #f0f0f0; } /* Dark background, light text */
+    .gradio-container { max-width: 95% !important; margin: auto; padding: 20px; } /* Wider container */
+    .gr-group {
+        border-radius: 12px !important;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; /* Darker shadow */
+        border: 1px solid #333 !important; /* Darker border */
+        background-color: #2a2a2a; /* Darker group background */
+    }
+    .gr-panel {
+        border-radius: 12px !important;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important;
+        border: 1px solid #333 !important;
+        background-color: #2a2a2a;
+    }
+    /* --- Typography (Orange Hues) --- */
+    h1 { text-align: center; font-size: 3rem !important; font-weight: 800; color: #ff8c00; margin-bottom: 0.5rem; letter-spacing: -1.5px; } /* Orange title */
+    h3, h4 { color: #ffa500; } /* Orange headings */
+    .subtitle { text-align: center; color: #cccccc; font-size: 1.2rem; margin-bottom: 2.5rem; max-width: 900px; margin-left: auto; margin-right: auto;}
+    label { color: #f0f0f0 !important; } /* Label text color */
+    /* --- Tabs --- */
+    .gradio-tabs { background-color: #2a2a2a; border-radius: 12px; }
+    .gradio-tab-item { color: #f0f0f0; }
+    .gradio-tabs button {
+        background-color: #3a3a3a !important;
+        color: #f0f0f0 !important;
+        border-radius: 8px 8px 0 0 !important;
+        transition: all 0.3s ease;
+    }
+    .gradio-tabs button.selected {
+        background-color: #ff8c00 !important; /* Orange selected tab */
+        color: #1a1a1a !important; /* Dark text on orange */
+        font-weight: 700;
+    }
+    .gradio-tabs button:hover { background-color: #555 !important; }
+    /* --- Inputs --- */
+    .gr-textbox, .gr-dropdown, .gr-slider {
+        background-color: #3a3a3a !important;
+        color: #f0f0f0 !important;
+        border: 1px solid #555 !important;
+        border-radius: 8px !important;
+    }
+    .gr-textbox textarea, .gr-textbox input, .gr-dropdown input {
+        color: #f0f0f0 !important;
+    }
+    .gr-textbox.gr-text-input:focus-within {
+        border-color: #ff8c00 !important; /* Orange focus border */
+        box-shadow: 0 0 0 2px rgba(255, 140, 0, 0.5) !important;
+    }
+    /* --- Buttons --- */
+    .gr-button { font-weight: 600 !important; transition: all 0.2s ease; border-radius: 8px !important; }
+    .gr-button-primary {
+        background-color: #ff8c00 !important; /* Orange primary button */
+        color: #1a1a1a !important;
+        box-shadow: 0 4px 10px rgba(255, 140, 0, 0.3);
+        border: none;
+    }
+    .gr-button-primary:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 6px 15px rgba(255, 140, 0, 0.5);
+        background-color: #ffa500 !important; /* Slightly lighter orange on hover */
+    }
+    .gr-button-secondary {
+        background-color: #444 !important;
+        color: #f0f0f0 !important;
+        border: 1px solid #555 !important;
+    }
+    .gr-button-secondary:hover {
+        background-color: #555 !important;
+    }
     /* --- Custom Radio Buttons (Segmented Control) --- */
     #leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
+    #leaderboard-toggle { background-color: #3a3a3a; padding: 5px; border-radius: 10px; display: inline-flex; border: 1px solid #555; }
     #leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
     #leaderboard-toggle input[type='radio'] { display: none; }
+    #leaderboard-toggle label {
+        padding: 8px 16px;
+        border-radius: 8px;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        font-weight: 500;
+        color: #f0f0f0;
+        background: transparent;
+        border: none;
+        box-shadow: none;
+    }
+    #leaderboard-toggle input[type='radio']:checked + label {
+        background-color: #ff8c00; /* Orange selected */
+        color: #1a1a1a;
+        font-weight: 600;
+        box-shadow: 0 2px 5px rgba(255, 140, 0, 0.3);
+    }
+    #leaderboard-toggle label:hover {
+        background-color: #555;
+    }
     /* --- Dataframe / Table Styling --- */
     .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
+    .leaderboard-table .gr-dataframe thead th {
+        background-color: #3a3a3a !important;
+        color: #ffa500 !important; /* Orange headers */
+        font-weight: 600 !important;
+        text-align: left;
+        padding: 12px 15px;
+        border-bottom: 2px solid #555;
+    }
+    .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #2f2f2f; } /* Alternating row color */
+    .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #4a4a4a; } /* Hover effect */
+    .leaderboard-table .gr-dataframe tbody td {
+        padding: 12px 15px;
+        border-bottom: 1px solid #3a3a3a;
+        color: #f0f0f0;
+    }
+    .leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #ffcc99; } /* Lighter orange for rank */
     /* --- Error & Result Panes --- */
+    #error-display-box {
+        background-color: #4a1e1e !important; /* Dark red for error */
+        border-color: #8c2f2f !important;
+        color: #ffc9c9 !important; /* Lighter red text */
+    }
+    #result-summary-box {
+        background-color: #1e3a2a !important; /* Dark green for success */
+        border-color: #2f8c4a !important;
+        color: #c9ffc9 !important; /* Lighter green text */
+    }
+    .gr-markdown p { color: #f0f0f0 !important; } /* Ensure markdown paragraph text is visible */
+    .gr-markdown strong { color: #ffa500 !important; } /* Strong text in orange */
+    .gradio-message { background-color: #ff8c00 !important; color: #1a1a1a !important; border: 1px solid #ff8c00 !important; } /* Gradio Info messages */
+"""
+with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
     gr.Markdown("<h1>🏆 Open LLM Evaluator</h1>")
+    gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
     with gr.Tabs() as tabs:
         # --- Leaderboard Tab ---
         with gr.TabItem("📊 Leaderboard", id=0):
             with gr.Column():
                 with gr.Row(elem_id="leaderboard-toggle-group"):
+                    # Temporarily remove MMLU-Pro from radio options
                     leaderboard_type_toggle = gr.Radio(
+                        ["MMLU"],
                         label="Select Benchmark",
                         value="MMLU",
                         interactive=True,
                         show_label=False,
                     )
                     refresh_button = gr.Button("🔄 Refresh", size="sm")
                 leaderboard_table_output = gr.DataFrame(
                     headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
                     interactive=False,
                     datatype=["number", "str", "str", "number", "str"],
+                    row_count=15, # Adjusted for more rows
+                    elem_classes="leaderboard-table",
+                    # Removed col_count to allow dynamic width
                 )
         # --- Evaluation Tab ---
         with gr.TabItem("🚀 Run Evaluation", id=1):
             with gr.Row(variant='panel'):
                         model_id_input = gr.Textbox(
                             label="Hugging Face Model ID",
                             placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
+                            interactive=True,
+                            scale=2 # Increased scale for textbox
                         )
+                        # Temporarily remove MMLU-Pro from radio options
                         benchmark_selection_radio = gr.Radio(
+                            ["MMLU"],
                             label="Benchmark",
                             value="MMLU",
                             interactive=True,
                         with gr.Row():
                             benchmark_subject_dropdown = gr.Dropdown(
                                 label="Subject",
+                                # Ensure only MMLU subjects are fetched
+                                choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
                                 value="ALL",
                                 interactive=True
                             )
                                 label="Samples per Subject",
                                 minimum=5, maximum=100, value=25, step=5, interactive=True
                             )
                     run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
                 with gr.Column(scale=3):
                     gr.Markdown("### 2. View Results")
                     # Panel for displaying the summary of results
                     with gr.Group(visible=False) as result_summary_box:
                         result_summary_output = gr.Markdown(elem_id="result-summary-box")
                     # Panel for displaying errors
                     with gr.Group(visible=False) as error_box:
                         error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
                         error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
                     # Panel for detailed, row-by-row results
                     with gr.Group(visible=False) as details_box:
                         gr.Markdown("#### Detailed Evaluation Log")
                             headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
                             datatype=["str", "str", "str", "str", "str"],
                             interactive=False,
+                            row_count=10, # Adjusted for more rows
+                            # Removed col_count to allow dynamic width
                             wrap=True,
                         )
+    # --- Event Handlers & Logic ---
     # Update subject dropdown when benchmark type changes
     benchmark_selection_radio.change(
         fn=update_subject_dropdown,
         inputs=[benchmark_selection_radio],
         outputs=[benchmark_subject_dropdown]
     )
     # Main evaluation trigger
     run_button.click(
         fn=run_evaluation,
 # Launch the Gradio app
 if __name__ == "__main__":
+    demo.launch(debug=True)