Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

Enderchef commited on Jun 25, 2025

Commit

ca30b1d

verified ·

1 Parent(s): cda939c

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -38

app.py CHANGED Viewed

@@ -26,13 +26,12 @@ def get_all_benchmark_options():
     and a flattened list suitable for a Gradio dropdown.
     """
     all_options = {}
-    gr_dropdown_options = []
     # Get subjects for MMLU
     try:
         mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
         all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
-        gr_dropdown_options.extend([f"MMLU - {s}" for s in all_options[MMLU_DATASET]])
     except Exception as e:
         print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
         all_options[MMLU_DATASET] = []
@@ -41,15 +40,19 @@ def get_all_benchmark_options():
     try:
         mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
         all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
-        gr_dropdown_options.extend([f"MMLU-Pro - {s}" for s in all_options[MMLU_PRO_DATASET]])
     except Exception as e:
         print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
         all_options[MMLU_PRO_DATASET] = []
     return all_options, gr_dropdown_options
 # Initialize these once globally when the app starts
-ALL_BENCHMARK_SUBJECTS, GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()
 @spaces.GPU() # Decorator to ensure this function runs on GPU if available
 def load_model(model_id):
@@ -186,7 +189,7 @@ def evaluate_single_subject(generator, dataset_id, subject, sample_count, progre
     return accuracy, subject_results
 @spaces.GPU() # Decorator to ensure this function runs on GPU if available
-def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=gr.Progress()):
     """
     Main function to orchestrate the evaluation process.
     Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
@@ -198,25 +201,15 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
         # Return updates to hide logs/debug and show empty results
         return "", gr.update(value="", visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
-    # Parse the selected benchmark and subject from the dropdown string
-    parts = selected_benchmark_subject.split(" - ")
-    if len(parts) != 2:
-        gr.Error("Invalid benchmark selection format. Please select from the dropdown.")
-        return "", gr.update(value="", visible=False), gr.update(visible=False), \
-               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
-    benchmark_name = parts[0]
-    subject_name = parts[1]
     dataset_id_map = {
         "MMLU": MMLU_DATASET,
         "MMLU-Pro": MMLU_PRO_DATASET
     }
-    current_dataset_id = dataset_id_map.get(benchmark_name)
     if not current_dataset_id:
-        gr.Error(f"Unknown benchmark selected: {benchmark_name}. This should not happen.")
         return "", gr.update(value="", visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
@@ -234,12 +227,12 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
                 subjects_to_evaluate.remove("ALL")
             if not subjects_to_evaluate:
-                gr.Warning(f"No subjects found to evaluate for '{benchmark_name}'.")
                 return "", gr.update(value="", visible=False), gr.update(visible=False), \
                        gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
-            for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_name} subjects")):
-                gr.Info(f"Evaluating {benchmark_name} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
                 try:
                     accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
                     all_evaluation_results.extend(subject_details)
@@ -249,14 +242,14 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
                     total_correct_overall += num_correct_in_subject
                     total_samples_overall += num_evaluated_samples
-                    eval_summary_lines.append(f"- {benchmark_name} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
                 except Exception as e:
-                    gr.Error(f"Skipping {benchmark_name} - {sub} due to an error: {e}")
-                    eval_summary_lines.append(f"- {benchmark_name} - {sub}: Error during evaluation.")
                     continue
             overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
-            score_string = f"Overall Average Accuracy for {benchmark_name}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
             score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)
         else:
@@ -264,7 +257,7 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
             all_evaluation_results.extend(subject_details)
             overall_accuracy = accuracy
             num_evaluated_samples = len(subject_details)
-            score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
         # Format detailed results for display in the text box
         formatted_details = "\n\n".join([
@@ -283,7 +276,7 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
         # Record the evaluation result to a JSONL file for the leaderboard
         record = {
             "model_id": model_id,
-            "benchmark": benchmark_name,
             "subject": subject_name,
             "accuracy": overall_accuracy,
             "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
@@ -360,6 +353,24 @@ def load_leaderboard(benchmark_filter):
         traceback.print_exc() # Print full traceback for debugging
         return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 # --- Gradio Interface Definition ---
 with gr.Blocks(css="""
@@ -564,12 +575,69 @@ with gr.Blocks(css="""
         border-bottom-right-radius: 12px;
     }
-    /* Horizontal line for separation */
-    hr {
-        border: none;
-        border-top: 1px solid #e2e8f0;
-        margin: 30px 0;
     }
 """) as demo:
     gr.Markdown("""
     # 🤖 LLM Benchmark Evaluator
@@ -592,19 +660,30 @@ with gr.Blocks(css="""
                     placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
                     interactive=True
                 )
                 with gr.Row():
                     benchmark_subject_dropdown = gr.Dropdown(
-                        label="Choose Benchmark and Subject",
-                        choices=GRADIO_DROPDOWN_OPTIONS,
-                        value="MMLU - ALL", # Default to MMLU ALL for initial load
                         interactive=True,
-                        min_width=400 # Ensure sufficient width
                     )
                     sample_count_slider = gr.Slider(
                         label="Number of Samples per Subject (1-100)",
                         minimum=1,
                         maximum=100,
-                        value=10, # Default to 10 samples
                         step=1,
                         interactive=True,
                         min_width=200
@@ -648,7 +727,7 @@ with gr.Blocks(css="""
             # Define button click actions
             run_button.click(
                 run_evaluation,
-                inputs=[model_id_input, benchmark_subject_dropdown, sample_count_slider],
                 outputs=[
                     acc_output,
                     error_message_output, debug_error_column, # For error state
@@ -656,6 +735,13 @@ with gr.Blocks(css="""
                 ]
             )
             # Toggle visibility of detail_output
             show_details_button.click(
                 lambda s: gr.update(visible=not s), # Toggle visibility
@@ -722,4 +808,4 @@ with gr.Blocks(css="""
             leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
 # Launch the Gradio app
-demo.launch()

     and a flattened list suitable for a Gradio dropdown.
     """
     all_options = {}
+    gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly
     # Get subjects for MMLU
     try:
         mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
         all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
     except Exception as e:
         print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
         all_options[MMLU_DATASET] = []
     try:
         mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
         all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
     except Exception as e:
         print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
         all_options[MMLU_PRO_DATASET] = []
+    # Flattened list for the initial state of the subject dropdown (e.g., MMLU subjects)
+    if MMLU_DATASET in all_options:
+        gr_dropdown_options.extend(all_options[MMLU_DATASET])
     return all_options, gr_dropdown_options
 # Initialize these once globally when the app starts
+ALL_BENCHMARK_SUBJECTS, INITIAL_GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()
 @spaces.GPU() # Decorator to ensure this function runs on GPU if available
 def load_model(model_id):
     return accuracy, subject_results
 @spaces.GPU() # Decorator to ensure this function runs on GPU if available
+def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress()):
     """
     Main function to orchestrate the evaluation process.
     Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
         # Return updates to hide logs/debug and show empty results
         return "", gr.update(value="", visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
     dataset_id_map = {
         "MMLU": MMLU_DATASET,
         "MMLU-Pro": MMLU_PRO_DATASET
     }
+    current_dataset_id = dataset_id_map.get(benchmark_category)
     if not current_dataset_id:
+        gr.Error(f"Unknown benchmark category selected: {benchmark_category}. This should not happen.")
         return "", gr.update(value="", visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
                 subjects_to_evaluate.remove("ALL")
             if not subjects_to_evaluate:
+                gr.Warning(f"No subjects found to evaluate for '{benchmark_category}'.")
                 return "", gr.update(value="", visible=False), gr.update(visible=False), \
                        gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
+            for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_category} subjects")):
+                gr.Info(f"Evaluating {benchmark_category} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
                 try:
                     accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
                     all_evaluation_results.extend(subject_details)
                     total_correct_overall += num_correct_in_subject
                     total_samples_overall += num_evaluated_samples
+                    eval_summary_lines.append(f"- {benchmark_category} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
                 except Exception as e:
+                    gr.Error(f"Skipping {benchmark_category} - {sub} due to an error: {e}")
+                    eval_summary_lines.append(f"- {benchmark_category} - {sub}: Error during evaluation.")
                     continue
             overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
+            score_string = f"Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
             score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)
         else:
             all_evaluation_results.extend(subject_details)
             overall_accuracy = accuracy
             num_evaluated_samples = len(subject_details)
+            score_string = f"Accuracy for {benchmark_category} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
         # Format detailed results for display in the text box
         formatted_details = "\n\n".join([
         # Record the evaluation result to a JSONL file for the leaderboard
         record = {
             "model_id": model_id,
+            "benchmark": benchmark_category,
             "subject": subject_name,
             "accuracy": overall_accuracy,
             "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
         traceback.print_exc() # Print full traceback for debugging
         return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+def update_subject_dropdown_choices(benchmark_category):
+    """
+    Updates the choices for the subject dropdown based on the selected benchmark category.
+    """
+    dataset_id_map = {
+        "MMLU": MMLU_DATASET,
+        "MMLU-Pro": MMLU_PRO_DATASET
+    }
+    selected_dataset_id = dataset_id_map.get(benchmark_category)
+    if selected_dataset_id and selected_dataset_id in ALL_BENCHMARK_SUBJECTS:
+        new_choices = ALL_BENCHMARK_SUBJECTS[selected_dataset_id]
+        # Set default value to "ALL" if available, otherwise the first subject
+        default_value = "ALL" if "ALL" in new_choices else (new_choices[0] if new_choices else None)
+        return gr.update(choices=new_choices, value=default_value)
+    else:
+        return gr.update(choices=[], value=None)
 # --- Gradio Interface Definition ---
 with gr.Blocks(css="""
         border-bottom-right-radius: 12px;
     }
+    /* Radio button group for leaderboard */
+    #leaderboard-toggle.gr-form {
+        display: flex;
+        justify-content: center;
+        padding: 0px 0px 20px 0px; /* Reduced padding for more compact look */
+    }
+    #leaderboard-toggle label.gr-radio-label {
+        font-size: 1.1em;
+        font-weight: 600;
+        color: #2d3748;
+        padding: 10px 20px;
+        border-radius: 8px;
+        background-color: #edf2f7; /* Light background for unselected */
+        border: 1px solid #e2e8f0;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        margin: 0 5px; /* Spacing between radio buttons */
+    }
+    #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label {
+        background-color: #2f80ed; /* Blue for selected */
+        color: white;
+        border-color: #2f80ed;
+        box-shadow: 0 3px 10px rgba(47, 128, 237, 0.3);
+    }
+    #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label:hover {
+        background-color: #1a6dcd; /* Darker blue on hover */
+    }
+    #leaderboard-toggle label.gr-radio-label:hover {
+        background-color: #e2e8f0; /* Lighter grey on hover */
+    }
+    /* Radio button group for evaluation benchmark selection */
+    #eval-benchmark-selection {
+        display: flex;
+        justify-content: center;
+        margin-bottom: 20px; /* Space above dropdown */
+    }
+    #eval-benchmark-selection label.gr-radio-label {
+        font-size: 1.05em;
+        font-weight: 500;
+        color: #4a5568;
+        padding: 8px 15px;
+        border-radius: 6px;
+        background-color: #f0f4f7;
+        border: 1px solid #d9e3ed;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        margin: 0 5px;
+    }
+    #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label {
+        background-color: #48bb78; /* A pleasant green for evaluation selection */
+        color: white;
+        border-color: #48bb78;
+        box-shadow: 0 2px 8px rgba(72, 187, 120, 0.2);
     }
+    #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label:hover {
+        background-color: #38a169;
+    }
+    #eval-benchmark-selection label.gr-radio-label:hover {
+        background-color: #e5edf2;
+    }
 """) as demo:
     gr.Markdown("""
     # 🤖 LLM Benchmark Evaluator
                     placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
                     interactive=True
                 )
+                # New Radio button for benchmark selection for evaluation
+                benchmark_selection_radio = gr.Radio(
+                    ["MMLU", "MMLU-Pro"],
+                    label="Select Benchmark Type",
+                    value="MMLU", # Default selection
+                    interactive=True,
+                    container=False, # Important for custom styling placement
+                    elem_id="eval-benchmark-selection"
+                )
                 with gr.Row():
                     benchmark_subject_dropdown = gr.Dropdown(
+                        label="Choose Subject", # Label changed to be more concise
+                        choices=INITIAL_GRADIO_DROPDOWN_OPTIONS, # Initial choices (MMLU subjects)
+                        value="ALL", # Default to ALL for MMLU initially
                         interactive=True,
+                        min_width=400
                     )
                     sample_count_slider = gr.Slider(
                         label="Number of Samples per Subject (1-100)",
                         minimum=1,
                         maximum=100,
+                        value=10,
                         step=1,
                         interactive=True,
                         min_width=200
             # Define button click actions
             run_button.click(
                 run_evaluation,
+                inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider], # Updated inputs
                 outputs=[
                     acc_output,
                     error_message_output, debug_error_column, # For error state
                 ]
             )
+            # Link benchmark selection radio to subject dropdown
+            benchmark_selection_radio.change(
+                update_subject_dropdown_choices,
+                inputs=[benchmark_selection_radio],
+                outputs=[benchmark_subject_dropdown]
+            )
             # Toggle visibility of detail_output
             show_details_button.click(
                 lambda s: gr.update(visible=not s), # Toggle visibility
             leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
 # Launch the Gradio app
+demo.launch()