| | import json |
| | import re |
| | import os |
| | import streamlit as st |
| | import requests |
| | import pandas as pd |
| | from io import StringIO |
| | import plotly.graph_objs as go |
| | from huggingface_hub import HfApi |
| | from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError |
| | import streamlit.components.v1 as components |
| | from datetime import datetime |
| |
|
| | from urllib.parse import quote |
| | from pathlib import Path |
| | import re |
| | import html |
| | from typing import Dict, Any |
| |
|
| | BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"] |
| |
|
| | def sanitize_agent_name(agent_name): |
| | |
| | if agent_name.startswith('.'): |
| | raise ValueError("Agent name cannot start with a dot") |
| | |
| | if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name): |
| | raise ValueError("Invalid agent name format") |
| | return agent_name |
| |
|
| | def safe_path_join(*parts): |
| | |
| | base = Path("results").resolve() |
| | try: |
| | path = base.joinpath(*parts).resolve() |
| | if not str(path).startswith(str(base)): |
| | raise ValueError("Path traversal detected") |
| | return path |
| | except Exception: |
| | raise ValueError("Invalid path") |
| |
|
| | def sanitize_column_name(col: str) -> str: |
| | """Sanitize column names for HTML display""" |
| | return html.escape(str(col)) |
| |
|
| | def sanitize_cell_value(value: Any) -> str: |
| | if isinstance(value, (int, float)): |
| | return str(value) |
| | if isinstance(value, str) and '±' in value: |
| | score, std_err = value.split('±') |
| | return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>' |
| | return html.escape(str(value)) |
| |
|
| | def create_html_table_main(df): |
| | col1, col2 = st.columns([2,6]) |
| | with col1: |
| | sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column") |
| | with col2: |
| | sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order") |
| | |
| | def get_sort_value(row): |
| | if row == "-": |
| | return float('-inf') |
| | else: |
| | try: |
| | return float(row) |
| | except ValueError: |
| | return row |
| | |
| | |
| | if sort_order == "Ascending": |
| | df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value)) |
| | else: |
| | df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value)) |
| | html = ''' |
| | <style> |
| | table { |
| | width: 100%; |
| | border-collapse: collapse; |
| | } |
| | th, td { |
| | border: 1px solid #ddd; |
| | padding: 8px; |
| | text-align: center; |
| | } |
| | th { |
| | font-weight: bold; |
| | } |
| | .table-container { |
| | padding-bottom: 20px; |
| | } |
| | </style> |
| | ''' |
| | html += '<div class="table-container">' |
| | html += '<table>' |
| | html += '<thead><tr>' |
| | for column in df.columns: |
| | html += f'<th>{sanitize_column_name(column)}</th>' |
| | html += '</tr></thead>' |
| | html += '<tbody>' |
| | for _, row in df.iterrows(): |
| | html += '<tr>' |
| | for col in df.columns: |
| | if col == "Agent": |
| | html += f'<td>{row[col]}</td>' |
| | else: |
| | html += f'<td>{sanitize_cell_value(row[col])}</td>' |
| | html += '</tr>' |
| | html += '</tbody></table>' |
| | html += '</div>' |
| | return html |
| |
|
| | def create_html_table_benchmark(df, benchmark): |
| | col1, col2 = st.columns([2,6]) |
| | with col1: |
| | sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}") |
| | with col2: |
| | sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}") |
| | |
| | def get_sort_value(row): |
| | if row == "-": |
| | return float('-inf') |
| | else: |
| | try: |
| | return float(row) |
| | except ValueError: |
| | return row |
| | |
| | |
| | if sort_order == "Ascending": |
| | df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value)) |
| | else: |
| | df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value)) |
| |
|
| | html = ''' |
| | <style> |
| | table { |
| | width: 100%; |
| | border-collapse: collapse; |
| | } |
| | th, td { |
| | border: 1px solid #ddd; |
| | padding: 8px; |
| | text-align: center; |
| | } |
| | th { |
| | font-weight: bold; |
| | } |
| | .table-container { |
| | padding-bottom: 20px; |
| | } |
| | </style> |
| | ''' |
| | html += '<div class="table-container">' |
| | html += '<table>' |
| | html += '<thead><tr>' |
| | for column in df.columns: |
| | if column == "Reproduced_all" or column == "std_err": |
| | continue |
| | html += f'<th>{sanitize_column_name(column)}</th>' |
| | html += '</tr></thead>' |
| | html += '<tbody>' |
| | for _, row in df.iterrows(): |
| | html += '<tr>' |
| | for column in df.columns: |
| | if column == "Reproduced": |
| | if row[column] == "-": |
| | html += f'<td>{sanitize_cell_value(row[column])}</td>' |
| | else: |
| | summary = sanitize_cell_value(row[column]) |
| | details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"])) |
| | html += f'<td><details><summary>{summary}</summary>{details}</details></td>' |
| | elif column == "Reproduced_all" or column == "std_err": |
| | continue |
| | elif column == "Score": |
| | score_with_std_err = f'{row[column]} ± {row["std_err"]}' |
| | html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>' |
| | else: |
| | html += f'<td>{sanitize_cell_value(row[column])}</td>' |
| | html += '</tr>' |
| | html += '</tbody></table>' |
| | html += '</div>' |
| | return html |
| |
|
| | def check_sanity(agent): |
| | try: |
| | safe_agent = sanitize_agent_name(agent) |
| | for benchmark in BENCHMARKS: |
| | file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json") |
| | if not file_path.is_file(): |
| | continue |
| | original_count = 0 |
| | with open(file_path) as f: |
| | results = json.load(f) |
| | for result in results: |
| | if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]): |
| | return False |
| | if result["agent_name"] != agent: |
| | return False |
| | if result["benchmark"] != benchmark: |
| | return False |
| | if result["original_or_reproduced"] == "Original": |
| | original_count += 1 |
| | if original_count != 1: |
| | return False |
| | return True |
| | except ValueError: |
| | return False |
| |
|
| | def main(): |
| | st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded") |
| | st.markdown(""" |
| | <style> |
| | :root { |
| | --lighter-color: #888; /* Default for light theme */ |
| | } |
| | @media (prefers-color-scheme: dark) { |
| | :root { |
| | --lighter-color: #ccc; /* Default for dark theme */ |
| | } |
| | } |
| | </style> |
| | """, unsafe_allow_html=True) |
| |
|
| | st.markdown(""" |
| | <head> |
| | <meta http-equiv="Content-Security-Policy" |
| | content="default-src 'self' https://huggingface.co; |
| | script-src 'self' 'unsafe-inline'; |
| | style-src 'self' 'unsafe-inline'; |
| | img-src 'self' data: https:; |
| | frame-ancestors 'none';"> |
| | <meta http-equiv="X-Frame-Options" content="DENY"> |
| | <meta http-equiv="X-Content-Type-Options" content="nosniff"> |
| | <meta http-equiv="Referrer-Policy" content="strict-origin-when-cross-origin"> |
| | </head> |
| | """, unsafe_allow_html=True) |
| |
|
| | all_agents = os.listdir("results") |
| | all_results = {} |
| | for agent in all_agents: |
| | if not check_sanity(agent): |
| | st.error(f"Results for {agent} are not in the correct format.") |
| | continue |
| | agent_results = [] |
| | for benchmark in BENCHMARKS: |
| | file_path = safe_path_join(agent, f"{benchmark.lower()}.json") |
| | if not file_path.is_file(): |
| | continue |
| | with open(file_path) as f: |
| | agent_results.extend(json.load(f)) |
| | all_results[agent] = agent_results |
| |
|
| | st.title("🏆 BrowserGym Leaderboard") |
| | st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.") |
| | |
| | tabs = st.tabs(["🏆 Main Leaderboard",] + BENCHMARKS + ["📝 About"]) |
| |
|
| | with tabs[0]: |
| | |
| | def get_leaderboard_dict(results): |
| | leaderboard_dict = [] |
| | for key, values in results.items(): |
| | result_dict = {"Agent": key} |
| | for benchmark in BENCHMARKS: |
| | if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values): |
| | result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0] |
| | else: |
| | result_dict[benchmark] = "-" |
| | leaderboard_dict.append(result_dict) |
| | return leaderboard_dict |
| | leaderboard_dict = get_leaderboard_dict(all_results) |
| | |
| | full_df = pd.DataFrame.from_dict(leaderboard_dict) |
| |
|
| | df = pd.DataFrame(columns=full_df.columns) |
| | dfs_to_concat = [] |
| | dfs_to_concat.append(full_df) |
| |
|
| | |
| | if dfs_to_concat: |
| | df = pd.concat(dfs_to_concat, ignore_index=True) |
| |
|
| | for benchmark in BENCHMARKS: |
| | df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-") |
| | df[benchmark] = df[benchmark].astype(str) |
| | |
| | search_query = st.text_input("Search agents", "", key="search_main") |
| |
|
| | |
| | if search_query: |
| | df = df[df['Agent'].str.contains(search_query, case=False)] |
| |
|
| | |
| |
|
| | def make_hyperlink(agent_name): |
| | try: |
| | safe_name = sanitize_agent_name(agent_name) |
| | safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md" |
| | return f'<a href="{html.escape(safe_url)}" target="_blank">{html.escape(safe_name)}</a>' |
| | except ValueError: |
| | return "" |
| | |
| | df['Agent'] = df['Agent'].apply(make_hyperlink) |
| | html_table = create_html_table_main(df) |
| | st.markdown(html_table, unsafe_allow_html=True) |
| |
|
| | if st.button("Export to CSV", key="export_main"): |
| | |
| | csv_data = df.to_csv(index=False) |
| |
|
| | |
| | st.download_button( |
| | label="Download CSV", |
| | data=csv_data, |
| | file_name="leaderboard.csv", |
| | key="download-csv", |
| | help="Click to download the CSV file", |
| | ) |
| |
|
| | with tabs[-1]: |
| | st.markdown(''' |
| | # BrowserGym Leaderboard |
| | |
| | This leaderboard tracks performance of various agents on web navigation tasks. |
| | |
| | ## How to Submit Results for New Agents |
| | |
| | ### 1. Create Results Directory |
| | Create a new folder in the `results` directory with your agent's name: |
| | ```bash |
| | results/ |
| | └── your-agent-name/ |
| | ├── README.md |
| | ├── webarena.json |
| | ├── workarena-l1.json |
| | ├── workarena++-l2.json |
| | ├── workarena++-l3.json |
| | └── miniwob.json |
| | ``` |
| | |
| | |
| | ### 2. Add Agent Details |
| | |
| | Create a `README.md` in your agent's folder with the following details: |
| | |
| | #### Required Information |
| | - **Model Name**: Base model used (e.g., GPT-4, Claude-2) |
| | - **Model Architecture**: Architecture details and any modifications |
| | - **Input/Output Format**: How inputs are processed and outputs generated |
| | - **Training Details**: Training configuration if applicable |
| | - Dataset used |
| | - Number of training steps |
| | - Hardware used |
| | - Training time |
| | |
| | #### Optional Information |
| | - **Paper Link**: Link to published paper/preprint if available |
| | - **Code Repository**: Link to public code implementation |
| | - **Additional Notes**: Any special configurations or requirements |
| | - **License**: License information for your agent |
| | |
| | Make sure to organize the information in clear sections using Markdown. |
| | |
| | ### 3. Add Benchmark Results |
| | |
| | Create separate JSON files for each benchmark following this format: |
| | |
| | ```json |
| | [ |
| | { |
| | "agent_name": "your-agent-name", |
| | "study_id": "unique-study-identifier-from-agentlab", |
| | "date_time": "YYYY-MM-DD HH:MM:SS", |
| | "benchmark": "WebArena", |
| | "score": 0.0, |
| | "std_err": 0.0, |
| | "benchmark_specific": "Yes/No", |
| | "benchmark_tuned": "Yes/No", |
| | "followed_evaluation_protocol": "Yes/No", |
| | "reproducible": "Yes/No", |
| | "comments": "Additional details", |
| | "original_or_reproduced": "Original" |
| | } |
| | ] |
| | ``` |
| | |
| | Please add all the benchmark files in separate json files named as follows: |
| | |
| | - `webarena.json` |
| | - `workarena-l1.json` |
| | - `workarena-l2.json` |
| | - `workarena-l3.json` |
| | - `miniwob.json` |
| | |
| | Each file must contain a JSON array with a single object following the format above. The benchmark field in each file must match the benchmark name exactly ([`WebArena`, `WorkArena-L1`, `WorkArena-L2`, `WorkArena-L3`, `MiniWoB`]) and benchmark_lowercase.json as the filename. |
| | |
| | ### 4. Submit PR |
| | |
| | 1. Open the community tab and press "New Pull Request" |
| | 2. Give it a new title to the PR and follow the steps mentioned |
| | 3. Publish the branch |
| | |
| | ## How to Submit Reproducibility Results for Existing Agents |
| | |
| | Open the results file for the agent and benchmark you reproduced the results for. |
| | |
| | ### 1. Add reproduced results |
| | |
| | |
| | Append the following entry in the json file. Ensure you set `original_or_reproduced` as `Reproduced`. |
| | |
| | ```json |
| | [ |
| | { |
| | "agent_name": "your-agent-name", |
| | "study_id": "unique-study-identifier-from-agentlab", |
| | "date_time": "YYYY-MM-DD HH:MM:SS", |
| | "benchmark": "WebArena", |
| | "score": 0.0, |
| | "std_err": 0.0, |
| | "benchmark_specific": "Yes/No", |
| | "benchmark_tuned": "Yes/No", |
| | "followed_evaluation_protocol": "Yes/No", |
| | "reproducible": "Yes/No", |
| | "comments": "Additional details", |
| | "original_or_reproduced": "Reproduced" |
| | } |
| | ] |
| | ``` |
| | |
| | ### 2. Submit PR |
| | |
| | 1. Open the community tab and press "New Pull Request" |
| | 2. Give it a new title to the PR and follow the steps mentioned |
| | 3. Publish the branch |
| | |
| | ## License |
| | |
| | MIT |
| | ''') |
| | for i, benchmark in enumerate(BENCHMARKS, start=1): |
| | with tabs[i]: |
| | def get_benchmark_dict(results, benchmark): |
| | benchmark_dict = [] |
| | for key, values in results.items(): |
| | result_dict = {"Agent": key} |
| | flag = 0 |
| | for value in values: |
| | if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original": |
| | result_dict["Score"] = value["score"] |
| | result_dict["std_err"] = value["std_err"] |
| | result_dict["Benchmark Specific"] = value["benchmark_specific"] |
| | result_dict["Benchmark Tuned"] = value["benchmark_tuned"] |
| | result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"] |
| | result_dict["Reproducible"] = value["reproducible"] |
| | result_dict["Comments"] = value["comments"] |
| | result_dict["Study ID"] = value["study_id"] |
| | value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p") |
| | result_dict["Date"] = value["date_time"] |
| | result_dict["Reproduced"] = [] |
| | result_dict["Reproduced_all"] = [] |
| | flag = 1 |
| | if not flag: |
| | result_dict["Score"] = "-" |
| | result_dict["std_err"] = "-" |
| | result_dict["Benchmark Specific"] = "-" |
| | result_dict["Benchmark Tuned"] = "-" |
| | result_dict["Followed Evaluation Protocol"] = "-" |
| | result_dict["Reproducible"] = "-" |
| | result_dict["Comments"] = "-" |
| | result_dict["Study ID"] = "-" |
| | result_dict["Date"] = "-" |
| | result_dict["Reproduced"] = [] |
| | result_dict["Reproduced_all"] = [] |
| | if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced": |
| | result_dict["Reproduced"].append(value["score"]) |
| | value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p") |
| | result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])])) |
| | if result_dict["Reproduced"]: |
| | result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"])) |
| | else: |
| | result_dict["Reproduced"] = "-" |
| | benchmark_dict.append(result_dict) |
| | return benchmark_dict |
| | benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark) |
| | |
| | full_df = pd.DataFrame.from_dict(benchmark_dict) |
| | df_ = pd.DataFrame(columns=full_df.columns) |
| | dfs_to_concat = [] |
| | dfs_to_concat.append(full_df) |
| |
|
| | |
| | if dfs_to_concat: |
| | df_ = pd.concat(dfs_to_concat, ignore_index=True) |
| | df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-") |
| | df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-") |
| | df_['Score'] = df_['Score'].astype(str) |
| | html_table = create_html_table_benchmark(df_, benchmark) |
| | st.markdown(html_table, unsafe_allow_html=True) |
| | |
| | |
| | if __name__ == "__main__": |
| | main() |
| |
|