index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
    <meta property="og:title" content="MathArena.ai"/>
  <meta property="og:description" content="MathArena: Evaluating LLMs on Uncontaminated Math Competitions"/>
  <meta property="description" content="MathArena: Evaluating LLMs on Uncontaminated Math Competitions"/>

  <meta property="og:url" content="https://matharena.ai/"/>
    <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>
  <meta name="keywords" content="Math, LLM, Olympiads, Competitions, Leaderboards, AI, Machine Learning, MathArena, MathArena.ai"/>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">

  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>MathArena</title>
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/2.0.7/css/dataTables.dataTables.min.css">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/material-components-web/14.0.0/material-components-web.min.js">
  <link rel="stylesheet" href="static/css/index.css">
  
  <script>
    window.MathJax = {
        tex: {
            inlineMath: [['$', '$']]
        }
    };
</script>

  <script type="text/javascript" async
  src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.1.0/es5/tex-mml-chtml.js">
</script>
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script type="text/javascript" charset="utf8" src="https://cdn.datatables.net/2.0.7/js/dataTables.min.js"></script>
  <script src="static/js/index.js"></script>
  
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">MathArena: Evaluating LLMs on Uncontaminated Math Competitions</h1>
            <div class="is-size-5 publication-authors">
              <img class="logos" src="static/images/footer.svg" alt="ETH & SRI Logo">
            </div>
            <br>
            <div class="is-size-2 publication-authors">
              <span class="author-block">AIME 2025</span>
            </div>
          </div>
        </div>
      </div>
</div>
      
      
    </div>
  </div>
</section>

<section class="hero is-light">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-justified">
          <!-- <div class="table-info-box">
            <b>Model scores on AIME I 2025</b>
          </div> -->
          <table id="myTopTable" class="display" style="width:100%">
            <thead>
                <tr>
                  <!-- <th rowspan="2" class="model-name">Model Name</th> -->
                  <th colspan="18" class="competition-name">AIME I 2025<br>
                  <span id="tracesHint">
                    <i>Select a model name (leftmost column) and a problem number (top row) to see corresponding solutions</i>
                  </span>
                  </th>
                </tr>
                <tr>
                  <th colspan="1" class="model-name">Model Name</th>
                  <th  colspan="1" class="tooltip">Acc
                    <span class="tooltip-text">Acc is average accuracy over all runs.</span>
                  </th>
                  <th  colspan="1" class="tooltip">Cost
                    <span class="tooltip-text">Cost is total cost over entire benchmark. Note that this is API-dependent.</span>
                  </th>
                  <th colspan="1"><a href="#" id="1">1</a></th>
                  <th colspan="1"><a href="#" id="2">2</a></th>
                  <th colspan="1"><a href="#" id="3">3</a></th>
                  <th colspan="1"><a href="#" id="4">4</a></th>
                  <th colspan="1"><a href="#" id="5">5</a></th>
                  <th colspan="1"><a href="#" id="6">6</a></th>
                  <th colspan="1"><a href="#" id="7">7</a></th>
                  <th colspan="1"><a href="#" id="8">8</a></th>
                  <th colspan="1"><a href="#" id="9">9</a></th>
                  <th colspan="1"><a href="#" id="10">10</a></th>
                  <th colspan="1"><a href="#" id="11">11</a></th>
                  <th colspan="1"><a href="#" id="12">12</a></th>
                  <th colspan="1"><a href="#" id="13">13</a></th>
                  <th colspan="1"><a href="#" id="14">14</a></th>
                  <th colspan="1"><a href="#" id="15">15</a></th>
                  
              </tr>
            </thead>
            <tbody>
            </tbody>
        </table>
        <p>
          *Ran locally, we have not settled on a computation method for the cost yet.
        </p>
      </div>
      </div>
      </div>
</section>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">

        <div id="traces">
          <!-- JS fills -->
        </div>

      </div>
      </div>
      </div>
</section>

<section class="section hero is-light">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-ninety">
          <h2 class="title is-2">What is MathArena?</h2>
          <div class="content has-text-justified">
            <!-- <div class="caption">
              We pit LLMs against each other and humans on recent math competitions and olympiads, providing the possibility for uncontaminated model evaluation. 
              We hope that this will lead to better understanding of the capabilities and limitations of LLMs on math tasks.
            </div> -->
            <p>
              MathArena is a platform for evaluation of LLMs on the latest math competitions and olympiads.
              Our mission is rigorous assessment of the reasoning and generalization capabilities of LLMs on new math problems which the models have not seen during training.
              To ensure a fair and uncontaminated evaluation, we exclusively test models on competitions that took place after their release, avoiding retroactive assessments on potentially leaked or pre-trained material.
              By performing standardized evaluation we ensure model scores are actually comparable and are not dependent on the specific evaluation setup of the model provider.
              <br><br>
              To show the model performance, we publish a leaderboard for each competition showing the scores of different models individual problems.
              Additionally, we will include a main table that includes model performance on all competitions.
              To evaluate performance, we run each model 4 times on each problem, computing the average score and the cost of the model (in USD) across all runs.
              <!-- To make the results easier to interpret, we use a color-coded system in the table: green problems are solved more than 75% of the time, yellow problems are solved 25-75% of the time, and red problems are solved less than 25% of the time. -->
              <br><br>
              We are committed to make our evaluation code and data publicly available.
            </p>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-ninety">
        <!-- Add FAQ section -->
        <h2 class="title is-2">Frequently Asked Questions</h2>
        <div class="content has-text-justified">
          <div class="faq-container">
            <div class="faq-item">
              <div class="faq-question">
                How exactly do you compute accuracy?
              </div>
              <div class="faq-answer" style="display: none;">
                We compute the accuracy of a model by prompting it to solve each problem 4 times and computing the success rate for this problem by dividing the number of correct solutions by 4.
                This corresponds to the pass@1 metric estimated using 4 samples. The final accuracy is the average pass@1 over all problems. We do not perform majority voting or other criteria like pass@K.
              </div>
            </div>

            <div class="faq-item">
              <div class="faq-question">
                What do the colors in the table mean?
              </div>
              <div class="faq-answer" style="display: none;">
                The colors indicate success rates of the problems:
                <ul>
                  <li>Green: Problem solved >75% of the time</li>
                  <li>Yellow: Problem solved 25-75% of the time</li>
                  <li>Red: Problem solved <25% of the time</li>
                </ul>
              </div>
            </div>

            <div class="faq-item">
              <div class="faq-question">
                Can you show the average number of input and output tokens for each model?
              </div>
              <div class="faq-answer" style="display: none;">
                Yes, below you can find the average number of input and output tokens for each model along with the price per million tokens for the API we used.
                <table id="secondaryTable" class="display" style="width:100%">
                  <thead>
                      <tr>
                        <!-- <th rowspan="2" class="model-name">Model Name</th> -->
                        <th colspan="5" class="competition-name">AIME I 2025<br>
                        </th>
                      </tr>
                      <tr>
                        <th colspan="1" class="model-name">Model Name</th>
                        <th  colspan="1" class="avg-cost-header">Input Tokens</th>
                        <th  colspan="1" class="avg-cost-header">Input Costs</th>
                        <th  colspan="1" class="avg-cost-header">Output Tokens</th>
                        <th  colspan="1" class="avg-cost-header">Output Costs</th>    
                    </tr>
                  </thead>
                  <tbody>
                  </tbody>
                </table>
              </div>
                
            </div>

            <div class="faq-item">
              <div class="faq-question">
                How is the cost calculated?
              </div>
              <div class="faq-answer" style="display: none;">
                The cost shows the total cost of evaluating the model on the entire benchmark (all problems and all repetitions). It is calculated based on the API pricing for each model.
                For open-source models, costs can vary significantly depending on the chosen API provider and our results may not always be achieved using the most cost-effective option.
                In particular, for DeepSeek models we use Together API as we found it to be the most reliable endpoint, but it is more expensive than DeepSeek's own API.
                For gemini-2.0-flash-thinking it was impossible to determine the cost since the pay-as-you-go pricing is not available, and the Google API does not return the number of thinking tokens.
              </div>
            </div>

            <div class="faq-item">
              <div class="faq-question">
                Why did you use the Together API for Deepseek models?
              </div>
              <div class="faq-answer" style="display: none;">
                The Deepseek API only allow answers up to 8000 tokens, which is not enough for most olympiad problems. Thus, using the Deepseek API would have led to a much lower (non-representative) performance of the Deepseek models.
                We verified that the Together API works well by reproducing the accuracy of Deepseek-R1 on AIME 2024 using our experimental setup.
              </div>
            </div>

            <div class="faq-item">
              <div class="faq-question">
                How do you know that your problems are not in the training data?
              </div>
              <div class="faq-answer" style="display: none;">
                First, we always evaluate models on new competitions immediately as the problems are released, guaranteeing that the knowledge cutoff of the model is before the date of the competition. 
                While it is not impossible to rule out that evaluated problems or their variants are in the training data (e.g. because they appeared in another competition, see <a href="https://x.com/DimitrisPapail/status/1888325914603516214" style="color: blue">here</a>), the organizers of competitions such as AIME
                always try to ensure highest quality of their problem set. So we believe that the problems are sufficiently novel that it is possible to evaluate generalization capabilities of the models.
            </div>
          </div>

          <div class="faq-item">
            <div class="faq-question">
              Can you evaluate more models?
            </div>
            <div class="faq-answer" style="display: none;">
              Yes, we are planning to add more models while keeping the table concise and informative. Some of the models are difficult to evaluate due to the rate limits in particular APIs, but we will try to add
              most well known ones. We are also going to release our evaluation scripts to enable the community to evaluate their models.
            </div>
          </div>

          <div class="faq-item">
            <div class="faq-question">
              Are you going to release data and code?
            </div>
            <div class="faq-answer" style="display: none;">
              Yes, we are going to open source our evaluation scripts and data in the next few days.
            </div>
          </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content has-text-centered">
          <br>
          <img class="logos" src="static/images/footer.svg" alt="ETH & SRI Logo">
        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Cloudflare Web Analytics -->
 <script defer src='https://static.cloudflareinsights.com/beacon.min.js' 
 data-cf-beacon='{"token": "68baeccd6f3e464aa6ad43275be4f8ca"}'>
</script><!-- End Cloudflare Web Analytics -->

<script src="static/js/traces.js"></script>
<script src="static/js/table_data.js"></script>
<script src="static/js/secondary.js"></script>

  </body>
  </html>