Fino1 Leaderboard

The Fino1 Leaderboard evaluates the performance of various LLMs, including general-purpose models and reasoning-enhanced models, on complex financial tasks. These tasks, such as mathematical question answering and equation execution, assess an LLMโ€™s ability to perform structured financial reasoning and numerical computation.

{
  • "headers": [
    • "Models",
    • "Average",
    • "FinQA",
    • "DM-Simplong",
    • "XBRL-Math",
    • "Type"
    ],
  • "data": [
    • [
      • "GPT-4o",
      • 68.24,
      • 72.49,
      • 60,
      • 72.22,
      • "Instruction-tuned"
      ],
    • [
      • "GPT-4.5",
      • 67.46,
      • 68.94,
      • 59,
      • 74.44,
      • "Instruction-tuned"
      ],
    • [
      • "GPT-o1",
      • 59.84,
      • 49.07,
      • 56,
      • 74.44,
      • "Reasoning-enhanced"
      ],
    • [
      • "GPT-o3-mini",
      • 65.51,
      • 60.87,
      • 59,
      • 76.67,
      • "Reasoning-enhanced"
      ],
    • [
      • "DeepSeek-V3",
      • 67.62,
      • 73.2,
      • 53,
      • 76.67,
      • "Instruction-tuned"
      ],
    • [
      • "DeepSeek-R1",
      • 68.93,
      • 65.13,
      • 53,
      • 86.67,
      • "Reasoning-enhanced"
      ],
    • [
      • "Qwen2.5-72B-Instruct",
      • 66.72,
      • 73.38,
      • 59,
      • 67.78,
      • "Instruction-tuned"
      ],
    • [
      • "Qwen2.5-72B-Instruct-Math",
      • 65.69,
      • 69.74,
      • 42,
      • 83.33,
      • "Reasoning-enhanced"
      ],
    • [
      • "Qwen2.5-32B-Instruct",
      • 64.89,
      • 73.11,
      • 56,
      • 65.56,
      • "Instruction-tuned"
      ],
    • [
      • "DeepSeek-R1-Distill-Llama-70B",
      • 68.8,
      • 66.73,
      • 53,
      • 86.67,
      • "Reasoning-enhanced"
      ],
    • [
      • "Llama3-70B-Instruct",
      • 52.2,
      • 58.92,
      • 41,
      • 56.67,
      • "Instruction-tuned"
      ],
    • [
      • "Llama3.1-70B-Instruct",
      • 58.17,
      • 63.18,
      • 48,
      • 63.33,
      • "Instruction-tuned"
      ],
    • [
      • "Llama3.3-70B-Instruct",
      • 64.05,
      • 68.15,
      • 54,
      • 70,
      • "Instruction-tuned"
      ],
    • [
      • "DeepSeek-R1-Distill-Qwen-32B",
      • 68.97,
      • 65.48,
      • 55,
      • 84.44,
      • "Reasoning-enhanced"
      ],
    • [
      • "DeepSeek-R1-Distill-Qwen-14B",
      • 63.9,
      • 63.27,
      • 44,
      • 84.44,
      • "Reasoning-enhanced"
      ],
    • [
      • "DeepSeek-R1-Distill-Llama-8B",
      • 53.36,
      • 45.96,
      • 33,
      • 81.11,
      • "Reasoning-enhanced"
      ],
    • [
      • "Llama3-8B-Instruct",
      • 39.95,
      • 41.97,
      • 29,
      • 48.89,
      • "Instruction-tuned"
      ],
    • [
      • "Llama3.1-8B-Instruct",
      • 50.12,
      • 54.13,
      • 34,
      • 62.22,
      • "Instruction-tuned"
      ],
    • [
      • "LIMO",
      • 56.52,
      • 63.44,
      • 45,
      • 61.11,
      • "Reasoning-enhanced"
      ],
    • [
      • "s1-32B",
      • 68.08,
      • 66.81,
      • 53,
      • 84.44,
      • "Reasoning-enhanced"
      ],
    • [
      • "Fino1-8B",
      • 61.03,
      • 60.87,
      • 40,
      • 82.22,
      • "Reasoning-enhanced"
      ]
    ],
  • "metadata": null
}