{"date":"2026-05-23T20:46:33Z","drift":[],"headline":"gemini-2.5-flash failing causality-1.","previous":{"anthropic/claude-haiku-4-5":{"ambiguity-1":{"correct":true,"score":4},"causality-1":{"correct":true,"score":4.5},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":4},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-opus-4-6":{"ambiguity-1":{"correct":true,"score":5},"causality-1":{"correct":true,"score":5},"code-1":{"correct":true,"score":4.75},"common-sense-1":{"correct":true,"score":4.25},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-sonnet-4-6":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":true,"score":4.75},"code-1":{"correct":true,"score":4.25},"common-sense-1":{"correct":true,"score":4.5},"logic-1":{"correct":true,"score":4.75},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-flash":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":false,"score":2.5},"code-1":{"correct":true,"score":4.75},"common-sense-1":{"correct":true,"score":4},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-pro":{"ambiguity-1":{"correct":true,"score":4},"causality-1":{"correct":true,"score":4.75},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":5},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"ollama/llama3":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}},"openai/gpt-5.4":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}},"openai/gpt-5.4-mini":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}}},"run_id":"2026-05-24T01-46-33","scorecard":{"anthropic/claude-haiku-4-5":{"ambiguity-1":{"correct":true,"score":4},"causality-1":{"correct":true,"score":4.5},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":4},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-opus-4-6":{"ambiguity-1":{"correct":true,"score":5},"causality-1":{"correct":true,"score":4.75},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":4.25},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-sonnet-4-6":{"ambiguity-1":{"correct":true,"score":4.25},"causality-1":{"correct":true,"score":4.5},"code-1":{"correct":true,"score":4.75},"common-sense-1":{"correct":true,"score":4.25},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-flash":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":false,"score":3},"code-1":{"correct":true,"score":4.75},"common-sense-1":{"correct":true,"score":3.75},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-pro":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":true,"score":4.5},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":4.75},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":4.75},"spatial-1":{"correct":true,"score":5}},"ollama/llama3":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}},"openai/gpt-5.4":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}},"openai/gpt-5.4-mini":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}}},"status":{"anthropic/claude-haiku-4-5":"stable","anthropic/claude-opus-4-6":"stable","anthropic/claude-sonnet-4-6":"stable","gemini/gemini-2.5-flash":"stable","gemini/gemini-2.5-pro":"stable"}}