From d722e78702de1d5a3ce68cb373b2ece1207c0fa2 Mon Sep 17 00:00:00 2001 From: cmpatino Date: Mon, 29 Sep 2025 21:09:38 +0200 Subject: [PATCH 1/4] Revert extraction setting for IndicesExtractionConfig Revert `try_extract_without_anchor` to True in `IndicesExtractionConfig` to avoid issues in `gpqa:diamond` eval --- src/lighteval/metrics/utils/extractive_match_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/metrics/utils/extractive_match_utils.py b/src/lighteval/metrics/utils/extractive_match_utils.py index cce2b1793..1a3dc518c 100644 --- a/src/lighteval/metrics/utils/extractive_match_utils.py +++ b/src/lighteval/metrics/utils/extractive_match_utils.py @@ -90,7 +90,7 @@ class IndicesExtractionConfig: """ prefix_for_extraction: ChoicePrefix - try_extract_without_anchor: bool = False + try_extract_without_anchor: bool = True ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | IndicesExtractionConfig From 6c5af4274320f56649400b70d88a330971ed0323 Mon Sep 17 00:00:00 2001 From: cmpatino Date: Tue, 30 Sep 2025 11:33:12 +0200 Subject: [PATCH 2/4] Change `try_extract_without_anchor` only for GPQA --- src/lighteval/metrics/metrics.py | 8 ++++---- src/lighteval/metrics/utils/extractive_match_utils.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 0674d2df1..fbe15dbf4 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -526,8 +526,8 @@ class Metrics(Enum): metric_name="extractive_match", sample_level_fn=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], precision=6, ), category=SamplingMethod.GENERATIVE, @@ -539,8 +539,8 @@ class Metrics(Enum): sample_level_fn=PassAtK( sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], precision=6, ), ), diff --git a/src/lighteval/metrics/utils/extractive_match_utils.py b/src/lighteval/metrics/utils/extractive_match_utils.py index 1a3dc518c..cce2b1793 100644 --- a/src/lighteval/metrics/utils/extractive_match_utils.py +++ b/src/lighteval/metrics/utils/extractive_match_utils.py @@ -90,7 +90,7 @@ class IndicesExtractionConfig: """ prefix_for_extraction: ChoicePrefix - try_extract_without_anchor: bool = True + try_extract_without_anchor: bool = False ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | IndicesExtractionConfig From 5b213cbdd47213c956743dde31753b5babeb6d6f Mon Sep 17 00:00:00 2001 From: cmpatino Date: Tue, 30 Sep 2025 11:44:07 +0200 Subject: [PATCH 3/4] Fix style --- src/lighteval/metrics/metrics.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index fbe15dbf4..167919974 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -526,8 +526,12 @@ class Metrics(Enum): metric_name="extractive_match", sample_level_fn=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], + gold_extraction_target=[ + IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + ], + pred_extraction_target=[ + IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + ], precision=6, ), category=SamplingMethod.GENERATIVE, @@ -539,8 +543,12 @@ class Metrics(Enum): sample_level_fn=PassAtK( sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)], + gold_extraction_target=[ + IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + ], + pred_extraction_target=[ + IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + ], precision=6, ), ), From 1b6354425b0f6a9651b6dfb07e869d4e10b75646 Mon Sep 17 00:00:00 2001 From: cmpatino Date: Tue, 30 Sep 2025 12:17:58 +0200 Subject: [PATCH 4/4] Update GPQA test to reflect the extract setting --- tests/unit/metrics/test_cases/gpqa_instruct_metric.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json index af68ba3e5..4dddf83cc 100644 --- a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json +++ b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json @@ -249,7 +249,7 @@ ] }, "expected_output": { - "extractive_match": 0.0 + "extractive_match": 1.0 }, "tolerance": 0.01, "description": "Answer with quotes but still extractable"