Merge pull request #105 from MantisAI/feat/adding-min-overlap-threshold

davidsbatista · web-flow · commit 502db83c8423 · 2025-09-06T10:09:47.000+02:00
feat: defining a min ground truth percentage to be considered an overlap
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,6 +68,7 @@ disable = [
     "R0801",  # duplicate-code
     "W9020",  # bad-option-value
     "W0621",  # redefined-outer-name
+    "W0212",  # protected-access
 ]
 
 [tool.pylint.'DESIGN']
diff --git a/src/nervaluate/evaluator.py b/src/nervaluate/evaluator.py
@@ -17,7 +17,9 @@
 class Evaluator:
     """Main evaluator class for NER evaluation."""
 
-    def __init__(self, true: Any, pred: Any, tags: List[str], loader: str = "default") -> None:
+    def __init__(
+        self, true: Any, pred: Any, tags: List[str], loader: str = "default", min_overlap_percentage: float = 1.0
+    ) -> None:
         """
         Initialize the evaluator.
 
@@ -26,8 +28,10 @@ def __init__(self, true: Any, pred: Any, tags: List[str], loader: str = "default
             pred: Predicted entities in any supported format
             tags: List of valid entity tags
             loader: Name of the loader to use
+            min_overlap_percentage: Minimum overlap percentage for partial matches (1-100)
         """
         self.tags = tags
+        self.min_overlap_percentage = min_overlap_percentage
         self._setup_loaders()
         self._load_data(true, pred, loader)
         self._setup_evaluation_strategies()
@@ -37,12 +41,12 @@ def _setup_loaders(self) -> None:
         self.loaders: Dict[str, DataLoader] = {"conll": ConllLoader(), "list": ListLoader(), "dict": DictLoader()}
 
     def _setup_evaluation_strategies(self) -> None:
-        """Setup evaluation strategies."""
+        """Setup evaluation strategies with overlap threshold."""
         self.strategies: Dict[str, EvaluationStrategy] = {
-            "strict": StrictEvaluation(),
-            "partial": PartialEvaluation(),
-            "ent_type": EntityTypeEvaluation(),
-            "exact": ExactEvaluation(),
+            "strict": StrictEvaluation(self.min_overlap_percentage),
+            "partial": PartialEvaluation(self.min_overlap_percentage),
+            "ent_type": EntityTypeEvaluation(self.min_overlap_percentage),
+            "exact": ExactEvaluation(self.min_overlap_percentage),
         }
 
     def _load_data(self, true: Any, pred: Any, loader: str) -> None:
diff --git a/src/nervaluate/strategies.py b/src/nervaluate/strategies.py
@@ -7,6 +7,45 @@
 class EvaluationStrategy(ABC):
     """Abstract base class for evaluation strategies."""
 
+    def __init__(self, min_overlap_percentage: float = 1.0):
+        """
+        Initialize strategy with minimum overlap threshold.
+
+        Args:
+            min_overlap_percentage: Minimum overlap percentage required (1-100)
+        """
+        if not 1.0 <= min_overlap_percentage <= 100.0:
+            raise ValueError("min_overlap_percentage must be between 1.0 and 100.0")
+        self.min_overlap_percentage = min_overlap_percentage
+
+    @staticmethod
+    def _calculate_overlap_percentage(pred: Entity, true: Entity) -> float:
+        """
+        Calculate the percentage overlap between predicted and true entities.
+
+        Returns:
+            Overlap percentage based on true entity span (0-100)
+        """
+        # Check if there's any overlap first
+        if pred.start > true.end or pred.end < true.start:
+            return 0.0
+
+        # Calculate overlap boundaries
+        overlap_start = max(pred.start, true.start)
+        overlap_end = min(pred.end, true.end)
+
+        # Calculate spans (adding 1 because end is inclusive)
+        overlap_span = overlap_end - overlap_start + 1
+        true_span = true.end - true.start + 1
+
+        # Calculate percentage based on true entity span
+        return (overlap_span / true_span) * 100.0
+
+    def _has_sufficient_overlap(self, pred: Entity, true: Entity) -> bool:
+        """Check if entities have sufficient overlap based on threshold."""
+        overlap_percentage = EvaluationStrategy._calculate_overlap_percentage(pred, true)
+        return overlap_percentage >= self.min_overlap_percentage
+
     @abstractmethod
     def evaluate(
         self, true_entities: List[Entity], pred_entities: List[Entity], tags: List[str], instance_index: int = 0
@@ -50,8 +89,8 @@ def evaluate(
                     matched_true.add(true_idx)
                     found_match = True
                     break
-                # Check for any overlap
-                if pred.start <= true.end and pred.end >= true.start:
+                # Check for sufficient overlap with min threshold
+                if self._has_sufficient_overlap(pred, true):
                     result.incorrect += 1
                     indices.incorrect_indices.append((instance_index, pred_idx))
                     matched_true.add(true_idx)
@@ -97,8 +136,8 @@ def evaluate(
                 if true_idx in matched_true:
                     continue
 
-                # Check for overlap
-                if pred.start <= true.end and pred.end >= true.start:
+                # Check for sufficient overlap with min threshold
+                if self._has_sufficient_overlap(pred, true):
                     if pred.start == true.start and pred.end == true.end:
                         result.correct += 1
                         indices.correct_indices.append((instance_index, pred_idx))
@@ -135,7 +174,6 @@ class EntityTypeEvaluation(EvaluationStrategy):
     If there's a predicted entity that doesn't match any true entity, we mark it as spurious.
     If there's a true entity that doesn't match any predicted entity, we mark it as missed.
 
-    # ToDo: define a minimum overlap threshold - see: https://github.com/MantisAI/nervaluate/pull/83
     """
 
     def evaluate(
@@ -153,8 +191,8 @@ def evaluate(
                 if true_idx in matched_true:
                     continue
 
-                # Check for any overlap (perfect or minimum)
-                if pred.start <= true.end and pred.end >= true.start:
+                # Check for sufficient overlap with min threshold
+                if self._has_sufficient_overlap(pred, true):
                     if pred.label == true.label:
                         result.correct += 1
                         indices.correct_indices.append((instance_index, pred_idx))
@@ -216,8 +254,8 @@ def evaluate(
                     matched_true.add(true_idx)
                     found_match = True
                     break
-                # Check for any overlap
-                if pred.start <= true.end and pred.end >= true.start:
+                # Check for sufficient overlap with min threshold
+                if self._has_sufficient_overlap(pred, true):
                     result.incorrect += 1
                     indices.incorrect_indices.append((instance_index, pred_idx))
                     matched_true.add(true_idx)
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
@@ -166,3 +166,190 @@ def test_results_to_csv(sample_data, tmp_path):
     # test invalid scenario for entities mode
     with pytest.raises(ValueError, match="Invalid scenario: must be one of"):
         evaluator.results_to_csv(mode="entities", scenario="invalid")
+
+
+def test_evaluator_with_min_overlap_percentage():
+    """Test Evaluator class with minimum overlap percentage parameter."""
+
+    # Test data: true entity spans positions 0-9 (10 tokens)
+    true_entities = [[{"label": "PER", "start": 0, "end": 9}]]  # 10-token entity
+
+    # Predicted entities with different overlap percentages
+    pred_entities = [[{"label": "PER", "start": 0, "end": 2}]]  # 30% overlap
+
+    # Test with default 1% threshold - should be partial match
+    evaluator_default = Evaluator(true=true_entities, pred=pred_entities, tags=["PER"], loader="dict")
+    results_default = evaluator_default.evaluate()
+    partial_default = results_default["overall"]["partial"]
+    assert partial_default.partial == 1
+    assert partial_default.spurious == 0
+
+    # Test with 50% threshold - should be spurious
+    evaluator_50 = Evaluator(
+        true=true_entities, pred=pred_entities, tags=["PER"], loader="dict", min_overlap_percentage=50.0
+    )
+    results_50 = evaluator_50.evaluate()
+    partial_50 = results_50["overall"]["partial"]
+    assert partial_50.partial == 0
+    assert partial_50.spurious == 1
+
+
+def test_evaluator_min_overlap_validation():
+    """Test that Evaluator validates minimum overlap percentage."""
+    true_entities = [[{"label": "PER", "start": 0, "end": 5}]]
+    pred_entities = [[{"label": "PER", "start": 0, "end": 5}]]
+
+    # Valid values should work
+    Evaluator(true_entities, pred_entities, ["PER"], "dict", min_overlap_percentage=1.0)
+    Evaluator(true_entities, pred_entities, ["PER"], "dict", min_overlap_percentage=50.0)
+    Evaluator(true_entities, pred_entities, ["PER"], "dict", min_overlap_percentage=100.0)
+
+    # Invalid values should raise ValueError during strategy initialization
+    with pytest.raises(ValueError, match="min_overlap_percentage must be between 1.0 and 100.0"):
+        Evaluator(true_entities, pred_entities, ["PER"], "dict", min_overlap_percentage=0.5)
+
+    with pytest.raises(ValueError, match="min_overlap_percentage must be between 1.0 and 100.0"):
+        Evaluator(true_entities, pred_entities, ["PER"], "dict", min_overlap_percentage=101.0)
+
+
+def test_evaluator_min_overlap_affects_all_strategies():
+    """Test that minimum overlap percentage affects all evaluation strategies."""
+    true_entities = [[{"label": "PER", "start": 0, "end": 9}]]  # 10 tokens
+
+    pred_entities = [[{"label": "PER", "start": 0, "end": 2}]]  # 30% overlap
+
+    evaluator = Evaluator(
+        true=true_entities, pred=pred_entities, tags=["PER"], loader="dict", min_overlap_percentage=50.0
+    )
+
+    results = evaluator.evaluate()
+
+    # All strategies should respect the 50% threshold
+    # 30% overlap < 50% threshold, so should be spurious for all strategies
+
+    # Partial strategy
+    partial_result = results["overall"]["partial"]
+    assert partial_result.spurious == 1
+    assert partial_result.correct == 0
+    assert partial_result.partial == 0
+
+    # Strict strategy
+    strict_result = results["overall"]["strict"]
+    assert strict_result.spurious == 1
+    assert strict_result.correct == 0
+    assert strict_result.incorrect == 0
+
+    # Entity type strategy
+    ent_type_result = results["overall"]["ent_type"]
+    assert ent_type_result.spurious == 1
+    assert ent_type_result.correct == 0
+    assert ent_type_result.incorrect == 0
+
+    # Exact strategy
+    exact_result = results["overall"]["exact"]
+    assert exact_result.spurious == 1
+    assert exact_result.correct == 0
+    assert exact_result.incorrect == 0
+
+
+def test_evaluator_min_overlap_with_different_thresholds():
+    """Test Evaluator with different overlap thresholds."""
+    true_entities = [[{"label": "PER", "start": 0, "end": 9}]]  # 10 tokens
+
+    # Test cases with different predicted entities
+    test_cases = [
+        # (pred_entities, threshold, expected_result_type)
+        ([{"label": "PER", "start": 0, "end": 4}], 50.0, "partial"),  # 50% overlap = 50%
+        ([{"label": "PER", "start": 0, "end": 4}], 51.0, "spurious"),  # 50% overlap < 51%
+        ([{"label": "PER", "start": 0, "end": 6}], 75.0, "spurious"),  # 70% overlap < 75%
+        ([{"label": "PER", "start": 0, "end": 7}], 75.0, "partial"),  # 80% overlap > 75%
+        ([{"label": "PER", "start": 0, "end": 9}], 100.0, "correct"),  # 100% overlap = exact match
+    ]
+
+    for pred_data, threshold, expected_type in test_cases:
+        pred_entities = [pred_data]
+
+        evaluator = Evaluator(
+            true=true_entities, pred=pred_entities, tags=["PER"], loader="dict", min_overlap_percentage=threshold
+        )
+
+        results = evaluator.evaluate()
+        partial_results = results["overall"]["partial"]
+
+        if expected_type == "correct":
+            assert partial_results.correct == 1, f"Failed for {pred_data} with threshold {threshold}%"
+            assert partial_results.partial == 0
+            assert partial_results.spurious == 0
+        elif expected_type == "partial":
+            assert partial_results.partial == 1, f"Failed for {pred_data} with threshold {threshold}%"
+            assert partial_results.correct == 0
+            assert partial_results.spurious == 0
+        elif expected_type == "spurious":
+            assert partial_results.spurious == 1, f"Failed for {pred_data} with threshold {threshold}%"
+            assert partial_results.correct == 0
+            assert partial_results.partial == 0
+
+
+def test_evaluator_min_overlap_with_multiple_entities():
+    """Test Evaluator with multiple entities and minimum overlap threshold."""
+    true_entities = [
+        [
+            {"label": "PER", "start": 0, "end": 4},  # 5 tokens
+            {"label": "ORG", "start": 10, "end": 14},  # 5 tokens
+            {"label": "LOC", "start": 20, "end": 24},  # 5 tokens
+        ]
+    ]
+
+    pred_entities = [
+        [
+            {"label": "PER", "start": 0, "end": 1},  # 40% overlap (2/5 tokens)
+            {"label": "ORG", "start": 10, "end": 12},  # 60% overlap (3/5 tokens)
+            {"label": "LOC", "start": 20, "end": 24},  # 100% overlap (exact match)
+            {"label": "MISC", "start": 30, "end": 32},  # No overlap (spurious)
+        ]
+    ]
+
+    # Test with 50% threshold
+    evaluator = Evaluator(
+        true=true_entities,
+        pred=pred_entities,
+        tags=["PER", "ORG", "LOC", "MISC"],
+        loader="dict",
+        min_overlap_percentage=50.0,
+    )
+
+    results = evaluator.evaluate()
+    partial_results = results["overall"]["partial"]
+
+    assert partial_results.correct == 1  # LOC exact match
+    assert partial_results.partial == 1  # ORG 60% overlap > 50%
+    assert partial_results.spurious == 2  # PER 40% < 50% and MISC no overlap
+    assert partial_results.missed == 1  # PER entity not sufficiently matched
+
+
+def test_evaluator_min_overlap_backward_compatibility():
+    """Test that the new feature maintains backward compatibility."""
+    true_entities = [[{"label": "PER", "start": 0, "end": 9}]]
+
+    pred_entities = [[{"label": "PER", "start": 9, "end": 9}]]  # 10% overlap (1 token out of 10)
+
+    # Without specifying min_overlap_percentage (should default to 1.0)
+    evaluator_default = Evaluator(true=true_entities, pred=pred_entities, tags=["PER"], loader="dict")
+
+    # With explicitly setting to 1.0
+    evaluator_explicit = Evaluator(
+        true=true_entities, pred=pred_entities, tags=["PER"], loader="dict", min_overlap_percentage=1.0
+    )
+
+    results_default = evaluator_default.evaluate()
+    results_explicit = evaluator_explicit.evaluate()
+
+    # Results should be identical
+    for strategy in ["strict", "partial", "ent_type", "exact"]:
+        default_result = results_default["overall"][strategy]
+        explicit_result = results_explicit["overall"][strategy]
+
+        assert default_result.correct == explicit_result.correct
+        assert default_result.partial == explicit_result.partial
+        assert default_result.spurious == explicit_result.spurious
+        assert default_result.missed == explicit_result.missed
diff --git a/tests/test_strategies.py b/tests/test_strategies.py

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ disable = [`
`68`	`68`	`"R0801", # duplicate-code`
`69`	`69`	`"W9020", # bad-option-value`
`70`	`70`	`"W0621", # redefined-outer-name`
	`71`	`+ "W0212", # protected-access`
`71`	`72`	`]`
`72`	`73`
`73`	`74`	`[tool.pylint.'DESIGN']`