fix context coverage metric input (#33)

* fix context coverage metric input
relari-ai · Feb 17, 2024 · 4839e9e · 4839e9e
1 parent 696f8d0
commit 4839e9e
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 31 deletions.
diff --git a/continuous_eval/metrics/retrieval_LLM_based_metrics.py b/continuous_eval/metrics/retrieval_LLM_based_metrics.py
@@ -85,7 +85,7 @@ def __init__(self, model: Optional[LLMInterface] = None, use_few_shot: bool = Tr
     def __str__(self):
         return f"LLMBasedContextCoverage(model={self.model}, use_few_shot={self.use_few_shot})"
 
-    def calculate(self, question, retrieved_contexts, answer, **kwargs):
+    def calculate(self, question, retrieved_contexts, ground_truths, **kwargs):
         """
         Calculate the context coverage score for the given datapoint.
         """
@@ -113,31 +113,39 @@ def calculate(self, question, retrieved_contexts, answer, **kwargs):
             else ""
         )
 
-        prompt = {
-            "system_prompt": (
-                """
-Given a question, context, and answer, analyze each statement in the answer and classify if the statement can be attributed to the given context or not. Output JSON strictly in the following format.
-"""
-                + few_shot_prompt
-            ),
-            "user_prompt": ("question: " + question + "\ncontext: " + context + "\nanswer: " + answer),
-        }
-
-        content = self._llm.run(prompt)
-
-        try:
-            coverage = self.extract_attributed_from_broken_json(content)
-        except Exception as e:
-            print(f"{type(e).__name__} Error: {content}, skipping")
-            return {
-                "LLM_based_context_coverage": None,
-                "LLM_based_context_statements": content,
+        scores = []
+        for gt in ground_truths:
+            prompt = {
+                "system_prompt": (
+                    """
+    Given a question, context, and answer, analyze each statement in the answer and classify if the statement can be attributed to the given context or not. Output JSON strictly in the following format.
+    """
+                    + few_shot_prompt
+                ),
+                "user_prompt": ("question: " + question + "\ncontext: " + context + "\nanswer: " + gt),
             }
 
-        return {
-            "LLM_based_context_coverage": coverage,
-            "LLM_based_context_statements": content,
-        }
+            content = self._llm.run(prompt)
+
+            try:
+                coverage = self.extract_attributed_from_broken_json(content)
+            except Exception as e:
+                print(f"{type(e).__name__} Error: {content}, skipping")
+                scores.append(
+                    {
+                        "LLM_based_context_coverage": -1.0,
+                        "LLM_based_context_statements": content,
+                    }
+                )
+            else:
+                scores.append(
+                    {
+                        "LLM_based_context_coverage": coverage,
+                        "LLM_based_context_statements": content,
+                    }
+                )
+
+        return max(scores, key=lambda x: x["LLM_based_context_coverage"])
 
     @staticmethod
     def extract_attributed_from_broken_json(statements):
@@ -147,6 +155,6 @@ def extract_attributed_from_broken_json(statements):
             attributed_numbers = [int(num) for group in attributed_numbers for num in group if num]
         except Exception as e:
             print(f"{type(e).__name__} Error: {attributed_numbers}, skipping")
-            return None
-        coverage = sum(attributed_numbers) / len(attributed_numbers) if attributed_numbers else None
+            return -1.0
+        coverage = sum(attributed_numbers) / len(attributed_numbers) if attributed_numbers else -1.0
         return coverage
diff --git a/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_coverage.md b/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_coverage.md
@@ -16,6 +16,8 @@ $$
 }
 $$
 
+This metric requires the LLM evaluator to output correct and complex JSON. If the JSON cannot be parsed, the score returns -1.0.
+
 
 ### Example Usage
 
@@ -41,9 +43,9 @@ print(metric.calculate(**datum))
 ### Sample Output
 
 ```JSON
-{   
-    'LLM_based_context_coverage': 0.5,
-    'LLM_based_context_statements': 
+{
+    'LLM_based_context_coverage': 0.5, 
+    'LLM_based_context_statements':
     {
         "classification": [
             {
@@ -52,8 +54,8 @@ print(metric.calculate(**datum))
                 "Attributed": 1
             },
             {
-                "statement_2": "Lyon is the second largest city in France.",
-                "reason": "The context does not provide information about the ranking of Lyon in terms of size compared to other French cities.",
+                "statement_2": "Marseille is the second largest city in France.",
+                "reason": "This information is not provided in the context, which only mentions Paris and Lyon.",
                 "Attributed": 0
             }
         ]