diff --git a/example_data/qa.json b/example_data/qa.json
index 8007f4f4009beb9082d3b343ee5e7262f2544abd..147fa653b97aeef4eab735b9656a1adbb2ffced3 100644
--- a/example_data/qa.json
+++ b/example_data/qa.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4257e05de6242a319640bddf017391e2f8d9d347d3c1c440d85d23c1be13e6bd
-size 1283
+oid sha256:6b9555060dd6c7e9e5e76e95e87d47fe0155a64cc93ecf94fb6cba337bd7d3a3
+size 1413
diff --git a/local_evaluation.py b/local_evaluation.py
index b22f8c5f77e01c01f37eef98215a13cba46734b9..d3b6de6c152377d9945eef0dd1d748d34ccf6646 100644
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -54,25 +54,20 @@ def parse_response(resp: str):
     except:
         return -1
 
-def evaluate_response(response):
-    """Evaluate the response to determine if it's missing or correct."""
-    is_missing = "Missing: True" in response
-    is_correct = "Accuracy: True" in response
-    return is_missing, is_correct
-
 def evaluate(dataset_path, model_name):
     qa = load_json_file(os.path.join(dataset_path, "qa.json"))
     web_results = load_json_file(os.path.join(dataset_path, "web.json"))
     openai_client = OpenAI()
     participant_model = UserModel()
-    character_limit = 50  # todo: Make character limit dynamic
 
     n_miss, n_correct, n_correct_exact = 0, 0, 0
     system_message = get_system_message()
 
     for query_dict, query_web_search_results in tqdm(zip(qa, web_results), total=len(qa)):
-        query, ground_truth = query_dict['q'], query_dict['fact_ans'].strip().lower()
-        prediction = participant_model.generate_answer(query, query_web_search_results, character_limit=character_limit)[:character_limit].strip().lower()
+        query, ground_truth = query_dict['query'], query_dict['answer'].strip().lower()
+        prediction = participant_model.generate_answer(query, query_web_search_results)
+        prediction = prediction.strip().lower()
+        
         messages = [
             {"role": "system", "content": system_message},
             {"role": "user", "content": f"Question: {query}\n Ground truth: {ground_truth}\n Prediction: {prediction}\n"},
@@ -80,7 +75,7 @@ def evaluate(dataset_path, model_name):
         if prediction == "i don't know":
             n_miss += 1
             continue
-        if row["prediction"] == row["gold_ans"]:
+        if prediction == ground_truth:
             n_correct_exact += 1
             n_correct += 1
             continue
@@ -94,14 +89,18 @@ def evaluate(dataset_path, model_name):
 
     n = len(qa)
     results = {
-        "Exact Accuracy": n_exact / n,
-        "Accuracy": n_correct / n,
-        "Hallucination": (n - n_correct - n_miss) / n
-        "Missing": n_miss / n,
-        "Total": n
+        "score": (2*n_correct + n_miss) / n - 1,
+        "exact_accuracy": n_correct_exact / n,
+        "accuracy": n_correct / n,
+        "hallucination": (n - n_correct - n_miss) / n,
+        "missing": n_miss / n,
+        "n_miss": n_miss,
+        "n_correct": n_correct,
+        "n_correct_exact": n_correct_exact,
+        "total": n,        
     }
     logger.info(results)
-    return (2*n_correct + n_miss) / n - 1
+    return results
 
 if __name__ == '__main__':
     DATASET_PATH = "example_data/"
diff --git a/models/dummy_model.py b/models/dummy_model.py
index 0927299b18de033606ebfb74141e9f59aed8677c..8dcb9a54bfb3890417fc0b75bfeefcdd2ba343a7 100644
--- a/models/dummy_model.py
+++ b/models/dummy_model.py
@@ -17,5 +17,5 @@ class DummyModel:
             string response - Your answer in plain text, should be limited to the character limit, 
                               Any longer responses will be trimmed to meet the character limit
         """
-        answer = "I'm sorry, I can't help with that."
+        answer = "i don't know"
         return answer
\ No newline at end of file