Run factuality benchmark with low temp

peterdemin · peterdemin · commit d1339c9f24e9 · 2023-09-03T11:29:53.000-04:00
diff --git a/fact.py b/fact.py
@@ -44,7 +44,11 @@ def main():
         for swap in (False, True):
             prompt = format_prompt(task, swap)
             print(f'{i}. {prompt}')
-            output = llm.create_completion(prompt, max_tokens=2)
+            output = llm.create_completion(
+                prompt,
+                max_tokens=20,
+                temperature=1e-6,
+            )
             answer = output['choices'][0]['text'].strip().split()[0]
             print(answer)
             result = dict(task, answer=answer, swap=swap)
diff --git a/requirements-llama-2.txt b/requirements-llama-2.txt
@@ -0,0 +1 @@
+llama-cpp-python==0.1.77  # Works with Llama 2 GGML
diff --git a/requirements-wizard.txt b/requirements-wizard.txt
@@ -0,0 +1 @@
+llama-cpp-python==0.1.83  # Works with Wizard GGUF
diff --git a/results-t=0.8.jsonl b/results-t=0.8.jsonl

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+llama-cpp-python==0.1.77 # Works with Llama 2 GGML`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+llama-cpp-python==0.1.83 # Works with Wizard GGUF`