peterdemin
diff --git a/‎fact.py
+21-6 b/‎fact.py
+21-6
diff --git a/‎results.jsonl renamed to ‎results-t=1e-6.jsonl b/‎results.jsonl renamed to ‎results-t=1e-6.jsonl
@@ -5,15 +5,14 @@
 TASKS_PATH = 'fact.json'
 
 PROMPT_TMPL = """\
-Decide which of the following summary is more consistent with the article sentence.
+Decide which of the following Summary is more consistent with the Article Sentence.
 
-Note that consistency means all information in the summary is supported by the article.
+Note that consistency means all information in the Summary is supported by the Article Sentence.
 
 Article Sentence: {article}
-Summary A: {option_a}
-Summary B: {option_b}
-
-The more consistent is Summary"""
+Summary Y: {option_a}
+Summary X: {option_b}
+Answer: The more consistent is Summary"""
 
 
 def iter_tasks(filename):
@@ -38,6 +37,20 @@ def check_ctx_len(llm, max_ctx=512):
 
 
 def main():
+    """
+    The more influential parameters/settings on the quality of LLM output are top-p, top-k, temperature, repetition_penalty, and turn templates.
+    You can think of Top-p and Top-k that control the “vocabulary size” of the large language models at inference time.
+    Since these models predict the next token (word) by calculating the probability of available words, we can control how the model picks the next token when multiple tokens are probable.
+    The top-p parameter selects the tokens whose cumulative probability is over a threshold.
+    The top-k parameter selects only the k tokens with the top probability.
+    With a low top-p value (like 0.15), you allow more rarely used tokens with lower probability to appear, but with a high top-p value (like 0.8) you essentially remove them from the generation vocabulary.
+    With a small top-k like 1, you only sample the most probable word; with a larger top-k, you will get more varied results.
+    The temperature comes after the probable tokens are selected by top-p or top-k.
+    After selecting a pool of potential tokens with top-p or top-k, you can use temperature to control the randomness of the results.
+    What temperature does is actually modifies the probability of the tokens — the higher the temperature, the more equal the probability that any of the words in the pool will be drawn is, and thus the more random the result.
+    The repetition penalty is a parameter to tell the model how frequently they should use the same token when generating text.
+    If the repetition penalty is high, the model is less likely to repeat what it has said in the past or be stuck in a loop repeating the same sentence.
+    """
     llm = Llama(model_path=MODEL_PATH, n_gqa=8, verbose=False) # , n_ctx=n_ctx)
     check_ctx_len(llm)
     for i, task in enumerate(iter_tasks(TASKS_PATH)):
@@ -47,6 +60,8 @@ def main():
             output = llm.create_completion(
                 prompt,
                 max_tokens=20,
+                top_k=10,
+                top_p=0.9,
                 temperature=1e-6,
             )
             answer = output['choices'][0]['text'].strip().split()[0]