[trainer] new kto mismatch pair creation strategy (#7509)

himalalps · web-flow · commit 6d6e0f44fccb · 2025-04-01T15:21:53.000+08:00
diff --git a/src/llamafactory/data/processor/feedback.py b/src/llamafactory/data/processor/feedback.py
@@ -83,8 +83,8 @@ def _encode_data_example(
         return input_ids, labels, kl_input_ids, kl_labels, kto_tag
 
     def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
-        # create unrelated input-output pairs for estimating the KL term by flipping the matched pairs
-        kl_response = examples["_response"][::-1]
+        # Creates mismatched pairs of prompts and completions for the KL dataset by adding a +1 offset to the order of completions.
+        kl_response = [examples["_response"][-1]] + examples["_response"][:-1]
         model_inputs = defaultdict(list)
         for i in range(len(examples["_prompt"])):
             if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) < 2: