diff --git a/docs/source/dpo_trainer.md b/docs/source/dpo_trainer.md index 863cefc6eb..d41fb13cc6 100644 --- a/docs/source/dpo_trainer.md +++ b/docs/source/dpo_trainer.md @@ -111,6 +111,7 @@ Several formulations of the objective have been proposed in the literature. Init | `"sigmoid"` (default) | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model and in fact the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression. | | `"hinge"` | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin. | | `"ipo"` | The [IPO](https://huggingface.co/papers/2310.12036) authors argue the logit transform can overfit and propose the identity transform to optimize preferences directly; TRL exposes this as `loss_type="ipo"`. | +| `"sigmoid_norm"` | The [SIMPO](https://huggingface.co/papers/2405.14734) authors address the length-bias in the original sigmoid loss by normalizing by the number of non-mask tokens; TRL exposes this as `loss_type="sigmoid_norm"`. | | `"exo_pair"` | The [EXO](https://huggingface.co/papers/2402.00856) authors propose reverse-KL preference optimization. `label_smoothing` must be strictly greater than `0.0`; a recommended value is `1e-3` (see Eq. 16 for the simplified pairwise variant). The full method uses `K>2` SFT completions and approaches PPO as `K` grows. | | `"nca_pair"` | The [NCA](https://huggingface.co/papers/2402.05369) authors shows that NCA optimizes the absolute likelihood for each response rather than the relative likelihood. | | `"robust"` | The [Robust DPO](https://huggingface.co/papers/2403.00409) authors propose an unbiased DPO loss under noisy preferences. Use `label_smoothing` in [`DPOConfig`] to model label-flip probability; valid values are in the range `[0.0, 0.5)`. | diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md index 6f8ba7daf3..8f956692a1 100644 --- a/docs/source/paper_index.md +++ b/docs/source/paper_index.md @@ -1121,6 +1121,25 @@ training_args = DPOConfig( ) ``` +### Length-Normalized DPO (Sigmoid Norm) + +**📜 Paper**: https://huggingface.co/papers/2405.14734 + +The length-normalized sigmoid loss addresses length bias in DPO by dividing chosen and rejected log-ratio scores by their respective completion lengths before computing the Bradley-Terry loss. This per-token normalization was introduced in [SimPO](https://huggingface.co/papers/2405.14734) as an average log-probability reward for a reference-free setting, and was later adopted for standard reference-model-based DPO in post-training recipes such as [Tulu 3](https://huggingface.co/papers/2411.15124) (Section 4.3). The loss is: + +$$ +\mathcal{L}_{\text{sigmoid\_norm}} = -\log\sigma\!\left(\beta \left({\color{red}\frac{1}{|y_w|}}\log\frac{\pi_\theta(y_w|x)}{\pi_{\text{ref}}(y_w|x)} - {\color{red}\frac{1}{|y_l|}}\log\frac{\pi_\theta(y_l|x)}{\pi_{\text{ref}}(y_l|x)}\right)\right), +$$ +which can be set with: + +```python +from trl import DPOConfig + +training_args = DPOConfig( + loss_type=["sigmoid_norm"], +) +``` + ### Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization **📜 Paper**: https://huggingface.co/papers/2411.10442 diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py index 288dbb2ccb..7456005f51 100644 --- a/tests/test_dpo_trainer.py +++ b/tests/test_dpo_trainer.py @@ -266,6 +266,7 @@ def test_train_model(self): "sigmoid", "hinge", "ipo", + "sigmoid_norm", "exo_pair", "nca_pair", "robust", diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py index 979747dd07..fcbeaec672 100644 --- a/trl/trainer/dpo_config.py +++ b/trl/trainer/dpo_config.py @@ -70,7 +70,7 @@ class DPOConfig(_BaseConfig): > Parameters that control the training loss_type (`list[str]`, *optional*, defaults to `["sigmoid"]`): - Type of loss to use. Possible values are: `'sigmoid'`, `'hinge'`, `'ipo'`, `'exo_pair'`, `'nca_pair'`, + Type of loss to use. Possible values are: `'sigmoid'`, `'hinge'`, `'ipo'`, `'sigmoid_norm'`, `'exo_pair'`, `'nca_pair'`, `'robust'`, `'bco_pair'`, `'sppo_hard'`, `'aot'`, `'aot_unpaired'`, `'apo_zero'`, `'apo_down'`, `'discopop'`, `'sft'`. If multiple loss types are provided, they will be combined using the weights specified in `loss_weights`. @@ -211,7 +211,7 @@ class DPOConfig(_BaseConfig): loss_type: list[str] = field( default_factory=lambda: ["sigmoid"], metadata={ - "help": "Type of loss to use. Possible values are: `'sigmoid'`, `'hinge'`, `'ipo'`, `'exo_pair'`, " + "help": "Type of loss to use. Possible values are: `'sigmoid'`, `'hinge'`, `'ipo'`, `'sigmoid_norm'`, `'exo_pair'`, " "`'nca_pair'`, `'robust'`, `'bco_pair'`, `'sppo_hard'`, `'aot'`, `'aot_unpaired'`, `'apo_zero'`, " "`'apo_down'`, `'discopop'`, `'sft'`. If multiple loss types are provided, they will be combined using " "the weights specified in `loss_weights`.", diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py index 0ae3e90668..0003da66ea 100644 --- a/trl/trainer/dpo_trainer.py +++ b/trl/trainer/dpo_trainer.py @@ -1248,6 +1248,13 @@ def _compute_loss(self, model, inputs, return_outputs): # (Eq. 17) of the paper where beta is the regularization parameter for the IPO loss, denoted by τ. per_sequence_loss = (ipo_delta - 1 / (2 * self.beta)) ** 2 + elif loss_type == "sigmoid_norm": + chosen_mask, rejected_mask = completion_mask.chunk(2, dim=0) + chosen_avg_score = chosen_scores / chosen_mask.sum(dim=1).clamp(min=1.0) + rejected_avg_score = rejected_scores / rejected_mask.sum(dim=1).clamp(min=1.0) + delta = chosen_avg_score - rejected_avg_score + per_sequence_loss = -F.logsigmoid(self.beta * delta) + elif loss_type == "exo_pair": # Implements EXO-pref from the paper https://huggingface.co/papers/2402.00856, (Eq. 16) # Minimize KL(p_fθ || p_rh) for K=2; p_fθ = softmax(βπ * (log πθ − log π_ref)) over {chosen, rejected} @@ -1348,7 +1355,7 @@ def _compute_loss(self, model, inputs, return_outputs): else: raise ValueError( - f"Unknown loss type: {loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'exo_pair', " + f"Unknown loss type: {loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'sigmoid_norm', 'exo_pair', " "'nca_pair', 'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_unpaired', 'apo_zero', 'apo_down', " "'discopop', 'sft']" )