Add discriptions of LigerJSD and modify the reference model with lerp

Tcc0403 · Tcc0403 · commit 96ba54e2ef5b · 2024-10-03T04:16:28.000+08:00
diff --git a/benchmark/scripts/benchmark_jsd.py b/benchmark/scripts/benchmark_jsd.py
@@ -27,7 +27,7 @@ def forward(
     ):
         log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
         log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(-1, log_q.size(-1))
-        m = self.beta * torch.exp(log_p) + (1 - self.beta) * torch.exp(log_q)
+        m = torch.lerp(torch.exp(log_p), torch.exp(log_q), self.beta)
         loss = self.beta * self.kl(torch.log(m), log_p) + (1 - self.beta) * self.kl(
             torch.log(m), log_q
         )
diff --git a/src/liger_kernel/ops/jsd.py b/src/liger_kernel/ops/jsd.py
@@ -87,16 +87,18 @@ def jsd_backward(dX, grad_output):
 
 
 class LigerJSDFunction(torch.autograd.Function):
-    """
-    Class implementing the forward and backward pass for the JS Divergence using Triton, as defined by the following formula:
-
-    Parameters:
-    _input (tensor): predict values with shape (BT, V) in logspace
-    target (tensor): ground truth values with shape (BT, V) in logspace
-    beta (float): coefficient beta of generalized JSD in the open interval (0, 1)
-
-    Returns:
-    loss (tensor): JSD
+    r"""
+    This class implements the forward and backward pass for the generalized Jensen-Shannon Divergence.
+    .. math::
+        JSD(\beta)(P || Q)
+            = \beta * KLDiv(P || (\beta * P + (1 - \beta) * Q)) + (1 - \beta) * KLDiv(Q || (\beta * P + (1 - \beta) * Q))
+
+    .. note::
+        As all the other losses in PyTorch, this function expects the first argument,
+        :attr:`_input`, to be the predictions, the output of the student model, in log-space
+        and the second, :attr:`target`, to be the observations, the output of the teacher model, in log-space.
+        This differs from the standard mathematical notation :math:`JSD(P || Q)` where
+        :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
     """
 
     @staticmethod
@@ -107,7 +109,15 @@ def forward(
         target: torch.Tensor,
         beta: float = 0.5,
     ) -> torch.Tensor:
-
+        """
+        Args:
+            _input (torch.Tensor): predict values with shape (BT, V) in logspace
+            target (torch.Tensor): ground truth values with shape (BT, V) in logspace
+            beta (float): coefficient beta of generalized JSD in the open interval (0, 1)
+
+        Returns:
+            loss (torch.Tensor): generalized JSD
+        """
         loss, dX = jsd_forward(_input, target, beta)
         ctx.save_for_backward(dX)
         return loss
diff --git a/src/liger_kernel/transformers/jsd.py b/src/liger_kernel/transformers/jsd.py
@@ -4,6 +4,35 @@
 
 
 class LigerJSD(nn.Module):
+    r"""The generalized Jensen-Shannon Divergence.
+    .. math::
+    JSD(\beta)(P || Q)
+        = \beta * KLDiv(P || (\beta * P + (1 - \beta) * Q)) + (1 - \beta) * KLDiv(Q || (\beta * P + (1 - \beta) * Q))
+    .. note::
+    As all the other losses in PyTorch, this function expects the first argument,
+    :attr:`log_q`, to be the predictions, the output of the student model in log-space,
+    and the second, :attr:`log_p`, to be the observations, the output of the teacher model in log-space.
+    This differs from the standard mathematical notation :math:`JSD(P || Q)` where
+    :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
+
+    Args:
+        beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: a scalar.
+
+    Examples:
+    ```python
+    >>> jsd = LigerJSD(beta=0.1)
+    >>> # input should be a distribution in the log space
+    >>> input = torch.randn(3, 5, requires_grad=True).log_softmax(dim=-1)
+    >>> target = torch.randn(3, 5, requires_grad=True).log_softmax(dim=-1)
+    >>> output = jsd(input, target)
+    ```
+    """
+
     def __init__(self, beta=0.5):
         super().__init__()
         assert (
diff --git a/test/transformers/test_jsd.py b/test/transformers/test_jsd.py
@@ -24,7 +24,7 @@ def forward(
     ):
         log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
         log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(-1, log_q.size(-1))
-        m = self.beta * torch.exp(log_p) + (1 - self.beta) * torch.exp(log_q)
+        m = torch.lerp(torch.exp(log_q), torch.exp(log_p), self.beta)
         loss = self.beta * self.kl(torch.log(m), log_p) + (1 - self.beta) * self.kl(
             torch.log(m), log_q
         )
@@ -36,7 +36,7 @@ def forward(
     [
         (2, 4096, 32000),  # llama2, mistral
         (2, 4096, 32000),  # llama2, mistral
-        # # weird shape
+        # weird shape
         (41, 401, 1271),
         pytest.param(
             1,

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def forward(`
`27`	`27`	`):`
`28`	`28`	`log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)`
`29`	`29`	`log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(-1, log_q.size(-1))`
`30`		`- m = self.beta * torch.exp(log_p) + (1 - self.beta) * torch.exp(log_q)`
	`30`	`+ m = torch.lerp(torch.exp(log_p), torch.exp(log_q), self.beta)`
`31`	`31`	`loss = self.beta * self.kl(torch.log(m), log_p) + (1 - self.beta) * self.kl(`
`32`	`32`	`torch.log(m), log_q`
`33`	`33`	`)`